Import Torch Import torch.nn as nn import torch.nn.functional as F from Torch.autograd import Variable import MATPLOTLIB.P Yplot as PLT import numpy as NP import math import random import OS Import Gym # Hyper Parameters State_dim = 4 Action_di M = 2 Step = Sample_nums = Class Actornetwork (NN.
Module): Def __init__ (self,input_size,hidden_size,action_size): Super (Actornetwork, self). __init__ () SELF.FC1 = nn. Linear (input_size,hidden_size) self.fc2 = nn. Linear (hidden_size,hidden_size) self.fc3 = nn. Linear (Hidden_size,action_size) def forward (self,x): out = F.relu (SELF.FC1 (x)) out = F.relu (SELF.FC2 ( Out) is out = F.log_softmax (SELF.FC3) return out class Valuenetwork (NN).
Module): Def __init__ (self,input_size,hidden_size,output_size): Super (Valuenetwork, self). __init__ () SELF.FC1 = nn. Linear (input_size,hidden_size) self.fc2 = nn. Linear (hidden_size,hidden_size) self.fc3 = nn. Linear (Hidden_size,output_size) def forward (self,x): out = F.relu (SELF.FC1 (x)) out = F.relu (SELF.FC2 ( Out) out = SELF.FC3 [out] return out def roll_out (actor_network,task,sample_nums,value_network,init_state ): #task. Reset () states = [] actions = [] Rewards = [] Is_done = False Final_r = 0 state = ini T_state for J in Range (Sample_nums): States.append (state) log_softmax_action = Actor_network (Variable (Torch. Tensor ([State])) Softmax_action = Torch.exp (log_softmax_action) action = Np.random.choice (action_dim,p=so Ftmax_action.cpu (). Data.numpy () [0]) one_hot_action = [Int (k = = action) for k in range (Action_dim)] Next_st
Ate,reward,done,_ = Task.step (action) #fix_reward = -10 if done else 1 actions.append (one_hot_action) Rewards.append (reward) Final_state = next_state state = Next_State if Done:is_don
E = True State = Task.reset () break if not is_done:final_r = Value_network (Variable (torch. Tensor ([final_state])). CPU (). Data.numpy () return states,actions,rewards,final_r,state def discount_reward (R, Gamma
, final_r): Discounted_r = Np.zeros_like (r) Running_add = Final_r for T in reversed (range (0, Len (r))): Running_add = Running_add * gamma + r[t] discounted_r[t] = running_add return discounted_r def Main (): # Init a task generator for data Fetching task = Gym.make ("cartpole-v0") Init_state = Task.reset () # init value Network Value_network = valuenetwork (input_size = state_dim,hidden_size = 40,output_size = 1) value_network_optim = Torch.optim.Adam (Value_network.parameters (), lr=0.01) # init actor Network Actor_network = Actornetwork (state_d Im,40,action_dim) Actor_network_optim = Torch.optim.Adam (Actor_network.parameters (), LR = 0.01) Steps =[] Task
_episodes =[] Test_results =[] For step in range (step): States,actions,rewards,final_r,current_state = Roll_out (actor_network,task,sample_num s,value_network,init_state) Init_state = current_state Actions_var = Variable (torch. Tensor (Actions). View ( -1,action_dim)) States_var = Variable (torch. Tensor (states). View ( -1,state_dim)) # Train actor Network () Actor_network_optim.zero_grad Max_actions = Actor_network (states_var) vs = Value_network (States_var). Detach () # Calculate QS Qs = Variable (torch. Tensor (Discount_reward (rewards,0.99,final_r)) advantages = Qs-vs Actor_network_loss =-Torch.mean (tor Ch.sum (log_softmax_actions*actions_var,1) * advantages) Actor_network_loss.backward () torch.nn.utils.clip_g Rad_norm (Actor_network.parameters (), 0.5) Actor_network_optim.step () # Train Value Network Value_n Etwork_optim.zero_grad () target_values = QS values = value_netWork (States_var) criterion = nn. Mseloss () Value_network_loss = criterion (values,target_values) Value_network_loss.backward () torch . Nn.utils.clip_grad_norm (Value_network.parameters (), 0.5) Value_network_optim.step () # Testing if (step + 1)% 50== 0:result = 0 Test_task = gym.make ("Cartpole-v0") for Te
St_epi in Range (a): state = Test_task.reset () to Test_step in range (200): Softmax_action = Torch.exp (actor_network Variable (torch. Tensor ([State]))) #print (softmax_action.data) action = Np.argmax (softmax_
Action.data.numpy () [0]) Next_state,reward,done,_ = Test_task.step (action) Result + = Reward state = Next_State if done:b Reak Print ("Step: ", step+1," test result: ", result/10.0) steps.append (step+1) test_results.append (RESULT/10) if __name__ = = ' __main__ ': Main ()