step = 0
done = False
episode_reward = 0
episode_num = 0
obs = env.reset()
#一共走max_timesteps步
while step < max_timesteps:
#done为点计为一个episode
if done:
obs = env.reset()
done = False
episode_reward = 0
#这里为何要随机走10%
if step < random_steps:
action = env.action_space.sample()
#根据obs选择action,然后走一步
action = self.model.select_action(np.array(obs))
new_obs, reward, done, _ = env.step(action)
#一次episode的奖励累加
episode_reward += reward
episode_num += 1
obs = new_obs
step += 1
然后最重要就是调用模型本身训练环节。
网络构成:
class Actor(nn.Module):
def __init__(self, state_dim, action_dim, max_action):
super(Actor, self).__init__()
self.l1 = nn.Linear(state_dim, 400)
self.l2 = nn.Linear(400, 300)
self.l3 = nn.Linear(300, action_dim)
self.max_action = max_action
def forward(self, x):
x = F.relu(self.l1(x))
x = F.relu(self.l2(x))
x = self.max_action * torch.tanh(self.l3(x))
return x
class Critic(nn.Module):
def __init__(self, state_dim, action_dim):
super(Critic, self).__init__()
self.l1 = nn.Linear(state_dim, 400)
self.l2 = nn.Linear(400 + action_dim, 300)
self.l3 = nn.Linear(300, 1)
def forward(self, x, u):
x = F.relu(self.l1(x))
x = F.relu(self.l2(torch.cat([x, u], 1)))
x = self.l3(x)
return x
Actor和Critic都是两个简单的三层全连接网络,比较简单,不必展开描述。
(公众号:七年实现财富自由(ailabx),用数字说基金,用基金做投资组合,践行财富自由之路)