I am trying to recreate the REINFORCE algorithm using the lunarlander-v2 environment but I am having trouble with my model updating. The total rewards for an episode is almost always negative. The values, after around 3-5 thousand iterations, stay within the range [-150,0]. My loss seems to get slightly better but it isn’t reflected in the scores. I cannot tell if it is an issue with my model architecture (being too simple?) or with the training function. If anyone has accomplished a passing score on lunar lander using Reinforce please let me know if there are tips you may have. My learning rate is 0.005, gamma is 0.99.

```
class PolicyNetwork(nn.Module):
def __init__(self, action_space, state_space):
super(PolicyNetwork, self).__init__()
self.fc1 = nn.Linear(state_space, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, action_space)
self.init_weights()
def init_weights(self):
nn.init.kaiming_normal_(self.fc1.weight)
nn.init.kaiming_normal_(self.fc2.weight)
nn.init.kaiming_normal_(self.fc3.weight)
self.fc1.bias.data.fill_(0.0)
self.fc2.bias.data.fill_(0.0)
self.fc3.bias.data.fill_(0.0)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return F.softmax(x, dim = 1)
def act(self, state):
state = torch.from_numpy(state).float().unsqueeze(0).to(device)
logits = self.forward(state)
prob_dist = Categorical(logits=logits)
action = prob_dist.sample()
log_prob = prob_dist.log_prob(action)
return action.item(), log_prob
def train_reinforce(model, env, max_timesteps, num_episodes, optimizer, gamma):
for epi in range(num_episodes):
rewards = []
log_probs = []
returns = deque()
state = env.reset()
for t in range(max_timesteps):
action, log_prob = model.act(state)
new_state, reward, done, _ = env.step(action)
rewards.append(reward)
log_probs.append(log_prob)
if done:
break
state = new_state
# Calculate Return for the episode using memoization
for t in reversed(range(len(rewards))):
discounted_return = (returns[0] if len(returns) > 0 else 0)
returns.appendleft(discounted_return*gamma + rewards[t])
returns = torch.tensor(returns).to(device)
returns = returns - returns.mean()
log_probs = torch.stack(log_probs).to(device)
loss = -log_probs * returns
loss = torch.sum(loss)
optimizer.zero_grad()
loss.backward()
optimizer.step()
```