NN oscillating using PyTorch for Reinforce on lunarlander

I am trying to recreate the REINFORCE algorithm using the lunarlander-v2 environment but I am having trouble with my model updating. The total rewards for an episode is almost always negative. The values, after around 3-5 thousand iterations, stay within the range [-150,0]. My loss seems to get slightly better but it isn’t reflected in the scores. I cannot tell if it is an issue with my model architecture (being too simple?) or with the training function. If anyone has accomplished a passing score on lunar lander using Reinforce please let me know if there are tips you may have. My learning rate is 0.005, gamma is 0.99.

class PolicyNetwork(nn.Module):
    def __init__(self, action_space, state_space):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_space, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_space)

        self.init_weights()

    def init_weights(self):
     
        nn.init.kaiming_normal_(self.fc1.weight)
        nn.init.kaiming_normal_(self.fc2.weight)
        nn.init.kaiming_normal_(self.fc3.weight)

        self.fc1.bias.data.fill_(0.0)
        self.fc2.bias.data.fill_(0.0)
        self.fc3.bias.data.fill_(0.0)

    def forward(self, x):
      x = F.relu(self.fc1(x))
      x = F.relu(self.fc2(x))
      x = self.fc3(x)
      return F.softmax(x, dim = 1)

    def act(self, state):
      state = torch.from_numpy(state).float().unsqueeze(0).to(device)
      logits = self.forward(state)
      prob_dist = Categorical(logits=logits)
      action = prob_dist.sample()
      log_prob = prob_dist.log_prob(action)
      return action.item(), log_prob

def train_reinforce(model, env, max_timesteps, num_episodes, optimizer, gamma):
    for epi in range(num_episodes):
        rewards = []
        log_probs = []
        returns = deque()
        state = env.reset()
        for t in range(max_timesteps):
            action, log_prob = model.act(state)
            new_state, reward, done, _ = env.step(action)
            rewards.append(reward)
            log_probs.append(log_prob)
            if done:
                break
            state = new_state

        # Calculate Return for the episode using memoization
        for t in reversed(range(len(rewards))):
            discounted_return = (returns[0] if len(returns) > 0 else 0)
            returns.appendleft(discounted_return*gamma + rewards[t])

        returns = torch.tensor(returns).to(device)
        returns = returns - returns.mean()
        log_probs = torch.stack(log_probs).to(device)

        loss = -log_probs * returns
        loss = torch.sum(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()