done done done part A
This commit is contained in:
@@ -103,7 +103,7 @@ class PGAgent(nn.Module):
|
||||
q_values = None
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
q_values = [self._discounted_return(reward) for reward in rewards]
|
||||
############################
|
||||
|
||||
else:
|
||||
@@ -114,7 +114,7 @@ class PGAgent(nn.Module):
|
||||
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
q_values = [self._discounted_reward_to_go(reward) for reward in rewards]
|
||||
############################
|
||||
|
||||
return q_values
|
||||
@@ -148,7 +148,10 @@ class PGAgent(nn.Module):
|
||||
advantages = None
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
source = rewards.copy()
|
||||
mean = np.mean(source)
|
||||
std = np.std(source)
|
||||
advantages = (source - mean)/std
|
||||
############################
|
||||
|
||||
return advantages
|
||||
@@ -166,9 +169,9 @@ class PGAgent(nn.Module):
|
||||
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
q_value=sum(self.gamma ** i * reward for i, reward in enumerate(rewards))
|
||||
return [q_value] * len(rewards)
|
||||
############################
|
||||
pass
|
||||
|
||||
|
||||
def _discounted_reward_to_go(self, rewards: Sequence[float]) -> Sequence[float]:
|
||||
@@ -181,6 +184,12 @@ class PGAgent(nn.Module):
|
||||
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
q_values = []
|
||||
current_sum = 0
|
||||
for t in range (len(rewards)-1,-1,-1):
|
||||
current_sum *= self.gamma
|
||||
current_sum += rewards[t]
|
||||
q_values.append(current_sum)
|
||||
q_values.reverse()
|
||||
return q_values
|
||||
############################
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user