done done done part A
This commit is contained in:
12
.gitignore
vendored
12
.gitignore
vendored
@@ -1,7 +1,7 @@
|
||||
# Python
|
||||
# __pycache__/
|
||||
# *.pyc
|
||||
# *.pyo
|
||||
# *.pyd
|
||||
# *.pyw
|
||||
# *.pyz
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
*.pyw
|
||||
*.pyz
|
||||
17
environment.yml
Normal file
17
environment.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
name: DRL
|
||||
channels:
|
||||
- pytorch
|
||||
- nvidia
|
||||
- defaults
|
||||
dependencies:
|
||||
- python=3.10
|
||||
- pip=23.0.1
|
||||
- pip:
|
||||
- gymnasium[classic-control]==0.27.1
|
||||
- hydra-core==1.3.2
|
||||
- matplotlib==3.7.1
|
||||
- moviepy==1.0.3
|
||||
- torch torchvision --index-url https://download.pytorch.org/whl/cu126 # change this to your own cuda version
|
||||
- opencv-python
|
||||
- tensorboardX==2.6.4
|
||||
- tensorboard==2.20.0
|
||||
8
hw3/bash/1-2-experiments.sh
Normal file
8
hw3/bash/1-2-experiments.sh
Normal file
@@ -0,0 +1,8 @@
|
||||
python run.py --env_name CartPole-v1 -n 200 -b 1000 --exp_name cartpole
|
||||
python run.py --env_name CartPole-v1 -n 200 -b 1000 -rtg --exp_name cartpole_rtg
|
||||
python run.py --env_name CartPole-v1 -n 200 -b 1000 -na --exp_name cartpole_na
|
||||
python run.py --env_name CartPole-v1 -n 200 -b 1000 -rtg -na --exp_name cartpole_rtg_na
|
||||
python run.py --env_name CartPole-v1 -n 200 -b 4000 --exp_name cartpole_lb
|
||||
python run.py --env_name CartPole-v1 -n 200 -b 4000 -rtg --exp_name cartpole_lb_rtg
|
||||
python run.py --env_name CartPole-v1 -n 200 -b 4000 -na --exp_name cartpole_lb_na
|
||||
python run.py --env_name CartPole-v1 -n 200 -b 4000 -rtg -na --exp_name cartpole_lb_rtg_na
|
||||
1
hw3/bash/read-results.sh
Normal file
1
hw3/bash/read-results.sh
Normal file
@@ -0,0 +1 @@
|
||||
tensorboard --logdir data/pg_cartpole_CartPole-v1_25-10-2025_15-00-04
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -103,7 +103,7 @@ class PGAgent(nn.Module):
|
||||
q_values = None
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
q_values = [self._discounted_return(reward) for reward in rewards]
|
||||
############################
|
||||
|
||||
else:
|
||||
@@ -114,7 +114,7 @@ class PGAgent(nn.Module):
|
||||
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
q_values = [self._discounted_reward_to_go(reward) for reward in rewards]
|
||||
############################
|
||||
|
||||
return q_values
|
||||
@@ -148,7 +148,10 @@ class PGAgent(nn.Module):
|
||||
advantages = None
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
source = rewards.copy()
|
||||
mean = np.mean(source)
|
||||
std = np.std(source)
|
||||
advantages = (source - mean)/std
|
||||
############################
|
||||
|
||||
return advantages
|
||||
@@ -166,9 +169,9 @@ class PGAgent(nn.Module):
|
||||
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
q_value=sum(self.gamma ** i * reward for i, reward in enumerate(rewards))
|
||||
return [q_value] * len(rewards)
|
||||
############################
|
||||
pass
|
||||
|
||||
|
||||
def _discounted_reward_to_go(self, rewards: Sequence[float]) -> Sequence[float]:
|
||||
@@ -181,6 +184,12 @@ class PGAgent(nn.Module):
|
||||
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
q_values = []
|
||||
current_sum = 0
|
||||
for t in range (len(rewards)-1,-1,-1):
|
||||
current_sum *= self.gamma
|
||||
current_sum += rewards[t]
|
||||
q_values.append(current_sum)
|
||||
q_values.reverse()
|
||||
return q_values
|
||||
############################
|
||||
pass
|
||||
|
||||
@@ -63,7 +63,9 @@ class MLPPolicy(nn.Module):
|
||||
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
obs = ptu.from_numpy(obs)
|
||||
action_tensor = self.forward(obs)
|
||||
action = ptu.to_numpy(action_tensor.sample())
|
||||
############################
|
||||
|
||||
return action
|
||||
@@ -80,7 +82,8 @@ class MLPPolicy(nn.Module):
|
||||
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
logits_prob = self.logits_net(obs)
|
||||
action = distributions.Categorical(logits=logits_prob)
|
||||
############################
|
||||
|
||||
else:
|
||||
@@ -116,7 +119,21 @@ class MLPPolicyPG(MLPPolicy):
|
||||
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
|
||||
dist = self.forward(obs)
|
||||
|
||||
if self.discrete:
|
||||
log_p = dist.log_prob(actions.long())
|
||||
else:
|
||||
log_p = dist.log_prob(actions)
|
||||
|
||||
loss = -torch.mean(advantages * log_p)
|
||||
|
||||
# update gradients
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
|
||||
############################
|
||||
|
||||
return {
|
||||
|
||||
@@ -33,7 +33,10 @@ def sample_trajectory(
|
||||
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
ac = policy.get_action(ob)
|
||||
|
||||
next_ob, rew, terminated, truncated, _ = env.step(ac)
|
||||
rollout_done = terminated or truncated
|
||||
############################
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user