done done done part A

This commit is contained in:
Zheyuan Wu
2025-10-25 16:01:19 -05:00
parent 892131cfd4
commit 0f109ac389
18 changed files with 70 additions and 15 deletions

12
.gitignore vendored
View File

@@ -1,7 +1,7 @@
# Python
# __pycache__/
# *.pyc
# *.pyo
# *.pyd
# *.pyw
# *.pyz
__pycache__/
*.pyc
*.pyo
*.pyd
*.pyw
*.pyz

17
environment.yml Normal file
View File

@@ -0,0 +1,17 @@
name: DRL
channels:
- pytorch
- nvidia
- defaults
dependencies:
- python=3.10
- pip=23.0.1
- pip:
- gymnasium[classic-control]==0.27.1
- hydra-core==1.3.2
- matplotlib==3.7.1
- moviepy==1.0.3
- torch torchvision --index-url https://download.pytorch.org/whl/cu126 # change this to your own cuda version
- opencv-python
- tensorboardX==2.6.4
- tensorboard==2.20.0

View File

@@ -0,0 +1,8 @@
python run.py --env_name CartPole-v1 -n 200 -b 1000 --exp_name cartpole
python run.py --env_name CartPole-v1 -n 200 -b 1000 -rtg --exp_name cartpole_rtg
python run.py --env_name CartPole-v1 -n 200 -b 1000 -na --exp_name cartpole_na
python run.py --env_name CartPole-v1 -n 200 -b 1000 -rtg -na --exp_name cartpole_rtg_na
python run.py --env_name CartPole-v1 -n 200 -b 4000 --exp_name cartpole_lb
python run.py --env_name CartPole-v1 -n 200 -b 4000 -rtg --exp_name cartpole_lb_rtg
python run.py --env_name CartPole-v1 -n 200 -b 4000 -na --exp_name cartpole_lb_na
python run.py --env_name CartPole-v1 -n 200 -b 4000 -rtg -na --exp_name cartpole_lb_rtg_na

1
hw3/bash/read-results.sh Normal file
View File

@@ -0,0 +1 @@
tensorboard --logdir data/pg_cartpole_CartPole-v1_25-10-2025_15-00-04

View File

@@ -103,7 +103,7 @@ class PGAgent(nn.Module):
q_values = None
############################
# YOUR IMPLEMENTATION HERE #
q_values = [self._discounted_return(reward) for reward in rewards]
############################
else:
@@ -114,7 +114,7 @@ class PGAgent(nn.Module):
############################
# YOUR IMPLEMENTATION HERE #
q_values = [self._discounted_reward_to_go(reward) for reward in rewards]
############################
return q_values
@@ -148,7 +148,10 @@ class PGAgent(nn.Module):
advantages = None
############################
# YOUR IMPLEMENTATION HERE #
source = rewards.copy()
mean = np.mean(source)
std = np.std(source)
advantages = (source - mean)/std
############################
return advantages
@@ -166,9 +169,9 @@ class PGAgent(nn.Module):
############################
# YOUR IMPLEMENTATION HERE #
q_value=sum(self.gamma ** i * reward for i, reward in enumerate(rewards))
return [q_value] * len(rewards)
############################
pass
def _discounted_reward_to_go(self, rewards: Sequence[float]) -> Sequence[float]:
@@ -181,6 +184,12 @@ class PGAgent(nn.Module):
############################
# YOUR IMPLEMENTATION HERE #
q_values = []
current_sum = 0
for t in range (len(rewards)-1,-1,-1):
current_sum *= self.gamma
current_sum += rewards[t]
q_values.append(current_sum)
q_values.reverse()
return q_values
############################
pass

View File

@@ -63,7 +63,9 @@ class MLPPolicy(nn.Module):
############################
# YOUR IMPLEMENTATION HERE #
obs = ptu.from_numpy(obs)
action_tensor = self.forward(obs)
action = ptu.to_numpy(action_tensor.sample())
############################
return action
@@ -80,7 +82,8 @@ class MLPPolicy(nn.Module):
############################
# YOUR IMPLEMENTATION HERE #
logits_prob = self.logits_net(obs)
action = distributions.Categorical(logits=logits_prob)
############################
else:
@@ -116,7 +119,21 @@ class MLPPolicyPG(MLPPolicy):
############################
# YOUR IMPLEMENTATION HERE #
dist = self.forward(obs)
if self.discrete:
log_p = dist.log_prob(actions.long())
else:
log_p = dist.log_prob(actions)
loss = -torch.mean(advantages * log_p)
# update gradients
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
############################
return {

View File

@@ -33,7 +33,10 @@ def sample_trajectory(
############################
# YOUR IMPLEMENTATION HERE #
ac = policy.get_action(ob)
next_ob, rew, terminated, truncated, _ = env.step(ac)
rollout_done = terminated or truncated
############################