done?
This commit is contained in:
202
hw2/buffer.py
Normal file
202
hw2/buffer.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import torch
|
||||
import numpy as np
|
||||
from collections import deque
|
||||
|
||||
|
||||
def get_buffer(cfg, **args):
|
||||
assert type(cfg.nstep) == int and cfg.nstep > 0, 'nstep must be a positive integer'
|
||||
if not cfg.use_per:
|
||||
if cfg.nstep == 1:
|
||||
return ReplayBuffer(cfg.capacity, **args)
|
||||
else:
|
||||
return NStepReplayBuffer(cfg.capacity, cfg.nstep, cfg.gamma, **args)
|
||||
else:
|
||||
if cfg.nstep == 1:
|
||||
return PrioritizedReplayBuffer(cfg.capacity, cfg.per_eps, cfg.per_alpha, cfg.per_beta, **args)
|
||||
else:
|
||||
return PrioritizedNStepReplayBuffer(cfg.capacity, cfg.per_eps, cfg.per_alpha, cfg.per_beta, cfg.nstep, cfg.gamma, **args)
|
||||
|
||||
|
||||
class ReplayBuffer:
|
||||
def __init__(self, capacity, state_size, device):
|
||||
self.device = device
|
||||
self.state = torch.empty(capacity, state_size, dtype=torch.float)
|
||||
self.action = torch.empty(capacity, 1, dtype=torch.float)
|
||||
self.reward = torch.empty(capacity, dtype=torch.float)
|
||||
self.next_state = torch.empty(capacity, state_size, dtype=torch.float)
|
||||
self.done = torch.empty(capacity, dtype=torch.int)
|
||||
|
||||
self.idx = 0
|
||||
self.size = 0
|
||||
self.capacity = capacity
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return 'NormalReplayBuffer'
|
||||
|
||||
def add(self, transition):
|
||||
state, action, reward, next_state, done = transition
|
||||
|
||||
# store transition in the buffer and update the index and size of the buffer
|
||||
# you may need to convert the data type to torch.tensor
|
||||
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
self.state[self.idx] = torch.tensor(state, device=self.device)
|
||||
self.action[self.idx] = torch.tensor(action, device=self.device)
|
||||
self.reward[self.idx] = torch.tensor(reward, device=self.device)
|
||||
self.next_state[self.idx] = torch.tensor(next_state, device=self.device)
|
||||
self.done[self.idx] = torch.tensor(done, device=self.device)
|
||||
self.idx = (self.idx + 1) % self.capacity
|
||||
self.size = min(self.size + 1, self.capacity)
|
||||
|
||||
############################
|
||||
|
||||
def sample(self, batch_size):
|
||||
# sample batch_size data from the buffer without replacement
|
||||
sample_idxs = np.random.choice(self.size, batch_size, replace=False)
|
||||
batch = ()
|
||||
# get a batch of data from the buffer according to the sample_idxs
|
||||
# please transfer the data to the corresponding device before return
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
# do not load to gpu device since the buffer is not loaded on init
|
||||
batch = (torch.index_select(self.state, 0, torch.tensor(sample_idxs)),
|
||||
torch.index_select(self.action, 0, torch.tensor(sample_idxs)),
|
||||
torch.index_select(self.reward, 0, torch.tensor(sample_idxs)),
|
||||
torch.index_select(self.next_state, 0, torch.tensor(sample_idxs)),
|
||||
torch.index_select(self.done, 0, torch.tensor(sample_idxs))
|
||||
)
|
||||
|
||||
############################
|
||||
return batch
|
||||
|
||||
|
||||
class NStepReplayBuffer(ReplayBuffer):
|
||||
def __init__(self, capacity, n_step, gamma, state_size, device):
|
||||
super().__init__(capacity, state_size, device=device)
|
||||
self.n_step = n_step
|
||||
self.n_step_buffer = deque([], maxlen=n_step)
|
||||
self.gamma = gamma
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'{self.n_step}StepReplayBuffer'
|
||||
|
||||
def n_step_handler(self):
|
||||
"""Get n-step state, action, reward and done for the transition, discard those rewards after done=True"""
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
state, action, reward, done = self.n_step_buffer[0]
|
||||
# compute n-step discounted reward
|
||||
gamma = self.gamma
|
||||
for i in range(1, len(self.n_step_buffer)):
|
||||
if done:
|
||||
break
|
||||
reward += gamma * self.n_step_buffer[i][2]
|
||||
gamma *= self.gamma
|
||||
############################
|
||||
return state, action, reward, done
|
||||
|
||||
def add(self, transition):
|
||||
state, action, reward, next_state, done = transition
|
||||
self.n_step_buffer.append((state, action, reward, done))
|
||||
if len(self.n_step_buffer) < self.n_step:
|
||||
return
|
||||
state, action, reward, done = self.n_step_handler()
|
||||
super().add((state, action, reward, next_state, done))
|
||||
|
||||
|
||||
class PrioritizedReplayBuffer(ReplayBuffer):
|
||||
def __init__(self, capacity, eps, alpha, beta, state_size, device):
|
||||
self.weights = np.zeros(capacity, dtype=np.float32) # stores weights for importance sampling
|
||||
self.eps = eps # minimal priority for stability
|
||||
self.alpha = alpha # determines how much prioritization is used, α = 0 corresponding to the uniform case
|
||||
self.beta = beta # determines the amount of importance-sampling correction, b = 1 fully compensate for the non-uniform probabilities
|
||||
self.max_priority = eps # priority for new samples, init as eps
|
||||
super().__init__(capacity, state_size, device=device)
|
||||
|
||||
def add(self, transition):
|
||||
"""
|
||||
Add a new experience to memory, and update it's priority to the max_priority.
|
||||
"""
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
super().add(transition)
|
||||
self.weights[self.idx] = self.max_priority
|
||||
############################
|
||||
|
||||
|
||||
def sample(self, batch_size):
|
||||
"""
|
||||
Sample a batch of experiences from the buffer with priority, and calculates the weights used for the correction of bias used in the Q-learning update
|
||||
Returns:
|
||||
batch: a batch of experiences as in the normal replay buffer
|
||||
weights: torch.Tensor (batch_size, ), importance sampling weights for each sample
|
||||
sample_idxs: numpy.ndarray (batch_size, ), the indexes of the sample in the buffer
|
||||
"""
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
# assume sample with replacement, in case if sample size is too small
|
||||
sample_idxs_tensor = torch.multinomial(torch.tensor(self.weights), batch_size, replacement=True)
|
||||
sample_idxs = sample_idxs_tensor.cpu().numpy()
|
||||
# do not load to gpu device since the buffer is not loaded on init
|
||||
batch = (
|
||||
torch.index_select(self.state, 0, torch.tensor(sample_idxs)),
|
||||
torch.index_select(self.action, 0, torch.tensor(sample_idxs)),
|
||||
torch.index_select(self.reward, 0, torch.tensor(sample_idxs)),
|
||||
torch.index_select(self.next_state, 0, torch.tensor(sample_idxs)),
|
||||
torch.index_select(self.done, 0, torch.tensor(sample_idxs))
|
||||
)
|
||||
weights = torch.tensor(self.weights[sample_idxs], device=self.device).unsqueeze(1)
|
||||
############################
|
||||
return batch, weights, sample_idxs
|
||||
|
||||
def update_priorities(self, data_idxs, priorities: np.ndarray):
|
||||
priorities = (priorities + self.eps) ** self.alpha
|
||||
|
||||
self.weights[data_idxs] = priorities
|
||||
self.max_priority = max(self.weights)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return 'PrioritizedReplayBuffer'
|
||||
|
||||
|
||||
# Avoid Diamond Inheritance
|
||||
class PrioritizedNStepReplayBuffer(PrioritizedReplayBuffer):
|
||||
def __init__(self, capacity, eps, alpha, beta, n_step, gamma, state_size, device):
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
super().__init__(capacity, eps, alpha, beta, state_size, device)
|
||||
self.n_step = n_step
|
||||
self.n_step_buffer = deque([], maxlen=n_step)
|
||||
self.gamma = gamma
|
||||
############################
|
||||
def __repr__(self) -> str:
|
||||
return f'Prioritized{self.n_step}StepReplayBuffer'
|
||||
|
||||
def add(self, transition):
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
state, action, reward, next_state, done = transition
|
||||
self.n_step_buffer.append((state, action, reward, done))
|
||||
if len(self.n_step_buffer) < self.n_step:
|
||||
return
|
||||
state, action, reward, done = self.n_step_handler()
|
||||
super().add((state, action, reward, next_state, done))
|
||||
############################
|
||||
|
||||
# def the other necessary class methods as your need
|
||||
|
||||
def n_step_handler(self):
|
||||
"""Get n-step state, action, reward and done for the transition, discard those rewards after done=True"""
|
||||
############################
|
||||
# YOUR IMPLEMENTATION HERE #
|
||||
state, action, reward, done = self.n_step_buffer[0]
|
||||
# compute n-step discounted reward
|
||||
gamma = self.gamma
|
||||
for i in range(1, len(self.n_step_buffer)):
|
||||
if done:
|
||||
break
|
||||
reward += gamma * self.n_step_buffer[i][2]
|
||||
gamma *= self.gamma
|
||||
############################
|
||||
return state, action, reward, done
|
||||
Reference in New Issue
Block a user