v0.5 to main (#10)
* v0.5 (#9) * update idql configs * update awr configs * update dipo configs * update qsm configs * update dqm configs * update project version to 0.5.0
This commit is contained in:
parent
dd14c5887c
commit
e0842e71dc
1
.gitignore
vendored
1
.gitignore
vendored
@ -10,6 +10,7 @@ checkpoints/
|
||||
out/
|
||||
err/
|
||||
*.pkl
|
||||
*.sh
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
@ -11,10 +11,15 @@ import torch
|
||||
import logging
|
||||
import pickle
|
||||
import random
|
||||
from tqdm import tqdm
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
Batch = namedtuple("Batch", "actions conditions")
|
||||
Transition = namedtuple("Transition", "actions conditions rewards dones")
|
||||
TransitionWithReturn = namedtuple(
|
||||
"Transition", "actions conditions rewards dones reward_to_gos"
|
||||
)
|
||||
|
||||
|
||||
class StitchedSequenceDataset(torch.utils.data.Dataset):
|
||||
@ -49,6 +54,8 @@ class StitchedSequenceDataset(torch.utils.data.Dataset):
|
||||
self.img_cond_steps = img_cond_steps
|
||||
self.device = device
|
||||
self.use_img = use_img
|
||||
self.max_n_episodes = max_n_episodes
|
||||
self.dataset_path = dataset_path
|
||||
|
||||
# Load dataset to device specified
|
||||
if dataset_path.endswith(".npz"):
|
||||
@ -87,7 +94,7 @@ class StitchedSequenceDataset(torch.utils.data.Dataset):
|
||||
"""
|
||||
start, num_before_start = self.indices[idx]
|
||||
end = start + self.horizon_steps
|
||||
states = self.states[(start - num_before_start) : end]
|
||||
states = self.states[(start - num_before_start) : (start + 1)]
|
||||
actions = self.actions[start:end]
|
||||
states = torch.stack(
|
||||
[
|
||||
@ -116,9 +123,9 @@ class StitchedSequenceDataset(torch.utils.data.Dataset):
|
||||
indices = []
|
||||
cur_traj_index = 0
|
||||
for traj_length in traj_lengths:
|
||||
max_start = cur_traj_index + traj_length - horizon_steps + 1
|
||||
max_start = cur_traj_index + traj_length - horizon_steps
|
||||
indices += [
|
||||
(i, i - cur_traj_index) for i in range(cur_traj_index, max_start)
|
||||
(i, i - cur_traj_index) for i in range(cur_traj_index, max_start + 1)
|
||||
]
|
||||
cur_traj_index += traj_length
|
||||
return indices
|
||||
@ -135,3 +142,151 @@ class StitchedSequenceDataset(torch.utils.data.Dataset):
|
||||
|
||||
def __len__(self):
|
||||
return len(self.indices)
|
||||
|
||||
|
||||
class StitchedSequenceQLearningDataset(StitchedSequenceDataset):
|
||||
"""
|
||||
Extends StitchedSequenceDataset to include rewards and dones for Q learning
|
||||
|
||||
Do not load the last step of **truncated** episodes since we do not have the correct next state for the final step of each episode. Truncation can be determined by terminal=False but end of episode.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dataset_path,
|
||||
max_n_episodes=10000,
|
||||
discount_factor=1.0,
|
||||
device="cuda:0",
|
||||
get_mc_return=False,
|
||||
**kwargs,
|
||||
):
|
||||
if dataset_path.endswith(".npz"):
|
||||
dataset = np.load(dataset_path, allow_pickle=False)
|
||||
elif dataset_path.endswith(".pkl"):
|
||||
with open(dataset_path, "rb") as f:
|
||||
dataset = pickle.load(f)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file format: {dataset_path}")
|
||||
traj_lengths = dataset["traj_lengths"][:max_n_episodes]
|
||||
total_num_steps = np.sum(traj_lengths)
|
||||
|
||||
# discount factor
|
||||
self.discount_factor = discount_factor
|
||||
|
||||
# rewards and dones(terminals)
|
||||
self.rewards = (
|
||||
torch.from_numpy(dataset["rewards"][:total_num_steps]).float().to(device)
|
||||
)
|
||||
log.info(f"Rewards shape/type: {self.rewards.shape, self.rewards.dtype}")
|
||||
self.dones = (
|
||||
torch.from_numpy(dataset["terminals"][:total_num_steps]).to(device).float()
|
||||
)
|
||||
log.info(f"Dones shape/type: {self.dones.shape, self.dones.dtype}")
|
||||
|
||||
super().__init__(
|
||||
dataset_path=dataset_path,
|
||||
max_n_episodes=max_n_episodes,
|
||||
device=device,
|
||||
**kwargs,
|
||||
)
|
||||
log.info(f"Total number of transitions using: {len(self)}")
|
||||
|
||||
# compute discounted reward-to-go for each trajectory
|
||||
self.get_mc_return = get_mc_return
|
||||
if get_mc_return:
|
||||
self.reward_to_go = torch.zeros_like(self.rewards)
|
||||
cumulative_traj_length = np.cumsum(traj_lengths)
|
||||
prev_traj_length = 0
|
||||
for i, traj_length in tqdm(
|
||||
enumerate(cumulative_traj_length), desc="Computing reward-to-go"
|
||||
):
|
||||
traj_rewards = self.rewards[prev_traj_length:traj_length]
|
||||
returns = torch.zeros_like(traj_rewards)
|
||||
prev_return = 0
|
||||
for t in range(len(traj_rewards)):
|
||||
returns[-t - 1] = (
|
||||
traj_rewards[-t - 1] + self.discount_factor * prev_return
|
||||
)
|
||||
prev_return = returns[-t - 1]
|
||||
self.reward_to_go[prev_traj_length:traj_length] = returns
|
||||
prev_traj_length = traj_length
|
||||
log.info(f"Computed reward-to-go for each trajectory.")
|
||||
|
||||
def make_indices(self, traj_lengths, horizon_steps):
|
||||
"""
|
||||
skip last step of truncated episodes
|
||||
"""
|
||||
num_skip = 0
|
||||
indices = []
|
||||
cur_traj_index = 0
|
||||
for traj_length in traj_lengths:
|
||||
max_start = cur_traj_index + traj_length - horizon_steps
|
||||
if not self.dones[cur_traj_index + traj_length - 1]: # truncation
|
||||
max_start -= 1
|
||||
num_skip += 1
|
||||
indices += [
|
||||
(i, i - cur_traj_index) for i in range(cur_traj_index, max_start + 1)
|
||||
]
|
||||
cur_traj_index += traj_length
|
||||
log.info(f"Number of transitions skipped due to truncation: {num_skip}")
|
||||
return indices
|
||||
|
||||
def __getitem__(self, idx):
|
||||
start, num_before_start = self.indices[idx]
|
||||
end = start + self.horizon_steps
|
||||
states = self.states[(start - num_before_start) : (start + 1)]
|
||||
actions = self.actions[start:end]
|
||||
rewards = self.rewards[start : (start + 1)]
|
||||
dones = self.dones[start : (start + 1)]
|
||||
|
||||
# Account for action horizon
|
||||
if idx < len(self.indices) - self.horizon_steps:
|
||||
next_states = self.states[
|
||||
(start - num_before_start + self.horizon_steps) : start
|
||||
+ 1
|
||||
+ self.horizon_steps
|
||||
] # even if this uses the first state(s) of the next episode, done=True will prevent bootstrapping. We have already filtered out cases where done=False but end of episode (truncation).
|
||||
else:
|
||||
# prevents indexing error, but ignored since done=True
|
||||
next_states = torch.zeros_like(states)
|
||||
|
||||
# stack obs history
|
||||
states = torch.stack(
|
||||
[
|
||||
states[max(num_before_start - t, 0)]
|
||||
for t in reversed(range(self.cond_steps))
|
||||
]
|
||||
) # more recent is at the end
|
||||
next_states = torch.stack(
|
||||
[
|
||||
next_states[max(num_before_start - t, 0)]
|
||||
for t in reversed(range(self.cond_steps))
|
||||
]
|
||||
) # more recent is at the end
|
||||
conditions = {"state": states, "next_state": next_states}
|
||||
if self.use_img:
|
||||
images = self.images[(start - num_before_start) : end]
|
||||
images = torch.stack(
|
||||
[
|
||||
images[max(num_before_start - t, 0)]
|
||||
for t in reversed(range(self.img_cond_steps))
|
||||
]
|
||||
)
|
||||
conditions["rgb"] = images
|
||||
if self.get_mc_return:
|
||||
reward_to_gos = self.reward_to_go[start : (start + 1)]
|
||||
batch = TransitionWithReturn(
|
||||
actions,
|
||||
conditions,
|
||||
rewards,
|
||||
dones,
|
||||
reward_to_gos,
|
||||
)
|
||||
else:
|
||||
batch = Transition(
|
||||
actions,
|
||||
conditions,
|
||||
rewards,
|
||||
dones,
|
||||
)
|
||||
return batch
|
||||
|
@ -36,7 +36,7 @@ class EvalDiffusionAgent(EvalAgent):
|
||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -57,9 +57,13 @@ class EvalDiffusionAgent(EvalAgent):
|
||||
action_venv = output_venv[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv)
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = terminated_venv | truncated_venv
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
|
@ -40,7 +40,7 @@ class EvalImgDiffusionAgent(EvalAgent):
|
||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -60,9 +60,13 @@ class EvalImgDiffusionAgent(EvalAgent):
|
||||
action_venv = output_venv[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv)
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = terminated_venv | truncated_venv
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
|
@ -36,7 +36,7 @@ class EvalGaussianAgent(EvalAgent):
|
||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -55,9 +55,13 @@ class EvalGaussianAgent(EvalAgent):
|
||||
action_venv = output_venv[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv)
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = terminated_venv | truncated_venv
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
|
@ -40,7 +40,7 @@ class EvalImgGaussianAgent(EvalAgent):
|
||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -58,9 +58,13 @@ class EvalImgGaussianAgent(EvalAgent):
|
||||
action_venv = output_venv[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(action_venv)
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = terminated_venv | truncated_venv
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
|
@ -26,7 +26,7 @@ from util.scheduler import CosineAnnealingWarmupRestarts
|
||||
def td_values(
|
||||
states,
|
||||
rewards,
|
||||
dones,
|
||||
terminateds,
|
||||
state_values,
|
||||
gamma=0.99,
|
||||
alpha=0.95,
|
||||
@ -43,21 +43,20 @@ def td_values(
|
||||
"""
|
||||
sample_count = len(states)
|
||||
tds = np.zeros_like(state_values, dtype=np.float32)
|
||||
dones[-1] = 1
|
||||
next_value = 1 - dones[-1]
|
||||
next_value = state_values[-1].copy()
|
||||
next_value[terminateds[-1]] = 0.0
|
||||
|
||||
val = 0.0
|
||||
for i in range(sample_count - 1, -1, -1):
|
||||
# next_value = 0.0 if dones[i] else state_values[i + 1]
|
||||
|
||||
# get next_value for vectorized
|
||||
if i < sample_count - 1:
|
||||
next_value = state_values[i + 1]
|
||||
next_value = next_value * (1 - dones[i])
|
||||
next_value = next_value * (1 - terminateds[i])
|
||||
|
||||
state_value = state_values[i]
|
||||
error = rewards[i] + gamma * next_value - state_value
|
||||
val = alpha * error + gamma * lam * (1 - dones[i]) * val
|
||||
val = alpha * error + gamma * lam * (1 - terminateds[i]) * val
|
||||
|
||||
tds[i] = val + state_value
|
||||
return tds
|
||||
@ -127,12 +126,12 @@ class TrainAWRDiffusionAgent(TrainAgent):
|
||||
obs_buffer = deque(maxlen=self.buffer_size)
|
||||
action_buffer = deque(maxlen=self.buffer_size)
|
||||
reward_buffer = deque(maxlen=self.buffer_size)
|
||||
done_buffer = deque(maxlen=self.buffer_size)
|
||||
first_buffer = deque(maxlen=self.buffer_size)
|
||||
terminated_buffer = deque(maxlen=self.buffer_size)
|
||||
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
@ -156,10 +155,9 @@ class TrainAWRDiffusionAgent(TrainAgent):
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
firsts_trajs[0] = (
|
||||
done_venv # if done at the end of last iteration, then the envs are just reset
|
||||
)
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -184,21 +182,26 @@ class TrainAWRDiffusionAgent(TrainAgent):
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
|
||||
action_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# add to buffer
|
||||
obs_buffer.append(prev_obs_venv["state"])
|
||||
action_buffer.append(action_venv)
|
||||
reward_buffer.append(reward_venv * self.scale_reward_factor)
|
||||
done_buffer.append(done_venv)
|
||||
first_buffer.append(firsts_trajs[step])
|
||||
if not eval_mode:
|
||||
obs_buffer.append(prev_obs_venv["state"])
|
||||
action_buffer.append(action_venv)
|
||||
reward_buffer.append(reward_venv * self.scale_reward_factor)
|
||||
terminated_buffer.append(terminated_venv)
|
||||
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
@ -240,7 +243,7 @@ class TrainAWRDiffusionAgent(TrainAgent):
|
||||
if not eval_mode:
|
||||
obs_trajs = np.array(deepcopy(obs_buffer)) # assume only state
|
||||
reward_trajs = np.array(deepcopy(reward_buffer))
|
||||
dones_trajs = np.array(deepcopy(done_buffer))
|
||||
terminated_trajs = np.array(deepcopy(terminated_buffer))
|
||||
obs_t = einops.rearrange(
|
||||
torch.from_numpy(obs_trajs).float().to(self.device),
|
||||
"s e h d -> (s e) h d",
|
||||
@ -248,7 +251,9 @@ class TrainAWRDiffusionAgent(TrainAgent):
|
||||
values_trajs = np.array(
|
||||
self.model.critic({"state": obs_t}).detach().cpu().numpy()
|
||||
).reshape(-1, self.n_envs)
|
||||
td_trajs = td_values(obs_trajs, reward_trajs, dones_trajs, values_trajs)
|
||||
td_trajs = td_values(
|
||||
obs_trajs, reward_trajs, terminated_trajs, values_trajs
|
||||
)
|
||||
td_t = torch.from_numpy(td_trajs.flatten()).float().to(self.device)
|
||||
|
||||
# Update critic
|
||||
@ -268,7 +273,7 @@ class TrainAWRDiffusionAgent(TrainAgent):
|
||||
obs_trajs = np.array(deepcopy(obs_buffer))
|
||||
samples_trajs = np.array(deepcopy(action_buffer))
|
||||
reward_trajs = np.array(deepcopy(reward_buffer))
|
||||
dones_trajs = np.array(deepcopy(done_buffer))
|
||||
terminated_trajs = np.array(deepcopy(terminated_buffer))
|
||||
obs_t = einops.rearrange(
|
||||
torch.from_numpy(obs_trajs).float().to(self.device),
|
||||
"s e h d -> (s e) h d",
|
||||
@ -276,7 +281,9 @@ class TrainAWRDiffusionAgent(TrainAgent):
|
||||
values_trajs = np.array(
|
||||
self.model.critic({"state": obs_t}).detach().cpu().numpy()
|
||||
).reshape(-1, self.n_envs)
|
||||
td_trajs = td_values(obs_trajs, reward_trajs, dones_trajs, values_trajs)
|
||||
td_trajs = td_values(
|
||||
obs_trajs, reward_trajs, terminated_trajs, values_trajs
|
||||
)
|
||||
advantages_trajs = td_trajs - values_trajs
|
||||
|
||||
# flatten
|
||||
@ -315,13 +322,13 @@ class TrainAWRDiffusionAgent(TrainAgent):
|
||||
advantages_b_scaled.clamp_(max=self.max_adv_weight)
|
||||
|
||||
# Update policy with collected trajectories
|
||||
loss = self.model.loss(
|
||||
loss_actor = self.model.loss(
|
||||
actions_b,
|
||||
obs_b,
|
||||
advantages_b_scaled.detach(),
|
||||
)
|
||||
self.actor_optimizer.zero_grad()
|
||||
loss.backward()
|
||||
loss_actor.backward()
|
||||
if self.itr >= self.n_critic_warmup_itr:
|
||||
if self.max_grad_norm is not None:
|
||||
torch.nn.utils.clip_grad_norm_(
|
||||
@ -341,10 +348,12 @@ class TrainAWRDiffusionAgent(TrainAgent):
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
@ -365,12 +374,13 @@ class TrainAWRDiffusionAgent(TrainAgent):
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"loss": loss,
|
||||
"total env step": cnt_train_step,
|
||||
"loss - actor": loss_actor,
|
||||
"loss - critic": loss_critic,
|
||||
"avg episode reward - train": avg_episode_reward,
|
||||
"num episode - train": num_episode_finished,
|
||||
@ -378,10 +388,7 @@ class TrainAWRDiffusionAgent(TrainAgent):
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["loss"] = loss
|
||||
run_results[-1]["loss_critic"] = loss_critic
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["time"] = time
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
||||
|
501
agent/finetune/train_calql_agent.py
Normal file
501
agent/finetune/train_calql_agent.py
Normal file
@ -0,0 +1,501 @@
|
||||
"""
|
||||
Reinforcement Learning with Prior Data (RLPD) agent training script.
|
||||
|
||||
Does not support image observations right now.
|
||||
"""
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import numpy as np
|
||||
import torch
|
||||
import logging
|
||||
import wandb
|
||||
import hydra
|
||||
from collections import deque
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
from util.timer import Timer
|
||||
from agent.finetune.train_agent import TrainAgent
|
||||
from util.scheduler import CosineAnnealingWarmupRestarts
|
||||
|
||||
|
||||
class TrainCalQLAgent(TrainAgent):
|
||||
def __init__(self, cfg):
|
||||
super().__init__(cfg)
|
||||
assert self.n_envs == 1, "Cal-QL only supports single env for now"
|
||||
|
||||
# Train mode (offline or online)
|
||||
self.train_online = cfg.train.train_online
|
||||
|
||||
# Build dataset
|
||||
self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset)
|
||||
|
||||
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
|
||||
self.gamma = cfg.train.gamma
|
||||
|
||||
# Optimizer
|
||||
self.actor_optimizer = torch.optim.AdamW(
|
||||
self.model.network.parameters(),
|
||||
lr=cfg.train.actor_lr,
|
||||
weight_decay=cfg.train.actor_weight_decay,
|
||||
)
|
||||
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
|
||||
self.actor_optimizer,
|
||||
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
|
||||
cycle_mult=1.0,
|
||||
max_lr=cfg.train.actor_lr,
|
||||
min_lr=cfg.train.actor_lr_scheduler.min_lr,
|
||||
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
|
||||
gamma=1.0,
|
||||
)
|
||||
self.critic_optimizer = torch.optim.AdamW(
|
||||
self.model.critic.parameters(),
|
||||
lr=cfg.train.critic_lr,
|
||||
weight_decay=cfg.train.critic_weight_decay,
|
||||
)
|
||||
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
|
||||
self.critic_optimizer,
|
||||
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
|
||||
cycle_mult=1.0,
|
||||
max_lr=cfg.train.critic_lr,
|
||||
min_lr=cfg.train.critic_lr_scheduler.min_lr,
|
||||
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
|
||||
gamma=1.0,
|
||||
)
|
||||
|
||||
# Perturbation scale
|
||||
self.target_ema_rate = cfg.train.target_ema_rate
|
||||
|
||||
# Number of random actions to sample for Cal-QL
|
||||
self.n_random_actions = cfg.train.n_random_actions
|
||||
|
||||
# Reward scale
|
||||
self.scale_reward_factor = cfg.train.scale_reward_factor
|
||||
|
||||
# Number of critic updates
|
||||
self.num_update = cfg.train.num_update
|
||||
|
||||
# Buffer size
|
||||
self.buffer_size = cfg.train.buffer_size
|
||||
|
||||
# Online only configs
|
||||
if self.train_online:
|
||||
# number of episode to colect per epoch for training
|
||||
self.n_episode_per_epoch = cfg.train.n_episode_per_epoch
|
||||
# UTD ratio
|
||||
self.online_utd_ratio = cfg.train.online_utd_ratio
|
||||
|
||||
# Eval episodes
|
||||
self.n_eval_episode = cfg.train.n_eval_episode
|
||||
|
||||
# Exploration steps at the beginning - using randomly sampled action
|
||||
self.n_explore_steps = cfg.train.n_explore_steps
|
||||
|
||||
# Initialize temperature parameter for entropy
|
||||
init_temperature = cfg.train.init_temperature
|
||||
self.log_alpha = torch.tensor(np.log(init_temperature)).to(self.device)
|
||||
self.log_alpha.requires_grad = True
|
||||
self.automatic_entropy_tuning = cfg.train.automatic_entropy_tuning
|
||||
self.target_entropy = cfg.train.target_entropy
|
||||
self.log_alpha_optimizer = torch.optim.Adam(
|
||||
[self.log_alpha],
|
||||
lr=cfg.train.critic_lr,
|
||||
)
|
||||
|
||||
def run(self):
|
||||
# make a FIFO replay buffer for obs, action, and reward
|
||||
obs_buffer = deque(maxlen=self.buffer_size)
|
||||
next_obs_buffer = deque(maxlen=self.buffer_size)
|
||||
action_buffer = deque(maxlen=self.buffer_size)
|
||||
reward_buffer = deque(maxlen=self.buffer_size)
|
||||
reward_to_go_buffer = deque(maxlen=self.buffer_size)
|
||||
terminated_buffer = deque(maxlen=self.buffer_size)
|
||||
if not self.train_online:
|
||||
obs_array = np.array(obs_buffer)
|
||||
next_obs_array = np.array(next_obs_buffer)
|
||||
actions_array = np.array(action_buffer)
|
||||
rewards_array = np.array(reward_buffer)
|
||||
reward_to_go_array = np.array(reward_to_go_buffer)
|
||||
terminated_array = np.array(terminated_buffer)
|
||||
|
||||
# load offline dataset into replay buffer
|
||||
dataloader_offline = torch.utils.data.DataLoader(
|
||||
self.dataset_offline,
|
||||
batch_size=len(self.dataset_offline),
|
||||
drop_last=False,
|
||||
)
|
||||
for batch in dataloader_offline:
|
||||
actions, states_and_next, rewards, terminated, reward_to_go = batch
|
||||
states = states_and_next["state"]
|
||||
next_states = states_and_next["next_state"]
|
||||
obs_buffer_off = states.cpu().numpy()
|
||||
next_obs_buffer_off = next_states.cpu().numpy()
|
||||
action_buffer_off = actions.cpu().numpy()
|
||||
reward_buffer_off = rewards.cpu().numpy().flatten()
|
||||
reward_to_go_buffer_off = reward_to_go.cpu().numpy().flatten()
|
||||
terminated_buffer_off = terminated.cpu().numpy().flatten()
|
||||
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
if self.itr % 1000 == 0:
|
||||
print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
|
||||
|
||||
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
|
||||
options_venv = [{} for _ in range(self.n_envs)]
|
||||
if self.itr % self.render_freq == 0 and self.render_video:
|
||||
for env_ind in range(self.n_render):
|
||||
options_venv[env_ind]["video_path"] = os.path.join(
|
||||
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
|
||||
)
|
||||
|
||||
# Define train or eval - all envs restart
|
||||
eval_mode = (
|
||||
self.itr % self.val_freq == 0
|
||||
and self.itr >= self.n_explore_steps
|
||||
and not self.force_train
|
||||
)
|
||||
# during eval, we collect a fixed number of episodes, so we set n_steps to a large value
|
||||
if eval_mode:
|
||||
n_steps = int(1e5)
|
||||
elif not self.train_online:
|
||||
n_steps = 0
|
||||
else:
|
||||
n_steps = int(1e5) # use episodes
|
||||
self.model.eval() if eval_mode else self.model.train()
|
||||
|
||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
|
||||
firsts_trajs = np.zeros((n_steps + 1, self.n_envs))
|
||||
if self.reset_at_iteration or eval_mode or self.itr == 0:
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
reward_trajs = np.zeros((n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
cnt_episode = 0
|
||||
for step in range(n_steps):
|
||||
if step % 100 == 0:
|
||||
print(f"Completed environment step {step}")
|
||||
|
||||
# Select action
|
||||
if self.itr < self.n_explore_steps:
|
||||
action_venv = self.venv.action_space.sample()
|
||||
else:
|
||||
with torch.no_grad():
|
||||
cond = {
|
||||
"state": torch.from_numpy(prev_obs_venv["state"])
|
||||
.float()
|
||||
.to(self.device)
|
||||
}
|
||||
samples = (
|
||||
self.model(
|
||||
cond=cond,
|
||||
deterministic=eval_mode,
|
||||
)
|
||||
.cpu()
|
||||
.numpy()
|
||||
) # n_env x horizon x act
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# add to buffer in train mode
|
||||
if not eval_mode:
|
||||
for i in range(self.n_envs):
|
||||
obs_buffer.append(prev_obs_venv["state"][i])
|
||||
if truncated_venv[i]:
|
||||
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
|
||||
else: # first obs in new episode
|
||||
next_obs_buffer.append(obs_venv["state"][i])
|
||||
action_buffer.append(action_venv[i])
|
||||
reward_buffer.extend(
|
||||
(reward_venv * self.scale_reward_factor).tolist()
|
||||
)
|
||||
terminated_buffer.extend(terminated_venv.tolist())
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# check if enough eval episodes are done
|
||||
cnt_episode += np.sum(done_venv)
|
||||
if eval_mode and cnt_episode >= self.n_eval_episode:
|
||||
break
|
||||
if not eval_mode and cnt_episode >= self.n_episode_per_epoch:
|
||||
break
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
|
||||
for i in range(len(env_steps) - 1):
|
||||
start = env_steps[i]
|
||||
end = env_steps[i + 1]
|
||||
if end - start > 1:
|
||||
episodes_start_end.append((env_ind, start, end - 1))
|
||||
if len(episodes_start_end) > 0:
|
||||
reward_trajs_split = [
|
||||
reward_trajs[start : end + 1, env_ind]
|
||||
for env_ind, start, end in episodes_start_end
|
||||
]
|
||||
|
||||
# compute episode returns
|
||||
returns_trajs_split = [
|
||||
np.zeros_like(reward_trajs) for reward_trajs in reward_trajs_split
|
||||
]
|
||||
for traj_rewards, traj_returns in zip(
|
||||
reward_trajs_split, returns_trajs_split
|
||||
):
|
||||
prev_return = 0
|
||||
for t in range(len(traj_rewards)):
|
||||
traj_returns[-t - 1] = (
|
||||
traj_rewards[-t - 1] + self.gamma * prev_return
|
||||
)
|
||||
prev_return = traj_returns[-t - 1]
|
||||
|
||||
# flatten (note: only works for single env!)
|
||||
returns_trajs_split = np.concatenate(returns_trajs_split)
|
||||
|
||||
# extend buffer
|
||||
reward_to_go_buffer.extend(returns_trajs_split)
|
||||
|
||||
num_episode_finished = len(reward_trajs_split)
|
||||
episode_reward = np.array(
|
||||
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
|
||||
)
|
||||
episode_best_reward = np.array(
|
||||
[
|
||||
np.max(reward_traj) / self.act_steps
|
||||
for reward_traj in reward_trajs_split
|
||||
]
|
||||
)
|
||||
avg_episode_reward = np.mean(episode_reward)
|
||||
avg_best_reward = np.mean(episode_best_reward)
|
||||
success_rate = np.mean(
|
||||
episode_best_reward >= self.best_reward_threshold_for_success
|
||||
)
|
||||
else:
|
||||
episode_reward = np.array([])
|
||||
num_episode_finished = 0
|
||||
avg_episode_reward = 0
|
||||
avg_best_reward = 0
|
||||
success_rate = 0
|
||||
|
||||
# Update models
|
||||
if not eval_mode and self.itr >= self.n_explore_steps:
|
||||
# TODO: is this slow in online?
|
||||
if self.train_online:
|
||||
obs_array = np.array(obs_buffer)
|
||||
next_obs_array = np.array(next_obs_buffer)
|
||||
actions_array = np.array(action_buffer)
|
||||
rewards_array = np.array(reward_buffer)
|
||||
reward_to_go_array = np.array(reward_to_go_buffer)
|
||||
terminated_array = np.array(terminated_buffer)
|
||||
|
||||
# override num_update
|
||||
if self.train_online:
|
||||
num_update = len(reward_trajs) # assume one env!
|
||||
else:
|
||||
num_update = self.num_update
|
||||
for _ in range(num_update):
|
||||
# Sample from OFFLINE buffer
|
||||
inds = np.random.choice(
|
||||
len(obs_buffer_off),
|
||||
self.batch_size // 2 if self.train_online else self.batch_size,
|
||||
)
|
||||
obs_b = (
|
||||
torch.from_numpy(obs_buffer_off[inds]).float().to(self.device)
|
||||
)
|
||||
next_obs_b = (
|
||||
torch.from_numpy(next_obs_buffer_off[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
actions_b = (
|
||||
torch.from_numpy(action_buffer_off[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
rewards_b = (
|
||||
torch.from_numpy(reward_buffer_off[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
terminated_b = (
|
||||
torch.from_numpy(terminated_buffer_off[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
reward_to_go_b = (
|
||||
torch.from_numpy(reward_to_go_buffer_off[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
|
||||
# Sample from ONLINE buffer
|
||||
if self.train_online:
|
||||
inds = np.random.choice(len(obs_buffer), self.batch_size // 2)
|
||||
obs_b_on = (
|
||||
torch.from_numpy(obs_array[inds]).float().to(self.device)
|
||||
)
|
||||
next_obs_b_on = (
|
||||
torch.from_numpy(next_obs_array[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
actions_b_on = (
|
||||
torch.from_numpy(actions_array[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
rewards_b_on = (
|
||||
torch.from_numpy(rewards_array[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
terminated_b_on = (
|
||||
torch.from_numpy(terminated_array[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
reward_to_go_b_on = (
|
||||
torch.from_numpy(reward_to_go_array[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
|
||||
# merge offline and online data
|
||||
obs_b = torch.cat([obs_b, obs_b_on], dim=0)
|
||||
next_obs_b = torch.cat([next_obs_b, next_obs_b_on], dim=0)
|
||||
actions_b = torch.cat([actions_b, actions_b_on], dim=0)
|
||||
rewards_b = torch.cat([rewards_b, rewards_b_on], dim=0)
|
||||
terminated_b = torch.cat([terminated_b, terminated_b_on], dim=0)
|
||||
reward_to_go_b = torch.cat(
|
||||
[reward_to_go_b, reward_to_go_b_on], dim=0
|
||||
)
|
||||
|
||||
# Get a random action for Cal-QL
|
||||
random_actions = (
|
||||
torch.rand(
|
||||
(
|
||||
self.batch_size,
|
||||
self.n_random_actions,
|
||||
self.horizon_steps,
|
||||
self.action_dim,
|
||||
)
|
||||
).to(self.device)
|
||||
* 2
|
||||
- 1
|
||||
) # scale to [-1, 1]
|
||||
|
||||
# Update critic
|
||||
alpha = self.log_alpha.exp().item()
|
||||
loss_critic = self.model.loss_critic(
|
||||
{"state": obs_b},
|
||||
{"state": next_obs_b},
|
||||
actions_b,
|
||||
random_actions,
|
||||
rewards_b,
|
||||
reward_to_go_b,
|
||||
terminated_b,
|
||||
self.gamma,
|
||||
alpha,
|
||||
)
|
||||
self.critic_optimizer.zero_grad()
|
||||
loss_critic.backward()
|
||||
self.critic_optimizer.step()
|
||||
|
||||
# Update target critic
|
||||
self.model.update_target_critic(self.target_ema_rate)
|
||||
|
||||
# Update actor
|
||||
loss_actor = self.model.loss_actor(
|
||||
{"state": obs_b},
|
||||
alpha,
|
||||
)
|
||||
self.actor_optimizer.zero_grad()
|
||||
loss_actor.backward()
|
||||
self.actor_optimizer.step()
|
||||
|
||||
# Update temperature parameter
|
||||
if self.automatic_entropy_tuning:
|
||||
self.log_alpha_optimizer.zero_grad()
|
||||
loss_alpha = self.model.loss_temperature(
|
||||
{"state": obs_b},
|
||||
self.log_alpha.exp(), # with grad
|
||||
self.target_entropy,
|
||||
)
|
||||
loss_alpha.backward()
|
||||
self.log_alpha_optimizer.step()
|
||||
|
||||
# Update lr
|
||||
self.actor_lr_scheduler.step()
|
||||
self.critic_lr_scheduler.step()
|
||||
|
||||
# Save model
|
||||
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
|
||||
self.save_model()
|
||||
|
||||
# Log loss and save metrics
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0 and self.itr >= self.n_explore_steps:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"success rate - eval": success_rate,
|
||||
"avg episode reward - eval": avg_episode_reward,
|
||||
"avg best reward - eval": avg_best_reward,
|
||||
"num episode - eval": num_episode_finished,
|
||||
},
|
||||
step=self.itr,
|
||||
commit=False,
|
||||
)
|
||||
run_results[-1]["eval_success_rate"] = success_rate
|
||||
run_results[-1]["eval_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | alpha {alpha:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"total env step": cnt_train_step,
|
||||
"loss - actor": loss_actor,
|
||||
"loss - critic": loss_critic,
|
||||
"entropy coeff": alpha,
|
||||
"avg episode reward - train": avg_episode_reward,
|
||||
"num episode - train": num_episode_finished,
|
||||
},
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
@ -65,11 +65,14 @@ class TrainDIPODiffusionAgent(TrainAgent):
|
||||
gamma=1.0,
|
||||
)
|
||||
|
||||
# target update rate
|
||||
self.target_ema_rate = cfg.train.target_ema_rate
|
||||
|
||||
# Buffer size
|
||||
self.buffer_size = cfg.train.buffer_size
|
||||
|
||||
# Perturbation scale
|
||||
self.eta = cfg.train.eta
|
||||
# Action gradient scaling
|
||||
self.action_lr = cfg.train.action_lr
|
||||
|
||||
# Updates
|
||||
self.replay_ratio = cfg.train.replay_ratio
|
||||
@ -80,6 +83,9 @@ class TrainDIPODiffusionAgent(TrainAgent):
|
||||
# Apply action gradient many steps
|
||||
self.action_gradient_steps = cfg.train.action_gradient_steps
|
||||
|
||||
# Max grad norm for action
|
||||
self.action_grad_norm = self.action_dim * self.act_steps * 0.1
|
||||
|
||||
def run(self):
|
||||
|
||||
# make a FIFO replay buffer for obs, action, and reward
|
||||
@ -87,12 +93,12 @@ class TrainDIPODiffusionAgent(TrainAgent):
|
||||
next_obs_buffer = deque(maxlen=self.buffer_size)
|
||||
action_buffer = deque(maxlen=self.buffer_size)
|
||||
reward_buffer = deque(maxlen=self.buffer_size)
|
||||
done_buffer = deque(maxlen=self.buffer_size)
|
||||
first_buffer = deque(maxlen=self.buffer_size)
|
||||
terminated_buffer = deque(maxlen=self.buffer_size)
|
||||
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
@ -116,10 +122,9 @@ class TrainDIPODiffusionAgent(TrainAgent):
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
firsts_trajs[0] = (
|
||||
done_venv # if done at the end of last iteration, then the envs are just reset
|
||||
)
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -144,23 +149,33 @@ class TrainDIPODiffusionAgent(TrainAgent):
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
|
||||
action_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# add to buffer
|
||||
for i in range(self.n_envs):
|
||||
obs_buffer.append(prev_obs_venv["state"][i])
|
||||
next_obs_buffer.append(obs_venv["state"][i])
|
||||
action_buffer.append(action_venv[i])
|
||||
reward_buffer.append(reward_venv[i] * self.scale_reward_factor)
|
||||
done_buffer.append(done_venv[i])
|
||||
first_buffer.append(firsts_trajs[step])
|
||||
if not eval_mode:
|
||||
for i in range(self.n_envs):
|
||||
obs_buffer.append(prev_obs_venv["state"][i])
|
||||
if truncated_venv[i]: # truncated
|
||||
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
|
||||
else:
|
||||
next_obs_buffer.append(obs_venv["state"][i])
|
||||
action_buffer.append(action_venv[i])
|
||||
reward_buffer.extend(
|
||||
(reward_venv * self.scale_reward_factor).tolist()
|
||||
)
|
||||
terminated_buffer.extend(terminated_venv.tolist())
|
||||
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
@ -200,40 +215,31 @@ class TrainDIPODiffusionAgent(TrainAgent):
|
||||
|
||||
# Update models
|
||||
if not eval_mode:
|
||||
num_batch = self.replay_ratio
|
||||
num_batch = int(
|
||||
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
|
||||
)
|
||||
# only worth converting first with parallel envs - large number of updates below
|
||||
obs_array = np.array(obs_buffer)
|
||||
next_obs_array = np.array(next_obs_buffer)
|
||||
action_array = np.array(action_buffer)
|
||||
reward_array = np.array(reward_buffer)
|
||||
terminated_array = np.array(terminated_buffer)
|
||||
|
||||
# Critic learning
|
||||
for _ in range(num_batch):
|
||||
# Sample batch
|
||||
inds = np.random.choice(len(obs_buffer), self.batch_size)
|
||||
obs_b = (
|
||||
torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
obs_b = torch.from_numpy(obs_array[inds]).float().to(self.device)
|
||||
next_obs_b = (
|
||||
torch.from_numpy(
|
||||
np.vstack([next_obs_buffer[i][None] for i in inds])
|
||||
)
|
||||
.float()
|
||||
.to(self.device)
|
||||
torch.from_numpy(next_obs_array[inds]).float().to(self.device)
|
||||
)
|
||||
actions_b = (
|
||||
torch.from_numpy(
|
||||
np.vstack([action_buffer[i][None] for i in inds])
|
||||
)
|
||||
.float()
|
||||
.to(self.device)
|
||||
torch.from_numpy(action_array[inds]).float().to(self.device)
|
||||
)
|
||||
rewards_b = (
|
||||
torch.from_numpy(np.vstack([reward_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
torch.from_numpy(reward_array[inds]).float().to(self.device)
|
||||
)
|
||||
dones_b = (
|
||||
torch.from_numpy(np.vstack([done_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
terminated_b = (
|
||||
torch.from_numpy(terminated_array[inds]).float().to(self.device)
|
||||
)
|
||||
|
||||
# Update critic
|
||||
@ -242,78 +248,77 @@ class TrainDIPODiffusionAgent(TrainAgent):
|
||||
{"state": next_obs_b},
|
||||
actions_b,
|
||||
rewards_b,
|
||||
dones_b,
|
||||
terminated_b,
|
||||
self.gamma,
|
||||
)
|
||||
self.critic_optimizer.zero_grad()
|
||||
loss_critic.backward()
|
||||
self.critic_optimizer.step()
|
||||
|
||||
# Actor learning
|
||||
for _ in range(num_batch):
|
||||
# Sample batch
|
||||
inds = np.random.choice(len(obs_buffer), self.batch_size)
|
||||
obs_b = (
|
||||
torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
actions_b = (
|
||||
torch.from_numpy(
|
||||
np.vstack([action_buffer[i][None] for i in inds])
|
||||
)
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
|
||||
# Replace actions in buffer with guided actions
|
||||
guided_action_list = []
|
||||
|
||||
# get Q-perturbed actions by optimizing
|
||||
actions_flat = actions_b.reshape(actions_b.shape[0], -1)
|
||||
actions_optim = torch.optim.Adam(
|
||||
[actions_flat], lr=self.eta, eps=1e-5
|
||||
)
|
||||
for _ in range(self.action_gradient_steps):
|
||||
actions_flat.requires_grad_(True)
|
||||
q_values_1, q_values_2 = self.model.critic(
|
||||
{"state": obs_b}, actions_flat
|
||||
)
|
||||
q_values = torch.min(q_values_1, q_values_2)
|
||||
action_opt_loss = -q_values.sum()
|
||||
|
||||
actions_optim.zero_grad()
|
||||
action_opt_loss.backward(torch.ones_like(action_opt_loss))
|
||||
|
||||
# get the perturbed action
|
||||
actions_optim.step()
|
||||
|
||||
actions_flat.requires_grad_(False)
|
||||
actions_flat.clamp_(-1.0, 1.0)
|
||||
guided_action = actions_flat.detach()
|
||||
guided_action = guided_action.reshape(
|
||||
guided_action.shape[0], -1, self.action_dim
|
||||
)
|
||||
guided_action_list.append(guided_action)
|
||||
guided_action_stacked = torch.cat(guided_action_list, 0)
|
||||
|
||||
# Add to buffer (need separate indices since we're working with a limited subset)
|
||||
for i, i_buf in enumerate(inds):
|
||||
action_buffer[i_buf] = (
|
||||
guided_action_stacked[i].detach().cpu().numpy()
|
||||
)
|
||||
|
||||
# Update policy with collected trajectories
|
||||
loss = self.model.loss(guided_action.detach(), {"state": obs_b})
|
||||
self.actor_optimizer.zero_grad()
|
||||
loss.backward()
|
||||
# Actor learning
|
||||
loss_actor = 0.0
|
||||
if self.itr >= self.n_critic_warmup_itr:
|
||||
inds = np.random.choice(len(obs_buffer), self.batch_size)
|
||||
obs_b = (
|
||||
torch.from_numpy(obs_array[inds]).float().to(self.device)
|
||||
)
|
||||
actions_b = (
|
||||
torch.from_numpy(action_array[inds]).float().to(self.device)
|
||||
)
|
||||
|
||||
# get Q-perturbed actions by optimizing
|
||||
actions_flat = actions_b.reshape(len(actions_b), -1)
|
||||
actions_optim = torch.optim.Adam(
|
||||
[actions_flat], lr=self.action_lr, eps=1e-5
|
||||
)
|
||||
for _ in range(self.action_gradient_steps):
|
||||
actions_flat.requires_grad_(True)
|
||||
q_values_1, q_values_2 = self.model.critic(
|
||||
{"state": obs_b}, actions_flat
|
||||
)
|
||||
q_values = torch.min(q_values_1, q_values_2)
|
||||
action_opt_loss = -q_values.sum()
|
||||
|
||||
actions_optim.zero_grad()
|
||||
action_opt_loss.backward(torch.ones_like(action_opt_loss))
|
||||
torch.nn.utils.clip_grad_norm_(
|
||||
[actions_flat],
|
||||
max_norm=self.action_grad_norm,
|
||||
norm_type=2,
|
||||
)
|
||||
actions_optim.step()
|
||||
|
||||
actions_flat.requires_grad_(False)
|
||||
actions_flat.clamp_(-1.0, 1.0)
|
||||
guided_action = actions_flat.reshape(
|
||||
len(actions_flat), self.horizon_steps, self.action_dim
|
||||
)
|
||||
guided_action_np = guided_action.detach().cpu().numpy()
|
||||
|
||||
# Add back to buffer
|
||||
action_array[inds] = guided_action_np
|
||||
|
||||
# Update policy with collected trajectories
|
||||
loss_actor = self.model.loss(
|
||||
guided_action.detach(), {"state": obs_b}
|
||||
)
|
||||
self.actor_optimizer.zero_grad()
|
||||
loss_actor.backward()
|
||||
if self.max_grad_norm is not None:
|
||||
torch.nn.utils.clip_grad_norm_(
|
||||
self.model.actor.parameters(), self.max_grad_norm
|
||||
)
|
||||
self.actor_optimizer.step()
|
||||
|
||||
# Update target critic and actor
|
||||
self.model.update_target_critic(self.target_ema_rate)
|
||||
self.model.update_target_actor(self.target_ema_rate)
|
||||
|
||||
# convert back to buffer
|
||||
action_buffer = deque(
|
||||
[action for action in action_array], maxlen=self.buffer_size
|
||||
)
|
||||
|
||||
# Update lr
|
||||
self.actor_lr_scheduler.step()
|
||||
self.critic_lr_scheduler.step()
|
||||
@ -326,10 +331,12 @@ class TrainDIPODiffusionAgent(TrainAgent):
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
@ -350,23 +357,19 @@ class TrainDIPODiffusionAgent(TrainAgent):
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss - critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"loss": loss,
|
||||
"loss - critic": loss_critic,
|
||||
"avg episode reward - train": avg_episode_reward,
|
||||
"num episode - train": num_episode_finished,
|
||||
},
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["loss"] = loss
|
||||
run_results[-1]["loss_critic"] = loss_critic
|
||||
wandb_log = {
|
||||
"total env step": cnt_train_step,
|
||||
"loss - critic": loss_critic,
|
||||
"avg episode reward - train": avg_episode_reward,
|
||||
"num episode - train": num_episode_finished,
|
||||
}
|
||||
if type(loss_actor) == torch.Tensor:
|
||||
wandb_log["loss - actor"] = loss_actor
|
||||
wandb.log(wandb_log, step=self.itr, commit=True)
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["time"] = time
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
||||
|
@ -77,6 +77,9 @@ class TrainDQLDiffusionAgent(TrainAgent):
|
||||
# Updates
|
||||
self.replay_ratio = cfg.train.replay_ratio
|
||||
|
||||
# critic target update rate
|
||||
self.target_ema_rate = cfg.train.target_ema_rate
|
||||
|
||||
def run(self):
|
||||
|
||||
# make a FIFO replay buffer for obs, action, and reward
|
||||
@ -84,12 +87,12 @@ class TrainDQLDiffusionAgent(TrainAgent):
|
||||
next_obs_buffer = deque(maxlen=self.buffer_size)
|
||||
action_buffer = deque(maxlen=self.buffer_size)
|
||||
reward_buffer = deque(maxlen=self.buffer_size)
|
||||
done_buffer = deque(maxlen=self.buffer_size)
|
||||
first_buffer = deque(maxlen=self.buffer_size)
|
||||
terminated_buffer = deque(maxlen=self.buffer_size)
|
||||
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
@ -113,10 +116,9 @@ class TrainDQLDiffusionAgent(TrainAgent):
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
firsts_trajs[0] = (
|
||||
done_venv # if done at the end of last iteration, then the envs are just reset
|
||||
)
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -141,23 +143,33 @@ class TrainDQLDiffusionAgent(TrainAgent):
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
|
||||
action_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# add to buffer
|
||||
for i in range(self.n_envs):
|
||||
obs_buffer.append(prev_obs_venv["state"][i])
|
||||
next_obs_buffer.append(obs_venv["state"][i])
|
||||
action_buffer.append(action_venv[i])
|
||||
reward_buffer.append(reward_venv[i] * self.scale_reward_factor)
|
||||
done_buffer.append(done_venv[i])
|
||||
first_buffer.append(firsts_trajs[step])
|
||||
if not eval_mode:
|
||||
for i in range(self.n_envs):
|
||||
obs_buffer.append(prev_obs_venv["state"][i])
|
||||
if truncated_venv[i]: # truncated
|
||||
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
|
||||
else:
|
||||
next_obs_buffer.append(obs_venv["state"][i])
|
||||
action_buffer.append(action_venv[i])
|
||||
reward_buffer.extend(
|
||||
(reward_venv * self.scale_reward_factor).tolist()
|
||||
)
|
||||
terminated_buffer.extend(terminated_venv.tolist())
|
||||
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
@ -197,41 +209,24 @@ class TrainDQLDiffusionAgent(TrainAgent):
|
||||
|
||||
# Update models
|
||||
if not eval_mode:
|
||||
num_batch = self.replay_ratio
|
||||
num_batch = int(
|
||||
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
|
||||
)
|
||||
# only worth converting first with parallel envs - large number of updates below
|
||||
obs_array = np.array(obs_buffer)
|
||||
next_obs_array = np.array(next_obs_buffer)
|
||||
action_array = np.array(action_buffer)
|
||||
reward_array = np.array(reward_buffer)
|
||||
terminated_array = np.array(terminated_buffer)
|
||||
|
||||
# Critic learning
|
||||
for _ in range(num_batch):
|
||||
# Sample batch
|
||||
inds = np.random.choice(len(obs_buffer), self.batch_size)
|
||||
obs_b = (
|
||||
torch.from_numpy(np.vstack([obs_buffer[i][None] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
next_obs_b = (
|
||||
torch.from_numpy(
|
||||
np.vstack([next_obs_buffer[i][None] for i in inds])
|
||||
)
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
actions_b = (
|
||||
torch.from_numpy(
|
||||
np.vstack([action_buffer[i][None] for i in inds])
|
||||
)
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
rewards_b = (
|
||||
torch.from_numpy(np.vstack([reward_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
dones_b = (
|
||||
torch.from_numpy(np.vstack([done_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
obs_b = torch.from_numpy(obs_array[inds]).float().to(self.device)
|
||||
next_obs_b = torch.from_numpy(next_obs_array[inds]).float().to(self.device)
|
||||
actions_b = torch.from_numpy(action_array[inds]).float().to(self.device)
|
||||
rewards_b = torch.from_numpy(reward_array[inds]).float().to(self.device)
|
||||
terminated_b = torch.from_numpy(terminated_array[inds]).float().to(self.device)
|
||||
|
||||
# Update critic
|
||||
loss_critic = self.model.loss_critic(
|
||||
@ -239,39 +234,30 @@ class TrainDQLDiffusionAgent(TrainAgent):
|
||||
{"state": next_obs_b},
|
||||
actions_b,
|
||||
rewards_b,
|
||||
dones_b,
|
||||
terminated_b,
|
||||
self.gamma,
|
||||
)
|
||||
self.critic_optimizer.zero_grad()
|
||||
loss_critic.backward()
|
||||
self.critic_optimizer.step()
|
||||
|
||||
# get the new action and q values
|
||||
samples = self.model.forward_train(
|
||||
cond={"state": obs_b},
|
||||
deterministic=eval_mode,
|
||||
)
|
||||
action_venv = samples[:, : self.act_steps] # n_env x horizon x act
|
||||
q_values_b = self.model.critic({"state": obs_b}, action_venv)
|
||||
q1_new_action, q2_new_action = q_values_b
|
||||
|
||||
# Update policy with collected trajectories
|
||||
self.actor_optimizer.zero_grad()
|
||||
actor_loss = self.model.loss_actor(
|
||||
loss_actor = self.model.loss_actor(
|
||||
{"state": obs_b},
|
||||
actions_b,
|
||||
q1_new_action,
|
||||
q2_new_action,
|
||||
self.eta,
|
||||
self.act_steps,
|
||||
)
|
||||
actor_loss.backward()
|
||||
loss_actor.backward()
|
||||
if self.itr >= self.n_critic_warmup_itr:
|
||||
if self.max_grad_norm is not None:
|
||||
torch.nn.utils.clip_grad_norm_(
|
||||
self.model.actor.parameters(), self.max_grad_norm
|
||||
)
|
||||
self.actor_optimizer.step()
|
||||
loss = actor_loss
|
||||
|
||||
# update target
|
||||
self.model.update_target_critic(self.target_ema_rate)
|
||||
|
||||
# Update lr
|
||||
self.actor_lr_scheduler.step()
|
||||
@ -285,10 +271,12 @@ class TrainDQLDiffusionAgent(TrainAgent):
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
@ -309,12 +297,13 @@ class TrainDQLDiffusionAgent(TrainAgent):
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"loss": loss,
|
||||
"total env step": cnt_train_step,
|
||||
"loss - actor": loss_actor,
|
||||
"loss - critic": loss_critic,
|
||||
"avg episode reward - train": avg_episode_reward,
|
||||
"num episode - train": num_episode_finished,
|
||||
@ -322,10 +311,7 @@ class TrainDQLDiffusionAgent(TrainAgent):
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["loss"] = loss
|
||||
run_results[-1]["loss_critic"] = loss_critic
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["time"] = time
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
||||
|
352
agent/finetune/train_ibrl_agent.py
Normal file
352
agent/finetune/train_ibrl_agent.py
Normal file
@ -0,0 +1,352 @@
|
||||
"""
|
||||
Imitation Bootstrapped Reinforcement Learning (IBRL) agent training script.
|
||||
|
||||
Does not support image observations right now.
|
||||
"""
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import numpy as np
|
||||
import torch
|
||||
import logging
|
||||
import wandb
|
||||
import hydra
|
||||
from collections import deque
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
from util.timer import Timer
|
||||
from agent.finetune.train_agent import TrainAgent
|
||||
from util.scheduler import CosineAnnealingWarmupRestarts
|
||||
|
||||
|
||||
class TrainIBRLAgent(TrainAgent):
|
||||
def __init__(self, cfg):
|
||||
super().__init__(cfg)
|
||||
|
||||
# Build dataset
|
||||
self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset)
|
||||
|
||||
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
|
||||
self.gamma = cfg.train.gamma
|
||||
|
||||
# Optimizer
|
||||
self.actor_optimizer = torch.optim.AdamW(
|
||||
self.model.network.parameters(),
|
||||
lr=cfg.train.actor_lr,
|
||||
weight_decay=cfg.train.actor_weight_decay,
|
||||
)
|
||||
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
|
||||
self.actor_optimizer,
|
||||
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
|
||||
cycle_mult=1.0,
|
||||
max_lr=cfg.train.actor_lr,
|
||||
min_lr=cfg.train.actor_lr_scheduler.min_lr,
|
||||
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
|
||||
gamma=1.0,
|
||||
)
|
||||
self.critic_optimizer = torch.optim.AdamW(
|
||||
self.model.ensemble_params.values(), # https://github.com/pytorch/pytorch/issues/120581
|
||||
lr=cfg.train.critic_lr,
|
||||
weight_decay=cfg.train.critic_weight_decay,
|
||||
)
|
||||
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
|
||||
self.critic_optimizer,
|
||||
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
|
||||
cycle_mult=1.0,
|
||||
max_lr=cfg.train.critic_lr,
|
||||
min_lr=cfg.train.critic_lr_scheduler.min_lr,
|
||||
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
|
||||
gamma=1.0,
|
||||
)
|
||||
|
||||
# Perturbation scale
|
||||
self.target_ema_rate = cfg.train.target_ema_rate
|
||||
|
||||
# Reward scale
|
||||
self.scale_reward_factor = cfg.train.scale_reward_factor
|
||||
|
||||
# Number of critic updates
|
||||
self.critic_num_update = cfg.train.critic_num_update
|
||||
|
||||
# Update frequency
|
||||
self.update_freq = cfg.train.update_freq
|
||||
|
||||
# Buffer size
|
||||
self.buffer_size = cfg.train.buffer_size
|
||||
|
||||
# Eval episodes
|
||||
self.n_eval_episode = cfg.train.n_eval_episode
|
||||
|
||||
# Exploration steps at the beginning - using randomly sampled action
|
||||
self.n_explore_steps = cfg.train.n_explore_steps
|
||||
|
||||
def run(self):
|
||||
# make a FIFO replay buffer for obs, action, and reward
|
||||
obs_buffer = deque(maxlen=self.buffer_size)
|
||||
next_obs_buffer = deque(maxlen=self.buffer_size)
|
||||
action_buffer = deque(maxlen=self.buffer_size)
|
||||
reward_buffer = deque(maxlen=self.buffer_size)
|
||||
terminated_buffer = deque(maxlen=self.buffer_size)
|
||||
|
||||
# load offline dataset into replay buffer
|
||||
dataloader_offline = torch.utils.data.DataLoader(
|
||||
self.dataset_offline,
|
||||
batch_size=len(self.dataset_offline),
|
||||
drop_last=False,
|
||||
)
|
||||
for batch in dataloader_offline:
|
||||
actions, states_and_next, rewards, terminated = batch
|
||||
states = states_and_next["state"]
|
||||
next_states = states_and_next["next_state"]
|
||||
obs_buffer.extend(states.cpu().numpy())
|
||||
next_obs_buffer.extend(next_states.cpu().numpy())
|
||||
action_buffer.extend(actions.cpu().numpy())
|
||||
reward_buffer.extend(rewards.cpu().numpy().flatten())
|
||||
terminated_buffer.extend(terminated.cpu().numpy().flatten())
|
||||
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
if self.itr % 1000 == 0:
|
||||
print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
|
||||
|
||||
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
|
||||
options_venv = [{} for _ in range(self.n_envs)]
|
||||
if self.itr % self.render_freq == 0 and self.render_video:
|
||||
for env_ind in range(self.n_render):
|
||||
options_venv[env_ind]["video_path"] = os.path.join(
|
||||
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
|
||||
)
|
||||
|
||||
# Define train or eval - all envs restart
|
||||
eval_mode = (
|
||||
self.itr % self.val_freq == 0
|
||||
and self.itr > self.n_explore_steps
|
||||
and not self.force_train
|
||||
)
|
||||
n_steps = (
|
||||
self.n_steps if not eval_mode else int(1e5)
|
||||
) # large number for eval mode
|
||||
self.model.eval() if eval_mode else self.model.train()
|
||||
|
||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
|
||||
firsts_trajs = np.zeros((n_steps + 1, self.n_envs))
|
||||
if self.reset_at_iteration or eval_mode or self.itr == 0:
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
reward_trajs = np.zeros((n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
cnt_episode = 0
|
||||
for step in range(n_steps):
|
||||
|
||||
# Select action
|
||||
with torch.no_grad():
|
||||
cond = {
|
||||
"state": torch.from_numpy(prev_obs_venv["state"])
|
||||
.float()
|
||||
.to(self.device)
|
||||
}
|
||||
samples = (
|
||||
self.model(
|
||||
cond=cond,
|
||||
deterministic=eval_mode,
|
||||
)
|
||||
.cpu()
|
||||
.numpy()
|
||||
) # n_env x horizon x act
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# add to buffer in train mode
|
||||
if not eval_mode:
|
||||
for i in range(self.n_envs):
|
||||
obs_buffer.append(prev_obs_venv["state"][i])
|
||||
if "final_obs" in info_venv[i]: # truncated
|
||||
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
|
||||
terminated_venv[i] = False
|
||||
else: # first obs in new episode
|
||||
next_obs_buffer.append(obs_venv["state"][i])
|
||||
action_buffer.append(action_venv[i])
|
||||
reward_buffer.extend(
|
||||
(reward_venv * self.scale_reward_factor).tolist()
|
||||
)
|
||||
terminated_buffer.append(terminated_venv.tolist())
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# check if enough eval episodes are done
|
||||
cnt_episode += np.sum(done_venv)
|
||||
if eval_mode and cnt_episode >= self.n_eval_episode:
|
||||
break
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
|
||||
for i in range(len(env_steps) - 1):
|
||||
start = env_steps[i]
|
||||
end = env_steps[i + 1]
|
||||
if end - start > 1:
|
||||
episodes_start_end.append((env_ind, start, end - 1))
|
||||
if len(episodes_start_end) > 0:
|
||||
reward_trajs_split = [
|
||||
reward_trajs[start : end + 1, env_ind]
|
||||
for env_ind, start, end in episodes_start_end
|
||||
]
|
||||
num_episode_finished = len(reward_trajs_split)
|
||||
episode_reward = np.array(
|
||||
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
|
||||
)
|
||||
episode_best_reward = np.array(
|
||||
[
|
||||
np.max(reward_traj) / self.act_steps
|
||||
for reward_traj in reward_trajs_split
|
||||
]
|
||||
)
|
||||
avg_episode_reward = np.mean(episode_reward)
|
||||
avg_best_reward = np.mean(episode_best_reward)
|
||||
success_rate = np.mean(
|
||||
episode_best_reward >= self.best_reward_threshold_for_success
|
||||
)
|
||||
else:
|
||||
episode_reward = np.array([])
|
||||
num_episode_finished = 0
|
||||
avg_episode_reward = 0
|
||||
avg_best_reward = 0
|
||||
success_rate = 0
|
||||
|
||||
# Update models
|
||||
if (
|
||||
not eval_mode
|
||||
and self.itr > self.n_explore_steps
|
||||
and self.itr % self.update_freq == 0
|
||||
):
|
||||
# Update critic more frequently
|
||||
for _ in range(self.critic_num_update):
|
||||
# Sample from online buffer
|
||||
inds = np.random.choice(len(obs_buffer), self.batch_size)
|
||||
obs_b = (
|
||||
torch.from_numpy(np.array([obs_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
next_obs_b = (
|
||||
torch.from_numpy(np.array([next_obs_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
actions_b = (
|
||||
torch.from_numpy(np.array([action_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
rewards_b = (
|
||||
torch.from_numpy(np.array([reward_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
terminated_b = (
|
||||
torch.from_numpy(np.array([terminated_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
loss_critic = self.model.loss_critic(
|
||||
{"state": obs_b},
|
||||
{"state": next_obs_b},
|
||||
actions_b,
|
||||
rewards_b,
|
||||
terminated_b,
|
||||
self.gamma,
|
||||
)
|
||||
self.critic_optimizer.zero_grad()
|
||||
loss_critic.backward()
|
||||
self.critic_optimizer.step()
|
||||
|
||||
# Update target critic every critic update
|
||||
self.model.update_target_critic(self.target_ema_rate)
|
||||
|
||||
# Update actor once with the final batch
|
||||
loss_actor = self.model.loss_actor(
|
||||
{"state": obs_b},
|
||||
)
|
||||
self.actor_optimizer.zero_grad()
|
||||
loss_actor.backward()
|
||||
self.actor_optimizer.step()
|
||||
|
||||
# Update target actor
|
||||
self.model.update_target_actor(self.target_ema_rate)
|
||||
|
||||
# Update lr
|
||||
self.actor_lr_scheduler.step()
|
||||
self.critic_lr_scheduler.step()
|
||||
|
||||
# Save model
|
||||
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
|
||||
self.save_model()
|
||||
|
||||
# Log loss and save metrics
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"success rate - eval": success_rate,
|
||||
"avg episode reward - eval": avg_episode_reward,
|
||||
"avg best reward - eval": avg_best_reward,
|
||||
"num episode - eval": num_episode_finished,
|
||||
},
|
||||
step=self.itr,
|
||||
commit=False,
|
||||
)
|
||||
run_results[-1]["eval_success_rate"] = success_rate
|
||||
run_results[-1]["eval_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"total env step": cnt_train_step,
|
||||
"loss - actor": loss_actor,
|
||||
"loss - critic": loss_critic,
|
||||
"avg episode reward - train": avg_episode_reward,
|
||||
"num episode - train": num_episode_finished,
|
||||
},
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
@ -102,12 +102,12 @@ class TrainIDQLDiffusionAgent(TrainAgent):
|
||||
next_obs_buffer = deque(maxlen=self.buffer_size)
|
||||
action_buffer = deque(maxlen=self.buffer_size)
|
||||
reward_buffer = deque(maxlen=self.buffer_size)
|
||||
done_buffer = deque(maxlen=self.buffer_size)
|
||||
first_buffer = deque(maxlen=self.buffer_size)
|
||||
terminated_buffer = deque(maxlen=self.buffer_size)
|
||||
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
@ -131,10 +131,9 @@ class TrainIDQLDiffusionAgent(TrainAgent):
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
firsts_trajs[0] = (
|
||||
done_venv # if done at the end of last iteration, then the envs are just reset
|
||||
)
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -161,22 +160,33 @@ class TrainIDQLDiffusionAgent(TrainAgent):
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
|
||||
action_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# add to buffer
|
||||
obs_buffer.append(prev_obs_venv["state"])
|
||||
next_obs_buffer.append(obs_venv["state"])
|
||||
action_buffer.append(action_venv)
|
||||
reward_buffer.append(reward_venv * self.scale_reward_factor)
|
||||
done_buffer.append(done_venv)
|
||||
first_buffer.append(firsts_trajs[step])
|
||||
if not eval_mode:
|
||||
obs_venv_copy = obs_venv.copy()
|
||||
for i in range(self.n_envs):
|
||||
if truncated_venv[i]:
|
||||
obs_venv_copy["state"][i] = info_venv[i]["final_obs"][
|
||||
"state"
|
||||
]
|
||||
obs_buffer.append(prev_obs_venv["state"])
|
||||
next_obs_buffer.append(obs_venv_copy["state"])
|
||||
action_buffer.append(action_venv)
|
||||
reward_buffer.append(reward_venv * self.scale_reward_factor)
|
||||
terminated_buffer.append(terminated_venv)
|
||||
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
@ -216,13 +226,15 @@ class TrainIDQLDiffusionAgent(TrainAgent):
|
||||
|
||||
# Update models
|
||||
if not eval_mode:
|
||||
num_batch = int(
|
||||
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
|
||||
)
|
||||
|
||||
obs_trajs = np.array(deepcopy(obs_buffer))
|
||||
action_trajs = np.array(deepcopy(action_buffer))
|
||||
next_obs_trajs = np.array(deepcopy(next_obs_buffer))
|
||||
reward_trajs = np.array(deepcopy(reward_buffer))
|
||||
done_trajs = np.array(deepcopy(done_buffer))
|
||||
first_trajs = np.array(deepcopy(first_buffer))
|
||||
terminated_trajs = np.array(deepcopy(terminated_buffer))
|
||||
|
||||
# flatten
|
||||
obs_trajs = einops.rearrange(
|
||||
@ -238,13 +250,7 @@ class TrainIDQLDiffusionAgent(TrainAgent):
|
||||
"s e h d -> (s e) h d",
|
||||
)
|
||||
reward_trajs = reward_trajs.reshape(-1)
|
||||
done_trajs = done_trajs.reshape(-1)
|
||||
first_trajs = first_trajs.reshape(-1)
|
||||
|
||||
num_batch = int(
|
||||
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
|
||||
)
|
||||
|
||||
terminated_trajs = terminated_trajs.reshape(-1)
|
||||
for _ in range(num_batch):
|
||||
|
||||
# Sample batch
|
||||
@ -259,7 +265,9 @@ class TrainIDQLDiffusionAgent(TrainAgent):
|
||||
reward_b = (
|
||||
torch.from_numpy(reward_trajs[inds]).float().to(self.device)
|
||||
)
|
||||
done_b = torch.from_numpy(done_trajs[inds]).float().to(self.device)
|
||||
terminated_b = (
|
||||
torch.from_numpy(terminated_trajs[inds]).float().to(self.device)
|
||||
)
|
||||
|
||||
# update critic value function
|
||||
critic_loss_v = self.model.loss_critic_v(
|
||||
@ -275,7 +283,7 @@ class TrainIDQLDiffusionAgent(TrainAgent):
|
||||
{"state": next_obs_b},
|
||||
actions_b,
|
||||
reward_b,
|
||||
done_b,
|
||||
terminated_b,
|
||||
self.gamma,
|
||||
)
|
||||
self.critic_q_optimizer.zero_grad()
|
||||
@ -284,16 +292,15 @@ class TrainIDQLDiffusionAgent(TrainAgent):
|
||||
|
||||
# update target q function
|
||||
self.model.update_target_critic(self.critic_tau)
|
||||
|
||||
loss_critic = critic_loss_q.detach() + critic_loss_v.detach()
|
||||
|
||||
# Update policy with collected trajectories - no weighting
|
||||
loss = self.model.loss(
|
||||
loss_actor = self.model.loss(
|
||||
actions_b,
|
||||
{"state": obs_b},
|
||||
)
|
||||
self.actor_optimizer.zero_grad()
|
||||
loss.backward()
|
||||
loss_actor.backward()
|
||||
if self.itr >= self.n_critic_warmup_itr:
|
||||
if self.max_grad_norm is not None:
|
||||
torch.nn.utils.clip_grad_norm_(
|
||||
@ -314,10 +321,12 @@ class TrainIDQLDiffusionAgent(TrainAgent):
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
@ -338,12 +347,13 @@ class TrainIDQLDiffusionAgent(TrainAgent):
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"loss": loss,
|
||||
"total env step": cnt_train_step,
|
||||
"loss - actor": loss_actor,
|
||||
"loss - critic": loss_critic,
|
||||
"avg episode reward - train": avg_episode_reward,
|
||||
"num episode - train": num_episode_finished,
|
||||
@ -351,10 +361,7 @@ class TrainIDQLDiffusionAgent(TrainAgent):
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["loss"] = loss
|
||||
run_results[-1]["loss_critic"] = loss_critic
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["time"] = time
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
||||
|
@ -50,6 +50,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
@ -68,34 +69,36 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
last_itr_eval = eval_mode
|
||||
|
||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
||||
dones_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
firsts_trajs[0] = (
|
||||
done_venv # if done at the end of last iteration, then the envs are just reset
|
||||
)
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
|
||||
# Holder
|
||||
obs_trajs = {
|
||||
"state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
|
||||
"state": np.zeros(
|
||||
(self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim)
|
||||
)
|
||||
}
|
||||
chains_trajs = np.empty(
|
||||
chains_trajs = np.zeros(
|
||||
(
|
||||
0,
|
||||
self.n_steps,
|
||||
self.n_envs,
|
||||
self.model.ft_denoising_steps + 1,
|
||||
self.horizon_steps,
|
||||
self.action_dim,
|
||||
)
|
||||
)
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
|
||||
obs_full_trajs = np.vstack(
|
||||
(obs_full_trajs, prev_obs_venv["state"][:, -1][None])
|
||||
) # save current obs
|
||||
terminated_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
if self.save_full_observations: # state-only
|
||||
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
|
||||
obs_full_trajs = np.vstack(
|
||||
(obs_full_trajs, prev_obs_venv["state"][:, -1][None])
|
||||
)
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -123,9 +126,10 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
action_venv = output_venv[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
|
||||
action_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
if self.save_full_observations: # state-only
|
||||
obs_full_venv = np.array(
|
||||
[info["full_obs"]["state"] for info in info_venv]
|
||||
@ -133,15 +137,18 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
obs_full_trajs = np.vstack(
|
||||
(obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
|
||||
)
|
||||
obs_trajs["state"] = np.vstack(
|
||||
(obs_trajs["state"], prev_obs_venv["state"][None])
|
||||
)
|
||||
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
dones_trajs[step] = done_venv
|
||||
obs_trajs["state"][step] = prev_obs_venv["state"]
|
||||
chains_trajs[step] = chains_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
terminated_trajs[step] = terminated_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
@ -238,7 +245,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
)
|
||||
reward_trajs = reward_trajs_transpose.T
|
||||
|
||||
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
|
||||
# bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
|
||||
obs_venv_ts = {
|
||||
"state": torch.from_numpy(obs_venv["state"])
|
||||
.float()
|
||||
@ -256,7 +263,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
)
|
||||
else:
|
||||
nextvalues = values_trajs[t + 1]
|
||||
nonterminal = 1.0 - dones_trajs[t]
|
||||
nonterminal = 1.0 - terminated_trajs[t]
|
||||
# delta = r + gamma*V(st+1) - V(st)
|
||||
delta = (
|
||||
reward_trajs[t] * self.reward_scale_const
|
||||
@ -405,6 +412,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.save_trajs:
|
||||
@ -414,6 +422,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
run_results[-1]["reward_trajs"] = reward_trajs
|
||||
if self.itr % self.log_freq == 0:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
@ -434,11 +443,12 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}"
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"total env step": cnt_train_step,
|
||||
"loss": loss,
|
||||
"pg loss": pg_loss,
|
||||
"value loss": v_loss,
|
||||
@ -459,17 +469,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["loss"] = loss
|
||||
run_results[-1]["pg_loss"] = pg_loss
|
||||
run_results[-1]["value_loss"] = v_loss
|
||||
run_results[-1]["bc_loss"] = bc_loss
|
||||
run_results[-1]["eta"] = eta
|
||||
run_results[-1]["approx_kl"] = approx_kl
|
||||
run_results[-1]["ratio"] = ratio
|
||||
run_results[-1]["clip_frac"] = np.mean(clipfracs)
|
||||
run_results[-1]["explained_variance"] = explained_var
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["time"] = time
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
||||
|
@ -40,6 +40,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
@ -58,31 +59,32 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
||||
last_itr_eval = eval_mode
|
||||
|
||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
||||
dones_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
firsts_trajs[0] = (
|
||||
done_venv # if done at the end of last iteration, then the envs are just reset
|
||||
)
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
|
||||
# Holder
|
||||
obs_trajs = {
|
||||
k: np.empty((0, self.n_envs, self.n_cond_step, *self.obs_dims[k]))
|
||||
k: np.zeros(
|
||||
(self.n_steps, self.n_envs, self.n_cond_step, *self.obs_dims[k])
|
||||
)
|
||||
for k in self.obs_dims
|
||||
}
|
||||
chains_trajs = np.empty(
|
||||
chains_trajs = np.zeros(
|
||||
(
|
||||
0,
|
||||
self.n_steps,
|
||||
self.n_envs,
|
||||
self.model.ft_denoising_steps + 1,
|
||||
self.horizon_steps,
|
||||
self.action_dim,
|
||||
)
|
||||
)
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
terminated_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -111,17 +113,23 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
||||
action_venv = output_venv[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
|
||||
action_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
for k in obs_trajs:
|
||||
obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
|
||||
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
dones_trajs[step] = done_venv
|
||||
obs_trajs[k][step] = prev_obs_venv[k]
|
||||
chains_trajs[step] = chains_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
terminated_trajs[step] = terminated_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
@ -235,7 +243,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
||||
)
|
||||
reward_trajs = reward_trajs_transpose.T
|
||||
|
||||
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
|
||||
# bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
|
||||
obs_venv_ts = {
|
||||
key: torch.from_numpy(obs_venv[key]).float().to(self.device)
|
||||
for key in self.obs_dims
|
||||
@ -252,7 +260,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
||||
)
|
||||
else:
|
||||
nextvalues = values_trajs[t + 1]
|
||||
nonterminal = 1.0 - dones_trajs[t]
|
||||
nonterminal = 1.0 - terminated_trajs[t]
|
||||
# delta = r + gamma*V(st+1) - V(st)
|
||||
delta = (
|
||||
reward_trajs[t] * self.reward_scale_const
|
||||
@ -398,10 +406,12 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
@ -422,11 +432,12 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}"
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"total env step": cnt_train_step,
|
||||
"loss": loss,
|
||||
"pg loss": pg_loss,
|
||||
"value loss": v_loss,
|
||||
@ -447,17 +458,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["loss"] = loss
|
||||
run_results[-1]["pg_loss"] = pg_loss
|
||||
run_results[-1]["value_loss"] = v_loss
|
||||
run_results[-1]["bc_loss"] = bc_loss
|
||||
run_results[-1]["eta"] = eta
|
||||
run_results[-1]["approx_kl"] = approx_kl
|
||||
run_results[-1]["ratio"] = ratio
|
||||
run_results[-1]["clip_frac"] = np.mean(clipfracs)
|
||||
run_results[-1]["explained_variance"] = explained_var
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["time"] = time
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
||||
|
@ -32,6 +32,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
@ -50,42 +51,39 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
||||
last_itr_eval = eval_mode
|
||||
|
||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
||||
dones_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
firsts_trajs[0] = (
|
||||
done_venv # if done at the end of last iteration, then the envs are just reset
|
||||
)
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
|
||||
# Holder
|
||||
obs_trajs = {
|
||||
"state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
|
||||
"state": np.zeros(
|
||||
(self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim)
|
||||
)
|
||||
}
|
||||
samples_trajs = np.empty(
|
||||
samples_trajs = np.zeros(
|
||||
(
|
||||
0,
|
||||
self.n_steps,
|
||||
self.n_envs,
|
||||
self.horizon_steps,
|
||||
self.action_dim,
|
||||
)
|
||||
)
|
||||
chains_trajs = np.empty(
|
||||
chains_trajs = np.zeros(
|
||||
(
|
||||
0,
|
||||
self.n_steps,
|
||||
self.n_envs,
|
||||
self.model.ft_denoising_steps + 1,
|
||||
self.horizon_steps,
|
||||
self.action_dim,
|
||||
)
|
||||
)
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
|
||||
obs_full_trajs = np.vstack(
|
||||
(obs_full_trajs, prev_obs_venv["state"][:, -1][None])
|
||||
) # save current obs
|
||||
terminated_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -111,28 +109,25 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
||||
samples.chains.cpu().numpy()
|
||||
) # n_env x denoising x horizon x act
|
||||
action_venv = output_venv[:, : self.act_steps]
|
||||
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
|
||||
samples_trajs[step] = output_venv
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
|
||||
action_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
if self.save_full_observations: # state-only
|
||||
obs_full_venv = np.array(
|
||||
[info["full_obs"]["state"] for info in info_venv]
|
||||
) # n_envs x act_steps x obs_dim
|
||||
obs_full_trajs = np.vstack(
|
||||
(obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
|
||||
)
|
||||
obs_trajs["state"] = np.vstack(
|
||||
(obs_trajs["state"], prev_obs_venv["state"][None])
|
||||
)
|
||||
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
dones_trajs[step] = done_venv
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
obs_trajs["state"][step] = prev_obs_venv["state"]
|
||||
chains_trajs[step] = chains_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
terminated_trajs[step] = terminated_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
@ -214,7 +209,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
||||
)
|
||||
reward_trajs = reward_trajs_transpose.T
|
||||
|
||||
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
|
||||
# bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
|
||||
obs_venv_ts = {
|
||||
"state": torch.from_numpy(obs_venv["state"])
|
||||
.float()
|
||||
@ -232,7 +227,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
||||
)
|
||||
else:
|
||||
nextvalues = values_trajs[t + 1]
|
||||
nonterminal = 1.0 - dones_trajs[t]
|
||||
nonterminal = 1.0 - terminated_trajs[t]
|
||||
# delta = r + gamma*V(st+1) - V(st)
|
||||
delta = (
|
||||
reward_trajs[t] * self.reward_scale_const
|
||||
@ -343,20 +338,6 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
||||
np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
|
||||
)
|
||||
|
||||
# Plot state trajectories (only in D3IL)
|
||||
if (
|
||||
self.itr % self.render_freq == 0
|
||||
and self.n_render > 0
|
||||
and self.traj_plotter is not None
|
||||
):
|
||||
self.traj_plotter(
|
||||
obs_full_trajs=obs_full_trajs,
|
||||
n_render=self.n_render,
|
||||
max_episode_steps=self.max_episode_steps,
|
||||
render_dir=self.render_dir,
|
||||
itr=self.itr,
|
||||
)
|
||||
|
||||
# Update lr
|
||||
if self.itr >= self.n_critic_warmup_itr:
|
||||
self.actor_lr_scheduler.step()
|
||||
@ -370,16 +351,17 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.save_trajs:
|
||||
run_results[-1]["obs_full_trajs"] = obs_full_trajs
|
||||
run_results[-1]["obs_trajs"] = obs_trajs
|
||||
run_results[-1]["action_trajs"] = samples_trajs
|
||||
run_results[-1]["chains_trajs"] = chains_trajs
|
||||
run_results[-1]["reward_trajs"] = reward_trajs
|
||||
if self.itr % self.log_freq == 0:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
@ -400,11 +382,12 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"total env step": cnt_train_step,
|
||||
"loss": loss,
|
||||
"pg loss": pg_loss,
|
||||
"value loss": v_loss,
|
||||
@ -417,15 +400,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["loss"] = loss
|
||||
run_results[-1]["pg_loss"] = pg_loss
|
||||
run_results[-1]["value_loss"] = v_loss
|
||||
run_results[-1]["approx_kl"] = approx_kl
|
||||
run_results[-1]["ratio"] = ratio
|
||||
run_results[-1]["clip_frac"] = np.mean(clipfracs)
|
||||
run_results[-1]["explained_variance"] = explained_var
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["time"] = time
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
||||
|
@ -27,6 +27,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
@ -45,33 +46,35 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
||||
last_itr_eval = eval_mode
|
||||
|
||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
||||
dones_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
firsts_trajs[0] = (
|
||||
done_venv # if done at the end of last iteration, then the envs are just reset
|
||||
)
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
|
||||
# Holder
|
||||
obs_trajs = {
|
||||
"state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
|
||||
"state": np.zeros(
|
||||
(self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim)
|
||||
)
|
||||
}
|
||||
samples_trajs = np.empty(
|
||||
samples_trajs = np.zeros(
|
||||
(
|
||||
0,
|
||||
self.n_steps,
|
||||
self.n_envs,
|
||||
self.horizon_steps,
|
||||
self.action_dim,
|
||||
)
|
||||
)
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
|
||||
obs_full_trajs = np.vstack(
|
||||
(obs_full_trajs, prev_obs_venv["state"][:, -1][None])
|
||||
) # save current obs
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
terminated_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
if self.save_full_observations:
|
||||
obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
|
||||
obs_full_trajs = np.vstack(
|
||||
(obs_full_trajs, prev_obs_venv["state"][:, -1][None])
|
||||
)
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -93,9 +96,10 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
||||
action_venv = output_venv[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
|
||||
action_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
if self.save_full_observations: # state-only
|
||||
obs_full_venv = np.array(
|
||||
[info["full_obs"]["state"] for info in info_venv]
|
||||
@ -103,15 +107,18 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
||||
obs_full_trajs = np.vstack(
|
||||
(obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
|
||||
)
|
||||
obs_trajs["state"] = np.vstack(
|
||||
(obs_trajs["state"], prev_obs_venv["state"][None])
|
||||
)
|
||||
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
dones_trajs[step] = done_venv
|
||||
obs_trajs["state"][step] = prev_obs_venv["state"]
|
||||
samples_trajs[step] = output_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
terminated_trajs[step] = terminated_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
@ -221,7 +228,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
||||
)
|
||||
else:
|
||||
nextvalues = values_trajs[t + 1]
|
||||
nonterminal = 1.0 - dones_trajs[t]
|
||||
nonterminal = 1.0 - terminated_trajs[t]
|
||||
# delta = r + gamma*V(st+1) - V(st)
|
||||
delta = (
|
||||
reward_trajs[t] * self.reward_scale_const
|
||||
@ -363,6 +370,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.save_trajs:
|
||||
@ -372,6 +380,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
||||
run_results[-1]["reward_trajs"] = reward_trajs
|
||||
if self.itr % self.log_freq == 0:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
@ -392,11 +401,12 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | ent {-entropy_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | ent {-entropy_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"total env step": cnt_train_step,
|
||||
"loss": loss,
|
||||
"pg loss": pg_loss,
|
||||
"value loss": v_loss,
|
||||
@ -412,16 +422,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["loss"] = loss
|
||||
run_results[-1]["pg_loss"] = pg_loss
|
||||
run_results[-1]["value_loss"] = v_loss
|
||||
run_results[-1]["entropy_loss"] = entropy_loss
|
||||
run_results[-1]["approx_kl"] = approx_kl
|
||||
run_results[-1]["ratio"] = ratio
|
||||
run_results[-1]["clip_frac"] = np.mean(clipfracs)
|
||||
run_results[-1]["explained_variance"] = explained_var
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["time"] = time
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
||||
|
@ -40,6 +40,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
@ -58,30 +59,31 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
||||
last_itr_eval = eval_mode
|
||||
|
||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
||||
dones_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
firsts_trajs[0] = (
|
||||
done_venv # if done at the end of last iteration, then the envs are just reset
|
||||
)
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
|
||||
# Holder
|
||||
obs_trajs = {
|
||||
k: np.empty((0, self.n_envs, self.n_cond_step, *self.obs_dims[k]))
|
||||
k: np.zeros(
|
||||
(self.n_steps, self.n_envs, self.n_cond_step, *self.obs_dims[k])
|
||||
)
|
||||
for k in self.obs_dims
|
||||
}
|
||||
samples_trajs = np.empty(
|
||||
samples_trajs = np.zeros(
|
||||
(
|
||||
0,
|
||||
self.n_steps,
|
||||
self.n_envs,
|
||||
self.horizon_steps,
|
||||
self.action_dim,
|
||||
)
|
||||
)
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
terminated_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -104,17 +106,23 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
||||
action_venv = output_venv[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
|
||||
action_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
for k in obs_trajs:
|
||||
obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
|
||||
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
dones_trajs[step] = done_venv
|
||||
obs_trajs[k][step] = prev_obs_venv[k]
|
||||
samples_trajs[step] = output_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
terminated_trajs[step] = terminated_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
@ -240,7 +248,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
||||
)
|
||||
else:
|
||||
nextvalues = values_trajs[t + 1]
|
||||
nonterminal = 1.0 - dones_trajs[t]
|
||||
nonterminal = 1.0 - terminated_trajs[t]
|
||||
# delta = r + gamma*V(st+1) - V(st)
|
||||
delta = (
|
||||
reward_trajs[t] * self.reward_scale_const
|
||||
@ -374,10 +382,12 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
@ -398,11 +408,12 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"total env step": cnt_train_step,
|
||||
"loss": loss,
|
||||
"pg loss": pg_loss,
|
||||
"value loss": v_loss,
|
||||
@ -422,17 +433,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["loss"] = loss
|
||||
run_results[-1]["pg_loss"] = pg_loss
|
||||
run_results[-1]["value_loss"] = v_loss
|
||||
run_results[-1]["bc_loss"] = bc_loss
|
||||
run_results[-1]["std"] = std
|
||||
run_results[-1]["approx_kl"] = approx_kl
|
||||
run_results[-1]["ratio"] = ratio
|
||||
run_results[-1]["clip_frac"] = np.mean(clipfracs)
|
||||
run_results[-1]["explained_variance"] = explained_var
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["time"] = time
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
||||
|
@ -80,12 +80,12 @@ class TrainQSMDiffusionAgent(TrainAgent):
|
||||
next_obs_buffer = deque(maxlen=self.buffer_size)
|
||||
action_buffer = deque(maxlen=self.buffer_size)
|
||||
reward_buffer = deque(maxlen=self.buffer_size)
|
||||
done_buffer = deque(maxlen=self.buffer_size)
|
||||
first_buffer = deque(maxlen=self.buffer_size)
|
||||
terminated_buffer = deque(maxlen=self.buffer_size)
|
||||
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
@ -109,10 +109,9 @@ class TrainQSMDiffusionAgent(TrainAgent):
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
firsts_trajs[0] = (
|
||||
done_venv # if done at the end of last iteration, then the envs are just reset
|
||||
)
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -137,22 +136,33 @@ class TrainQSMDiffusionAgent(TrainAgent):
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
|
||||
action_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# add to buffer
|
||||
obs_buffer.append(prev_obs_venv["state"])
|
||||
next_obs_buffer.append(obs_venv["state"])
|
||||
action_buffer.append(action_venv)
|
||||
reward_buffer.append(reward_venv * self.scale_reward_factor)
|
||||
done_buffer.append(done_venv)
|
||||
first_buffer.append(firsts_trajs[step])
|
||||
if not eval_mode:
|
||||
obs_venv_copy = obs_venv.copy()
|
||||
for i in range(self.n_envs):
|
||||
if truncated_venv[i]:
|
||||
obs_venv_copy["state"][i] = info_venv[i]["final_obs"][
|
||||
"state"
|
||||
]
|
||||
obs_buffer.append(prev_obs_venv["state"])
|
||||
next_obs_buffer.append(obs_venv_copy["state"])
|
||||
action_buffer.append(action_venv)
|
||||
reward_buffer.append(reward_venv * self.scale_reward_factor)
|
||||
terminated_buffer.append(terminated_venv)
|
||||
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
@ -192,13 +202,15 @@ class TrainQSMDiffusionAgent(TrainAgent):
|
||||
|
||||
# Update models
|
||||
if not eval_mode:
|
||||
num_batch = int(
|
||||
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
|
||||
)
|
||||
|
||||
obs_trajs = np.array(deepcopy(obs_buffer))
|
||||
action_trajs = np.array(deepcopy(action_buffer))
|
||||
next_obs_trajs = np.array(deepcopy(next_obs_buffer))
|
||||
reward_trajs = np.array(deepcopy(reward_buffer))
|
||||
done_trajs = np.array(deepcopy(done_buffer))
|
||||
first_trajs = np.array(deepcopy(first_buffer))
|
||||
terminated_trajs = np.array(deepcopy(terminated_buffer))
|
||||
|
||||
# flatten
|
||||
obs_trajs = einops.rearrange(
|
||||
@ -214,16 +226,8 @@ class TrainQSMDiffusionAgent(TrainAgent):
|
||||
"s e h d -> (s e) h d",
|
||||
)
|
||||
reward_trajs = reward_trajs.reshape(-1)
|
||||
done_trajs = done_trajs.reshape(-1)
|
||||
first_trajs = first_trajs.reshape(-1)
|
||||
|
||||
num_batch = int(
|
||||
self.n_steps * self.n_envs / self.batch_size * self.replay_ratio
|
||||
)
|
||||
|
||||
terminated_trajs = terminated_trajs.reshape(-1)
|
||||
for _ in range(num_batch):
|
||||
|
||||
# Sample batch
|
||||
inds = np.random.choice(len(obs_trajs), self.batch_size)
|
||||
obs_b = torch.from_numpy(obs_trajs[inds]).float().to(self.device)
|
||||
next_obs_b = (
|
||||
@ -232,37 +236,34 @@ class TrainQSMDiffusionAgent(TrainAgent):
|
||||
actions_b = (
|
||||
torch.from_numpy(action_trajs[inds]).float().to(self.device)
|
||||
)
|
||||
reward_b = (
|
||||
rewards_b = (
|
||||
torch.from_numpy(reward_trajs[inds]).float().to(self.device)
|
||||
)
|
||||
done_b = torch.from_numpy(done_trajs[inds]).float().to(self.device)
|
||||
terminated_b = (
|
||||
torch.from_numpy(terminated_trajs[inds]).float().to(self.device)
|
||||
)
|
||||
|
||||
# update critic q function
|
||||
critic_loss = self.model.loss_critic(
|
||||
loss_critic = self.model.loss_critic(
|
||||
{"state": obs_b},
|
||||
{"state": next_obs_b},
|
||||
actions_b,
|
||||
reward_b,
|
||||
done_b,
|
||||
rewards_b,
|
||||
terminated_b,
|
||||
self.gamma,
|
||||
)
|
||||
self.critic_optimizer.zero_grad()
|
||||
critic_loss.backward()
|
||||
loss_critic.backward()
|
||||
self.critic_optimizer.step()
|
||||
|
||||
# update target q function
|
||||
self.model.update_target_critic(self.critic_tau)
|
||||
|
||||
loss_critic = critic_loss.detach()
|
||||
|
||||
# Update policy with collected trajectories
|
||||
loss = self.model.loss_actor(
|
||||
loss_actor = self.model.loss_actor(
|
||||
{"state": obs_b},
|
||||
actions_b,
|
||||
self.q_grad_coeff,
|
||||
)
|
||||
self.actor_optimizer.zero_grad()
|
||||
loss.backward()
|
||||
loss_actor.backward()
|
||||
if self.itr >= self.n_critic_warmup_itr:
|
||||
if self.max_grad_norm is not None:
|
||||
torch.nn.utils.clip_grad_norm_(
|
||||
@ -270,6 +271,9 @@ class TrainQSMDiffusionAgent(TrainAgent):
|
||||
)
|
||||
self.actor_optimizer.step()
|
||||
|
||||
# update target critic
|
||||
self.model.update_target_critic(self.critic_tau)
|
||||
|
||||
# Update lr
|
||||
self.actor_lr_scheduler.step()
|
||||
self.critic_lr_scheduler.step()
|
||||
@ -282,10 +286,12 @@ class TrainQSMDiffusionAgent(TrainAgent):
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
@ -306,12 +312,13 @@ class TrainQSMDiffusionAgent(TrainAgent):
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"loss": loss,
|
||||
"total env step": cnt_train_step,
|
||||
"loss - actor": loss_actor,
|
||||
"loss - critic": loss_critic,
|
||||
"avg episode reward - train": avg_episode_reward,
|
||||
"num episode - train": num_episode_finished,
|
||||
@ -319,10 +326,7 @@ class TrainQSMDiffusionAgent(TrainAgent):
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["loss"] = loss
|
||||
run_results[-1]["loss_critic"] = loss_critic
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["time"] = time
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
||||
|
404
agent/finetune/train_rlpd_agent.py
Normal file
404
agent/finetune/train_rlpd_agent.py
Normal file
@ -0,0 +1,404 @@
|
||||
"""
|
||||
Reinforcement Learning with Prior Data (RLPD) agent training script.
|
||||
|
||||
Does not support image observations right now.
|
||||
"""
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import numpy as np
|
||||
import torch
|
||||
import logging
|
||||
import wandb
|
||||
import hydra
|
||||
from collections import deque
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
from util.timer import Timer
|
||||
from agent.finetune.train_agent import TrainAgent
|
||||
from util.scheduler import CosineAnnealingWarmupRestarts
|
||||
|
||||
|
||||
class TrainRLPDAgent(TrainAgent):
|
||||
def __init__(self, cfg):
|
||||
super().__init__(cfg)
|
||||
|
||||
# Build dataset
|
||||
self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset)
|
||||
|
||||
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
|
||||
self.gamma = cfg.train.gamma
|
||||
|
||||
# Optimizer
|
||||
self.actor_optimizer = torch.optim.AdamW(
|
||||
self.model.network.parameters(),
|
||||
lr=cfg.train.actor_lr,
|
||||
weight_decay=cfg.train.actor_weight_decay,
|
||||
)
|
||||
self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
|
||||
self.actor_optimizer,
|
||||
first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
|
||||
cycle_mult=1.0,
|
||||
max_lr=cfg.train.actor_lr,
|
||||
min_lr=cfg.train.actor_lr_scheduler.min_lr,
|
||||
warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
|
||||
gamma=1.0,
|
||||
)
|
||||
self.critic_optimizer = torch.optim.AdamW(
|
||||
self.model.ensemble_params.values(), # https://github.com/pytorch/pytorch/issues/120581
|
||||
lr=cfg.train.critic_lr,
|
||||
weight_decay=cfg.train.critic_weight_decay,
|
||||
)
|
||||
self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
|
||||
self.critic_optimizer,
|
||||
first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
|
||||
cycle_mult=1.0,
|
||||
max_lr=cfg.train.critic_lr,
|
||||
min_lr=cfg.train.critic_lr_scheduler.min_lr,
|
||||
warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
|
||||
gamma=1.0,
|
||||
)
|
||||
|
||||
# Perturbation scale
|
||||
self.target_ema_rate = cfg.train.target_ema_rate
|
||||
|
||||
# Reward scale
|
||||
self.scale_reward_factor = cfg.train.scale_reward_factor
|
||||
|
||||
# Number of critic updates
|
||||
self.critic_num_update = cfg.train.critic_num_update
|
||||
|
||||
# Buffer size
|
||||
self.buffer_size = cfg.train.buffer_size
|
||||
|
||||
# Eval episodes
|
||||
self.n_eval_episode = cfg.train.n_eval_episode
|
||||
|
||||
# Exploration steps at the beginning - using randomly sampled action
|
||||
self.n_explore_steps = cfg.train.n_explore_steps
|
||||
|
||||
# Initialize temperature parameter for entropy
|
||||
init_temperature = cfg.train.init_temperature
|
||||
self.log_alpha = torch.tensor(np.log(init_temperature)).to(self.device)
|
||||
self.log_alpha.requires_grad = True
|
||||
self.target_entropy = cfg.train.target_entropy
|
||||
self.log_alpha_optimizer = torch.optim.Adam(
|
||||
[self.log_alpha],
|
||||
lr=cfg.train.critic_lr,
|
||||
)
|
||||
|
||||
def run(self):
|
||||
# make a FIFO replay buffer for obs, action, and reward
|
||||
obs_buffer = deque(maxlen=self.buffer_size)
|
||||
next_obs_buffer = deque(maxlen=self.buffer_size)
|
||||
action_buffer = deque(maxlen=self.buffer_size)
|
||||
reward_buffer = deque(maxlen=self.buffer_size)
|
||||
terminated_buffer = deque(maxlen=self.buffer_size)
|
||||
|
||||
# load offline dataset into replay buffer
|
||||
dataloader_offline = torch.utils.data.DataLoader(
|
||||
self.dataset_offline,
|
||||
batch_size=len(self.dataset_offline),
|
||||
drop_last=False,
|
||||
)
|
||||
for batch in dataloader_offline:
|
||||
actions, states_and_next, rewards, terminated = batch
|
||||
states = states_and_next["state"]
|
||||
next_states = states_and_next["next_state"]
|
||||
obs_buffer_off = states.cpu().numpy()
|
||||
next_obs_buffer_off = next_states.cpu().numpy()
|
||||
action_buffer_off = actions.cpu().numpy()
|
||||
reward_buffer_off = rewards.cpu().numpy().flatten()
|
||||
terminated_buffer_off = terminated.cpu().numpy().flatten()
|
||||
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
if self.itr % 1000 == 0:
|
||||
print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
|
||||
|
||||
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
|
||||
options_venv = [{} for _ in range(self.n_envs)]
|
||||
if self.itr % self.render_freq == 0 and self.render_video:
|
||||
for env_ind in range(self.n_render):
|
||||
options_venv[env_ind]["video_path"] = os.path.join(
|
||||
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
|
||||
)
|
||||
|
||||
# Define train or eval - all envs restart
|
||||
eval_mode = (
|
||||
self.itr % self.val_freq == 0
|
||||
and self.itr >= self.n_explore_steps
|
||||
and not self.force_train
|
||||
)
|
||||
n_steps = (
|
||||
self.n_steps if not eval_mode else int(1e5)
|
||||
) # large number for eval mode
|
||||
self.model.eval() if eval_mode else self.model.train()
|
||||
|
||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
|
||||
firsts_trajs = np.zeros((n_steps + 1, self.n_envs))
|
||||
if self.reset_at_iteration or eval_mode or self.itr == 0:
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
# if done at the end of last iteration, then the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
reward_trajs = np.zeros((n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
cnt_episode = 0
|
||||
for step in range(n_steps):
|
||||
|
||||
# Select action
|
||||
if self.itr < self.n_explore_steps:
|
||||
action_venv = self.venv.action_space.sample()
|
||||
else:
|
||||
with torch.no_grad():
|
||||
cond = {
|
||||
"state": torch.from_numpy(prev_obs_venv["state"])
|
||||
.float()
|
||||
.to(self.device)
|
||||
}
|
||||
samples = (
|
||||
self.model(
|
||||
cond=cond,
|
||||
deterministic=eval_mode,
|
||||
)
|
||||
.cpu()
|
||||
.numpy()
|
||||
) # n_env x horizon x act
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# add to buffer in train mode
|
||||
if not eval_mode:
|
||||
for i in range(self.n_envs):
|
||||
obs_buffer.append(prev_obs_venv["state"][i])
|
||||
if truncated_venv[i]:
|
||||
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
|
||||
else:
|
||||
next_obs_buffer.append(obs_venv["state"][i])
|
||||
action_buffer.append(action_venv[i])
|
||||
reward_buffer.extend(
|
||||
(reward_venv * self.scale_reward_factor).tolist()
|
||||
)
|
||||
terminated_buffer.extend(terminated_venv.tolist())
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# check if enough eval episodes are done
|
||||
cnt_episode += np.sum(done_venv)
|
||||
if eval_mode and cnt_episode >= self.n_eval_episode:
|
||||
break
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
|
||||
for i in range(len(env_steps) - 1):
|
||||
start = env_steps[i]
|
||||
end = env_steps[i + 1]
|
||||
if end - start > 1:
|
||||
episodes_start_end.append((env_ind, start, end - 1))
|
||||
if len(episodes_start_end) > 0:
|
||||
reward_trajs_split = [
|
||||
reward_trajs[start : end + 1, env_ind]
|
||||
for env_ind, start, end in episodes_start_end
|
||||
]
|
||||
num_episode_finished = len(reward_trajs_split)
|
||||
episode_reward = np.array(
|
||||
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
|
||||
)
|
||||
episode_best_reward = np.array(
|
||||
[
|
||||
np.max(reward_traj) / self.act_steps
|
||||
for reward_traj in reward_trajs_split
|
||||
]
|
||||
)
|
||||
avg_episode_reward = np.mean(episode_reward)
|
||||
avg_best_reward = np.mean(episode_best_reward)
|
||||
success_rate = np.mean(
|
||||
episode_best_reward >= self.best_reward_threshold_for_success
|
||||
)
|
||||
else:
|
||||
episode_reward = np.array([])
|
||||
num_episode_finished = 0
|
||||
avg_episode_reward = 0
|
||||
avg_best_reward = 0
|
||||
success_rate = 0
|
||||
|
||||
# Update models
|
||||
if not eval_mode and self.itr >= self.n_explore_steps:
|
||||
|
||||
# Update critic more frequently
|
||||
for _ in range(self.critic_num_update):
|
||||
|
||||
# Sample from OFFLINE buffer
|
||||
inds = np.random.choice(len(obs_buffer_off), self.batch_size // 2)
|
||||
obs_b_off = (
|
||||
torch.from_numpy(obs_buffer_off[inds]).float().to(self.device)
|
||||
)
|
||||
next_obs_b_off = (
|
||||
torch.from_numpy(next_obs_buffer_off[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
actions_b_off = (
|
||||
torch.from_numpy(action_buffer_off[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
rewards_b_off = (
|
||||
torch.from_numpy(reward_buffer_off[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
terminated_b_off = (
|
||||
torch.from_numpy(terminated_buffer_off[inds])
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
|
||||
# Sample from ONLINE buffer
|
||||
inds = np.random.choice(len(obs_buffer), self.batch_size // 2)
|
||||
obs_b_on = (
|
||||
torch.from_numpy(np.array([obs_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
next_obs_b_on = (
|
||||
torch.from_numpy(np.array([next_obs_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
actions_b_on = (
|
||||
torch.from_numpy(np.array([action_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
rewards_b_on = (
|
||||
torch.from_numpy(np.array([reward_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
terminated_b_on = (
|
||||
torch.from_numpy(np.array([terminated_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
|
||||
# merge offline and online data
|
||||
obs_b = torch.cat([obs_b_off, obs_b_on], dim=0)
|
||||
next_obs_b = torch.cat([next_obs_b_off, next_obs_b_on], dim=0)
|
||||
actions_b = torch.cat([actions_b_off, actions_b_on], dim=0)
|
||||
rewards_b = torch.cat([rewards_b_off, rewards_b_on], dim=0)
|
||||
terminated_b = torch.cat([terminated_b_off, terminated_b_on], dim=0)
|
||||
|
||||
# Update critic
|
||||
alpha = self.log_alpha.exp().item()
|
||||
loss_critic = self.model.loss_critic(
|
||||
{"state": obs_b},
|
||||
{"state": next_obs_b},
|
||||
actions_b,
|
||||
rewards_b,
|
||||
terminated_b,
|
||||
self.gamma,
|
||||
alpha,
|
||||
)
|
||||
self.critic_optimizer.zero_grad()
|
||||
loss_critic.backward()
|
||||
self.critic_optimizer.step()
|
||||
|
||||
# Update target critic every critic update
|
||||
self.model.update_target_critic(self.target_ema_rate)
|
||||
|
||||
# Update actor once with the final batch
|
||||
loss_actor = self.model.loss_actor(
|
||||
{"state": obs_b},
|
||||
alpha,
|
||||
)
|
||||
self.actor_optimizer.zero_grad()
|
||||
loss_actor.backward()
|
||||
self.actor_optimizer.step()
|
||||
|
||||
# Update temperature parameter
|
||||
self.log_alpha_optimizer.zero_grad()
|
||||
loss_alpha = self.model.loss_temperature(
|
||||
{"state": obs_b},
|
||||
self.log_alpha.exp(), # with grad
|
||||
self.target_entropy,
|
||||
)
|
||||
loss_alpha.backward()
|
||||
self.log_alpha_optimizer.step()
|
||||
|
||||
# Update lr
|
||||
self.actor_lr_scheduler.step()
|
||||
self.critic_lr_scheduler.step()
|
||||
|
||||
# Save model
|
||||
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
|
||||
self.save_model()
|
||||
|
||||
# Log loss and save metrics
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps:
|
||||
time = timer()
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"success rate - eval": success_rate,
|
||||
"avg episode reward - eval": avg_episode_reward,
|
||||
"avg best reward - eval": avg_best_reward,
|
||||
"num episode - eval": num_episode_finished,
|
||||
},
|
||||
step=self.itr,
|
||||
commit=False,
|
||||
)
|
||||
run_results[-1]["eval_success_rate"] = success_rate
|
||||
run_results[-1]["eval_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | alpha {alpha:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"total env step": cnt_train_step,
|
||||
"loss - actor": loss_actor,
|
||||
"loss - critic": loss_critic,
|
||||
"entropy coeff": alpha,
|
||||
"avg episode reward - train": avg_episode_reward,
|
||||
"num episode - train": num_episode_finished,
|
||||
},
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
@ -19,7 +19,6 @@ from util.scheduler import CosineAnnealingWarmupRestarts
|
||||
|
||||
|
||||
class TrainRWRDiffusionAgent(TrainAgent):
|
||||
|
||||
def __init__(self, cfg):
|
||||
super().__init__(cfg)
|
||||
|
||||
@ -52,14 +51,13 @@ class TrainRWRDiffusionAgent(TrainAgent):
|
||||
self.update_epochs = cfg.train.update_epochs
|
||||
|
||||
def run(self):
|
||||
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
|
||||
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
|
||||
options_venv = [{} for _ in range(self.n_envs)]
|
||||
if self.itr % self.render_freq == 0 and self.render_video:
|
||||
@ -79,23 +77,24 @@ class TrainRWRDiffusionAgent(TrainAgent):
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
firsts_trajs[0] = (
|
||||
done_venv # if done at the end of last iteration, then the envs are just reset
|
||||
)
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
|
||||
# Holder
|
||||
obs_trajs = {
|
||||
"state": np.empty((0, self.n_envs, self.n_cond_step, self.obs_dim))
|
||||
"state": np.zeros(
|
||||
(self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim)
|
||||
)
|
||||
}
|
||||
samples_trajs = np.empty(
|
||||
samples_trajs = np.zeros(
|
||||
(
|
||||
0,
|
||||
self.n_steps,
|
||||
self.n_envs,
|
||||
self.horizon_steps,
|
||||
self.action_dim,
|
||||
)
|
||||
)
|
||||
reward_trajs = np.empty((0, self.n_envs))
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
for step in range(self.n_steps):
|
||||
@ -118,19 +117,25 @@ class TrainRWRDiffusionAgent(TrainAgent):
|
||||
.numpy()
|
||||
) # n_env x horizon x act
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
samples_trajs = np.vstack((samples_trajs, samples[None]))
|
||||
samples_trajs[step] = samples
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, done_venv, info_venv = self.venv.step(
|
||||
action_venv
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
obs_trajs["state"] = np.vstack(
|
||||
(obs_trajs["state"], prev_obs_venv["state"][None])
|
||||
)
|
||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
|
||||
# save
|
||||
obs_trajs["state"][step] = prev_obs_venv["state"]
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
@ -157,20 +162,23 @@ class TrainRWRDiffusionAgent(TrainAgent):
|
||||
num_episode_finished = len(reward_trajs_split)
|
||||
|
||||
# Compute episode returns
|
||||
discounted_reward_trajs_split = [
|
||||
[
|
||||
self.gamma**t * r
|
||||
for t, r in zip(
|
||||
list(range(end - start + 1)),
|
||||
reward_trajs[start : end + 1, env_ind],
|
||||
)
|
||||
]
|
||||
for env_ind, start, end in episodes_start_end
|
||||
]
|
||||
returns_trajs_split = [
|
||||
np.cumsum(y[::-1])[::-1] for y in discounted_reward_trajs_split
|
||||
np.zeros_like(reward_trajs) for reward_trajs in reward_trajs_split
|
||||
]
|
||||
for traj_rewards, traj_returns in zip(
|
||||
reward_trajs_split, returns_trajs_split
|
||||
):
|
||||
prev_return = 0
|
||||
for t in range(len(traj_rewards)):
|
||||
traj_returns[-t - 1] = (
|
||||
traj_rewards[-t - 1] + self.gamma * prev_return
|
||||
)
|
||||
prev_return = traj_returns[-t - 1]
|
||||
|
||||
# Note: concatenation is okay here since we are concatenating
|
||||
# states and actions later on, in the same order
|
||||
returns_trajs_split = np.concatenate(returns_trajs_split)
|
||||
|
||||
episode_reward = np.array(
|
||||
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
|
||||
)
|
||||
@ -195,7 +203,6 @@ class TrainRWRDiffusionAgent(TrainAgent):
|
||||
|
||||
# Update models
|
||||
if not eval_mode:
|
||||
|
||||
# Tensorize data and put them to device
|
||||
# k for environment step
|
||||
obs_k = {
|
||||
@ -230,7 +237,6 @@ class TrainRWRDiffusionAgent(TrainAgent):
|
||||
total_steps = len(rewards_k_scaled)
|
||||
inds_k = np.arange(total_steps)
|
||||
for _ in range(self.update_epochs):
|
||||
|
||||
# for each epoch, go through all data in batches
|
||||
np.random.shuffle(inds_k)
|
||||
num_batch = max(1, total_steps // self.batch_size) # skip last ones
|
||||
@ -267,10 +273,12 @@ class TrainRWRDiffusionAgent(TrainAgent):
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0:
|
||||
time = timer()
|
||||
run_results[-1]["time"] = time
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
@ -291,11 +299,12 @@ class TrainRWRDiffusionAgent(TrainAgent):
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: loss {loss:8.4f} | reward {avg_episode_reward:8.4f} |t:{time:8.4f}"
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"total env step": cnt_train_step,
|
||||
"loss": loss,
|
||||
"avg episode reward - train": avg_episode_reward,
|
||||
"num episode - train": num_episode_finished,
|
||||
@ -303,9 +312,7 @@ class TrainRWRDiffusionAgent(TrainAgent):
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["loss"] = loss
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["time"] = time
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
||||
|
335
agent/finetune/train_sac_agent.py
Normal file
335
agent/finetune/train_sac_agent.py
Normal file
@ -0,0 +1,335 @@
|
||||
"""
|
||||
Soft Actor Critic (SAC) agent training script.
|
||||
|
||||
Does not support image observations right now.
|
||||
"""
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import numpy as np
|
||||
import torch
|
||||
import logging
|
||||
import wandb
|
||||
from collections import deque
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
from util.timer import Timer
|
||||
from agent.finetune.train_agent import TrainAgent
|
||||
|
||||
|
||||
class TrainSACAgent(TrainAgent):
|
||||
def __init__(self, cfg):
|
||||
super().__init__(cfg)
|
||||
|
||||
# note the discount factor gamma here is applied to reward every act_steps, instead of every env step
|
||||
self.gamma = cfg.train.gamma
|
||||
|
||||
# Optimizer
|
||||
self.actor_optimizer = torch.optim.Adam(
|
||||
self.model.network.parameters(),
|
||||
lr=cfg.train.actor_lr,
|
||||
)
|
||||
self.critic_optimizer = torch.optim.Adam(
|
||||
self.model.critic.parameters(),
|
||||
lr=cfg.train.critic_lr,
|
||||
)
|
||||
|
||||
# Perturbation scale
|
||||
self.target_ema_rate = cfg.train.target_ema_rate
|
||||
|
||||
# Reward scale
|
||||
self.scale_reward_factor = cfg.train.scale_reward_factor
|
||||
|
||||
# Actor/critic update frequency - assume single env
|
||||
self.critic_update_freq = int(
|
||||
cfg.train.batch_size / cfg.train.critic_replay_ratio
|
||||
)
|
||||
self.actor_update_freq = int(
|
||||
cfg.train.batch_size / cfg.train.actor_replay_ratio
|
||||
)
|
||||
|
||||
# Buffer size
|
||||
self.buffer_size = cfg.train.buffer_size
|
||||
|
||||
# Eval episodes
|
||||
self.n_eval_episode = cfg.train.n_eval_episode
|
||||
|
||||
# Exploration steps at the beginning - using randomly sampled action
|
||||
self.n_explore_steps = cfg.train.n_explore_steps
|
||||
|
||||
# Initialize temperature parameter for entropy
|
||||
init_temperature = cfg.train.init_temperature
|
||||
self.log_alpha = torch.tensor(np.log(init_temperature)).to(self.device)
|
||||
self.log_alpha.requires_grad = True
|
||||
self.target_entropy = cfg.train.target_entropy
|
||||
self.log_alpha_optimizer = torch.optim.Adam(
|
||||
[self.log_alpha],
|
||||
lr=cfg.train.critic_lr,
|
||||
)
|
||||
|
||||
def run(self):
|
||||
# make a FIFO replay buffer for obs, action, and reward
|
||||
obs_buffer = deque(maxlen=self.buffer_size)
|
||||
next_obs_buffer = deque(maxlen=self.buffer_size)
|
||||
action_buffer = deque(maxlen=self.buffer_size)
|
||||
reward_buffer = deque(maxlen=self.buffer_size)
|
||||
terminated_buffer = deque(maxlen=self.buffer_size)
|
||||
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
cnt_train_step = 0
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
if self.itr % 1000 == 0:
|
||||
print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
|
||||
|
||||
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
|
||||
options_venv = [{} for _ in range(self.n_envs)]
|
||||
if self.itr % self.render_freq == 0 and self.render_video:
|
||||
for env_ind in range(self.n_render):
|
||||
options_venv[env_ind]["video_path"] = os.path.join(
|
||||
self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
|
||||
)
|
||||
|
||||
# Define train or eval - all envs restart
|
||||
eval_mode = (
|
||||
self.itr % self.val_freq == 0
|
||||
and self.itr > self.n_explore_steps
|
||||
and not self.force_train
|
||||
)
|
||||
n_steps = (
|
||||
self.n_steps if not eval_mode else int(1e5)
|
||||
) # large number for eval mode
|
||||
self.model.eval() if eval_mode else self.model.train()
|
||||
|
||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
|
||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||
if self.reset_at_iteration or eval_mode or self.itr == 0:
|
||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||
firsts_trajs[0] = 1
|
||||
else:
|
||||
# if done at the end of last iteration, the envs are just reset
|
||||
firsts_trajs[0] = done_venv
|
||||
reward_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||
|
||||
# Collect a set of trajectories from env
|
||||
cnt_episode = 0
|
||||
for step in range(n_steps):
|
||||
|
||||
# Select action
|
||||
if self.itr < self.n_explore_steps:
|
||||
action_venv = self.venv.action_space.sample()
|
||||
else:
|
||||
with torch.no_grad():
|
||||
cond = {
|
||||
"state": torch.from_numpy(prev_obs_venv["state"])
|
||||
.float()
|
||||
.to(self.device)
|
||||
}
|
||||
samples = (
|
||||
self.model(
|
||||
cond=cond,
|
||||
deterministic=eval_mode,
|
||||
)
|
||||
.cpu()
|
||||
.numpy()
|
||||
) # n_env x horizon x act
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
|
||||
# add to buffer in train mode
|
||||
if not eval_mode:
|
||||
for i in range(self.n_envs):
|
||||
obs_buffer.append(prev_obs_venv["state"][i])
|
||||
if "final_obs" in info_venv[i]: # truncated
|
||||
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
|
||||
else: # first obs in new episode
|
||||
next_obs_buffer.append(obs_venv["state"][i])
|
||||
action_buffer.append(action_venv[i])
|
||||
reward_buffer.extend(
|
||||
(reward_venv * self.scale_reward_factor).tolist()
|
||||
)
|
||||
terminated_buffer.extend(terminated_venv.tolist())
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
||||
# count steps --- not acounting for done within action chunk
|
||||
cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
|
||||
|
||||
# check if enough eval episodes are done
|
||||
cnt_episode += np.sum(done_venv)
|
||||
if eval_mode and cnt_episode >= self.n_eval_episode:
|
||||
break
|
||||
|
||||
# Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
|
||||
episodes_start_end = []
|
||||
for env_ind in range(self.n_envs):
|
||||
env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
|
||||
for i in range(len(env_steps) - 1):
|
||||
start = env_steps[i]
|
||||
end = env_steps[i + 1]
|
||||
if end - start > 1:
|
||||
episodes_start_end.append((env_ind, start, end - 1))
|
||||
if len(episodes_start_end) > 0:
|
||||
reward_trajs_split = [
|
||||
reward_trajs[start : end + 1, env_ind]
|
||||
for env_ind, start, end in episodes_start_end
|
||||
]
|
||||
num_episode_finished = len(reward_trajs_split)
|
||||
episode_reward = np.array(
|
||||
[np.sum(reward_traj) for reward_traj in reward_trajs_split]
|
||||
)
|
||||
episode_best_reward = np.array(
|
||||
[
|
||||
np.max(reward_traj) / self.act_steps
|
||||
for reward_traj in reward_trajs_split
|
||||
]
|
||||
)
|
||||
avg_episode_reward = np.mean(episode_reward)
|
||||
avg_best_reward = np.mean(episode_best_reward)
|
||||
success_rate = np.mean(
|
||||
episode_best_reward >= self.best_reward_threshold_for_success
|
||||
)
|
||||
else:
|
||||
episode_reward = np.array([])
|
||||
num_episode_finished = 0
|
||||
avg_episode_reward = 0
|
||||
avg_best_reward = 0
|
||||
success_rate = 0
|
||||
|
||||
# Update models
|
||||
if (
|
||||
not eval_mode
|
||||
and self.itr > self.n_explore_steps
|
||||
and self.itr % self.critic_update_freq == 0
|
||||
):
|
||||
inds = np.random.choice(len(obs_buffer), self.batch_size, replace=False)
|
||||
obs_b = (
|
||||
torch.from_numpy(np.array([obs_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
next_obs_b = (
|
||||
torch.from_numpy(np.array([next_obs_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
actions_b = (
|
||||
torch.from_numpy(np.array([action_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
rewards_b = (
|
||||
torch.from_numpy(np.array([reward_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
terminated_b = (
|
||||
torch.from_numpy(np.array([terminated_buffer[i] for i in inds]))
|
||||
.float()
|
||||
.to(self.device)
|
||||
)
|
||||
|
||||
# Update critic
|
||||
alpha = self.log_alpha.exp().item()
|
||||
loss_critic = self.model.loss_critic(
|
||||
{"state": obs_b},
|
||||
{"state": next_obs_b},
|
||||
actions_b,
|
||||
rewards_b,
|
||||
terminated_b,
|
||||
self.gamma,
|
||||
alpha,
|
||||
)
|
||||
self.critic_optimizer.zero_grad()
|
||||
loss_critic.backward()
|
||||
self.critic_optimizer.step()
|
||||
|
||||
# Update target critic every critic update
|
||||
self.model.update_target_critic(self.target_ema_rate)
|
||||
|
||||
# Delay update actor
|
||||
loss_actor = 0
|
||||
if self.itr % self.actor_update_freq == 0:
|
||||
for _ in range(2):
|
||||
loss_actor = self.model.loss_actor(
|
||||
{"state": obs_b},
|
||||
alpha,
|
||||
)
|
||||
self.actor_optimizer.zero_grad()
|
||||
loss_actor.backward()
|
||||
self.actor_optimizer.step()
|
||||
|
||||
# Update temperature parameter
|
||||
self.log_alpha_optimizer.zero_grad()
|
||||
loss_alpha = self.model.loss_temperature(
|
||||
{"state": obs_b},
|
||||
self.log_alpha.exp(), # with grad
|
||||
self.target_entropy,
|
||||
)
|
||||
loss_alpha.backward()
|
||||
self.log_alpha_optimizer.step()
|
||||
|
||||
# Save model
|
||||
if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
|
||||
self.save_model()
|
||||
|
||||
# Log loss and save metrics
|
||||
run_results.append(
|
||||
{
|
||||
"itr": self.itr,
|
||||
"step": cnt_train_step,
|
||||
}
|
||||
)
|
||||
if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps:
|
||||
time = timer()
|
||||
if eval_mode:
|
||||
log.info(
|
||||
f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb.log(
|
||||
{
|
||||
"success rate - eval": success_rate,
|
||||
"avg episode reward - eval": avg_episode_reward,
|
||||
"avg best reward - eval": avg_best_reward,
|
||||
"num episode - eval": num_episode_finished,
|
||||
},
|
||||
step=self.itr,
|
||||
commit=False,
|
||||
)
|
||||
run_results[-1]["eval_success_rate"] = success_rate
|
||||
run_results[-1]["eval_episode_reward"] = avg_episode_reward
|
||||
run_results[-1]["eval_best_reward"] = avg_best_reward
|
||||
else:
|
||||
log.info(
|
||||
f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | alpha {alpha:8.4f} | t {time:8.4f}"
|
||||
)
|
||||
if self.use_wandb:
|
||||
wandb_log_dict = {
|
||||
"total env step": cnt_train_step,
|
||||
"loss - critic": loss_critic,
|
||||
"entropy coeff": alpha,
|
||||
"avg episode reward - train": avg_episode_reward,
|
||||
"num episode - train": num_episode_finished,
|
||||
}
|
||||
if loss_actor is not None:
|
||||
wandb_log_dict["loss - actor"] = loss_actor
|
||||
wandb.log(
|
||||
wandb_log_dict,
|
||||
step=self.itr,
|
||||
commit=True,
|
||||
)
|
||||
run_results[-1]["train_episode_reward"] = avg_episode_reward
|
||||
with open(self.result_path, "wb") as f:
|
||||
pickle.dump(run_results, f)
|
||||
self.itr += 1
|
@ -16,7 +16,6 @@ env_name: avoiding-m5
|
||||
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
@ -102,7 +101,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -16,7 +16,6 @@ env_name: avoiding-m5
|
||||
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
act_steps: 4
|
||||
@ -94,7 +93,7 @@ model:
|
||||
learn_fixed_std: False
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
|
@ -16,7 +16,6 @@ env_name: avoiding-m5
|
||||
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
act_steps: 4
|
||||
@ -95,7 +94,7 @@ model:
|
||||
num_modes: ${num_modes}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
|
@ -16,7 +16,6 @@ env_name: avoiding-m5
|
||||
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
@ -102,7 +101,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -16,7 +16,6 @@ env_name: avoiding-m5
|
||||
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
act_steps: 4
|
||||
@ -94,7 +93,7 @@ model:
|
||||
learn_fixed_std: False
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
|
@ -16,7 +16,6 @@ env_name: avoiding-m5
|
||||
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
act_steps: 4
|
||||
@ -95,7 +94,7 @@ model:
|
||||
num_modes: ${num_modes}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
|
@ -16,7 +16,6 @@ env_name: avoiding-m5
|
||||
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
@ -102,7 +101,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -16,7 +16,6 @@ env_name: avoiding-m5
|
||||
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
act_steps: 4
|
||||
@ -94,7 +93,7 @@ model:
|
||||
learn_fixed_std: False
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
|
@ -16,7 +16,6 @@ env_name: avoiding-m5
|
||||
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
act_steps: 4
|
||||
@ -95,7 +94,7 @@ model:
|
||||
num_modes: ${num_modes}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
|
@ -15,7 +15,6 @@ env: avoid
|
||||
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
@ -50,7 +49,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -15,7 +15,6 @@ env: avoid
|
||||
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
|
||||
@ -47,7 +46,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
@ -15,7 +15,6 @@ env: avoid
|
||||
mode: d56_r12 # M1, desired modes 5 and 6, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
num_modes: 5
|
||||
@ -49,7 +48,7 @@ model:
|
||||
num_modes: ${num_modes}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
@ -15,7 +15,6 @@ env: avoid
|
||||
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
@ -50,7 +49,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -15,7 +15,6 @@ env: avoid
|
||||
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
|
||||
@ -47,7 +46,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
@ -15,7 +15,6 @@ env: avoid
|
||||
mode: d57_r12 # M2, desired modes 5 and 7, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
num_modes: 5
|
||||
@ -49,7 +48,7 @@ model:
|
||||
num_modes: ${num_modes}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
@ -15,7 +15,6 @@ env: avoid
|
||||
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
@ -50,7 +49,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -15,7 +15,6 @@ env: avoid
|
||||
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
|
||||
@ -47,7 +46,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
@ -15,7 +15,6 @@ env: avoid
|
||||
mode: d58_r12 # M3, desired modes 5 and 8, required modes 1 and 2
|
||||
obs_dim: 4
|
||||
action_dim: 2
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
num_modes: 5
|
||||
@ -49,7 +48,7 @@ model:
|
||||
num_modes: ${num_modes}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
cond_steps: 1
|
||||
horizon_steps: 8
|
||||
@ -59,7 +58,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -105,7 +104,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -107,7 +106,7 @@ model:
|
||||
cond_predict_scale: True
|
||||
groupnorm_eps: 1e-4
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 8
|
||||
act_steps: 8
|
||||
@ -98,7 +97,7 @@ model:
|
||||
std_max: 0.2
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -105,7 +104,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -106,7 +105,7 @@ model:
|
||||
smaller_encoder: False
|
||||
cond_predict_scale: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 8
|
||||
act_steps: 8
|
||||
@ -98,7 +97,7 @@ model:
|
||||
std_max: 0.2
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -105,7 +104,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -107,7 +106,7 @@ model:
|
||||
cond_predict_scale: True
|
||||
groupnorm_eps: 1e-4
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 8
|
||||
act_steps: 8
|
||||
@ -98,7 +97,7 @@ model:
|
||||
std_max: 0.2
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -105,7 +104,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -107,7 +106,7 @@ model:
|
||||
cond_predict_scale: True
|
||||
groupnorm_eps: 1e-4
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 8
|
||||
act_steps: 8
|
||||
@ -98,7 +97,7 @@ model:
|
||||
std_max: 0.2
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -105,7 +104,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -107,7 +106,7 @@ model:
|
||||
cond_predict_scale: True
|
||||
groupnorm_eps: 1e-4
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 8
|
||||
act_steps: 8
|
||||
@ -98,7 +97,7 @@ model:
|
||||
std_max: 0.2
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -105,7 +104,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
ft_denoising_steps: 5
|
||||
cond_steps: 1
|
||||
@ -106,7 +105,7 @@ model:
|
||||
smaller_encoder: False
|
||||
cond_predict_scale: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 8
|
||||
act_steps: 8
|
||||
@ -98,7 +97,7 @@ model:
|
||||
std_max: 0.2
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -16,7 +16,6 @@ randomness: low
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
@ -52,7 +51,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: low
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 16
|
||||
cond_steps: 1
|
||||
@ -54,7 +53,7 @@ model:
|
||||
cond_predict_scale: True
|
||||
groupnorm_eps: 1e-4 # not important
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: low
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
|
||||
@ -49,7 +48,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
@ -16,7 +16,6 @@ randomness: med
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
@ -52,7 +51,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: med
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 16
|
||||
cond_steps: 1
|
||||
@ -53,7 +52,7 @@ model:
|
||||
smaller_encoder: False
|
||||
cond_predict_scale: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: med
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
|
||||
@ -49,7 +48,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
@ -16,7 +16,6 @@ randomness: low
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
@ -52,7 +51,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: low
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 16
|
||||
cond_steps: 1
|
||||
@ -54,7 +53,7 @@ model:
|
||||
cond_predict_scale: True
|
||||
groupnorm_eps: 1e-4 # not important
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: low
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
|
||||
@ -49,7 +48,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
@ -16,7 +16,6 @@ randomness: med
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
@ -52,7 +51,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: med
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 16
|
||||
cond_steps: 1
|
||||
@ -53,7 +52,7 @@ model:
|
||||
smaller_encoder: False
|
||||
cond_predict_scale: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: med
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 58
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
|
||||
@ -49,7 +48,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
@ -16,7 +16,6 @@ randomness: low
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
@ -52,7 +51,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: low
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 16
|
||||
cond_steps: 1
|
||||
@ -53,7 +52,7 @@ model:
|
||||
smaller_encoder: False
|
||||
cond_predict_scale: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: low
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
|
||||
@ -49,7 +48,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
@ -16,7 +16,6 @@ randomness: med
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
@ -52,7 +51,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: med
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 100
|
||||
horizon_steps: 16
|
||||
cond_steps: 1
|
||||
@ -53,7 +52,7 @@ model:
|
||||
smaller_encoder: False
|
||||
cond_predict_scale: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -16,7 +16,6 @@ randomness: med
|
||||
env: ${task}_${randomness}_dim
|
||||
obs_dim: 44
|
||||
action_dim: 10
|
||||
transition_dim: ${action_dim}
|
||||
horizon_steps: 8
|
||||
cond_steps: 1
|
||||
|
||||
@ -49,7 +48,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
|
61
cfg/gym/eval/halfcheetah-v2/eval_diffusion_mlp.yaml
Normal file
61
cfg/gym/eval/halfcheetah-v2/eval_diffusion_mlp.yaml
Normal file
@ -0,0 +1,61 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
|
||||
|
||||
name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
n_steps: 1000 # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
|
||||
render_num: 0
|
||||
|
||||
env:
|
||||
n_envs: 40
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion.DiffusionModel
|
||||
predict_epsilon: True
|
||||
denoised_clip_value: 1.0
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
network:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
54
cfg/gym/eval/halfcheetah-v2/eval_gaussian_mlp.yaml
Normal file
54
cfg/gym/eval/halfcheetah-v2/eval_gaussian_mlp.yaml
Normal file
@ -0,0 +1,54 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent
|
||||
|
||||
name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path:
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
n_steps: 1000 # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
|
||||
render_num: 0
|
||||
|
||||
env:
|
||||
n_envs: 40
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
model:
|
||||
_target_: model.common.gaussian.GaussianModel
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
network:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
@ -15,7 +15,6 @@ device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
@ -54,7 +53,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
54
cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml
Normal file
54
cfg/gym/eval/hopper-v2/eval_gaussian_mlp.yaml
Normal file
@ -0,0 +1,54 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent
|
||||
|
||||
name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
n_steps: 1000 # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
|
||||
render_num: 0
|
||||
|
||||
env:
|
||||
n_envs: 40
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
model:
|
||||
_target_: model.common.gaussian.GaussianModel
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
network:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
117
cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml
Normal file
117
cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml
Normal file
@ -0,0 +1,117 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path:
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: calql-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 10000
|
||||
n_steps: 1 # not used
|
||||
n_episode_per_epoch: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 3e-4
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 1
|
||||
# CalQL specific
|
||||
train_online: True
|
||||
batch_size: 256
|
||||
n_random_actions: 4
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1.0
|
||||
num_update: 1000
|
||||
buffer_size: 1000000
|
||||
online_utd_ratio: 1
|
||||
n_eval_episode: 10
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
automatic_entropy_tuning: True
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_calql.CalQL_Gaussian
|
||||
randn_clip_value: 3
|
||||
cql_min_q_weight: 5.0
|
||||
tanh_output: True
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
discount_factor: ${train.gamma}
|
||||
get_mc_return: True
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
|
||||
|
||||
name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
@ -68,7 +68,7 @@ train:
|
||||
max_adv_weight: 100
|
||||
beta: 10
|
||||
buffer_size: 5000
|
||||
batch_size: 256
|
||||
batch_size: 1000
|
||||
replay_ratio: 64
|
||||
critic_update_ratio: 4
|
||||
|
||||
@ -82,7 +82,7 @@ model:
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
|
||||
|
||||
name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
@ -65,11 +65,12 @@ train:
|
||||
num: 0
|
||||
# DIPO specific
|
||||
scale_reward_factor: 0.01
|
||||
eta: 0.0001
|
||||
target_ema_rate: 0.005
|
||||
buffer_size: 1000000
|
||||
action_lr: 0.0001
|
||||
action_gradient_steps: 10
|
||||
buffer_size: 400000
|
||||
batch_size: 5000
|
||||
replay_ratio: 64
|
||||
batch_size: 1000
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_dipo.DIPODiffusion
|
||||
@ -81,7 +82,7 @@ model:
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
|
||||
|
||||
name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
@ -65,10 +65,11 @@ train:
|
||||
num: 0
|
||||
# DQL specific
|
||||
scale_reward_factor: 0.01
|
||||
target_ema_rate: 0.005
|
||||
buffer_size: 1000000
|
||||
eta: 1.0
|
||||
buffer_size: 400000
|
||||
batch_size: 5000
|
||||
replay_ratio: 64
|
||||
replay_ratio: 16
|
||||
batch_size: 1000
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_dql.DQLDiffusion
|
||||
@ -80,7 +81,7 @@ model:
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
|
||||
|
||||
name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
@ -69,9 +69,9 @@ train:
|
||||
eval_sample_num: 20 # how many samples to score during eval
|
||||
critic_tau: 0.001 # rate of target q network update
|
||||
use_expectile_exploration: True
|
||||
buffer_size: 5000
|
||||
batch_size: 512
|
||||
replay_ratio: 16
|
||||
buffer_size: 25000 # * n_envs
|
||||
replay_ratio: 128
|
||||
batch_size: 1000
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_idql.IDQLDiffusion
|
||||
@ -83,7 +83,7 @@ model:
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
|
||||
|
||||
name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
@ -93,7 +93,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_ppo_exact_diffusion_agent.TrainPPOExactDiffusionAgent
|
||||
|
||||
name: ${env_name}_ppo_exact_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
@ -87,7 +87,6 @@ model:
|
||||
sde_min_beta: 1e-10
|
||||
sde_probability_flow: True
|
||||
#
|
||||
gamma_denoising: 0.99
|
||||
clip_ploss_coef: 0.01
|
||||
min_sampling_denoising_std: 0.1
|
||||
min_logprob_denoising_std: 0.1
|
||||
@ -101,7 +100,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
|
||||
|
||||
name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
@ -65,11 +65,11 @@ train:
|
||||
num: 0
|
||||
# QSM specific
|
||||
scale_reward_factor: 0.01
|
||||
q_grad_coeff: 50
|
||||
critic_tau: 0.005 # rate of target q network update
|
||||
buffer_size: 5000
|
||||
batch_size: 256
|
||||
replay_ratio: 32
|
||||
q_grad_coeff: 10
|
||||
critic_tau: 0.005
|
||||
buffer_size: 25000
|
||||
replay_ratio: 16
|
||||
batch_size: 1000
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_qsm.QSMDiffusion
|
||||
@ -81,7 +81,7 @@ model:
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
|
||||
|
||||
name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-04-42/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
@ -73,7 +73,7 @@ model:
|
||||
network:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
|
109
cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml
Normal file
109
cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml
Normal file
@ -0,0 +1,109 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
|
||||
|
||||
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
base_policy_path:
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: ibrl-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-4
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
save_model_freq: 50000
|
||||
val_freq: 2000
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 200
|
||||
# IBRL specific
|
||||
batch_size: 256
|
||||
target_ema_rate: 0.01
|
||||
scale_reward_factor: 1
|
||||
critic_num_update: 5
|
||||
buffer_size: 300000
|
||||
n_eval_episode: 10
|
||||
n_explore_steps: 0
|
||||
update_freq: 2
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_ibrl.IBRL_Gaussian
|
||||
randn_clip_value: 3
|
||||
n_critics: 5
|
||||
soft_action_sample: True
|
||||
soft_action_sample_beta: 10
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: False # use ensemble
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
max_n_episodes: 50
|
@ -6,14 +6,14 @@ hydra:
|
||||
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
|
||||
|
||||
name: ${env_name}_nopre_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 20
|
||||
cond_steps: 1
|
||||
@ -86,7 +86,7 @@ model:
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
|
@ -6,14 +6,14 @@ hydra:
|
||||
_target_: agent.finetune.train_ppo_gaussian_agent.TrainPPOGaussianAgent
|
||||
|
||||
name: ${env_name}_nopre_ppo_gaussian_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
transition_dim: ${action_dim}
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
@ -79,10 +79,10 @@ model:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
residual_style: True
|
||||
residual_style: False # with new logvar head
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
109
cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml
Normal file
109
cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml
Normal file
@ -0,0 +1,109 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
|
||||
|
||||
name: ${env_name}_rlpd_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: rlpd-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 250000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 3e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
critic_lr: 3e-4
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
save_model_freq: 50000
|
||||
val_freq: 5000
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 200
|
||||
# RLPD specific
|
||||
batch_size: 256
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1
|
||||
critic_num_update: 20
|
||||
buffer_size: 1000000
|
||||
n_eval_episode: 10
|
||||
n_explore_steps: 5000
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_rlpd.RLPD_Gaussian
|
||||
randn_clip_value: 10
|
||||
tanh_output: True # squash after sampling
|
||||
backup_entropy: True
|
||||
n_critics: 10 # Ensemble size for critic models
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: False # use ensemble
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
89
cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml
Normal file
89
cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml
Normal file
@ -0,0 +1,89 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_sac_agent.TrainSACAgent
|
||||
|
||||
name: ${env_name}_sac_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: sac-gym-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 3e-4
|
||||
critic_lr: 1e-3
|
||||
save_model_freq: 100000
|
||||
val_freq: 10000
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 200
|
||||
# SAC specific
|
||||
batch_size: 256
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1
|
||||
critic_replay_ratio: 256
|
||||
actor_replay_ratio: 128
|
||||
buffer_size: 1000000
|
||||
n_eval_episode: 10
|
||||
n_explore_steps: 5000
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_sac.SAC_Gaussian
|
||||
randn_clip_value: 10
|
||||
tanh_output: True # squash after sampling
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic: # no layernorm
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256]
|
||||
activation_type: ReLU
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
117
cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml
Normal file
117
cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml
Normal file
@ -0,0 +1,117 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path:
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: calql-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 10000
|
||||
n_steps: 1 # not used
|
||||
n_episode_per_epoch: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 3e-4
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 1
|
||||
# CalQL specific
|
||||
train_online: True
|
||||
batch_size: 256
|
||||
n_random_actions: 4
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1.0
|
||||
num_update: 1000
|
||||
buffer_size: 1000000
|
||||
online_utd_ratio: 1
|
||||
n_eval_episode: 10
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
automatic_entropy_tuning: True
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_calql.CalQL_Gaussian
|
||||
randn_clip_value: 3
|
||||
cql_min_q_weight: 5.0
|
||||
tanh_output: True
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
discount_factor: ${train.gamma}
|
||||
get_mc_return: True
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
|
||||
|
||||
name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
@ -68,7 +68,7 @@ train:
|
||||
max_adv_weight: 100
|
||||
beta: 10
|
||||
buffer_size: 5000
|
||||
batch_size: 256
|
||||
batch_size: 1000
|
||||
replay_ratio: 64
|
||||
critic_update_ratio: 4
|
||||
|
||||
@ -82,7 +82,7 @@ model:
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
|
||||
|
||||
name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
@ -65,11 +65,12 @@ train:
|
||||
num: 0
|
||||
# DIPO specific
|
||||
scale_reward_factor: 0.01
|
||||
eta: 0.0001
|
||||
target_ema_rate: 0.005
|
||||
buffer_size: 1000000
|
||||
action_lr: 0.0001
|
||||
action_gradient_steps: 10
|
||||
buffer_size: 400000
|
||||
batch_size: 5000
|
||||
replay_ratio: 64
|
||||
batch_size: 1000
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_dipo.DIPODiffusion
|
||||
@ -81,7 +82,7 @@ model:
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
|
||||
|
||||
name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
@ -65,10 +65,11 @@ train:
|
||||
num: 0
|
||||
# DQL specific
|
||||
scale_reward_factor: 0.01
|
||||
target_ema_rate: 0.005
|
||||
buffer_size: 1000000
|
||||
eta: 1.0
|
||||
buffer_size: 400000
|
||||
batch_size: 5000
|
||||
replay_ratio: 64
|
||||
replay_ratio: 16
|
||||
batch_size: 1000
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_dql.DQLDiffusion
|
||||
@ -80,7 +81,7 @@ model:
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
|
||||
|
||||
name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
@ -69,9 +69,9 @@ train:
|
||||
eval_sample_num: 20 # how many samples to score during eval
|
||||
critic_tau: 0.001 # rate of target q network update
|
||||
use_expectile_exploration: True
|
||||
buffer_size: 5000
|
||||
batch_size: 512
|
||||
replay_ratio: 16
|
||||
buffer_size: 25000 # * n_envs
|
||||
replay_ratio: 128
|
||||
batch_size: 1000
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_idql.IDQLDiffusion
|
||||
@ -83,7 +83,7 @@ model:
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
|
||||
|
||||
name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/hopper-medium-v2_pre_diffusion_mlp_ta4_td20/2024-06-12_23-10-05/checkpoint/state_3000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
transition_dim: ${action_dim}
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
@ -93,7 +93,7 @@ model:
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
transition_dim: ${transition_dim}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user