v0.6 (#18)
* Sampling over both env and denoising steps in DPPO updates (#13) * sample one from each chain * full random sampling * Add Proficient Human (PH) Configs and Pipeline (#16) * fix missing cfg * add ph config * fix how terminated flags are added to buffer in ibrl * add ph config * offline calql for 1M gradient updates * bug fix: number of calql online gradient steps is the number of new transitions collected * add sample config for DPPO with ta=1 * Sampling over both env and denoising steps in DPPO updates (#13) * sample one from each chain * full random sampling * fix diffusion loss when predicting initial noise * fix dppo inds * fix typo * remove print statement --------- Co-authored-by: Justin M. Lidard <jlidard@neuronic.cs.princeton.edu> Co-authored-by: allenzren <allen.ren@princeton.edu> * update robomimic configs * better calql formulation * optimize calql and ibrl training * optimize data transfer in ppo agents * add kitchen configs * re-organize config folders, rerun calql and rlpd * add scratch gym locomotion configs * add kitchen installation dependencies * use truncated for termination in furniture env * update furniture and gym configs * update README and dependencies with kitchen * add url for new data and checkpoints * update demo RL configs * update batch sizes for furniture unet configs * raise error about dropout in residual mlp * fix observation bug in bc loss --------- Co-authored-by: Justin Lidard <60638575+jlidard@users.noreply.github.com> Co-authored-by: Justin M. Lidard <jlidard@neuronic.cs.princeton.edu>
This commit is contained in:
parent
7b10df690d
commit
dc8e0c9edc
@ -31,12 +31,11 @@ conda activate dppo
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
3. Install specific environment dependencies (Gym / Robomimic / D3IL / Furniture-Bench) or all dependencies
|
||||
3. Install specific environment dependencies (Gym / Kitchen / Robomimic / D3IL / Furniture-Bench) or all dependencies (except for Kitchen, which has dependency conflicts with other tasks).
|
||||
```console
|
||||
pip install -e .[gym] # or [robomimic], [d3il], [furniture]
|
||||
pip install -e .[all]
|
||||
pip install -e .[gym] # or [kitchen], [robomimic], [d3il], [furniture]
|
||||
pip install -e .[all] # except for Kitchen
|
||||
```
|
||||
<!-- **Note**: Please do not set macros for robomimic and robosuite that the warnings suggest --- we will use some different global variables than the ones defined in macro.py -->
|
||||
|
||||
4. [Install MuJoCo for Gym and/or Robomimic](installation/install_mujoco.md). [Install D3IL](installation/install_d3il.md). [Install IsaacGym and Furniture-Bench](installation/install_furniture.md)
|
||||
|
||||
@ -161,6 +160,7 @@ Our diffusion implementation is mostly based on [Diffuser](https://github.com/ja
|
||||
* `model.min_sampling_denoising_std`: <img src="https://latex.codecogs.com/gif.latex?\epsilon^\text{exp}_\text{min} "/>, minimum amount of noise when sampling at a denoising step
|
||||
* `model.min_logprob_denoising_std`: <img src="https://latex.codecogs.com/gif.latex?\epsilon^\text{prob}_\text{min} "/>, minimum standard deviation when evaluating likelihood at a denoising step
|
||||
* `model.clip_ploss_coef`: PPO clipping ratio
|
||||
* `train.batch_size`: you may notice the batch size is rather large --- this is due to the PPO update being in expectation over both environment steps and denoising steps (new in v0.6).
|
||||
|
||||
### DDIM fine-tuning
|
||||
|
||||
|
@ -82,8 +82,6 @@ class TrainCalQLAgent(TrainAgent):
|
||||
if self.train_online:
|
||||
# number of episode to colect per epoch for training
|
||||
self.n_episode_per_epoch = cfg.train.n_episode_per_epoch
|
||||
# UTD ratio
|
||||
self.online_utd_ratio = cfg.train.online_utd_ratio
|
||||
|
||||
# Eval episodes
|
||||
self.n_eval_episode = cfg.train.n_eval_episode
|
||||
@ -204,9 +202,13 @@ class TrainCalQLAgent(TrainAgent):
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
(
|
||||
obs_venv,
|
||||
reward_venv,
|
||||
terminated_venv,
|
||||
truncated_venv,
|
||||
info_venv,
|
||||
) = self.venv.step(action_venv)
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
@ -308,7 +310,8 @@ class TrainCalQLAgent(TrainAgent):
|
||||
|
||||
# override num_update
|
||||
if self.train_online:
|
||||
num_update = len(reward_trajs) # assume one env!
|
||||
# the amount of new transitions(single env)
|
||||
num_update = len(reward_trajs_split[0])
|
||||
else:
|
||||
num_update = self.num_update
|
||||
for _ in range(num_update):
|
||||
@ -413,7 +416,6 @@ class TrainCalQLAgent(TrainAgent):
|
||||
reward_to_go_b,
|
||||
terminated_b,
|
||||
self.gamma,
|
||||
alpha,
|
||||
)
|
||||
self.critic_optimizer.zero_grad()
|
||||
loss_critic.backward()
|
||||
|
@ -145,7 +145,6 @@ class TrainIBRLAgent(TrainAgent):
|
||||
# Collect a set of trajectories from env
|
||||
cnt_episode = 0
|
||||
for step in range(n_steps):
|
||||
|
||||
# Select action
|
||||
with torch.no_grad():
|
||||
cond = {
|
||||
@ -164,9 +163,13 @@ class TrainIBRLAgent(TrainAgent):
|
||||
action_venv = samples[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
(
|
||||
obs_venv,
|
||||
reward_venv,
|
||||
terminated_venv,
|
||||
truncated_venv,
|
||||
info_venv,
|
||||
) = self.venv.step(action_venv)
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
reward_trajs[step] = reward_venv
|
||||
firsts_trajs[step + 1] = done_venv
|
||||
@ -177,14 +180,13 @@ class TrainIBRLAgent(TrainAgent):
|
||||
obs_buffer.append(prev_obs_venv["state"][i])
|
||||
if "final_obs" in info_venv[i]: # truncated
|
||||
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
|
||||
terminated_venv[i] = False
|
||||
else: # first obs in new episode
|
||||
next_obs_buffer.append(obs_venv["state"][i])
|
||||
action_buffer.append(action_venv[i])
|
||||
reward_buffer.extend(
|
||||
(reward_venv * self.scale_reward_factor).tolist()
|
||||
)
|
||||
terminated_buffer.append(terminated_venv.tolist())
|
||||
terminated_buffer.extend(terminated_venv.tolist())
|
||||
|
||||
# update for next step
|
||||
prev_obs_venv = obs_venv
|
||||
|
@ -19,7 +19,6 @@ from util.scheduler import CosineAnnealingWarmupRestarts
|
||||
|
||||
|
||||
class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
|
||||
def __init__(self, cfg):
|
||||
super().__init__(cfg)
|
||||
|
||||
@ -46,7 +45,6 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
)
|
||||
|
||||
def run(self):
|
||||
|
||||
# Start training loop
|
||||
timer = Timer()
|
||||
run_results = []
|
||||
@ -54,7 +52,6 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
last_itr_eval = False
|
||||
done_venv = np.zeros((1, self.n_envs))
|
||||
while self.itr < self.n_train_itr:
|
||||
|
||||
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
|
||||
options_venv = [{} for _ in range(self.n_envs)]
|
||||
if self.itr % self.render_freq == 0 and self.render_video:
|
||||
@ -126,9 +123,13 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
action_venv = output_venv[:, : self.act_steps]
|
||||
|
||||
# Apply multi-step action
|
||||
obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
|
||||
self.venv.step(action_venv)
|
||||
)
|
||||
(
|
||||
obs_venv,
|
||||
reward_venv,
|
||||
terminated_venv,
|
||||
truncated_venv,
|
||||
info_venv,
|
||||
) = self.venv.step(action_venv)
|
||||
done_venv = terminated_venv | truncated_venv
|
||||
if self.save_full_observations: # state-only
|
||||
obs_full_venv = np.array(
|
||||
@ -285,40 +286,45 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
)
|
||||
}
|
||||
chains_k = einops.rearrange(
|
||||
torch.tensor(chains_trajs).float().to(self.device),
|
||||
torch.tensor(chains_trajs, device=self.device).float(),
|
||||
"s e t h d -> (s e) t h d",
|
||||
)
|
||||
returns_k = (
|
||||
torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
values_k = (
|
||||
torch.tensor(values_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(values_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
advantages_k = (
|
||||
torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(advantages_trajs, device=self.device)
|
||||
.float()
|
||||
.reshape(-1)
|
||||
)
|
||||
logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
|
||||
logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
|
||||
|
||||
# Update policy and critic
|
||||
total_steps = self.n_steps * self.n_envs
|
||||
inds_k = np.arange(total_steps)
|
||||
total_steps = self.n_steps * self.n_envs * self.model.ft_denoising_steps
|
||||
clipfracs = []
|
||||
for update_epoch in range(self.update_epochs):
|
||||
|
||||
# for each epoch, go through all data in batches
|
||||
flag_break = False
|
||||
np.random.shuffle(inds_k)
|
||||
inds_k = torch.randperm(total_steps, device=self.device)
|
||||
num_batch = max(1, total_steps // self.batch_size) # skip last ones
|
||||
for batch in range(num_batch):
|
||||
start = batch * self.batch_size
|
||||
end = start + self.batch_size
|
||||
inds_b = inds_k[start:end] # b for batch
|
||||
obs_b = {"state": obs_k["state"][inds_b]}
|
||||
chains_b = chains_k[inds_b]
|
||||
returns_b = returns_k[inds_b]
|
||||
values_b = values_k[inds_b]
|
||||
advantages_b = advantages_k[inds_b]
|
||||
logprobs_b = logprobs_k[inds_b]
|
||||
batch_inds_b, denoising_inds_b = torch.unravel_index(
|
||||
inds_b,
|
||||
(self.n_steps * self.n_envs, self.model.ft_denoising_steps),
|
||||
)
|
||||
obs_b = {"state": obs_k["state"][batch_inds_b]}
|
||||
chains_prev_b = chains_k[batch_inds_b, denoising_inds_b]
|
||||
chains_next_b = chains_k[batch_inds_b, denoising_inds_b + 1]
|
||||
returns_b = returns_k[batch_inds_b]
|
||||
values_b = values_k[batch_inds_b]
|
||||
advantages_b = advantages_k[batch_inds_b]
|
||||
logprobs_b = logprobs_k[batch_inds_b, denoising_inds_b]
|
||||
|
||||
# get loss
|
||||
(
|
||||
@ -332,7 +338,9 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
||||
eta,
|
||||
) = self.model.loss(
|
||||
obs_b,
|
||||
chains_b,
|
||||
chains_prev_b,
|
||||
chains_next_b,
|
||||
denoising_inds_b,
|
||||
returns_b,
|
||||
values_b,
|
||||
advantages_b,
|
||||
|
@ -283,40 +283,44 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
||||
for k in obs_trajs
|
||||
}
|
||||
chains_k = einops.rearrange(
|
||||
torch.tensor(chains_trajs).float().to(self.device),
|
||||
torch.tensor(chains_trajs, device=self.device).float(),
|
||||
"s e t h d -> (s e) t h d",
|
||||
)
|
||||
returns_k = (
|
||||
torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
values_k = (
|
||||
torch.tensor(values_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(values_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
advantages_k = (
|
||||
torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
|
||||
logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
|
||||
|
||||
# Update policy and critic
|
||||
total_steps = self.n_steps * self.n_envs
|
||||
inds_k = np.arange(total_steps)
|
||||
total_steps = self.n_steps * self.n_envs * self.model.ft_denoising_steps
|
||||
clipfracs = []
|
||||
for update_epoch in range(self.update_epochs):
|
||||
|
||||
# for each epoch, go through all data in batches
|
||||
flag_break = False
|
||||
np.random.shuffle(inds_k)
|
||||
inds_k = torch.randperm(total_steps, device=self.device)
|
||||
num_batch = max(1, total_steps // self.batch_size) # skip last ones
|
||||
for batch in range(num_batch):
|
||||
start = batch * self.batch_size
|
||||
end = start + self.batch_size
|
||||
inds_b = inds_k[start:end] # b for batch
|
||||
obs_b = {k: obs_k[k][inds_b] for k in obs_k}
|
||||
chains_b = chains_k[inds_b]
|
||||
returns_b = returns_k[inds_b]
|
||||
values_b = values_k[inds_b]
|
||||
advantages_b = advantages_k[inds_b]
|
||||
logprobs_b = logprobs_k[inds_b]
|
||||
batch_inds_b, denoising_inds_b = torch.unravel_index(
|
||||
inds_b,
|
||||
(self.n_steps * self.n_envs, self.model.ft_denoising_steps),
|
||||
)
|
||||
obs_b = {k: obs_k[k][batch_inds_b] for k in obs_k}
|
||||
chains_prev_b = chains_k[batch_inds_b, denoising_inds_b]
|
||||
chains_next_b = chains_k[batch_inds_b, denoising_inds_b + 1]
|
||||
returns_b = returns_k[batch_inds_b]
|
||||
values_b = values_k[batch_inds_b]
|
||||
advantages_b = advantages_k[batch_inds_b]
|
||||
logprobs_b = logprobs_k[batch_inds_b, denoising_inds_b]
|
||||
|
||||
# get loss
|
||||
(
|
||||
@ -330,7 +334,9 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
||||
eta,
|
||||
) = self.model.loss(
|
||||
obs_b,
|
||||
chains_b,
|
||||
chains_prev_b,
|
||||
chains_next_b,
|
||||
denoising_inds_b,
|
||||
returns_b,
|
||||
values_b,
|
||||
advantages_b,
|
||||
|
@ -249,29 +249,28 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
||||
)
|
||||
}
|
||||
samples_k = einops.rearrange(
|
||||
torch.tensor(samples_trajs).float().to(self.device),
|
||||
torch.tensor(samples_trajs, device=self.device).float(),
|
||||
"s e h d -> (s e) h d",
|
||||
)
|
||||
returns_k = (
|
||||
torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
values_k = (
|
||||
torch.tensor(values_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(values_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
advantages_k = (
|
||||
torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
|
||||
logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
|
||||
|
||||
# Update policy and critic
|
||||
total_steps = self.n_steps * self.n_envs
|
||||
inds_k = np.arange(total_steps)
|
||||
clipfracs = []
|
||||
for update_epoch in range(self.update_epochs):
|
||||
|
||||
# for each epoch, go through all data in batches
|
||||
flag_break = False
|
||||
np.random.shuffle(inds_k)
|
||||
inds_k = torch.randperm(total_steps, device=self.device)
|
||||
num_batch = max(1, total_steps // self.batch_size) # skip last ones
|
||||
for batch in range(num_batch):
|
||||
start = batch * self.batch_size
|
||||
|
@ -210,7 +210,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
||||
)
|
||||
reward_trajs = reward_trajs_transpose.T
|
||||
|
||||
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
|
||||
# bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
|
||||
obs_venv_ts = {
|
||||
"state": torch.from_numpy(obs_venv["state"])
|
||||
.float()
|
||||
@ -250,31 +250,28 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
||||
)
|
||||
}
|
||||
samples_k = einops.rearrange(
|
||||
torch.tensor(samples_trajs).float().to(self.device),
|
||||
torch.tensor(samples_trajs, device=self.device).float(),
|
||||
"s e h d -> (s e) h d",
|
||||
)
|
||||
returns_k = (
|
||||
torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
values_k = (
|
||||
torch.tensor(values_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(values_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
advantages_k = (
|
||||
torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
|
||||
)
|
||||
logprobs_k = (
|
||||
torch.tensor(logprobs_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
|
||||
|
||||
# Update policy and critic
|
||||
total_steps = self.n_steps * self.n_envs
|
||||
inds_k = np.arange(total_steps)
|
||||
clipfracs = []
|
||||
for update_epoch in range(self.update_epochs):
|
||||
|
||||
# for each epoch, go through all data in batches
|
||||
flag_break = False
|
||||
np.random.shuffle(inds_k)
|
||||
inds_k = torch.randperm(total_steps, device=self.device)
|
||||
num_batch = max(1, total_steps // self.batch_size) # skip last ones
|
||||
for batch in range(num_batch):
|
||||
start = batch * self.batch_size
|
||||
|
@ -231,7 +231,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
||||
)
|
||||
reward_trajs = reward_trajs_transpose.T
|
||||
|
||||
# bootstrap value with GAE if not done - apply reward scaling with constant if specified
|
||||
# bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
|
||||
obs_venv_ts = {
|
||||
key: torch.from_numpy(obs_venv[key]).float().to(self.device)
|
||||
for key in self.obs_dims
|
||||
@ -271,29 +271,28 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
||||
for k in obs_trajs
|
||||
}
|
||||
samples_k = einops.rearrange(
|
||||
torch.tensor(samples_trajs).float().to(self.device),
|
||||
torch.tensor(samples_trajs, device=self.device).float(),
|
||||
"s e h d -> (s e) h d",
|
||||
)
|
||||
returns_k = (
|
||||
torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
values_k = (
|
||||
torch.tensor(values_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(values_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
advantages_k = (
|
||||
torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
|
||||
torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
|
||||
)
|
||||
logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
|
||||
logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
|
||||
|
||||
# Update policy and critic
|
||||
total_steps = self.n_steps * self.n_envs
|
||||
inds_k = np.arange(total_steps)
|
||||
clipfracs = []
|
||||
for update_epoch in range(self.update_epochs):
|
||||
|
||||
# for each epoch, go through all data in batches
|
||||
flag_break = False
|
||||
np.random.shuffle(inds_k)
|
||||
inds_k = torch.randperm(total_steps, device=self.device)
|
||||
num_batch = max(1, total_steps // self.batch_size) # skip last ones
|
||||
for batch in range(num_batch):
|
||||
start = batch * self.batch_size
|
||||
|
@ -68,7 +68,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 8800
|
||||
batch_size: 40000
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
@ -68,7 +68,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 8800
|
||||
batch_size: 40000
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
@ -68,7 +68,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 8800
|
||||
batch_size: 17600
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
@ -68,7 +68,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 8800
|
||||
batch_size: 30000
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
@ -68,7 +68,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 8800
|
||||
batch_size: 30000
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
@ -68,7 +68,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 8800
|
||||
batch_size: 40000
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
@ -68,7 +68,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 8800
|
||||
batch_size: 40000
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
61
cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml
Normal file
61
cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml
Normal file
@ -0,0 +1,61 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
|
||||
|
||||
name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path:
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-mixed-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
act_steps: 4
|
||||
|
||||
n_steps: 70
|
||||
render_num: 0
|
||||
|
||||
env:
|
||||
n_envs: 40
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion.DiffusionModel
|
||||
predict_epsilon: True
|
||||
denoised_clip_value: 1.0
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
network:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 16
|
||||
mlp_dims: [256, 256, 256]
|
||||
cond_mlp_dims: [128, 32]
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path:
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_calql_mlp_ta1/2024-09-29_22-59-08_42/checkpoint/state_49.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
@ -92,7 +92,7 @@ model:
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
|
@ -68,7 +68,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 5000
|
||||
batch_size: 50000
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
108
cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml
Normal file
108
cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml
Normal file
@ -0,0 +1,108 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
|
||||
|
||||
name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta1_td20/2024-09-29_02-13-10_42/checkpoint/state_1000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 20
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env_name}-finetune
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 501
|
||||
n_critic_warmup_itr: 0
|
||||
n_steps: 1000
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# PPO specific
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 10000
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.99
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.01
|
||||
clip_ploss_coef_rate: 3
|
||||
randn_clip_value: 3
|
||||
min_sampling_denoising_std: 0.1
|
||||
min_logprob_denoising_std: 0.1
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
@ -8,7 +8,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
|
||||
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
base_policy_path:
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_gaussian_mlp_ta1/2024-09-28_18-48-54_42/checkpoint/state_500.pt
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
@ -87,7 +87,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
|
@ -68,7 +68,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 5000
|
||||
batch_size: 50000
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
@ -1,89 +0,0 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_sac_agent.TrainSACAgent
|
||||
|
||||
name: ${env_name}_sac_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: sac-gym-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 3e-4
|
||||
critic_lr: 1e-3
|
||||
save_model_freq: 100000
|
||||
val_freq: 10000
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 200
|
||||
# SAC specific
|
||||
batch_size: 256
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1
|
||||
critic_replay_ratio: 256
|
||||
actor_replay_ratio: 128
|
||||
buffer_size: 1000000
|
||||
n_eval_episode: 10
|
||||
n_explore_steps: 5000
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_sac.SAC_Gaussian
|
||||
randn_clip_value: 10
|
||||
tanh_output: True # squash after sampling
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic: # no layernorm
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256]
|
||||
activation_type: ReLU
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
116
cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml
Normal file
116
cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml
Normal file
@ -0,0 +1,116 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-complete-v0_calql_mlp_ta1/2024-10-26_01-01-33_42/checkpoint/state_999.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-complete-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: calql-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 10000
|
||||
n_steps: 1 # not used
|
||||
n_episode_per_epoch: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 3e-4
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
save_model_freq: 100
|
||||
val_freq: 20
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 1
|
||||
# CalQL specific
|
||||
train_online: True
|
||||
batch_size: 256
|
||||
n_random_actions: 10
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1.0
|
||||
num_update: 1000
|
||||
buffer_size: 1000000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
automatic_entropy_tuning: True
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_calql.CalQL_Gaussian
|
||||
randn_clip_value: 3
|
||||
cql_min_q_weight: 5.0
|
||||
tanh_output: True
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
discount_factor: ${train.gamma}
|
||||
get_mc_return: True
|
108
cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml
Normal file
108
cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml
Normal file
@ -0,0 +1,108 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
|
||||
|
||||
name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-complete-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-47-42_42/checkpoint/state_8000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-complete-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
act_steps: 4
|
||||
|
||||
env:
|
||||
n_envs: 40
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env_name}-finetune
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 301
|
||||
n_critic_warmup_itr: 0
|
||||
n_steps: 70
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# PPO specific
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 5600
|
||||
update_epochs: 10
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.99
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.01
|
||||
clip_ploss_coef_rate: 3
|
||||
randn_clip_value: 3
|
||||
min_sampling_denoising_std: 0.1
|
||||
min_logprob_denoising_std: 0.1
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 16
|
||||
mlp_dims: [256, 256, 256]
|
||||
cond_mlp_dims: [128, 32]
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
109
cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml
Normal file
109
cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml
Normal file
@ -0,0 +1,109 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
|
||||
|
||||
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/gym-pretrain/kitchen-complete-v0_pre_gaussian_mlp_ta1/2024-10-25_14-48-43_42/checkpoint/state_5000.pt
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-complete-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: ibrl-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 50000
|
||||
val_freq: 5000
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 200
|
||||
# IBRL specific
|
||||
batch_size: 256
|
||||
target_ema_rate: 0.01
|
||||
scale_reward_factor: 1
|
||||
critic_num_update: 5
|
||||
buffer_size: 500000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
update_freq: 2
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_ibrl.IBRL_Gaussian
|
||||
randn_clip_value: 3
|
||||
n_critics: 5
|
||||
soft_action_sample: True
|
||||
soft_action_sample_beta: 10
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
activation_type: ReLU
|
||||
dropout: 0.5
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: False # use ensemble
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
max_n_episodes: 50
|
@ -7,15 +7,15 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path:
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_calql_mlp_ta1/2024-10-25_21-36-13_42/checkpoint/state_999.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
env_name: kitchen-mixed-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
@ -23,10 +23,10 @@ act_steps: 1
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
@ -59,7 +59,7 @@ train:
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
val_freq: 20
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
@ -67,13 +67,12 @@ train:
|
||||
# CalQL specific
|
||||
train_online: True
|
||||
batch_size: 256
|
||||
n_random_actions: 4
|
||||
n_random_actions: 10
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1.0
|
||||
num_update: 1000
|
||||
buffer_size: 1000000
|
||||
online_utd_ratio: 1
|
||||
n_eval_episode: 10
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
@ -87,17 +86,17 @@ model:
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256]
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256]
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: True
|
108
cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml
Normal file
108
cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml
Normal file
@ -0,0 +1,108 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
|
||||
|
||||
name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-28_42/checkpoint/state_8000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-mixed-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
act_steps: 4
|
||||
|
||||
env:
|
||||
n_envs: 40
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env_name}-finetune
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 301
|
||||
n_critic_warmup_itr: 0
|
||||
n_steps: 70
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# PPO specific
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 5600
|
||||
update_epochs: 10
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.99
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.01
|
||||
clip_ploss_coef_rate: 3
|
||||
randn_clip_value: 3
|
||||
min_sampling_denoising_std: 0.1
|
||||
min_logprob_denoising_std: 0.1
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 16
|
||||
mlp_dims: [256, 256, 256]
|
||||
cond_mlp_dims: [128, 32]
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
@ -8,14 +8,14 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
|
||||
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
base_policy_path:
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_pre_gaussian_mlp_ta1/2024-10-25_01-39-44_42/checkpoint/state_5000.pt
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
env_name: kitchen-mixed-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
@ -23,10 +23,10 @@ act_steps: 1
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
@ -42,7 +42,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 250000
|
||||
n_train_itr: 1000000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
@ -51,14 +51,14 @@ train:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 50000
|
||||
val_freq: 2000
|
||||
val_freq: 5000
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
@ -68,8 +68,8 @@ train:
|
||||
target_ema_rate: 0.01
|
||||
scale_reward_factor: 1
|
||||
critic_num_update: 5
|
||||
buffer_size: 1000000
|
||||
n_eval_episode: 10
|
||||
buffer_size: 500000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
update_freq: 2
|
||||
|
||||
@ -78,19 +78,19 @@ model:
|
||||
randn_clip_value: 3
|
||||
n_critics: 5
|
||||
soft_action_sample: True
|
||||
soft_action_sample_beta: 0.1
|
||||
network_path: ${base_policy_path}
|
||||
soft_action_sample_beta: 10
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
activation_type: ReLU
|
||||
dropout: 0.5
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: False # use ensemble
|
||||
@ -106,3 +106,4 @@ offline_dataset:
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
max_n_episodes: 50
|
116
cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml
Normal file
116
cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml
Normal file
@ -0,0 +1,116 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_calql_mlp_ta1/2024-10-25_21-26-51_42/checkpoint/state_980.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-partial-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: calql-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 10000
|
||||
n_steps: 1 # not used
|
||||
n_episode_per_epoch: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 3e-4
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
save_model_freq: 100
|
||||
val_freq: 20
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 1
|
||||
# CalQL specific
|
||||
train_online: True
|
||||
batch_size: 256
|
||||
n_random_actions: 10
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1.0
|
||||
num_update: 1000
|
||||
buffer_size: 1000000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
automatic_entropy_tuning: True
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_calql.CalQL_Gaussian
|
||||
randn_clip_value: 3
|
||||
cql_min_q_weight: 5.0
|
||||
tanh_output: True
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
discount_factor: ${train.gamma}
|
||||
get_mc_return: True
|
108
cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml
Normal file
108
cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml
Normal file
@ -0,0 +1,108 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
|
||||
|
||||
name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-29_42/checkpoint/state_8000.pt
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-partial-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 4
|
||||
act_steps: 4
|
||||
|
||||
env:
|
||||
n_envs: 40
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env_name}-finetune
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 301
|
||||
n_critic_warmup_itr: 0
|
||||
n_steps: 70
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# PPO specific
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 5600
|
||||
update_epochs: 10
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.99
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.01
|
||||
clip_ploss_coef_rate: 3
|
||||
randn_clip_value: 3
|
||||
min_sampling_denoising_std: 0.1
|
||||
min_logprob_denoising_std: 0.1
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 16
|
||||
mlp_dims: [256, 256, 256]
|
||||
cond_mlp_dims: [128, 32]
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
109
cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml
Normal file
109
cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml
Normal file
@ -0,0 +1,109 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
|
||||
|
||||
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_pre_gaussian_mlp_ta1/2024-10-25_01-45-52_42/checkpoint/state_5000.pt
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-partial-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: ibrl-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 50000
|
||||
val_freq: 5000
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 200
|
||||
# IBRL specific
|
||||
batch_size: 256
|
||||
target_ema_rate: 0.01
|
||||
scale_reward_factor: 1
|
||||
critic_num_update: 5
|
||||
buffer_size: 500000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
update_freq: 2
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_ibrl.IBRL_Gaussian
|
||||
randn_clip_value: 3
|
||||
n_critics: 5
|
||||
soft_action_sample: True
|
||||
soft_action_sample_beta: 10
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
activation_type: ReLU
|
||||
dropout: 0.5
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: False # use ensemble
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
max_n_episodes: 50
|
@ -68,7 +68,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 5000
|
||||
batch_size: 50000
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
@ -1,103 +0,0 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
|
||||
|
||||
name: ${env_name}_rlpd_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: walker2d-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
denoising_steps: 20
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 40
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: rlpd-gym-${env_name}-finetune
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 2000
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# RLPD specific
|
||||
batch_size: 512
|
||||
entropy_temperature: 1.0 # alpha in RLPD paper
|
||||
target_ema_rate: 0.005 # rho in RLPD paper
|
||||
scale_reward_factor: 1.0 # multiply reward by this amount for more stable value estimation
|
||||
replay_ratio: 64 # number of batches to sample for each learning update
|
||||
buffer_size: 1000000
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_rlpd.RLPD_Gaussian
|
||||
randn_clip_value: 3
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
use_layernorm: True
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
n_critics: 2 # Ensemble size for critic models
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
@ -88,7 +88,7 @@ model:
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
|
113
cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml
Normal file
113
cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml
Normal file
@ -0,0 +1,113 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-complete-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: calql-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_steps: 1 # not used
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 20
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 1
|
||||
# CalQL specific
|
||||
train_online: False
|
||||
batch_size: 256
|
||||
n_random_actions: 10
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1.0
|
||||
num_update: 1000
|
||||
buffer_size: 1000000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
automatic_entropy_tuning: True
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_calql.CalQL_Gaussian
|
||||
randn_clip_value: 3
|
||||
cql_min_q_weight: 5.0
|
||||
tanh_output: True
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
discount_factor: ${train.gamma}
|
||||
get_mc_return: True
|
66
cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml
Normal file
66
cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml
Normal file
@ -0,0 +1,66 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
|
||||
|
||||
name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env: kitchen-complete-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
denoising_steps: 20
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env}-pretrain
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_epochs: 8000
|
||||
batch_size: 128
|
||||
learning_rate: 1e-3
|
||||
weight_decay: 1e-6
|
||||
lr_scheduler:
|
||||
first_cycle_steps: 8000
|
||||
warmup_steps: 1
|
||||
min_lr: 1e-4
|
||||
epoch_start_ema: 10
|
||||
update_ema_freq: 5
|
||||
save_model_freq: 1000
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion.DiffusionModel
|
||||
predict_epsilon: True
|
||||
denoised_clip_value: 1.0
|
||||
network:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 16
|
||||
mlp_dims: [256, 256, 256]
|
||||
cond_mlp_dims: [128, 32]
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
||||
|
||||
ema:
|
||||
decay: 0.995
|
||||
|
||||
train_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceDataset
|
||||
dataset_path: ${train_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
60
cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml
Normal file
60
cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml
Normal file
@ -0,0 +1,60 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
|
||||
|
||||
name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env: kitchen-complete-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
horizon_steps: 1
|
||||
cond_steps: 1
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env}-pretrain
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_epochs: 5000
|
||||
batch_size: 256
|
||||
learning_rate: 1e-4
|
||||
weight_decay: 0
|
||||
lr_scheduler:
|
||||
first_cycle_steps: 5000
|
||||
warmup_steps: 100
|
||||
min_lr: 1e-4
|
||||
epoch_start_ema: 20
|
||||
update_ema_freq: 10
|
||||
save_model_freq: 1000
|
||||
|
||||
model:
|
||||
_target_: model.common.gaussian.GaussianModel
|
||||
network:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
activation_type: ReLU
|
||||
dropout: 0.5
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
ema:
|
||||
decay: 0.995
|
||||
|
||||
train_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceDataset
|
||||
dataset_path: ${train_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
@ -6,15 +6,15 @@ hydra:
|
||||
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
env_name: kitchen-mixed-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
@ -22,10 +22,10 @@ act_steps: 1
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
@ -41,7 +41,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 100
|
||||
n_train_itr: 1000
|
||||
n_steps: 1 # not used
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
@ -50,14 +50,14 @@ train:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 3e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 10
|
||||
val_freq: 10
|
||||
val_freq: 20
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
@ -65,12 +65,12 @@ train:
|
||||
# CalQL specific
|
||||
train_online: False
|
||||
batch_size: 256
|
||||
n_random_actions: 4
|
||||
n_random_actions: 10
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1.0
|
||||
num_update: 1000
|
||||
buffer_size: 1000000
|
||||
n_eval_episode: 10
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
@ -83,17 +83,17 @@ model:
|
||||
tanh_output: True
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256]
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256]
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: True
|
66
cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml
Normal file
66
cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml
Normal file
@ -0,0 +1,66 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
|
||||
|
||||
name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env: kitchen-mixed-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
denoising_steps: 20
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env}-pretrain
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_epochs: 8000
|
||||
batch_size: 256
|
||||
learning_rate: 1e-3
|
||||
weight_decay: 1e-6
|
||||
lr_scheduler:
|
||||
first_cycle_steps: 8000
|
||||
warmup_steps: 1
|
||||
min_lr: 1e-4
|
||||
epoch_start_ema: 10
|
||||
update_ema_freq: 5
|
||||
save_model_freq: 1000
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion.DiffusionModel
|
||||
predict_epsilon: True
|
||||
denoised_clip_value: 1.0
|
||||
network:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 16
|
||||
mlp_dims: [256, 256, 256]
|
||||
cond_mlp_dims: [128, 32]
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
||||
|
||||
ema:
|
||||
decay: 0.995
|
||||
|
||||
train_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceDataset
|
||||
dataset_path: ${train_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
59
cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml
Normal file
59
cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml
Normal file
@ -0,0 +1,59 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
|
||||
|
||||
name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env: kitchen-mixed-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env}-pretrain
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_epochs: 5000
|
||||
batch_size: 128
|
||||
learning_rate: 1e-3
|
||||
weight_decay: 1e-6
|
||||
lr_scheduler:
|
||||
first_cycle_steps: 5000
|
||||
warmup_steps: 1
|
||||
min_lr: 1e-4
|
||||
epoch_start_ema: 10
|
||||
update_ema_freq: 5
|
||||
save_model_freq: 1000
|
||||
|
||||
model:
|
||||
_target_: model.common.gaussian.GaussianModel
|
||||
network:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
ema:
|
||||
decay: 0.995
|
||||
|
||||
train_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceDataset
|
||||
dataset_path: ${train_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
113
cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml
Normal file
113
cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml
Normal file
@ -0,0 +1,113 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-partial-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: calql-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_steps: 1 # not used
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 10
|
||||
val_freq: 20
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 1
|
||||
# CalQL specific
|
||||
train_online: False
|
||||
batch_size: 256
|
||||
n_random_actions: 10
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1.0
|
||||
num_update: 1000
|
||||
buffer_size: 1000000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
automatic_entropy_tuning: True
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_calql.CalQL_Gaussian
|
||||
randn_clip_value: 3
|
||||
cql_min_q_weight: 5.0
|
||||
tanh_output: True
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
discount_factor: ${train.gamma}
|
||||
get_mc_return: True
|
66
cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml
Normal file
66
cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml
Normal file
@ -0,0 +1,66 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
|
||||
|
||||
name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env: kitchen-partial-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
denoising_steps: 20
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env}-pretrain
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_epochs: 8000
|
||||
batch_size: 128
|
||||
learning_rate: 1e-3
|
||||
weight_decay: 1e-5
|
||||
lr_scheduler:
|
||||
first_cycle_steps: 8000
|
||||
warmup_steps: 1
|
||||
min_lr: 1e-4
|
||||
epoch_start_ema: 10
|
||||
update_ema_freq: 5
|
||||
save_model_freq: 1000
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion.DiffusionModel
|
||||
predict_epsilon: True
|
||||
denoised_clip_value: 1.0
|
||||
network:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 16
|
||||
mlp_dims: [256, 256, 256]
|
||||
cond_mlp_dims: [128, 32]
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
||||
|
||||
ema:
|
||||
decay: 0.995
|
||||
|
||||
train_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceDataset
|
||||
dataset_path: ${train_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
59
cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml
Normal file
59
cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml
Normal file
@ -0,0 +1,59 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
|
||||
|
||||
name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env: kitchen-partial-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
horizon_steps: 4
|
||||
cond_steps: 1
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env}-pretrain
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_epochs: 5000
|
||||
batch_size: 128
|
||||
learning_rate: 1e-3
|
||||
weight_decay: 1e-6
|
||||
lr_scheduler:
|
||||
first_cycle_steps: 5000
|
||||
warmup_steps: 1
|
||||
min_lr: 1e-4
|
||||
epoch_start_ema: 10
|
||||
update_ema_freq: 5
|
||||
save_model_freq: 1000
|
||||
|
||||
model:
|
||||
_target_: model.common.gaussian.GaussianModel
|
||||
network:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
ema:
|
||||
decay: 0.995
|
||||
|
||||
train_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceDataset
|
||||
dataset_path: ${train_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
@ -14,8 +14,8 @@ device: cuda:0
|
||||
env_name: halfcheetah-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 20
|
||||
denoising_steps: 10
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
@ -67,7 +67,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 1000
|
||||
batch_size: 10000
|
||||
update_epochs: 10
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
@ -53,7 +53,7 @@ train:
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 10000
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
@ -86,7 +86,7 @@ model:
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
@ -75,7 +75,7 @@ model:
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic: # no layernorm
|
99
cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml
Normal file
99
cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml
Normal file
@ -0,0 +1,99 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
|
||||
|
||||
name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 10
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env_name}-scratch
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_critic_warmup_itr: 0
|
||||
n_steps: 1000
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# AWR specific
|
||||
scale_reward_factor: 0.01
|
||||
max_adv_weight: 100
|
||||
beta: 10
|
||||
buffer_size: 100000 # * n_envs
|
||||
batch_size: 256
|
||||
replay_ratio: 128
|
||||
critic_update_ratio: 4
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_awr.AWRDiffusion
|
||||
# Sampling HPs
|
||||
min_sampling_denoising_std: 0.10
|
||||
randn_clip_value: 3
|
||||
#
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
residual_style: True
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
101
cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml
Normal file
101
cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml
Normal file
@ -0,0 +1,101 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
|
||||
|
||||
name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 10
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env_name}-scratch
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_critic_warmup_itr: 0
|
||||
n_steps: 1000
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# DIPO specific
|
||||
scale_reward_factor: 0.01
|
||||
target_ema_rate: 0.005
|
||||
buffer_size: 1000000
|
||||
action_lr: 0.0001
|
||||
action_gradient_steps: 10
|
||||
replay_ratio: 128
|
||||
batch_size: 256
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_dipo.DIPODiffusion
|
||||
# Sampling HPs
|
||||
min_sampling_denoising_std: 0.10
|
||||
randn_clip_value: 3
|
||||
#
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
residual_style: True
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
100
cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml
Normal file
100
cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml
Normal file
@ -0,0 +1,100 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
|
||||
|
||||
name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 10
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env_name}-scratch
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_critic_warmup_itr: 0
|
||||
n_steps: 1000
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# DQL specific
|
||||
scale_reward_factor: 0.01
|
||||
target_ema_rate: 0.005
|
||||
buffer_size: 1000000
|
||||
eta: 1.0
|
||||
replay_ratio: 128
|
||||
batch_size: 256
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_dql.DQLDiffusion
|
||||
# Sampling HPs
|
||||
min_sampling_denoising_std: 0.10
|
||||
randn_clip_value: 3
|
||||
#
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
residual_style: True
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
108
cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml
Normal file
108
cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml
Normal file
@ -0,0 +1,108 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
|
||||
|
||||
name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 10
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env_name}-scratch
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_critic_warmup_itr: 0
|
||||
n_steps: 1000
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# IDQL specific
|
||||
scale_reward_factor: 0.01
|
||||
eval_deterministic: True
|
||||
eval_sample_num: 10 # how many samples to score during eval
|
||||
critic_tau: 0.001 # rate of target q network update
|
||||
use_expectile_exploration: True
|
||||
buffer_size: 100000 # * n_envs
|
||||
replay_ratio: 128
|
||||
batch_size: 256
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_idql.IDQLDiffusion
|
||||
# Sampling HPs
|
||||
min_sampling_denoising_std: 0.10
|
||||
randn_clip_value: 3
|
||||
#
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
residual_style: True
|
||||
critic_q:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
critic_v:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
@ -14,8 +14,8 @@ device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 20
|
||||
denoising_steps: 10
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
@ -55,7 +55,7 @@ train:
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 10000
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
@ -67,7 +67,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 1000
|
||||
batch_size: 10000
|
||||
update_epochs: 10
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
@ -94,10 +94,10 @@ model:
|
||||
residual_style: True
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
@ -53,7 +53,7 @@ train:
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 10000
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
100
cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml
Normal file
100
cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml
Normal file
@ -0,0 +1,100 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
|
||||
|
||||
name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 10
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env_name}-scratch
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_critic_warmup_itr: 0
|
||||
n_steps: 1000
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# QSM specific
|
||||
scale_reward_factor: 0.01
|
||||
q_grad_coeff: 50
|
||||
critic_tau: 0.005
|
||||
buffer_size: 100000 # * n_envs
|
||||
replay_ratio: 128
|
||||
batch_size: 256
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_qsm.QSMDiffusion
|
||||
# Sampling HPs
|
||||
min_sampling_denoising_std: 0.10
|
||||
randn_clip_value: 3
|
||||
#
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
residual_style: True
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
84
cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml
Normal file
84
cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml
Normal file
@ -0,0 +1,84 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
|
||||
|
||||
name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 10
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: gym-${env_name}-scratch
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_critic_warmup_itr: 0
|
||||
n_steps: 1000
|
||||
gamma: 0.99
|
||||
lr: 1e-4
|
||||
weight_decay: 0
|
||||
lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# RWR specific
|
||||
max_reward_weight: 100
|
||||
beta: 10
|
||||
batch_size: 256
|
||||
update_epochs: 128
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_rwr.RWRDiffusion
|
||||
# Sampling HPs
|
||||
min_sampling_denoising_std: 0.1
|
||||
randn_clip_value: 3
|
||||
#
|
||||
network:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
residual_style: True
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
109
cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml
Normal file
109
cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml
Normal file
@ -0,0 +1,109 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
|
||||
|
||||
name: ${env_name}_rlpd_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-complete-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: rlpd-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 3e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 50000
|
||||
val_freq: 5000
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 200
|
||||
# RLPD specific
|
||||
batch_size: 256
|
||||
target_ema_rate: 0.01
|
||||
scale_reward_factor: 1
|
||||
critic_num_update: 10
|
||||
buffer_size: 400000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_rlpd.RLPD_Gaussian
|
||||
randn_clip_value: 10
|
||||
tanh_output: True # squash after sampling
|
||||
backup_entropy: True
|
||||
n_critics: 5 # Ensemble size for critic models
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: False # use ensemble
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
@ -12,9 +12,9 @@ offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: hopper-medium-v2
|
||||
obs_dim: 11
|
||||
action_dim: 3
|
||||
env_name: kitchen-mixed-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
@ -22,10 +22,10 @@ act_steps: 1
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 1000
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 3
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
@ -41,7 +41,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 250000
|
||||
n_train_itr: 1000000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 3e-4
|
||||
@ -50,12 +50,12 @@ train:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
critic_lr: 3e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 50000
|
||||
val_freq: 5000
|
||||
render:
|
||||
@ -64,12 +64,12 @@ train:
|
||||
log_freq: 200
|
||||
# RLPD specific
|
||||
batch_size: 256
|
||||
target_ema_rate: 0.005
|
||||
target_ema_rate: 0.01
|
||||
scale_reward_factor: 1
|
||||
critic_num_update: 20
|
||||
buffer_size: 1000000
|
||||
n_eval_episode: 10
|
||||
n_explore_steps: 5000
|
||||
critic_num_update: 10
|
||||
buffer_size: 400000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
|
||||
@ -78,20 +78,20 @@ model:
|
||||
randn_clip_value: 10
|
||||
tanh_output: True # squash after sampling
|
||||
backup_entropy: True
|
||||
n_critics: 10 # Ensemble size for critic models
|
||||
n_critics: 5 # Ensemble size for critic models
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256]
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256]
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: False # use ensemble
|
109
cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml
Normal file
109
cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml
Normal file
@ -0,0 +1,109 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
|
||||
|
||||
name: ${env_name}_rlpd_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: kitchen-partial-v0
|
||||
obs_dim: 60
|
||||
action_dim: 9
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 280
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 4
|
||||
wrappers:
|
||||
mujoco_locomotion_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: rlpd-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 3e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 50000
|
||||
val_freq: 5000
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 200
|
||||
# RLPD specific
|
||||
batch_size: 256
|
||||
target_ema_rate: 0.01
|
||||
scale_reward_factor: 1
|
||||
critic_num_update: 10
|
||||
buffer_size: 400000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_rlpd.RLPD_Gaussian
|
||||
randn_clip_value: 10
|
||||
tanh_output: True # squash after sampling
|
||||
backup_entropy: True
|
||||
n_critics: 5 # Ensemble size for critic models
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: False # use ensemble
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
@ -14,8 +14,8 @@ device: cuda:0
|
||||
env_name: walker2d-medium-v2
|
||||
obs_dim: 17
|
||||
action_dim: 6
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 20
|
||||
denoising_steps: 10
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
@ -67,7 +67,7 @@ train:
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 1000
|
||||
batch_size: 10000
|
||||
update_epochs: 10
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
@ -53,7 +53,7 @@ train:
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 10000
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path:
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_calql_mlp_ta1/2024-10-25_22-30-16_42/checkpoint/state_999.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
|
||||
@ -97,7 +97,7 @@ model:
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
|
122
cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml
Normal file
122
cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml
Normal file
@ -0,0 +1,122 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: can
|
||||
obs_dim: 23
|
||||
action_dim: 7
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 300
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
low_dim_keys: ['robot0_eef_pos',
|
||||
'robot0_eef_quat',
|
||||
'robot0_gripper_qpos',
|
||||
'object'] # same order of preprocessed observations
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: calql-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_steps: 1 # not used
|
||||
n_episode_per_epoch: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 3e-4
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 1
|
||||
# CalQL specific
|
||||
train_online: True
|
||||
batch_size: 256
|
||||
n_random_actions: 4
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1.0
|
||||
num_update: 1000
|
||||
buffer_size: 1000000
|
||||
online_utd_ratio: 1
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
automatic_entropy_tuning: True
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_calql.CalQL_Gaussian
|
||||
randn_clip_value: 3
|
||||
cql_min_q_weight: 5.0
|
||||
tanh_output: True
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
discount_factor: ${train.gamma}
|
||||
get_mc_return: True
|
@ -26,7 +26,7 @@ env:
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 300
|
||||
save_video: false
|
||||
save_video: False
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 151
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
|
@ -26,7 +26,7 @@ env:
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 300
|
||||
save_video: false
|
||||
save_video: False
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 151
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
|
@ -47,16 +47,16 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 151
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
actor_lr: 1e-5
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-5
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
|
@ -60,22 +60,22 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 200
|
||||
n_train_itr: 151
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
augment: True
|
||||
grad_accumulate: 15
|
||||
actor_lr: 1e-5
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 200
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-5
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 200
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
@ -96,7 +96,7 @@ train:
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.9
|
||||
gamma_denoising: 0.99
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.001
|
||||
clip_ploss_coef_rate: 3
|
||||
@ -158,10 +158,10 @@ model:
|
||||
embed_style: embed2
|
||||
embed_norm: 0
|
||||
img_cond_steps: ${img_cond_steps}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
|
111
cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml
Normal file
111
cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml
Normal file
@ -0,0 +1,111 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
|
||||
|
||||
name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_diffusion_mlp_ta1_td20/2024-09-29_15-43-07_42/checkpoint/state_8000.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: can
|
||||
obs_dim: 23
|
||||
action_dim: 7
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 50
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 300
|
||||
save_video: False
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
low_dim_keys: ['robot0_eef_pos',
|
||||
'robot0_eef_quat',
|
||||
'robot0_gripper_qpos',
|
||||
'object'] # same order of preprocessed observations
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: robomimic-${env_name}-finetune
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 301
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# PPO specific
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 15000
|
||||
update_epochs: 10
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.99
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.001
|
||||
clip_ploss_coef_rate: 3
|
||||
randn_clip_value: 3
|
||||
min_sampling_denoising_std: 0.1
|
||||
min_logprob_denoising_std: 0.1
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
111
cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml
Normal file
111
cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml
Normal file
@ -0,0 +1,111 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
|
||||
|
||||
name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: can
|
||||
obs_dim: 23
|
||||
action_dim: 7
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 40
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 300
|
||||
save_video: False
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
low_dim_keys: ['robot0_eef_pos',
|
||||
'robot0_eef_quat',
|
||||
'robot0_gripper_qpos',
|
||||
'object'] # same order of preprocessed observations
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: robomimic-${env_name}-finetune
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 301
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# PPO specific
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 6000
|
||||
update_epochs: 10
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.9
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.001
|
||||
clip_ploss_coef_rate: 3
|
||||
randn_clip_value: 3
|
||||
min_sampling_denoising_std: 0.1
|
||||
min_logprob_denoising_std: 0.1
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 16
|
||||
mlp_dims: [512, 512, 512]
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 151
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
|
@ -26,7 +26,7 @@ env:
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 300
|
||||
save_video: false
|
||||
save_video: False
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 151
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
|
@ -7,7 +7,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
|
||||
|
||||
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path:
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_gaussian_mlp_ta1/2024-09-28_13-43-59_42/checkpoint/state_5000.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
|
||||
@ -93,7 +93,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
|
115
cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml
Normal file
115
cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml
Normal file
@ -0,0 +1,115 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
|
||||
|
||||
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_gaussian_mlp_ta1/2024-10-08_20-52-04_0/checkpoint/state_5000.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: can
|
||||
obs_dim: 23
|
||||
action_dim: 7
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 250 # IBRL uses 200
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 1
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
low_dim_keys: ['robot0_eef_pos',
|
||||
'robot0_eef_quat',
|
||||
'robot0_gripper_qpos',
|
||||
'object'] # same order of preprocessed observations
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: ibrl-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-4
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
save_model_freq: 100000
|
||||
val_freq: 10000
|
||||
render:
|
||||
freq: 10000
|
||||
num: 0
|
||||
log_freq: 200
|
||||
# IBRL specific
|
||||
batch_size: 256
|
||||
target_ema_rate: 0.01
|
||||
scale_reward_factor: 1
|
||||
critic_num_update: 3
|
||||
buffer_size: 400000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
update_freq: 2
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_ibrl.IBRL_Gaussian
|
||||
randn_clip_value: 3
|
||||
n_critics: 5
|
||||
soft_action_sample: True
|
||||
soft_action_sample_beta: 10
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
activation_type: ReLU
|
||||
dropout: 0.5
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: False # use ensemble
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
max_n_episodes: 100
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 81
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 81
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 81
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
|
@ -27,7 +27,7 @@ env:
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 300
|
||||
save_video: false
|
||||
save_video: False
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
@ -47,16 +47,16 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 81
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
actor_lr: 1e-5
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-5
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
@ -99,10 +99,10 @@ model:
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
|
@ -60,22 +60,22 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 200
|
||||
n_train_itr: 151
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
augment: True
|
||||
grad_accumulate: 15
|
||||
actor_lr: 1e-5
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 200
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-5
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 200
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
@ -96,7 +96,7 @@ train:
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.9
|
||||
gamma_denoising: 0.99
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.001
|
||||
clip_ploss_coef_rate: 3
|
||||
@ -158,10 +158,10 @@ model:
|
||||
embed_style: embed2
|
||||
embed_norm: 0
|
||||
img_cond_steps: ${img_cond_steps}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 81
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 81
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 300
|
||||
gamma: 0.999
|
||||
|
@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path:
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_calql_mlp_ta1/2024-10-25_22-44-12_42/checkpoint/state_999.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
|
||||
@ -97,7 +97,7 @@ model:
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
|
122
cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml
Normal file
122
cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml
Normal file
@ -0,0 +1,122 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
|
||||
|
||||
name: ${env_name}_calql_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: square
|
||||
obs_dim: 23
|
||||
action_dim: 7
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 400
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
low_dim_keys: ['robot0_eef_pos',
|
||||
'robot0_eef_quat',
|
||||
'robot0_gripper_qpos',
|
||||
'object'] # same order of preprocessed observations
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: calql-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 10000
|
||||
n_steps: 1 # not used
|
||||
n_episode_per_epoch: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 3e-4
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 3e-4
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
log_freq: 1
|
||||
# CalQL specific
|
||||
train_online: True
|
||||
batch_size: 256
|
||||
n_random_actions: 4
|
||||
target_ema_rate: 0.005
|
||||
scale_reward_factor: 1.0
|
||||
num_update: 1000
|
||||
buffer_size: 1000000
|
||||
online_utd_ratio: 1
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
|
||||
init_temperature: 1
|
||||
automatic_entropy_tuning: True
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_calql.CalQL_Gaussian
|
||||
randn_clip_value: 3
|
||||
cql_min_q_weight: 5.0
|
||||
tanh_output: True
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [512, 512, 512]
|
||||
activation_type: ReLU
|
||||
tanh_output: False # squash after sampling instead
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
std_max: 7.3891
|
||||
std_min: 0.0067
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
discount_factor: ${train.gamma}
|
||||
get_mc_return: True
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
|
@ -47,16 +47,16 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 500
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
actor_lr: 1e-5
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-5
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
@ -100,10 +100,10 @@ model:
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
|
@ -60,7 +60,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 500
|
||||
n_train_itr: 301
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
@ -69,13 +69,13 @@ train:
|
||||
actor_lr: 1e-5
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 500
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-5
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 500
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
@ -96,7 +96,7 @@ train:
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.9
|
||||
gamma_denoising: 0.99
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.001
|
||||
clip_ploss_coef_rate: 3
|
||||
@ -158,10 +158,10 @@ model:
|
||||
embed_style: embed2
|
||||
embed_norm: 0
|
||||
img_cond_steps: ${img_cond_steps}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
|
112
cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml
Normal file
112
cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml
Normal file
@ -0,0 +1,112 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
|
||||
|
||||
name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_diffusion_mlp_ta1_td20/2024-09-29_02-14-14_42/checkpoint/state_8000.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: square
|
||||
obs_dim: 23
|
||||
action_dim: 7
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 50
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 400
|
||||
save_video: false
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
low_dim_keys: ['robot0_eef_pos',
|
||||
'robot0_eef_quat',
|
||||
'robot0_gripper_qpos',
|
||||
'object'] # same order of preprocessed observations
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: robomimic-${env_name}-finetune
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 301
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# PPO specific
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 20000
|
||||
update_epochs: 10
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.99
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.001
|
||||
clip_ploss_coef_rate: 3
|
||||
randn_clip_value: 3
|
||||
min_sampling_denoising_std: 0.1
|
||||
min_logprob_denoising_std: 0.1
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 32
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
cond_mlp_dims: [512, 64]
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
112
cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml
Normal file
112
cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml
Normal file
@ -0,0 +1,112 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
|
||||
|
||||
name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: square
|
||||
obs_dim: 23
|
||||
action_dim: 7
|
||||
denoising_steps: 20
|
||||
ft_denoising_steps: 10
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 40
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 400
|
||||
save_video: false
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
low_dim_keys: ['robot0_eef_pos',
|
||||
'robot0_eef_quat',
|
||||
'robot0_gripper_qpos',
|
||||
'object'] # same order of preprocessed observations
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: robomimic-${env_name}-finetune
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 301
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
val_freq: 10
|
||||
render:
|
||||
freq: 1
|
||||
num: 0
|
||||
# PPO specific
|
||||
reward_scale_running: True
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 8000
|
||||
update_epochs: 10
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.9
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.001
|
||||
clip_ploss_coef_rate: 3
|
||||
randn_clip_value: 3
|
||||
min_sampling_denoising_std: 0.1
|
||||
min_logprob_denoising_std: 0.1
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.diffusion.mlp_diffusion.DiffusionMLP
|
||||
time_dim: 32
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
cond_mlp_dims: [512, 64]
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
denoising_steps: ${denoising_steps}
|
||||
device: ${device}
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
|
@ -46,7 +46,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 300
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
|
@ -7,7 +7,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
|
||||
|
||||
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path:
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_gaussian_mlp_ta1/2024-09-28_13-42-43_42/checkpoint/state_5000.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
|
||||
@ -93,7 +93,7 @@ model:
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
|
115
cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml
Normal file
115
cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml
Normal file
@ -0,0 +1,115 @@
|
||||
defaults:
|
||||
- _self_
|
||||
hydra:
|
||||
run:
|
||||
dir: ${logdir}
|
||||
_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
|
||||
|
||||
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
|
||||
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
|
||||
base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_gaussian_mlp_ta1/2024-10-08_20-52-42_0/checkpoint/state_5000.pt
|
||||
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
|
||||
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
|
||||
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
|
||||
|
||||
seed: 42
|
||||
device: cuda:0
|
||||
env_name: square
|
||||
obs_dim: 23
|
||||
action_dim: 7
|
||||
cond_steps: 1
|
||||
horizon_steps: 1
|
||||
act_steps: 1
|
||||
|
||||
env:
|
||||
n_envs: 1
|
||||
name: ${env_name}
|
||||
max_episode_steps: 350 # IBRL uses 300
|
||||
reset_at_iteration: False
|
||||
save_video: False
|
||||
best_reward_threshold_for_success: 1
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
low_dim_keys: ['robot0_eef_pos',
|
||||
'robot0_eef_quat',
|
||||
'robot0_gripper_qpos',
|
||||
'object'] # same order of preprocessed observations
|
||||
multi_step:
|
||||
n_obs_steps: ${cond_steps}
|
||||
n_action_steps: ${act_steps}
|
||||
max_episode_steps: ${env.max_episode_steps}
|
||||
reset_within_step: True
|
||||
|
||||
wandb:
|
||||
entity: ${oc.env:DPPO_WANDB_ENTITY}
|
||||
project: ibrl-${env_name}
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000000
|
||||
n_steps: 1
|
||||
gamma: 0.99
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-4
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-4
|
||||
save_model_freq: 100000
|
||||
val_freq: 10000
|
||||
render:
|
||||
freq: 10000
|
||||
num: 0
|
||||
log_freq: 200
|
||||
# IBRL specific
|
||||
batch_size: 256
|
||||
target_ema_rate: 0.01
|
||||
scale_reward_factor: 1
|
||||
critic_num_update: 3
|
||||
buffer_size: 400000
|
||||
n_eval_episode: 40
|
||||
n_explore_steps: 0
|
||||
update_freq: 2
|
||||
|
||||
model:
|
||||
_target_: model.rl.gaussian_ibrl.IBRL_Gaussian
|
||||
randn_clip_value: 3
|
||||
n_critics: 5
|
||||
soft_action_sample: True
|
||||
soft_action_sample_beta: 10
|
||||
network_path: ${base_policy_path}
|
||||
actor:
|
||||
_target_: model.common.mlp_gaussian.Gaussian_MLP
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
activation_type: ReLU
|
||||
dropout: 0.5
|
||||
fixed_std: 0.1
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
horizon_steps: ${horizon_steps}
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
mlp_dims: [1024, 1024, 1024]
|
||||
activation_type: ReLU
|
||||
use_layernorm: True
|
||||
double_q: False # use ensemble
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
device: ${device}
|
||||
|
||||
offline_dataset:
|
||||
_target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
|
||||
dataset_path: ${offline_dataset_path}
|
||||
horizon_steps: ${horizon_steps}
|
||||
cond_steps: ${cond_steps}
|
||||
device: ${device}
|
||||
max_n_episodes: 100
|
@ -26,7 +26,7 @@ env:
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 800
|
||||
save_video: false
|
||||
save_video: False
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
@ -49,7 +49,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
@ -58,7 +58,7 @@ train:
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-6
|
||||
min_lr: 1e-5
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
@ -82,7 +82,7 @@ train:
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_awr.AWRDiffusion
|
||||
# Sampling HPs
|
||||
min_sampling_denoising_std: 0.08
|
||||
min_sampling_denoising_std: 0.1
|
||||
randn_clip_value: 3
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
|
@ -49,7 +49,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
@ -58,7 +58,7 @@ train:
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-6
|
||||
min_lr: 1e-5
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
@ -82,7 +82,7 @@ train:
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_dipo.DIPODiffusion
|
||||
# HP to tune
|
||||
min_sampling_denoising_std: 0.08
|
||||
min_sampling_denoising_std: 0.1
|
||||
randn_clip_value: 3
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
@ -96,12 +96,12 @@ model:
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObsAct
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
action_dim: ${action_dim}
|
||||
action_steps: ${act_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
action_dim: ${action_dim}
|
||||
|
@ -26,7 +26,7 @@ env:
|
||||
name: ${env_name}
|
||||
best_reward_threshold_for_success: 1
|
||||
max_episode_steps: 800
|
||||
save_video: false
|
||||
save_video: False
|
||||
wrappers:
|
||||
robomimic_lowdim:
|
||||
normalization_path: ${normalization_path}
|
||||
@ -49,8 +49,8 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_critic_warmup_itr: 2
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
actor_lr: 1e-5
|
||||
@ -58,7 +58,7 @@ train:
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-6
|
||||
min_lr: 1e-5
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
@ -81,7 +81,7 @@ train:
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_dql.DQLDiffusion
|
||||
# Sampling HPs
|
||||
min_sampling_denoising_std: 0.08
|
||||
min_sampling_denoising_std: 0.1
|
||||
randn_clip_value: 3
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
|
@ -49,7 +49,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
@ -58,7 +58,7 @@ train:
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-6
|
||||
min_lr: 1e-5
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
@ -83,7 +83,7 @@ train:
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_idql.IDQLDiffusion
|
||||
# Sampling HPs
|
||||
min_sampling_denoising_std: 0.08
|
||||
min_sampling_denoising_std: 0.1
|
||||
randn_clip_value: 3
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
|
@ -50,16 +50,16 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
actor_lr: 1e-5
|
||||
actor_lr: 1e-4
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-6
|
||||
min_lr: 1e-4
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
@ -76,7 +76,7 @@ train:
|
||||
reward_scale_const: 1.0
|
||||
gae_lambda: 0.95
|
||||
batch_size: 10000
|
||||
update_epochs: 8
|
||||
update_epochs: 5
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
||||
@ -88,7 +88,7 @@ model:
|
||||
clip_ploss_coef_base: 0.001
|
||||
clip_ploss_coef_rate: 3
|
||||
randn_clip_value: 3
|
||||
min_sampling_denoising_std: 0.08
|
||||
min_sampling_denoising_std: 0.1
|
||||
min_logprob_denoising_std: 0.1
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
@ -102,10 +102,10 @@ model:
|
||||
action_dim: ${action_dim}
|
||||
critic:
|
||||
_target_: model.common.critic.CriticObs
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
|
@ -64,7 +64,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 500
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 2
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
@ -73,13 +73,13 @@ train:
|
||||
actor_lr: 1e-5
|
||||
actor_weight_decay: 0
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 500
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-6
|
||||
min_lr: 1e-5
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
first_cycle_steps: 500
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-3
|
||||
save_model_freq: 100
|
||||
@ -93,19 +93,19 @@ train:
|
||||
gae_lambda: 0.95
|
||||
batch_size: 500
|
||||
logprob_batch_size: 1000
|
||||
update_epochs: 8
|
||||
update_epochs: 10
|
||||
vf_coef: 0.5
|
||||
target_kl: 1
|
||||
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_ppo.PPODiffusion
|
||||
# HP to tune
|
||||
gamma_denoising: 0.9
|
||||
gamma_denoising: 0.99
|
||||
clip_ploss_coef: 0.01
|
||||
clip_ploss_coef_base: 0.001
|
||||
clip_ploss_coef_rate: 3
|
||||
randn_clip_value: 3
|
||||
min_sampling_denoising_std: 0.08
|
||||
min_sampling_denoising_std: 0.1
|
||||
min_logprob_denoising_std: 0.1
|
||||
#
|
||||
use_ddim: ${use_ddim}
|
||||
@ -164,10 +164,10 @@ model:
|
||||
embed_style: embed2
|
||||
embed_norm: 0
|
||||
img_cond_steps: ${img_cond_steps}
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
mlp_dims: [256, 256, 256]
|
||||
activation_type: Mish
|
||||
residual_style: True
|
||||
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
|
||||
ft_denoising_steps: ${ft_denoising_steps}
|
||||
horizon_steps: ${horizon_steps}
|
||||
obs_dim: ${obs_dim}
|
||||
|
@ -49,7 +49,7 @@ wandb:
|
||||
run: ${now:%H-%M-%S}_${name}
|
||||
|
||||
train:
|
||||
n_train_itr: 1000
|
||||
n_train_itr: 201
|
||||
n_critic_warmup_itr: 5
|
||||
n_steps: 400
|
||||
gamma: 0.999
|
||||
@ -58,7 +58,7 @@ train:
|
||||
actor_lr_scheduler:
|
||||
first_cycle_steps: 1000
|
||||
warmup_steps: 10
|
||||
min_lr: 1e-6
|
||||
min_lr: 1e-5
|
||||
critic_lr: 1e-3
|
||||
critic_weight_decay: 0
|
||||
critic_lr_scheduler:
|
||||
@ -81,7 +81,7 @@ train:
|
||||
model:
|
||||
_target_: model.diffusion.diffusion_qsm.QSMDiffusion
|
||||
# Sampling HPs
|
||||
min_sampling_denoising_std: 0.08
|
||||
min_sampling_denoising_std: 0.1
|
||||
randn_clip_value: 3
|
||||
#
|
||||
network_path: ${base_policy_path}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user