diff --git a/README.md b/README.md
index f4f3428..2d64563 100644
--- a/README.md
+++ b/README.md
@@ -31,12 +31,11 @@ conda activate dppo
pip install -e .
```
-3. Install specific environment dependencies (Gym / Robomimic / D3IL / Furniture-Bench) or all dependencies
+3. Install specific environment dependencies (Gym / Kitchen / Robomimic / D3IL / Furniture-Bench) or all dependencies (except for Kitchen, which has dependency conflicts with other tasks).
```console
-pip install -e .[gym] # or [robomimic], [d3il], [furniture]
-pip install -e .[all]
+pip install -e .[gym] # or [kitchen], [robomimic], [d3il], [furniture]
+pip install -e .[all] # except for Kitchen
```
-
4. [Install MuJoCo for Gym and/or Robomimic](installation/install_mujoco.md). [Install D3IL](installation/install_d3il.md). [Install IsaacGym and Furniture-Bench](installation/install_furniture.md)
@@ -161,6 +160,7 @@ Our diffusion implementation is mostly based on [Diffuser](https://github.com/ja
* `model.min_sampling_denoising_std`:
, minimum amount of noise when sampling at a denoising step
* `model.min_logprob_denoising_std`:
, minimum standard deviation when evaluating likelihood at a denoising step
* `model.clip_ploss_coef`: PPO clipping ratio
+* `train.batch_size`: you may notice the batch size is rather large --- this is due to the PPO update being in expectation over both environment steps and denoising steps (new in v0.6).
### DDIM fine-tuning
diff --git a/agent/finetune/train_calql_agent.py b/agent/finetune/train_calql_agent.py
index cd96d0b..c71f488 100644
--- a/agent/finetune/train_calql_agent.py
+++ b/agent/finetune/train_calql_agent.py
@@ -82,8 +82,6 @@ class TrainCalQLAgent(TrainAgent):
if self.train_online:
# number of episode to colect per epoch for training
self.n_episode_per_epoch = cfg.train.n_episode_per_epoch
- # UTD ratio
- self.online_utd_ratio = cfg.train.online_utd_ratio
# Eval episodes
self.n_eval_episode = cfg.train.n_eval_episode
@@ -204,9 +202,13 @@ class TrainCalQLAgent(TrainAgent):
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
- obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
- self.venv.step(action_venv)
- )
+ (
+ obs_venv,
+ reward_venv,
+ terminated_venv,
+ truncated_venv,
+ info_venv,
+ ) = self.venv.step(action_venv)
done_venv = terminated_venv | truncated_venv
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
@@ -308,7 +310,8 @@ class TrainCalQLAgent(TrainAgent):
# override num_update
if self.train_online:
- num_update = len(reward_trajs) # assume one env!
+ # the amount of new transitions(single env)
+ num_update = len(reward_trajs_split[0])
else:
num_update = self.num_update
for _ in range(num_update):
@@ -413,7 +416,6 @@ class TrainCalQLAgent(TrainAgent):
reward_to_go_b,
terminated_b,
self.gamma,
- alpha,
)
self.critic_optimizer.zero_grad()
loss_critic.backward()
diff --git a/agent/finetune/train_ibrl_agent.py b/agent/finetune/train_ibrl_agent.py
index 0f9a06d..6de7d77 100644
--- a/agent/finetune/train_ibrl_agent.py
+++ b/agent/finetune/train_ibrl_agent.py
@@ -145,7 +145,6 @@ class TrainIBRLAgent(TrainAgent):
# Collect a set of trajectories from env
cnt_episode = 0
for step in range(n_steps):
-
# Select action
with torch.no_grad():
cond = {
@@ -164,9 +163,13 @@ class TrainIBRLAgent(TrainAgent):
action_venv = samples[:, : self.act_steps]
# Apply multi-step action
- obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
- self.venv.step(action_venv)
- )
+ (
+ obs_venv,
+ reward_venv,
+ terminated_venv,
+ truncated_venv,
+ info_venv,
+ ) = self.venv.step(action_venv)
done_venv = terminated_venv | truncated_venv
reward_trajs[step] = reward_venv
firsts_trajs[step + 1] = done_venv
@@ -177,14 +180,13 @@ class TrainIBRLAgent(TrainAgent):
obs_buffer.append(prev_obs_venv["state"][i])
if "final_obs" in info_venv[i]: # truncated
next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
- terminated_venv[i] = False
else: # first obs in new episode
next_obs_buffer.append(obs_venv["state"][i])
action_buffer.append(action_venv[i])
reward_buffer.extend(
(reward_venv * self.scale_reward_factor).tolist()
)
- terminated_buffer.append(terminated_venv.tolist())
+ terminated_buffer.extend(terminated_venv.tolist())
# update for next step
prev_obs_venv = obs_venv
diff --git a/agent/finetune/train_ppo_diffusion_agent.py b/agent/finetune/train_ppo_diffusion_agent.py
index ee073b3..998c638 100644
--- a/agent/finetune/train_ppo_diffusion_agent.py
+++ b/agent/finetune/train_ppo_diffusion_agent.py
@@ -19,7 +19,6 @@ from util.scheduler import CosineAnnealingWarmupRestarts
class TrainPPODiffusionAgent(TrainPPOAgent):
-
def __init__(self, cfg):
super().__init__(cfg)
@@ -46,7 +45,6 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
)
def run(self):
-
# Start training loop
timer = Timer()
run_results = []
@@ -54,7 +52,6 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
last_itr_eval = False
done_venv = np.zeros((1, self.n_envs))
while self.itr < self.n_train_itr:
-
# Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
options_venv = [{} for _ in range(self.n_envs)]
if self.itr % self.render_freq == 0 and self.render_video:
@@ -126,9 +123,13 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
action_venv = output_venv[:, : self.act_steps]
# Apply multi-step action
- obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = (
- self.venv.step(action_venv)
- )
+ (
+ obs_venv,
+ reward_venv,
+ terminated_venv,
+ truncated_venv,
+ info_venv,
+ ) = self.venv.step(action_venv)
done_venv = terminated_venv | truncated_venv
if self.save_full_observations: # state-only
obs_full_venv = np.array(
@@ -285,40 +286,45 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
)
}
chains_k = einops.rearrange(
- torch.tensor(chains_trajs).float().to(self.device),
+ torch.tensor(chains_trajs, device=self.device).float(),
"s e t h d -> (s e) t h d",
)
returns_k = (
- torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
)
values_k = (
- torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(values_trajs, device=self.device).float().reshape(-1)
)
advantages_k = (
- torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(advantages_trajs, device=self.device)
+ .float()
+ .reshape(-1)
)
- logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+ logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
# Update policy and critic
- total_steps = self.n_steps * self.n_envs
- inds_k = np.arange(total_steps)
+ total_steps = self.n_steps * self.n_envs * self.model.ft_denoising_steps
clipfracs = []
for update_epoch in range(self.update_epochs):
-
# for each epoch, go through all data in batches
flag_break = False
- np.random.shuffle(inds_k)
+ inds_k = torch.randperm(total_steps, device=self.device)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
for batch in range(num_batch):
start = batch * self.batch_size
end = start + self.batch_size
inds_b = inds_k[start:end] # b for batch
- obs_b = {"state": obs_k["state"][inds_b]}
- chains_b = chains_k[inds_b]
- returns_b = returns_k[inds_b]
- values_b = values_k[inds_b]
- advantages_b = advantages_k[inds_b]
- logprobs_b = logprobs_k[inds_b]
+ batch_inds_b, denoising_inds_b = torch.unravel_index(
+ inds_b,
+ (self.n_steps * self.n_envs, self.model.ft_denoising_steps),
+ )
+ obs_b = {"state": obs_k["state"][batch_inds_b]}
+ chains_prev_b = chains_k[batch_inds_b, denoising_inds_b]
+ chains_next_b = chains_k[batch_inds_b, denoising_inds_b + 1]
+ returns_b = returns_k[batch_inds_b]
+ values_b = values_k[batch_inds_b]
+ advantages_b = advantages_k[batch_inds_b]
+ logprobs_b = logprobs_k[batch_inds_b, denoising_inds_b]
# get loss
(
@@ -332,7 +338,9 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
eta,
) = self.model.loss(
obs_b,
- chains_b,
+ chains_prev_b,
+ chains_next_b,
+ denoising_inds_b,
returns_b,
values_b,
advantages_b,
diff --git a/agent/finetune/train_ppo_diffusion_img_agent.py b/agent/finetune/train_ppo_diffusion_img_agent.py
index 9d47b0d..9eb6696 100644
--- a/agent/finetune/train_ppo_diffusion_img_agent.py
+++ b/agent/finetune/train_ppo_diffusion_img_agent.py
@@ -283,40 +283,44 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
for k in obs_trajs
}
chains_k = einops.rearrange(
- torch.tensor(chains_trajs).float().to(self.device),
+ torch.tensor(chains_trajs, device=self.device).float(),
"s e t h d -> (s e) t h d",
)
returns_k = (
- torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
)
values_k = (
- torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(values_trajs, device=self.device).float().reshape(-1)
)
advantages_k = (
- torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
)
- logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+ logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
# Update policy and critic
- total_steps = self.n_steps * self.n_envs
- inds_k = np.arange(total_steps)
+ total_steps = self.n_steps * self.n_envs * self.model.ft_denoising_steps
clipfracs = []
for update_epoch in range(self.update_epochs):
# for each epoch, go through all data in batches
flag_break = False
- np.random.shuffle(inds_k)
+ inds_k = torch.randperm(total_steps, device=self.device)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
for batch in range(num_batch):
start = batch * self.batch_size
end = start + self.batch_size
inds_b = inds_k[start:end] # b for batch
- obs_b = {k: obs_k[k][inds_b] for k in obs_k}
- chains_b = chains_k[inds_b]
- returns_b = returns_k[inds_b]
- values_b = values_k[inds_b]
- advantages_b = advantages_k[inds_b]
- logprobs_b = logprobs_k[inds_b]
+ batch_inds_b, denoising_inds_b = torch.unravel_index(
+ inds_b,
+ (self.n_steps * self.n_envs, self.model.ft_denoising_steps),
+ )
+ obs_b = {k: obs_k[k][batch_inds_b] for k in obs_k}
+ chains_prev_b = chains_k[batch_inds_b, denoising_inds_b]
+ chains_next_b = chains_k[batch_inds_b, denoising_inds_b + 1]
+ returns_b = returns_k[batch_inds_b]
+ values_b = values_k[batch_inds_b]
+ advantages_b = advantages_k[batch_inds_b]
+ logprobs_b = logprobs_k[batch_inds_b, denoising_inds_b]
# get loss
(
@@ -330,7 +334,9 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
eta,
) = self.model.loss(
obs_b,
- chains_b,
+ chains_prev_b,
+ chains_next_b,
+ denoising_inds_b,
returns_b,
values_b,
advantages_b,
diff --git a/agent/finetune/train_ppo_exact_diffusion_agent.py b/agent/finetune/train_ppo_exact_diffusion_agent.py
index 920b03f..6fa425f 100644
--- a/agent/finetune/train_ppo_exact_diffusion_agent.py
+++ b/agent/finetune/train_ppo_exact_diffusion_agent.py
@@ -249,29 +249,28 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
)
}
samples_k = einops.rearrange(
- torch.tensor(samples_trajs).float().to(self.device),
+ torch.tensor(samples_trajs, device=self.device).float(),
"s e h d -> (s e) h d",
)
returns_k = (
- torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
)
values_k = (
- torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(values_trajs, device=self.device).float().reshape(-1)
)
advantages_k = (
- torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
)
- logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+ logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
# Update policy and critic
total_steps = self.n_steps * self.n_envs
- inds_k = np.arange(total_steps)
clipfracs = []
for update_epoch in range(self.update_epochs):
# for each epoch, go through all data in batches
flag_break = False
- np.random.shuffle(inds_k)
+ inds_k = torch.randperm(total_steps, device=self.device)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
for batch in range(num_batch):
start = batch * self.batch_size
diff --git a/agent/finetune/train_ppo_gaussian_agent.py b/agent/finetune/train_ppo_gaussian_agent.py
index 2ad38bd..be1d754 100644
--- a/agent/finetune/train_ppo_gaussian_agent.py
+++ b/agent/finetune/train_ppo_gaussian_agent.py
@@ -210,7 +210,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
)
reward_trajs = reward_trajs_transpose.T
- # bootstrap value with GAE if not done - apply reward scaling with constant if specified
+ # bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
obs_venv_ts = {
"state": torch.from_numpy(obs_venv["state"])
.float()
@@ -250,31 +250,28 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
)
}
samples_k = einops.rearrange(
- torch.tensor(samples_trajs).float().to(self.device),
+ torch.tensor(samples_trajs, device=self.device).float(),
"s e h d -> (s e) h d",
)
returns_k = (
- torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
)
values_k = (
- torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(values_trajs, device=self.device).float().reshape(-1)
)
advantages_k = (
- torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
- )
- logprobs_k = (
- torch.tensor(logprobs_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
)
+ logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
# Update policy and critic
total_steps = self.n_steps * self.n_envs
- inds_k = np.arange(total_steps)
clipfracs = []
for update_epoch in range(self.update_epochs):
# for each epoch, go through all data in batches
flag_break = False
- np.random.shuffle(inds_k)
+ inds_k = torch.randperm(total_steps, device=self.device)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
for batch in range(num_batch):
start = batch * self.batch_size
diff --git a/agent/finetune/train_ppo_gaussian_img_agent.py b/agent/finetune/train_ppo_gaussian_img_agent.py
index de1dbb9..3c404ed 100644
--- a/agent/finetune/train_ppo_gaussian_img_agent.py
+++ b/agent/finetune/train_ppo_gaussian_img_agent.py
@@ -231,7 +231,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
)
reward_trajs = reward_trajs_transpose.T
- # bootstrap value with GAE if not done - apply reward scaling with constant if specified
+ # bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
obs_venv_ts = {
key: torch.from_numpy(obs_venv[key]).float().to(self.device)
for key in self.obs_dims
@@ -271,29 +271,28 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
for k in obs_trajs
}
samples_k = einops.rearrange(
- torch.tensor(samples_trajs).float().to(self.device),
+ torch.tensor(samples_trajs, device=self.device).float(),
"s e h d -> (s e) h d",
)
returns_k = (
- torch.tensor(returns_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
)
values_k = (
- torch.tensor(values_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(values_trajs, device=self.device).float().reshape(-1)
)
advantages_k = (
- torch.tensor(advantages_trajs).float().to(self.device).reshape(-1)
+ torch.tensor(advantages_trajs, device=self.device).float().reshape(-1)
)
- logprobs_k = torch.tensor(logprobs_trajs).float().to(self.device)
+ logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
# Update policy and critic
total_steps = self.n_steps * self.n_envs
- inds_k = np.arange(total_steps)
clipfracs = []
for update_epoch in range(self.update_epochs):
# for each epoch, go through all data in batches
flag_break = False
- np.random.shuffle(inds_k)
+ inds_k = torch.randperm(total_steps, device=self.device)
num_batch = max(1, total_steps // self.batch_size) # skip last ones
for batch in range(num_batch):
start = batch * self.batch_size
diff --git a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml
index 69011ec..bb58881 100644
--- a/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/lamp_low/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 8800
+ batch_size: 40000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml
index ce73c44..183c2c9 100644
--- a/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/lamp_med/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 8800
+ batch_size: 40000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml
index 44cd23a..92be054 100644
--- a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_mlp.yaml
@@ -68,7 +68,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 8800
+ batch_size: 17600
update_epochs: 5
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml
index 54082c5..79d7a6e 100644
--- a/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/one_leg_low/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 8800
+ batch_size: 30000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml
index 9484c40..86d1406 100644
--- a/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/one_leg_med/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 8800
+ batch_size: 30000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml
index 519d59f..52753d7 100644
--- a/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/round_table_low/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 8800
+ batch_size: 40000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml
index 659fd30..a5f3f96 100644
--- a/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml
+++ b/cfg/furniture/finetune/round_table_med/ft_ppo_diffusion_unet.yaml
@@ -68,7 +68,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 8800
+ batch_size: 40000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml b/cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml
new file mode 100644
index 0000000..f74e0fc
--- /dev/null
+++ b/cfg/gym/eval/kitchen-v0/eval_diffusion_mlp.yaml
@@ -0,0 +1,61 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+n_steps: 70
+render_num: 0
+
+env:
+ n_envs: 40
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+model:
+ _target_: model.diffusion.diffusion.DiffusionModel
+ predict_epsilon: True
+ denoised_clip_value: 1.0
+ #
+ network_path: ${base_policy_path}
+ network:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [256, 256, 256]
+ cond_mlp_dims: [128, 32]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml b/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml
index cef9f0f..311d4a1 100644
--- a/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml
+++ b/cfg/gym/finetune/halfcheetah-v2/calql_mlp_online.yaml
@@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
name: ${env_name}_calql_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_calql_mlp_ta1/2024-09-29_22-59-08_42/checkpoint/state_49.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
@@ -92,7 +92,7 @@ model:
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml
index fbcea35..8e395ff 100644
--- a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp.yaml
@@ -68,7 +68,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 5000
+ batch_size: 50000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml
new file mode 100644
index 0000000..cba7754
--- /dev/null
+++ b/cfg/gym/finetune/halfcheetah-v2/ft_ppo_diffusion_mlp_ta1.yaml
@@ -0,0 +1,108 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_diffusion_mlp_ta1_td20/2024-09-29_02-13-10_42/checkpoint/state_1000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: halfcheetah-medium-v2
+obs_dim: 17
+action_dim: 6
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 20
+ name: ${env_name}
+ max_episode_steps: 1000
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 3 # success rate not relevant for gym tasks
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env_name}-finetune
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 501
+ n_critic_warmup_itr: 0
+ n_steps: 1000
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # PPO specific
+ reward_scale_running: True
+ reward_scale_const: 1.0
+ gae_lambda: 0.95
+ batch_size: 10000
+ update_epochs: 5
+ vf_coef: 0.5
+ target_kl: 1
+
+model:
+ _target_: model.diffusion.diffusion_ppo.PPODiffusion
+ # HP to tune
+ gamma_denoising: 0.99
+ clip_ploss_coef: 0.01
+ clip_ploss_coef_base: 0.01
+ clip_ploss_coef_rate: 3
+ randn_clip_value: 3
+ min_sampling_denoising_std: 0.1
+ min_logprob_denoising_std: 0.1
+ #
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObs
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ ft_denoising_steps: ${ft_denoising_steps}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml b/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml
index adfec91..7ab10bd 100644
--- a/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml
+++ b/cfg/gym/finetune/halfcheetah-v2/ibrl_mlp.yaml
@@ -8,7 +8,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/halfcheetah-medium-v2_pre_gaussian_mlp_ta1/2024-09-28_18-48-54_42/checkpoint/state_500.pt
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
seed: 42
@@ -87,7 +87,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObsAct
mlp_dims: [256, 256, 256]
diff --git a/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml
index d4b9597..5cea98a 100644
--- a/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/gym/finetune/hopper-v2/ft_ppo_diffusion_mlp.yaml
@@ -68,7 +68,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 5000
+ batch_size: 50000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/gym/finetune/hopper-v2/sac_mlp.yaml b/cfg/gym/finetune/hopper-v2/sac_mlp.yaml
deleted file mode 100644
index 6d44909..0000000
--- a/cfg/gym/finetune/hopper-v2/sac_mlp.yaml
+++ /dev/null
@@ -1,89 +0,0 @@
-defaults:
- - _self_
-hydra:
- run:
- dir: ${logdir}
-_target_: agent.finetune.train_sac_agent.TrainSACAgent
-
-name: ${env_name}_sac_mlp_ta${horizon_steps}
-logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
-offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
-
-seed: 42
-device: cuda:0
-env_name: hopper-medium-v2
-obs_dim: 11
-action_dim: 3
-cond_steps: 1
-horizon_steps: 1
-act_steps: 1
-
-env:
- n_envs: 1
- name: ${env_name}
- max_episode_steps: 1000
- reset_at_iteration: False
- save_video: False
- best_reward_threshold_for_success: 3
- wrappers:
- mujoco_locomotion_lowdim:
- normalization_path: ${normalization_path}
- multi_step:
- n_obs_steps: ${cond_steps}
- n_action_steps: ${act_steps}
- max_episode_steps: ${env.max_episode_steps}
- reset_within_step: True
-
-wandb:
- entity: ${oc.env:DPPO_WANDB_ENTITY}
- project: sac-gym-${env_name}
- run: ${now:%H-%M-%S}_${name}
-
-train:
- n_train_itr: 1000000
- n_steps: 1
- gamma: 0.99
- actor_lr: 3e-4
- critic_lr: 1e-3
- save_model_freq: 100000
- val_freq: 10000
- render:
- freq: 1
- num: 0
- log_freq: 200
- # SAC specific
- batch_size: 256
- target_ema_rate: 0.005
- scale_reward_factor: 1
- critic_replay_ratio: 256
- actor_replay_ratio: 128
- buffer_size: 1000000
- n_eval_episode: 10
- n_explore_steps: 5000
- target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
- init_temperature: 1
-
-model:
- _target_: model.rl.gaussian_sac.SAC_Gaussian
- randn_clip_value: 10
- tanh_output: True # squash after sampling
- actor:
- _target_: model.common.mlp_gaussian.Gaussian_MLP
- mlp_dims: [256, 256]
- activation_type: ReLU
- tanh_output: False # squash after sampling instead
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
- horizon_steps: ${horizon_steps}
-
- std_max: 7.3891
- std_min: 0.0067
- critic: # no layernorm
- _target_: model.common.critic.CriticObsAct
- mlp_dims: [256, 256]
- activation_type: ReLU
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
- action_dim: ${action_dim}
- action_steps: ${act_steps}
- horizon_steps: ${horizon_steps}
- device: ${device}
diff --git a/cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml b/cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml
new file mode 100644
index 0000000..1d2eb3a
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-complete-v0/calql_mlp_online.yaml
@@ -0,0 +1,116 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-complete-v0_calql_mlp_ta1/2024-10-26_01-01-33_42/checkpoint/state_999.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: calql-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 10000
+ n_steps: 1 # not used
+ n_episode_per_epoch: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 3e-4
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 3e-4
+ save_model_freq: 100
+ val_freq: 20
+ render:
+ freq: 1
+ num: 0
+ log_freq: 1
+ # CalQL specific
+ train_online: True
+ batch_size: 256
+ n_random_actions: 10
+ target_ema_rate: 0.005
+ scale_reward_factor: 1.0
+ num_update: 1000
+ buffer_size: 1000000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+ automatic_entropy_tuning: True
+
+model:
+ _target_: model.rl.gaussian_calql.CalQL_Gaussian
+ randn_clip_value: 3
+ cql_min_q_weight: 5.0
+ tanh_output: True
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ discount_factor: ${train.gamma}
+ get_mc_return: True
\ No newline at end of file
diff --git a/cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..c73997a
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-complete-v0/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-complete-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-47-42_42/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+ n_envs: 40
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env_name}-finetune
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 301
+ n_critic_warmup_itr: 0
+ n_steps: 70
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # PPO specific
+ reward_scale_running: True
+ reward_scale_const: 1.0
+ gae_lambda: 0.95
+ batch_size: 5600
+ update_epochs: 10
+ vf_coef: 0.5
+ target_kl: 1
+
+model:
+ _target_: model.diffusion.diffusion_ppo.PPODiffusion
+ # HP to tune
+ gamma_denoising: 0.99
+ clip_ploss_coef: 0.01
+ clip_ploss_coef_base: 0.01
+ clip_ploss_coef_rate: 3
+ randn_clip_value: 3
+ min_sampling_denoising_std: 0.1
+ min_logprob_denoising_std: 0.1
+ #
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [256, 256, 256]
+ cond_mlp_dims: [128, 32]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObs
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ ft_denoising_steps: ${ft_denoising_steps}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml b/cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml
new file mode 100644
index 0000000..d0c62ee
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-complete-v0/ibrl_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/gym-pretrain/kitchen-complete-v0_pre_gaussian_mlp_ta1/2024-10-25_14-48-43_42/checkpoint/state_5000.pt
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: ibrl-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000000
+ n_steps: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 50000
+ val_freq: 5000
+ render:
+ freq: 1
+ num: 0
+ log_freq: 200
+ # IBRL specific
+ batch_size: 256
+ target_ema_rate: 0.01
+ scale_reward_factor: 1
+ critic_num_update: 5
+ buffer_size: 500000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ update_freq: 2
+
+model:
+ _target_: model.rl.gaussian_ibrl.IBRL_Gaussian
+ randn_clip_value: 3
+ n_critics: 5
+ soft_action_sample: True
+ soft_action_sample_beta: 10
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [1024, 1024, 1024]
+ activation_type: ReLU
+ dropout: 0.5
+ fixed_std: 0.1
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [1024, 1024, 1024]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: False # use ensemble
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ max_n_episodes: 50
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml b/cfg/gym/finetune/kitchen-mixed-v0/calql_mlp_online.yaml
similarity index 86%
rename from cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml
rename to cfg/gym/finetune/kitchen-mixed-v0/calql_mlp_online.yaml
index 10204ba..cf8da13 100644
--- a/cfg/gym/finetune/hopper-v2/calql_mlp_online.yaml
+++ b/cfg/gym/finetune/kitchen-mixed-v0/calql_mlp_online.yaml
@@ -7,15 +7,15 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
name: ${env_name}_calql_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_calql_mlp_ta1/2024-10-25_21-36-13_42/checkpoint/state_999.pt
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
seed: 42
device: cuda:0
-env_name: hopper-medium-v2
-obs_dim: 11
-action_dim: 3
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
cond_steps: 1
horizon_steps: 1
act_steps: 1
@@ -23,10 +23,10 @@ act_steps: 1
env:
n_envs: 1
name: ${env_name}
- max_episode_steps: 1000
+ max_episode_steps: 280
reset_at_iteration: False
save_video: False
- best_reward_threshold_for_success: 3
+ best_reward_threshold_for_success: 4
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
@@ -59,7 +59,7 @@ train:
warmup_steps: 10
min_lr: 3e-4
save_model_freq: 100
- val_freq: 10
+ val_freq: 20
render:
freq: 1
num: 0
@@ -67,13 +67,12 @@ train:
# CalQL specific
train_online: True
batch_size: 256
- n_random_actions: 4
+ n_random_actions: 10
target_ema_rate: 0.005
scale_reward_factor: 1.0
num_update: 1000
buffer_size: 1000000
- online_utd_ratio: 1
- n_eval_episode: 10
+ n_eval_episode: 40
n_explore_steps: 0
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
init_temperature: 1
@@ -87,17 +86,17 @@ model:
network_path: ${base_policy_path}
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
- mlp_dims: [256, 256]
+ mlp_dims: [256, 256, 256]
activation_type: ReLU
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
_target_: model.common.critic.CriticObsAct
- mlp_dims: [256, 256]
+ mlp_dims: [256, 256, 256]
activation_type: ReLU
use_layernorm: True
double_q: True
diff --git a/cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..f90294b
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-mixed-v0/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-28_42/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+ n_envs: 40
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env_name}-finetune
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 301
+ n_critic_warmup_itr: 0
+ n_steps: 70
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # PPO specific
+ reward_scale_running: True
+ reward_scale_const: 1.0
+ gae_lambda: 0.95
+ batch_size: 5600
+ update_epochs: 10
+ vf_coef: 0.5
+ target_kl: 1
+
+model:
+ _target_: model.diffusion.diffusion_ppo.PPODiffusion
+ # HP to tune
+ gamma_denoising: 0.99
+ clip_ploss_coef: 0.01
+ clip_ploss_coef_base: 0.01
+ clip_ploss_coef_rate: 3
+ randn_clip_value: 3
+ min_sampling_denoising_std: 0.1
+ min_logprob_denoising_std: 0.1
+ #
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [256, 256, 256]
+ cond_mlp_dims: [128, 32]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObs
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ ft_denoising_steps: ${ft_denoising_steps}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml b/cfg/gym/finetune/kitchen-mixed-v0/ibrl_mlp.yaml
similarity index 78%
rename from cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml
rename to cfg/gym/finetune/kitchen-mixed-v0/ibrl_mlp.yaml
index 1737a1e..d98c3bb 100644
--- a/cfg/gym/finetune/hopper-v2/ibrl_mlp.yaml
+++ b/cfg/gym/finetune/kitchen-mixed-v0/ibrl_mlp.yaml
@@ -8,14 +8,14 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-mixed-v0_pre_gaussian_mlp_ta1/2024-10-25_01-39-44_42/checkpoint/state_5000.pt
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
seed: 42
device: cuda:0
-env_name: hopper-medium-v2
-obs_dim: 11
-action_dim: 3
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
cond_steps: 1
horizon_steps: 1
act_steps: 1
@@ -23,10 +23,10 @@ act_steps: 1
env:
n_envs: 1
name: ${env_name}
- max_episode_steps: 1000
+ max_episode_steps: 280
reset_at_iteration: False
save_video: False
- best_reward_threshold_for_success: 3
+ best_reward_threshold_for_success: 4
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
@@ -42,7 +42,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 250000
+ n_train_itr: 1000000
n_steps: 1
gamma: 0.99
actor_lr: 1e-4
@@ -51,25 +51,25 @@ train:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
- critic_lr: 1e-4
+ critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-4
+ min_lr: 1e-3
save_model_freq: 50000
- val_freq: 2000
+ val_freq: 5000
render:
freq: 1
num: 0
log_freq: 200
# IBRL specific
batch_size: 256
- target_ema_rate: 0.01
+ target_ema_rate: 0.01
scale_reward_factor: 1
critic_num_update: 5
- buffer_size: 1000000
- n_eval_episode: 10
+ buffer_size: 500000
+ n_eval_episode: 40
n_explore_steps: 0
update_freq: 2
@@ -78,19 +78,19 @@ model:
randn_clip_value: 3
n_critics: 5
soft_action_sample: True
- soft_action_sample_beta: 0.1
- network_path: ${base_policy_path}
+ soft_action_sample_beta: 10
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
- mlp_dims: [256, 256, 256]
- activation_type: Mish
+ mlp_dims: [1024, 1024, 1024]
+ activation_type: ReLU
+ dropout: 0.5
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObsAct
- mlp_dims: [256, 256, 256]
+ mlp_dims: [1024, 1024, 1024]
activation_type: ReLU
use_layernorm: True
double_q: False # use ensemble
@@ -105,4 +105,5 @@ offline_dataset:
dataset_path: ${offline_dataset_path}
horizon_steps: ${horizon_steps}
cond_steps: ${cond_steps}
- device: ${device}
\ No newline at end of file
+ device: ${device}
+ max_n_episodes: 50
\ No newline at end of file
diff --git a/cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml b/cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml
new file mode 100644
index 0000000..160bf19
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-partial-v0/calql_mlp_online.yaml
@@ -0,0 +1,116 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_calql_mlp_ta1/2024-10-25_21-26-51_42/checkpoint/state_980.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: calql-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 10000
+ n_steps: 1 # not used
+ n_episode_per_epoch: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 3e-4
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 3e-4
+ save_model_freq: 100
+ val_freq: 20
+ render:
+ freq: 1
+ num: 0
+ log_freq: 1
+ # CalQL specific
+ train_online: True
+ batch_size: 256
+ n_random_actions: 10
+ target_ema_rate: 0.005
+ scale_reward_factor: 1.0
+ num_update: 1000
+ buffer_size: 1000000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+ automatic_entropy_tuning: True
+
+model:
+ _target_: model.rl.gaussian_calql.CalQL_Gaussian
+ randn_clip_value: 3
+ cql_min_q_weight: 5.0
+ tanh_output: True
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ discount_factor: ${train.gamma}
+ get_mc_return: True
\ No newline at end of file
diff --git a/cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml
new file mode 100644
index 0000000..946d86b
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-partial-v0/ft_ppo_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ppo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-29_42/checkpoint/state_8000.pt
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+ n_envs: 40
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env_name}-finetune
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 301
+ n_critic_warmup_itr: 0
+ n_steps: 70
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # PPO specific
+ reward_scale_running: True
+ reward_scale_const: 1.0
+ gae_lambda: 0.95
+ batch_size: 5600
+ update_epochs: 10
+ vf_coef: 0.5
+ target_kl: 1
+
+model:
+ _target_: model.diffusion.diffusion_ppo.PPODiffusion
+ # HP to tune
+ gamma_denoising: 0.99
+ clip_ploss_coef: 0.01
+ clip_ploss_coef_base: 0.01
+ clip_ploss_coef_rate: 3
+ randn_clip_value: 3
+ min_sampling_denoising_std: 0.1
+ min_logprob_denoising_std: 0.1
+ #
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [256, 256, 256]
+ cond_mlp_dims: [128, 32]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObs
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ ft_denoising_steps: ${ft_denoising_steps}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml b/cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml
new file mode 100644
index 0000000..3d15f16
--- /dev/null
+++ b/cfg/gym/finetune/kitchen-partial-v0/ibrl_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/kitchen-partial-v0_pre_gaussian_mlp_ta1/2024-10-25_01-45-52_42/checkpoint/state_5000.pt
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: ibrl-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000000
+ n_steps: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 50000
+ val_freq: 5000
+ render:
+ freq: 1
+ num: 0
+ log_freq: 200
+ # IBRL specific
+ batch_size: 256
+ target_ema_rate: 0.01
+ scale_reward_factor: 1
+ critic_num_update: 5
+ buffer_size: 500000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ update_freq: 2
+
+model:
+ _target_: model.rl.gaussian_ibrl.IBRL_Gaussian
+ randn_clip_value: 3
+ n_critics: 5
+ soft_action_sample: True
+ soft_action_sample_beta: 10
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [1024, 1024, 1024]
+ activation_type: ReLU
+ dropout: 0.5
+ fixed_std: 0.1
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [1024, 1024, 1024]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: False # use ensemble
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ max_n_episodes: 50
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml
index 9158042..de70428 100644
--- a/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/gym/finetune/walker2d-v2/ft_ppo_diffusion_mlp.yaml
@@ -68,7 +68,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 5000
+ batch_size: 50000
update_epochs: 5
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml b/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml
deleted file mode 100644
index 42dcdf5..0000000
--- a/cfg/gym/finetune/walker2d-v2/ft_rlpd_mlp.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-defaults:
- - _self_
-hydra:
- run:
- dir: ${logdir}
-_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
-
-name: ${env_name}_rlpd_mlp_ta${horizon_steps}_td${denoising_steps}
-logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
-offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
-
-seed: 42
-device: cuda:0
-env_name: walker2d-medium-v2
-obs_dim: 17
-action_dim: 6
-denoising_steps: 20
-cond_steps: 1
-horizon_steps: 1
-act_steps: 1
-
-env:
- n_envs: 40
- name: ${env_name}
- max_episode_steps: 1000
- reset_at_iteration: False
- save_video: False
- best_reward_threshold_for_success: 3
- wrappers:
- mujoco_locomotion_lowdim:
- normalization_path: ${normalization_path}
- multi_step:
- n_obs_steps: ${cond_steps}
- n_action_steps: ${act_steps}
- max_episode_steps: ${env.max_episode_steps}
- reset_within_step: True
-
-wandb:
- entity: ${oc.env:DPPO_WANDB_ENTITY}
- project: rlpd-gym-${env_name}-finetune
- run: ${now:%H-%M-%S}_${name}
-
-train:
- n_train_itr: 1000
- n_critic_warmup_itr: 5
- n_steps: 2000
- gamma: 0.99
- actor_lr: 1e-4
- actor_weight_decay: 0
- actor_lr_scheduler:
- first_cycle_steps: 1000
- warmup_steps: 10
- min_lr: 1e-4
- critic_lr: 1e-3
- critic_weight_decay: 0
- critic_lr_scheduler:
- first_cycle_steps: 1000
- warmup_steps: 10
- min_lr: 1e-3
- save_model_freq: 100
- val_freq: 10
- render:
- freq: 1
- num: 0
- # RLPD specific
- batch_size: 512
- entropy_temperature: 1.0 # alpha in RLPD paper
- target_ema_rate: 0.005 # rho in RLPD paper
- scale_reward_factor: 1.0 # multiply reward by this amount for more stable value estimation
- replay_ratio: 64 # number of batches to sample for each learning update
- buffer_size: 1000000
-
-model:
- _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
- randn_clip_value: 3
- actor:
- _target_: model.common.mlp_gaussian.Gaussian_MLP
- mlp_dims: [512, 512, 512]
- activation_type: ReLU
- residual_style: True
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
- horizon_steps: ${horizon_steps}
- action_dim: ${action_dim}
- critic:
- _target_: model.common.critic.CriticObsAct
- action_dim: ${action_dim}
- action_steps: ${act_steps}
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
- mlp_dims: [256, 256, 256]
- activation_type: Mish
- residual_style: True
- use_layernorm: True
- horizon_steps: ${horizon_steps}
- device: ${device}
- n_critics: 2 # Ensemble size for critic models
-
-offline_dataset:
- _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
- dataset_path: ${offline_dataset_path}
- horizon_steps: ${horizon_steps}
- cond_steps: ${cond_steps}
- device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml b/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml
index 7dfd7ed..e73a4d5 100644
--- a/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml
+++ b/cfg/gym/pretrain/halfcheetah-medium-v2/calql_mlp_offline.yaml
@@ -88,7 +88,7 @@ model:
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
diff --git a/cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml b/cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml
new file mode 100644
index 0000000..8a2f462
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-complete-v0/calql_mlp_offline.yaml
@@ -0,0 +1,113 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: calql-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000
+ n_steps: 1 # not used
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 20
+ render:
+ freq: 1
+ num: 0
+ log_freq: 1
+ # CalQL specific
+ train_online: False
+ batch_size: 256
+ n_random_actions: 10
+ target_ema_rate: 0.005
+ scale_reward_factor: 1.0
+ num_update: 1000
+ buffer_size: 1000000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+ automatic_entropy_tuning: True
+
+model:
+ _target_: model.rl.gaussian_calql.CalQL_Gaussian
+ randn_clip_value: 3
+ cql_min_q_weight: 5.0
+ tanh_output: True
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ discount_factor: ${train.gamma}
+ get_mc_return: True
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..092fa00
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml
@@ -0,0 +1,66 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env}-pretrain
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_epochs: 8000
+ batch_size: 128
+ learning_rate: 1e-3
+ weight_decay: 1e-6
+ lr_scheduler:
+ first_cycle_steps: 8000
+ warmup_steps: 1
+ min_lr: 1e-4
+ epoch_start_ema: 10
+ update_ema_freq: 5
+ save_model_freq: 1000
+
+model:
+ _target_: model.diffusion.diffusion.DiffusionModel
+ predict_epsilon: True
+ denoised_clip_value: 1.0
+ network:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [256, 256, 256]
+ cond_mlp_dims: [128, 32]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
+
+ema:
+ decay: 0.995
+
+train_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceDataset
+ dataset_path: ${train_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..9426b6f
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml
@@ -0,0 +1,60 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env}-pretrain
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_epochs: 5000
+ batch_size: 256
+ learning_rate: 1e-4
+ weight_decay: 0
+ lr_scheduler:
+ first_cycle_steps: 5000
+ warmup_steps: 100
+ min_lr: 1e-4
+ epoch_start_ema: 20
+ update_ema_freq: 10
+ save_model_freq: 1000
+
+model:
+ _target_: model.common.gaussian.GaussianModel
+ network:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [1024, 1024, 1024]
+ activation_type: ReLU
+ dropout: 0.5
+ fixed_std: 0.1
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+ema:
+ decay: 0.995
+
+train_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceDataset
+ dataset_path: ${train_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml b/cfg/gym/pretrain/kitchen-mixed-v0/calql_mlp_offline.yaml
similarity index 85%
rename from cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml
rename to cfg/gym/pretrain/kitchen-mixed-v0/calql_mlp_offline.yaml
index 24f8957..4233314 100644
--- a/cfg/gym/pretrain/hopper-medium-v2/calql_mlp_offline.yaml
+++ b/cfg/gym/pretrain/kitchen-mixed-v0/calql_mlp_offline.yaml
@@ -6,15 +6,15 @@ hydra:
_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
name: ${env_name}_calql_mlp_ta${horizon_steps}
-logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
seed: 42
device: cuda:0
-env_name: hopper-medium-v2
-obs_dim: 11
-action_dim: 3
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
cond_steps: 1
horizon_steps: 1
act_steps: 1
@@ -22,10 +22,10 @@ act_steps: 1
env:
n_envs: 1
name: ${env_name}
- max_episode_steps: 1000
+ max_episode_steps: 280
reset_at_iteration: False
save_video: False
- best_reward_threshold_for_success: 3
+ best_reward_threshold_for_success: 4
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
@@ -41,7 +41,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 100
+ n_train_itr: 1000
n_steps: 1 # not used
gamma: 0.99
actor_lr: 1e-4
@@ -50,14 +50,14 @@ train:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-4
- critic_lr: 3e-4
+ critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 3e-4
+ min_lr: 1e-3
save_model_freq: 10
- val_freq: 10
+ val_freq: 20
render:
freq: 1
num: 0
@@ -65,12 +65,12 @@ train:
# CalQL specific
train_online: False
batch_size: 256
- n_random_actions: 4
+ n_random_actions: 10
target_ema_rate: 0.005
scale_reward_factor: 1.0
num_update: 1000
buffer_size: 1000000
- n_eval_episode: 10
+ n_eval_episode: 40
n_explore_steps: 0
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
init_temperature: 1
@@ -83,17 +83,17 @@ model:
tanh_output: True
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
- mlp_dims: [256, 256]
+ mlp_dims: [256, 256, 256]
activation_type: ReLU
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
_target_: model.common.critic.CriticObsAct
- mlp_dims: [256, 256]
+ mlp_dims: [256, 256, 256]
activation_type: ReLU
use_layernorm: True
double_q: True
diff --git a/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..becf244
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml
@@ -0,0 +1,66 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env}-pretrain
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_epochs: 8000
+ batch_size: 256
+ learning_rate: 1e-3
+ weight_decay: 1e-6
+ lr_scheduler:
+ first_cycle_steps: 8000
+ warmup_steps: 1
+ min_lr: 1e-4
+ epoch_start_ema: 10
+ update_ema_freq: 5
+ save_model_freq: 1000
+
+model:
+ _target_: model.diffusion.diffusion.DiffusionModel
+ predict_epsilon: True
+ denoised_clip_value: 1.0
+ network:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [256, 256, 256]
+ cond_mlp_dims: [128, 32]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
+
+ema:
+ decay: 0.995
+
+train_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceDataset
+ dataset_path: ${train_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..86a6e90
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml
@@ -0,0 +1,59 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env}-pretrain
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_epochs: 5000
+ batch_size: 128
+ learning_rate: 1e-3
+ weight_decay: 1e-6
+ lr_scheduler:
+ first_cycle_steps: 5000
+ warmup_steps: 1
+ min_lr: 1e-4
+ epoch_start_ema: 10
+ update_ema_freq: 5
+ save_model_freq: 1000
+
+model:
+ _target_: model.common.gaussian.GaussianModel
+ network:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ fixed_std: 0.1
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+ema:
+ decay: 0.995
+
+train_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceDataset
+ dataset_path: ${train_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml b/cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml
new file mode 100644
index 0000000..f99edfd
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-partial-v0/calql_mlp_offline.yaml
@@ -0,0 +1,113 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: calql-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000
+ n_steps: 1 # not used
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 10
+ val_freq: 20
+ render:
+ freq: 1
+ num: 0
+ log_freq: 1
+ # CalQL specific
+ train_online: False
+ batch_size: 256
+ n_random_actions: 10
+ target_ema_rate: 0.005
+ scale_reward_factor: 1.0
+ num_update: 1000
+ buffer_size: 1000000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+ automatic_entropy_tuning: True
+
+model:
+ _target_: model.rl.gaussian_calql.CalQL_Gaussian
+ randn_clip_value: 3
+ cql_min_q_weight: 5.0
+ tanh_output: True
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ discount_factor: ${train.gamma}
+ get_mc_return: True
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml b/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml
new file mode 100644
index 0000000..c854707
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml
@@ -0,0 +1,66 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env}-pretrain
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_epochs: 8000
+ batch_size: 128
+ learning_rate: 1e-3
+ weight_decay: 1e-5
+ lr_scheduler:
+ first_cycle_steps: 8000
+ warmup_steps: 1
+ min_lr: 1e-4
+ epoch_start_ema: 10
+ update_ema_freq: 5
+ save_model_freq: 1000
+
+model:
+ _target_: model.diffusion.diffusion.DiffusionModel
+ predict_epsilon: True
+ denoised_clip_value: 1.0
+ network:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [256, 256, 256]
+ cond_mlp_dims: [128, 32]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
+
+ema:
+ decay: 0.995
+
+train_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceDataset
+ dataset_path: ${train_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml b/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml
new file mode 100644
index 0000000..02413a5
--- /dev/null
+++ b/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml
@@ -0,0 +1,59 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
+
+name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env}-pretrain
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_epochs: 5000
+ batch_size: 128
+ learning_rate: 1e-3
+ weight_decay: 1e-6
+ lr_scheduler:
+ first_cycle_steps: 5000
+ warmup_steps: 1
+ min_lr: 1e-4
+ epoch_start_ema: 10
+ update_ema_freq: 5
+ save_model_freq: 1000
+
+model:
+ _target_: model.common.gaussian.GaussianModel
+ network:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ fixed_std: 0.1
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+ema:
+ decay: 0.995
+
+train_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceDataset
+ dataset_path: ${train_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml
similarity index 96%
rename from cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml
rename to cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml
index 9be391c..49f11ed 100644
--- a/cfg/gym/finetune/halfcheetah-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml
@@ -14,8 +14,8 @@ device: cuda:0
env_name: halfcheetah-medium-v2
obs_dim: 17
action_dim: 6
-denoising_steps: 20
-ft_denoising_steps: 20
+denoising_steps: 10
+ft_denoising_steps: 10
cond_steps: 1
horizon_steps: 1
act_steps: 1
@@ -67,7 +67,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 1000
+ batch_size: 10000
update_epochs: 10
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml
similarity index 97%
rename from cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml
rename to cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml
index f09c664..b0c1241 100644
--- a/cfg/gym/finetune/halfcheetah-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml
@@ -53,7 +53,7 @@ train:
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
- first_cycle_steps: 10000
+ first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
diff --git a/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/rlpd_mlp.yaml
similarity index 98%
rename from cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml
rename to cfg/gym/scratch/halfcheetah-v2/rlpd_mlp.yaml
index 898cf9b..24379c6 100644
--- a/cfg/gym/finetune/halfcheetah-v2/rlpd_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/rlpd_mlp.yaml
@@ -86,7 +86,7 @@ model:
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
diff --git a/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml b/cfg/gym/scratch/halfcheetah-v2/sac_mlp.yaml
similarity index 98%
rename from cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml
rename to cfg/gym/scratch/halfcheetah-v2/sac_mlp.yaml
index 8051c73..35182d5 100644
--- a/cfg/gym/finetune/halfcheetah-v2/sac_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/sac_mlp.yaml
@@ -75,7 +75,7 @@ model:
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic: # no layernorm
diff --git a/cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml
new file mode 100644
index 0000000..2f02c78
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/awr_diffusion_mlp.yaml
@@ -0,0 +1,99 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_awr_diffusion_agent.TrainAWRDiffusionAgent
+
+name: ${env_name}_awr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 10
+ name: ${env_name}
+ max_episode_steps: 1000
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 3
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env_name}-scratch
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000
+ n_critic_warmup_itr: 0
+ n_steps: 1000
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # AWR specific
+ scale_reward_factor: 0.01
+ max_adv_weight: 100
+ beta: 10
+ buffer_size: 100000 # * n_envs
+ batch_size: 256
+ replay_ratio: 128
+ critic_update_ratio: 4
+
+model:
+ _target_: model.diffusion.diffusion_awr.AWRDiffusion
+ # Sampling HPs
+ min_sampling_denoising_std: 0.10
+ randn_clip_value: 3
+ #
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ time_dim: 16
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ residual_style: True
+ critic:
+ _target_: model.common.critic.CriticObs
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml
new file mode 100644
index 0000000..9eda16e
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/dipo_diffusion_mlp.yaml
@@ -0,0 +1,101 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_dipo_diffusion_agent.TrainDIPODiffusionAgent
+
+name: ${env_name}_dipo_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 10
+ name: ${env_name}
+ max_episode_steps: 1000
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 3
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env_name}-scratch
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000
+ n_critic_warmup_itr: 0
+ n_steps: 1000
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # DIPO specific
+ scale_reward_factor: 0.01
+ target_ema_rate: 0.005
+ buffer_size: 1000000
+ action_lr: 0.0001
+ action_gradient_steps: 10
+ replay_ratio: 128
+ batch_size: 256
+
+model:
+ _target_: model.diffusion.diffusion_dipo.DIPODiffusion
+ # Sampling HPs
+ min_sampling_denoising_std: 0.10
+ randn_clip_value: 3
+ #
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ time_dim: 16
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ residual_style: True
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml
new file mode 100644
index 0000000..9bd4885
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/dql_diffusion_mlp.yaml
@@ -0,0 +1,100 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_dql_diffusion_agent.TrainDQLDiffusionAgent
+
+name: ${env_name}_dql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 10
+ name: ${env_name}
+ max_episode_steps: 1000
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 3
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env_name}-scratch
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000
+ n_critic_warmup_itr: 0
+ n_steps: 1000
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # DQL specific
+ scale_reward_factor: 0.01
+ target_ema_rate: 0.005
+ buffer_size: 1000000
+ eta: 1.0
+ replay_ratio: 128
+ batch_size: 256
+
+model:
+ _target_: model.diffusion.diffusion_dql.DQLDiffusion
+ # Sampling HPs
+ min_sampling_denoising_std: 0.10
+ randn_clip_value: 3
+ #
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ time_dim: 16
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ residual_style: True
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml
new file mode 100644
index 0000000..935263d
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/idql_diffusion_mlp.yaml
@@ -0,0 +1,108 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_idql_diffusion_agent.TrainIDQLDiffusionAgent
+
+name: ${env_name}_idql_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 10
+ name: ${env_name}
+ max_episode_steps: 1000
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 3
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env_name}-scratch
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000
+ n_critic_warmup_itr: 0
+ n_steps: 1000
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # IDQL specific
+ scale_reward_factor: 0.01
+ eval_deterministic: True
+ eval_sample_num: 10 # how many samples to score during eval
+ critic_tau: 0.001 # rate of target q network update
+ use_expectile_exploration: True
+ buffer_size: 100000 # * n_envs
+ replay_ratio: 128
+ batch_size: 256
+
+model:
+ _target_: model.diffusion.diffusion_idql.IDQLDiffusion
+ # Sampling HPs
+ min_sampling_denoising_std: 0.10
+ randn_clip_value: 3
+ #
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ time_dim: 16
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ residual_style: True
+ critic_q:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ critic_v:
+ _target_: model.common.critic.CriticObs
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml
similarity index 95%
rename from cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml
rename to cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml
index 3f26654..729a0c6 100644
--- a/cfg/gym/finetune/hopper-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml
@@ -1,7 +1,7 @@
defaults:
- _self_
hydra:
- run:
+ run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
@@ -14,8 +14,8 @@ device: cuda:0
env_name: hopper-medium-v2
obs_dim: 11
action_dim: 3
-denoising_steps: 20
-ft_denoising_steps: 20
+denoising_steps: 10
+ft_denoising_steps: 10
cond_steps: 1
horizon_steps: 1
act_steps: 1
@@ -55,7 +55,7 @@ train:
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
- first_cycle_steps: 10000
+ first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
@@ -67,7 +67,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 1000
+ batch_size: 10000
update_epochs: 10
vf_coef: 0.5
target_kl: 1
@@ -94,10 +94,10 @@ model:
residual_style: True
critic:
_target_: model.common.critic.CriticObs
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
ft_denoising_steps: ${ft_denoising_steps}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
diff --git a/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml b/cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml
similarity index 97%
rename from cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml
rename to cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml
index 57eafcb..05f5766 100644
--- a/cfg/gym/finetune/hopper-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml
@@ -53,7 +53,7 @@ train:
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
- first_cycle_steps: 10000
+ first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
diff --git a/cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml
new file mode 100644
index 0000000..9fee721
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/qsm_diffusion_mlp.yaml
@@ -0,0 +1,100 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_qsm_diffusion_agent.TrainQSMDiffusionAgent
+
+name: ${env_name}_qsm_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 10
+ name: ${env_name}
+ max_episode_steps: 1000
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 3
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env_name}-scratch
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000
+ n_critic_warmup_itr: 0
+ n_steps: 1000
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # QSM specific
+ scale_reward_factor: 0.01
+ q_grad_coeff: 50
+ critic_tau: 0.005
+ buffer_size: 100000 # * n_envs
+ replay_ratio: 128
+ batch_size: 256
+
+model:
+ _target_: model.diffusion.diffusion_qsm.QSMDiffusion
+ # Sampling HPs
+ min_sampling_denoising_std: 0.10
+ randn_clip_value: 3
+ #
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ time_dim: 16
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ residual_style: True
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml b/cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml
new file mode 100644
index 0000000..cdd98a2
--- /dev/null
+++ b/cfg/gym/scratch/hopper-v2/rwr_diffusion_mlp.yaml
@@ -0,0 +1,84 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_rwr_diffusion_agent.TrainRWRDiffusionAgent
+
+name: ${env_name}_rwr_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: hopper-medium-v2
+obs_dim: 11
+action_dim: 3
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 10
+ name: ${env_name}
+ max_episode_steps: 1000
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 3
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: gym-${env_name}-scratch
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000
+ n_critic_warmup_itr: 0
+ n_steps: 1000
+ gamma: 0.99
+ lr: 1e-4
+ weight_decay: 0
+ lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # RWR specific
+ max_reward_weight: 100
+ beta: 10
+ batch_size: 256
+ update_epochs: 128
+
+model:
+ _target_: model.diffusion.diffusion_rwr.RWRDiffusion
+ # Sampling HPs
+ min_sampling_denoising_std: 0.1
+ randn_clip_value: 3
+ #
+ network:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ time_dim: 16
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ residual_style: True
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml b/cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml
new file mode 100644
index 0000000..b80a9a8
--- /dev/null
+++ b/cfg/gym/scratch/kitchen-complete-v0/rlpd_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-complete-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: rlpd-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000000
+ n_steps: 1
+ gamma: 0.99
+ actor_lr: 3e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 3e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 50000
+ val_freq: 5000
+ render:
+ freq: 1
+ num: 0
+ log_freq: 200
+ # RLPD specific
+ batch_size: 256
+ target_ema_rate: 0.01
+ scale_reward_factor: 1
+ critic_num_update: 10
+ buffer_size: 400000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+
+model:
+ _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
+ randn_clip_value: 10
+ tanh_output: True # squash after sampling
+ backup_entropy: True
+ n_critics: 5 # Ensemble size for critic models
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: False # use ensemble
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml b/cfg/gym/scratch/kitchen-mixed-v0/rlpd_mlp.yaml
similarity index 83%
rename from cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml
rename to cfg/gym/scratch/kitchen-mixed-v0/rlpd_mlp.yaml
index 7a33bde..e006e25 100644
--- a/cfg/gym/finetune/hopper-v2/rlpd_mlp.yaml
+++ b/cfg/gym/scratch/kitchen-mixed-v0/rlpd_mlp.yaml
@@ -12,9 +12,9 @@ offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
seed: 42
device: cuda:0
-env_name: hopper-medium-v2
-obs_dim: 11
-action_dim: 3
+env_name: kitchen-mixed-v0
+obs_dim: 60
+action_dim: 9
cond_steps: 1
horizon_steps: 1
act_steps: 1
@@ -22,10 +22,10 @@ act_steps: 1
env:
n_envs: 1
name: ${env_name}
- max_episode_steps: 1000
+ max_episode_steps: 280
reset_at_iteration: False
save_video: False
- best_reward_threshold_for_success: 3
+ best_reward_threshold_for_success: 4
wrappers:
mujoco_locomotion_lowdim:
normalization_path: ${normalization_path}
@@ -41,7 +41,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 250000
+ n_train_itr: 1000000
n_steps: 1
gamma: 0.99
actor_lr: 3e-4
@@ -50,12 +50,12 @@ train:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 3e-4
- critic_lr: 3e-4
+ critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 3e-4
+ min_lr: 1e-3
save_model_freq: 50000
val_freq: 5000
render:
@@ -64,12 +64,12 @@ train:
log_freq: 200
# RLPD specific
batch_size: 256
- target_ema_rate: 0.005
+ target_ema_rate: 0.01
scale_reward_factor: 1
- critic_num_update: 20
- buffer_size: 1000000
- n_eval_episode: 10
- n_explore_steps: 5000
+ critic_num_update: 10
+ buffer_size: 400000
+ n_eval_episode: 40
+ n_explore_steps: 0
target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
init_temperature: 1
@@ -78,20 +78,20 @@ model:
randn_clip_value: 10
tanh_output: True # squash after sampling
backup_entropy: True
- n_critics: 10 # Ensemble size for critic models
+ n_critics: 5 # Ensemble size for critic models
actor:
_target_: model.common.mlp_gaussian.Gaussian_MLP
- mlp_dims: [256, 256]
+ mlp_dims: [256, 256, 256]
activation_type: ReLU
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
_target_: model.common.critic.CriticObsAct
- mlp_dims: [256, 256]
+ mlp_dims: [256, 256, 256]
activation_type: ReLU
use_layernorm: True
double_q: False # use ensemble
diff --git a/cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml b/cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml
new file mode 100644
index 0000000..a9b7781
--- /dev/null
+++ b/cfg/gym/scratch/kitchen-partial-v0/rlpd_mlp.yaml
@@ -0,0 +1,109 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: kitchen-partial-v0
+obs_dim: 60
+action_dim: 9
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 280
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 4
+ wrappers:
+ mujoco_locomotion_lowdim:
+ normalization_path: ${normalization_path}
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: rlpd-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000000
+ n_steps: 1
+ gamma: 0.99
+ actor_lr: 3e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 3e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 50000
+ val_freq: 5000
+ render:
+ freq: 1
+ num: 0
+ log_freq: 200
+ # RLPD specific
+ batch_size: 256
+ target_ema_rate: 0.01
+ scale_reward_factor: 1
+ critic_num_update: 10
+ buffer_size: 400000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+
+model:
+ _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
+ randn_clip_value: 10
+ tanh_output: True # squash after sampling
+ backup_entropy: True
+ n_critics: 5 # Ensemble size for critic models
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: False # use ensemble
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml b/cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml
similarity index 96%
rename from cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml
rename to cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml
index 6530d49..2c1769f 100644
--- a/cfg/gym/finetune/walker2d-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml
@@ -14,8 +14,8 @@ device: cuda:0
env_name: walker2d-medium-v2
obs_dim: 17
action_dim: 6
-denoising_steps: 20
-ft_denoising_steps: 20
+denoising_steps: 10
+ft_denoising_steps: 10
cond_steps: 1
horizon_steps: 1
act_steps: 1
@@ -67,7 +67,7 @@ train:
reward_scale_running: True
reward_scale_const: 1.0
gae_lambda: 0.95
- batch_size: 1000
+ batch_size: 10000
update_epochs: 10
vf_coef: 0.5
target_kl: 1
diff --git a/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml b/cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml
similarity index 97%
rename from cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml
rename to cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml
index dff57a3..70b6267 100644
--- a/cfg/gym/finetune/walker2d-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml
@@ -53,7 +53,7 @@ train:
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
- first_cycle_steps: 10000
+ first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
diff --git a/cfg/robomimic/finetune/can/calql_mlp_online.yaml b/cfg/robomimic/finetune/can/calql_mlp_online.yaml
index 8fc1a3c..9fd5db1 100644
--- a/cfg/robomimic/finetune/can/calql_mlp_online.yaml
+++ b/cfg/robomimic/finetune/can/calql_mlp_online.yaml
@@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
name: ${env_name}_calql_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_calql_mlp_ta1/2024-10-25_22-30-16_42/checkpoint/state_999.pt
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
@@ -97,7 +97,7 @@ model:
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
diff --git a/cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml b/cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml
new file mode 100644
index 0000000..cfb4b81
--- /dev/null
+++ b/cfg/robomimic/finetune/can/calql_mlp_online_ph.yaml
@@ -0,0 +1,122 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ best_reward_threshold_for_success: 1
+ max_episode_steps: 300
+ reset_at_iteration: False
+ save_video: False
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: calql-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000
+ n_steps: 1 # not used
+ n_episode_per_epoch: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 3e-4
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 3e-4
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ log_freq: 1
+ # CalQL specific
+ train_online: True
+ batch_size: 256
+ n_random_actions: 4
+ target_ema_rate: 0.005
+ scale_reward_factor: 1.0
+ num_update: 1000
+ buffer_size: 1000000
+ online_utd_ratio: 1
+ n_eval_episode: 40
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+ automatic_entropy_tuning: True
+
+model:
+ _target_: model.rl.gaussian_calql.CalQL_Gaussian
+ randn_clip_value: 3
+ cql_min_q_weight: 5.0
+ tanh_output: True
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ discount_factor: ${train.gamma}
+ get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml
index 2a8343a..ab384f1 100644
--- a/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_awr_diffusion_mlp.yaml
@@ -26,7 +26,7 @@ env:
name: ${env_name}
best_reward_threshold_for_success: 1
max_episode_steps: 300
- save_video: false
+ save_video: False
wrappers:
robomimic_lowdim:
normalization_path: ${normalization_path}
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 151
n_critic_warmup_itr: 2
n_steps: 300
gamma: 0.999
diff --git a/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml
index ed9c90f..59cb0a2 100644
--- a/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_dql_diffusion_mlp.yaml
@@ -26,7 +26,7 @@ env:
name: ${env_name}
best_reward_threshold_for_success: 1
max_episode_steps: 300
- save_video: false
+ save_video: False
wrappers:
robomimic_lowdim:
normalization_path: ${normalization_path}
diff --git a/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml
index 24bb53a..12e33b0 100644
--- a/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_idql_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 151
n_critic_warmup_itr: 5
n_steps: 300
gamma: 0.999
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
index ba1fa16..8256876 100644
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
@@ -47,16 +47,16 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 151
n_critic_warmup_itr: 2
n_steps: 300
gamma: 0.999
- actor_lr: 1e-5
+ actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-5
+ min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
index 0873cb4..54a4ab1 100644
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
@@ -1,7 +1,7 @@
defaults:
- _self_
hydra:
- run:
+ run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
@@ -60,22 +60,22 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 200
+ n_train_itr: 151
n_critic_warmup_itr: 2
n_steps: 300
gamma: 0.999
augment: True
grad_accumulate: 15
- actor_lr: 1e-5
+ actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
- first_cycle_steps: 200
+ first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-5
+ min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
- first_cycle_steps: 200
+ first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
@@ -96,7 +96,7 @@ train:
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
- gamma_denoising: 0.9
+ gamma_denoising: 0.99
clip_ploss_coef: 0.01
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
@@ -158,10 +158,10 @@ model:
embed_style: embed2
embed_norm: 0
img_cond_steps: ${img_cond_steps}
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
ft_denoising_steps: ${ft_denoising_steps}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml
new file mode 100644
index 0000000..86d28df
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1.yaml
@@ -0,0 +1,111 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_diffusion_mlp_ta1_td20/2024-09-29_15-43-07_42/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 50
+ name: ${env_name}
+ best_reward_threshold_for_success: 1
+ max_episode_steps: 300
+ save_video: False
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: robomimic-${env_name}-finetune
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 301
+ n_critic_warmup_itr: 2
+ n_steps: 300
+ gamma: 0.999
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # PPO specific
+ reward_scale_running: True
+ reward_scale_const: 1.0
+ gae_lambda: 0.95
+ batch_size: 15000
+ update_epochs: 10
+ vf_coef: 0.5
+ target_kl: 1
+
+model:
+ _target_: model.diffusion.diffusion_ppo.PPODiffusion
+ # HP to tune
+ gamma_denoising: 0.99
+ clip_ploss_coef: 0.01
+ clip_ploss_coef_base: 0.001
+ clip_ploss_coef_rate: 3
+ randn_clip_value: 3
+ min_sampling_denoising_std: 0.1
+ min_logprob_denoising_std: 0.1
+ #
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [512, 512, 512]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObs
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ ft_denoising_steps: ${ft_denoising_steps}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml
new file mode 100644
index 0000000..3367556
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_ta1_ph.yaml
@@ -0,0 +1,111 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 40
+ name: ${env_name}
+ best_reward_threshold_for_success: 1
+ max_episode_steps: 300
+ save_video: False
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: robomimic-${env_name}-finetune
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 301
+ n_critic_warmup_itr: 2
+ n_steps: 300
+ gamma: 0.999
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # PPO specific
+ reward_scale_running: True
+ reward_scale_const: 1.0
+ gae_lambda: 0.95
+ batch_size: 6000
+ update_epochs: 10
+ vf_coef: 0.5
+ target_kl: 1
+
+model:
+ _target_: model.diffusion.diffusion_ppo.PPODiffusion
+ # HP to tune
+ gamma_denoising: 0.9
+ clip_ploss_coef: 0.01
+ clip_ploss_coef_base: 0.001
+ clip_ploss_coef_rate: 3
+ randn_clip_value: 3
+ min_sampling_denoising_std: 0.1
+ min_logprob_denoising_std: 0.1
+ #
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [512, 512, 512]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObs
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ ft_denoising_steps: ${ft_denoising_steps}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml
index 591f3a9..bbd8bd6 100644
--- a/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_qsm_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 151
n_critic_warmup_itr: 5
n_steps: 300
gamma: 0.999
diff --git a/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml
index 5037605..fa451a3 100644
--- a/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_rwr_diffusion_mlp.yaml
@@ -26,7 +26,7 @@ env:
name: ${env_name}
best_reward_threshold_for_success: 1
max_episode_steps: 300
- save_video: false
+ save_video: False
wrappers:
robomimic_lowdim:
normalization_path: ${normalization_path}
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 151
n_critic_warmup_itr: 2
n_steps: 300
gamma: 0.999
diff --git a/cfg/robomimic/finetune/can/ibrl_mlp.yaml b/cfg/robomimic/finetune/can/ibrl_mlp.yaml
index 7aa8d24..c3ba56e 100644
--- a/cfg/robomimic/finetune/can/ibrl_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ibrl_mlp.yaml
@@ -7,7 +7,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_gaussian_mlp_ta1/2024-09-28_13-43-59_42/checkpoint/state_5000.pt
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
@@ -93,7 +93,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObsAct
mlp_dims: [1024, 1024, 1024]
diff --git a/cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml b/cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml
new file mode 100644
index 0000000..8940658
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ibrl_mlp_ph.yaml
@@ -0,0 +1,115 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can_pre_gaussian_mlp_ta1/2024-10-08_20-52-04_0/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 250 # IBRL uses 200
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 1
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: ibrl-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000000
+ n_steps: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-4
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ save_model_freq: 100000
+ val_freq: 10000
+ render:
+ freq: 10000
+ num: 0
+ log_freq: 200
+ # IBRL specific
+ batch_size: 256
+ target_ema_rate: 0.01
+ scale_reward_factor: 1
+ critic_num_update: 3
+ buffer_size: 400000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ update_freq: 2
+
+model:
+ _target_: model.rl.gaussian_ibrl.IBRL_Gaussian
+ randn_clip_value: 3
+ n_critics: 5
+ soft_action_sample: True
+ soft_action_sample_beta: 10
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [1024, 1024, 1024]
+ activation_type: ReLU
+ dropout: 0.5
+ fixed_std: 0.1
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [1024, 1024, 1024]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: False # use ensemble
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ max_n_episodes: 100
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml
index bddd57c..6b276bc 100644
--- a/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_awr_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 81
n_critic_warmup_itr: 2
n_steps: 300
gamma: 0.999
diff --git a/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml
index e0353b6..75e7c68 100644
--- a/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_dql_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 81
n_critic_warmup_itr: 5
n_steps: 300
gamma: 0.999
diff --git a/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml
index a0e2567..4bf3a2a 100644
--- a/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_idql_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 81
n_critic_warmup_itr: 5
n_steps: 300
gamma: 0.999
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
index b505b81..16b9485 100644
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
@@ -27,7 +27,7 @@ env:
name: ${env_name}
best_reward_threshold_for_success: 1
max_episode_steps: 300
- save_video: false
+ save_video: False
wrappers:
robomimic_lowdim:
normalization_path: ${normalization_path}
@@ -47,16 +47,16 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 81
n_critic_warmup_itr: 2
n_steps: 300
gamma: 0.999
- actor_lr: 1e-5
+ actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-5
+ min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
@@ -99,10 +99,10 @@ model:
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
ft_denoising_steps: ${ft_denoising_steps}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
index d46c44b..72207d6 100644
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
@@ -1,7 +1,7 @@
defaults:
- _self_
hydra:
- run:
+ run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
@@ -60,22 +60,22 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 200
+ n_train_itr: 151
n_critic_warmup_itr: 2
n_steps: 300
gamma: 0.999
augment: True
grad_accumulate: 15
- actor_lr: 1e-5
+ actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
- first_cycle_steps: 200
+ first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-5
+ min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
- first_cycle_steps: 200
+ first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
@@ -96,7 +96,7 @@ train:
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
- gamma_denoising: 0.9
+ gamma_denoising: 0.99
clip_ploss_coef: 0.01
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
@@ -158,10 +158,10 @@ model:
embed_style: embed2
embed_norm: 0
img_cond_steps: ${img_cond_steps}
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
ft_denoising_steps: ${ft_denoising_steps}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml
index 8262daa..4c550ea 100644
--- a/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_qsm_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 81
n_critic_warmup_itr: 5
n_steps: 300
gamma: 0.999
diff --git a/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml
index fa6b4ca..f32ef8d 100644
--- a/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_rwr_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 81
n_critic_warmup_itr: 2
n_steps: 300
gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/calql_mlp_online.yaml b/cfg/robomimic/finetune/square/calql_mlp_online.yaml
index 22ebae4..de333e6 100644
--- a/cfg/robomimic/finetune/square/calql_mlp_online.yaml
+++ b/cfg/robomimic/finetune/square/calql_mlp_online.yaml
@@ -7,7 +7,7 @@ _target_: agent.finetune.train_calql_agent.TrainCalQLAgent
name: ${env_name}_calql_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_calql_mlp_ta1/2024-10-25_22-44-12_42/checkpoint/state_999.pt
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
@@ -97,7 +97,7 @@ model:
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
diff --git a/cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml b/cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml
new file mode 100644
index 0000000..3332780
--- /dev/null
+++ b/cfg/robomimic/finetune/square/calql_mlp_online_ph.yaml
@@ -0,0 +1,122 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ best_reward_threshold_for_success: 1
+ max_episode_steps: 400
+ reset_at_iteration: False
+ save_video: False
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: calql-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 10000
+ n_steps: 1 # not used
+ n_episode_per_epoch: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 3e-4
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 3e-4
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ log_freq: 1
+ # CalQL specific
+ train_online: True
+ batch_size: 256
+ n_random_actions: 4
+ target_ema_rate: 0.005
+ scale_reward_factor: 1.0
+ num_update: 1000
+ buffer_size: 1000000
+ online_utd_ratio: 1
+ n_eval_episode: 40
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+ automatic_entropy_tuning: True
+
+model:
+ _target_: model.rl.gaussian_calql.CalQL_Gaussian
+ randn_clip_value: 3
+ cql_min_q_weight: 5.0
+ tanh_output: True
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ discount_factor: ${train.gamma}
+ get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml
index c5b2e39..13dfbb4 100644
--- a/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_awr_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 201
n_critic_warmup_itr: 2
n_steps: 400
gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml
index 350bfe6..e143e5e 100644
--- a/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_dql_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 201
n_critic_warmup_itr: 5
n_steps: 400
gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml
index 87f1e5b..0c5fee8 100644
--- a/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_idql_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 201
n_critic_warmup_itr: 5
n_steps: 400
gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
index 47c539e..edbe296 100644
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
@@ -47,16 +47,16 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 500
+ n_train_itr: 201
n_critic_warmup_itr: 2
n_steps: 400
gamma: 0.999
- actor_lr: 1e-5
+ actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-5
+ min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
@@ -100,10 +100,10 @@ model:
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
ft_denoising_steps: ${ft_denoising_steps}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
index 51d3e3a..84355d6 100644
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
@@ -1,7 +1,7 @@
defaults:
- _self_
hydra:
- run:
+ run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
@@ -60,7 +60,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 500
+ n_train_itr: 301
n_critic_warmup_itr: 2
n_steps: 400
gamma: 0.999
@@ -69,13 +69,13 @@ train:
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
- first_cycle_steps: 500
+ first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
- first_cycle_steps: 500
+ first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
@@ -96,7 +96,7 @@ train:
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
- gamma_denoising: 0.9
+ gamma_denoising: 0.99
clip_ploss_coef: 0.01
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
@@ -158,10 +158,10 @@ model:
embed_style: embed2
embed_norm: 0
img_cond_steps: ${img_cond_steps}
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
ft_denoising_steps: ${ft_denoising_steps}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml
new file mode 100644
index 0000000..156154c
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1.yaml
@@ -0,0 +1,112 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_diffusion_mlp_ta1_td20/2024-09-29_02-14-14_42/checkpoint/state_8000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 50
+ name: ${env_name}
+ best_reward_threshold_for_success: 1
+ max_episode_steps: 400
+ save_video: false
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: robomimic-${env_name}-finetune
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 301
+ n_critic_warmup_itr: 2
+ n_steps: 400
+ gamma: 0.999
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # PPO specific
+ reward_scale_running: True
+ reward_scale_const: 1.0
+ gae_lambda: 0.95
+ batch_size: 20000
+ update_epochs: 10
+ vf_coef: 0.5
+ target_kl: 1
+
+model:
+ _target_: model.diffusion.diffusion_ppo.PPODiffusion
+ # HP to tune
+ gamma_denoising: 0.99
+ clip_ploss_coef: 0.01
+ clip_ploss_coef_base: 0.001
+ clip_ploss_coef_rate: 3
+ randn_clip_value: 3
+ min_sampling_denoising_std: 0.1
+ min_logprob_denoising_std: 0.1
+ #
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 32
+ mlp_dims: [1024, 1024, 1024]
+ cond_mlp_dims: [512, 64]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObs
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ ft_denoising_steps: ${ft_denoising_steps}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml
new file mode 100644
index 0000000..c0d8d37
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_ta1_ph.yaml
@@ -0,0 +1,112 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
+
+name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 40
+ name: ${env_name}
+ best_reward_threshold_for_success: 1
+ max_episode_steps: 400
+ save_video: false
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: robomimic-${env_name}-finetune
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 301
+ n_critic_warmup_itr: 2
+ n_steps: 400
+ gamma: 0.999
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-3
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-3
+ save_model_freq: 100
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ # PPO specific
+ reward_scale_running: True
+ reward_scale_const: 1.0
+ gae_lambda: 0.95
+ batch_size: 8000
+ update_epochs: 10
+ vf_coef: 0.5
+ target_kl: 1
+
+model:
+ _target_: model.diffusion.diffusion_ppo.PPODiffusion
+ # HP to tune
+ gamma_denoising: 0.9
+ clip_ploss_coef: 0.01
+ clip_ploss_coef_base: 0.001
+ clip_ploss_coef_rate: 3
+ randn_clip_value: 3
+ min_sampling_denoising_std: 0.1
+ min_logprob_denoising_std: 0.1
+ #
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 32
+ mlp_dims: [1024, 1024, 1024]
+ cond_mlp_dims: [512, 64]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObs
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ mlp_dims: [256, 256, 256]
+ activation_type: Mish
+ residual_style: True
+ ft_denoising_steps: ${ft_denoising_steps}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml
index 1ad16d7..6b17bc5 100644
--- a/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_qsm_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 201
n_critic_warmup_itr: 5
n_steps: 400
gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml
index 2d34101..c27381f 100644
--- a/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_rwr_diffusion_mlp.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 300
+ n_train_itr: 201
n_critic_warmup_itr: 2
n_steps: 400
gamma: 0.999
diff --git a/cfg/robomimic/finetune/square/ibrl_mlp.yaml b/cfg/robomimic/finetune/square/ibrl_mlp.yaml
index 6e34653..fba5969 100644
--- a/cfg/robomimic/finetune/square/ibrl_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ibrl_mlp.yaml
@@ -7,7 +7,7 @@ _target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
name: ${env_name}_ibrl_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path:
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_gaussian_mlp_ta1/2024-09-28_13-42-43_42/checkpoint/state_5000.pt
robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
@@ -93,7 +93,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObsAct
mlp_dims: [1024, 1024, 1024]
diff --git a/cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml b/cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml
new file mode 100644
index 0000000..f65c1dd
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ibrl_mlp_ph.yaml
@@ -0,0 +1,115 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_ibrl_agent.TrainIBRLAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square_pre_gaussian_mlp_ta1/2024-10-08_20-52-42_0/checkpoint/state_5000.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 350 # IBRL uses 300
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 1
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: ibrl-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000000
+ n_steps: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-4
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ save_model_freq: 100000
+ val_freq: 10000
+ render:
+ freq: 10000
+ num: 0
+ log_freq: 200
+ # IBRL specific
+ batch_size: 256
+ target_ema_rate: 0.01
+ scale_reward_factor: 1
+ critic_num_update: 3
+ buffer_size: 400000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ update_freq: 2
+
+model:
+ _target_: model.rl.gaussian_ibrl.IBRL_Gaussian
+ randn_clip_value: 3
+ n_critics: 5
+ soft_action_sample: True
+ soft_action_sample_beta: 10
+ network_path: ${base_policy_path}
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [1024, 1024, 1024]
+ activation_type: ReLU
+ dropout: 0.5
+ fixed_std: 0.1
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [1024, 1024, 1024]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: False # use ensemble
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ max_n_episodes: 100
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml
index 61d7dec..8ab3b3b 100644
--- a/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_awr_diffusion_mlp.yaml
@@ -26,7 +26,7 @@ env:
name: ${env_name}
best_reward_threshold_for_success: 1
max_episode_steps: 800
- save_video: false
+ save_video: False
wrappers:
robomimic_lowdim:
normalization_path: ${normalization_path}
@@ -49,7 +49,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 1000
+ n_train_itr: 201
n_critic_warmup_itr: 2
n_steps: 400
gamma: 0.999
@@ -58,7 +58,7 @@ train:
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-6
+ min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
@@ -82,7 +82,7 @@ train:
model:
_target_: model.diffusion.diffusion_awr.AWRDiffusion
# Sampling HPs
- min_sampling_denoising_std: 0.08
+ min_sampling_denoising_std: 0.1
randn_clip_value: 3
#
network_path: ${base_policy_path}
diff --git a/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml
index ec30a80..1a99f3d 100644
--- a/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_dipo_diffusion_mlp.yaml
@@ -49,7 +49,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 1000
+ n_train_itr: 201
n_critic_warmup_itr: 2
n_steps: 400
gamma: 0.999
@@ -58,7 +58,7 @@ train:
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-6
+ min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
@@ -82,7 +82,7 @@ train:
model:
_target_: model.diffusion.diffusion_dipo.DIPODiffusion
# HP to tune
- min_sampling_denoising_std: 0.08
+ min_sampling_denoising_std: 0.1
randn_clip_value: 3
#
network_path: ${base_policy_path}
@@ -96,12 +96,12 @@ model:
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObsAct
- action_dim: ${action_dim}
- action_steps: ${act_steps}
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
action_dim: ${action_dim}
diff --git a/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml
index 825e9d6..21a760e 100644
--- a/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_dql_diffusion_mlp.yaml
@@ -26,7 +26,7 @@ env:
name: ${env_name}
best_reward_threshold_for_success: 1
max_episode_steps: 800
- save_video: false
+ save_video: False
wrappers:
robomimic_lowdim:
normalization_path: ${normalization_path}
@@ -49,8 +49,8 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 1000
- n_critic_warmup_itr: 2
+ n_train_itr: 201
+ n_critic_warmup_itr: 5
n_steps: 400
gamma: 0.999
actor_lr: 1e-5
@@ -58,7 +58,7 @@ train:
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-6
+ min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
@@ -81,7 +81,7 @@ train:
model:
_target_: model.diffusion.diffusion_dql.DQLDiffusion
# Sampling HPs
- min_sampling_denoising_std: 0.08
+ min_sampling_denoising_std: 0.1
randn_clip_value: 3
#
network_path: ${base_policy_path}
diff --git a/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml
index db690f9..140a39f 100644
--- a/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_idql_diffusion_mlp.yaml
@@ -49,7 +49,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 1000
+ n_train_itr: 201
n_critic_warmup_itr: 5
n_steps: 400
gamma: 0.999
@@ -58,7 +58,7 @@ train:
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-6
+ min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
@@ -83,7 +83,7 @@ train:
model:
_target_: model.diffusion.diffusion_idql.IDQLDiffusion
# Sampling HPs
- min_sampling_denoising_std: 0.08
+ min_sampling_denoising_std: 0.1
randn_clip_value: 3
#
network_path: ${base_policy_path}
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml
index f0418c9..198855b 100644
--- a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp.yaml
@@ -50,16 +50,16 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 1000
+ n_train_itr: 201
n_critic_warmup_itr: 2
n_steps: 400
gamma: 0.999
- actor_lr: 1e-5
+ actor_lr: 1e-4
actor_weight_decay: 0
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-6
+ min_lr: 1e-4
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
@@ -76,7 +76,7 @@ train:
reward_scale_const: 1.0
gae_lambda: 0.95
batch_size: 10000
- update_epochs: 8
+ update_epochs: 5
vf_coef: 0.5
target_kl: 1
@@ -88,7 +88,7 @@ model:
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
- min_sampling_denoising_std: 0.08
+ min_sampling_denoising_std: 0.1
min_logprob_denoising_std: 0.1
#
network_path: ${base_policy_path}
@@ -102,10 +102,10 @@ model:
action_dim: ${action_dim}
critic:
_target_: model.common.critic.CriticObs
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
ft_denoising_steps: ${ft_denoising_steps}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml
index ad22b83..b826e06 100644
--- a/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/transport/ft_ppo_diffusion_mlp_img.yaml
@@ -1,7 +1,7 @@
defaults:
- _self_
hydra:
- run:
+ run:
dir: ${logdir}
_target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
@@ -64,7 +64,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 500
+ n_train_itr: 201
n_critic_warmup_itr: 2
n_steps: 400
gamma: 0.999
@@ -73,13 +73,13 @@ train:
actor_lr: 1e-5
actor_weight_decay: 0
actor_lr_scheduler:
- first_cycle_steps: 500
+ first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-6
+ min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
- first_cycle_steps: 500
+ first_cycle_steps: 1000
warmup_steps: 10
min_lr: 1e-3
save_model_freq: 100
@@ -93,19 +93,19 @@ train:
gae_lambda: 0.95
batch_size: 500
logprob_batch_size: 1000
- update_epochs: 8
+ update_epochs: 10
vf_coef: 0.5
target_kl: 1
model:
_target_: model.diffusion.diffusion_ppo.PPODiffusion
# HP to tune
- gamma_denoising: 0.9
+ gamma_denoising: 0.99
clip_ploss_coef: 0.01
clip_ploss_coef_base: 0.001
clip_ploss_coef_rate: 3
randn_clip_value: 3
- min_sampling_denoising_std: 0.08
+ min_sampling_denoising_std: 0.1
min_logprob_denoising_std: 0.1
#
use_ddim: ${use_ddim}
@@ -164,10 +164,10 @@ model:
embed_style: embed2
embed_norm: 0
img_cond_steps: ${img_cond_steps}
- cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
mlp_dims: [256, 256, 256]
activation_type: Mish
residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
ft_denoising_steps: ${ft_denoising_steps}
horizon_steps: ${horizon_steps}
obs_dim: ${obs_dim}
diff --git a/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml
index 4072238..f116ef5 100644
--- a/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_qsm_diffusion_mlp.yaml
@@ -49,7 +49,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 1000
+ n_train_itr: 201
n_critic_warmup_itr: 5
n_steps: 400
gamma: 0.999
@@ -58,7 +58,7 @@ train:
actor_lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-6
+ min_lr: 1e-5
critic_lr: 1e-3
critic_weight_decay: 0
critic_lr_scheduler:
@@ -81,7 +81,7 @@ train:
model:
_target_: model.diffusion.diffusion_qsm.QSMDiffusion
# Sampling HPs
- min_sampling_denoising_std: 0.08
+ min_sampling_denoising_std: 0.1
randn_clip_value: 3
#
network_path: ${base_policy_path}
diff --git a/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml b/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml
index af9e9cb..40cd186 100644
--- a/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/transport/ft_rwr_diffusion_mlp.yaml
@@ -49,7 +49,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 1000
+ n_train_itr: 201
n_critic_warmup_itr: 2
n_steps: 400
gamma: 0.999
@@ -58,7 +58,7 @@ train:
lr_scheduler:
first_cycle_steps: 1000
warmup_steps: 10
- min_lr: 1e-6
+ min_lr: 1e-5
save_model_freq: 100
val_freq: 10
render:
@@ -73,7 +73,7 @@ train:
model:
_target_: model.diffusion.diffusion_rwr.RWRDiffusion
# Sampling HPs
- min_sampling_denoising_std: 0.08
+ min_sampling_denoising_std: 0.1
randn_clip_value: 3
#
network_path: ${base_policy_path}
diff --git a/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml b/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml
index 3c610a1..0fd05ae 100644
--- a/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml
+++ b/cfg/robomimic/pretrain/can/calql_mlp_offline.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 100
+ n_train_itr: 1000
n_steps: 1
gamma: 0.99
actor_lr: 1e-4
@@ -61,8 +61,8 @@ train:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 3e-4
- save_model_freq: 10
- val_freq: 10
+ save_model_freq: 100
+ val_freq: 20
render:
freq: 1
num: 0
@@ -70,7 +70,7 @@ train:
# CalQL specific
train_online: False
batch_size: 256
- n_random_actions: 4
+ n_random_actions: 10
target_ema_rate: 0.005
scale_reward_factor: 1.0
num_update: 1000
@@ -93,7 +93,7 @@ model:
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
diff --git a/cfg/robomimic/pretrain/can/calql_mlp_offline_ph.yaml b/cfg/robomimic/pretrain/can/calql_mlp_offline_ph.yaml
new file mode 100644
index 0000000..a70d4aa
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/calql_mlp_offline_ph.yaml
@@ -0,0 +1,118 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ best_reward_threshold_for_success: 1
+ max_episode_steps: 300
+ reset_at_iteration: False
+ save_video: False
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: calql-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000
+ n_steps: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 3e-4
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 3e-4
+ save_model_freq: 10
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ log_freq: 1
+ # CalQL specific
+ train_online: False
+ batch_size: 256
+ n_random_actions: 4
+ target_ema_rate: 0.005
+ scale_reward_factor: 1.0
+ num_update: 1000
+ buffer_size: 1000000
+ n_eval_episode: 10
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+ automatic_entropy_tuning: True
+
+model:
+ _target_: model.rl.gaussian_calql.CalQL_Gaussian
+ randn_clip_value: 3
+ cql_min_q_weight: 5.0
+ tanh_output: True
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ discount_factor: ${train.gamma}
+ get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1.yaml
new file mode 100644
index 0000000..62a09e8
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1.yaml
@@ -0,0 +1,65 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: robomimic-${env}-pretrain
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_epochs: 8000
+ batch_size: 256
+ learning_rate: 1e-4
+ weight_decay: 1e-6
+ lr_scheduler:
+ first_cycle_steps: 10000
+ warmup_steps: 100
+ min_lr: 1e-5
+ epoch_start_ema: 20
+ update_ema_freq: 10
+ save_model_freq: 1000
+
+model:
+ _target_: model.diffusion.diffusion.DiffusionModel
+ predict_epsilon: True
+ denoised_clip_value: 1.0
+ network:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [512, 512, 512]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
+
+ema:
+ decay: 0.995
+
+train_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceDataset
+ dataset_path: ${train_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1_ph.yaml
new file mode 100644
index 0000000..46593c6
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ta1_ph.yaml
@@ -0,0 +1,65 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: robomimic-${env}-pretrain
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_epochs: 8000
+ batch_size: 256
+ learning_rate: 1e-4
+ weight_decay: 1e-6
+ lr_scheduler:
+ first_cycle_steps: 10000
+ warmup_steps: 100
+ min_lr: 1e-5
+ epoch_start_ema: 20
+ update_ema_freq: 10
+ save_model_freq: 1000
+
+model:
+ _target_: model.diffusion.diffusion.DiffusionModel
+ predict_epsilon: True
+ denoised_clip_value: 1.0
+ network:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 16
+ mlp_dims: [512, 512, 512]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
+
+ema:
+ decay: 0.995
+
+train_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceDataset
+ dataset_path: ${train_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ta1_ph.yaml
similarity index 85%
rename from cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml
rename to cfg/robomimic/pretrain/can/pre_gaussian_mlp_ta1_ph.yaml
index c7e0d9c..1bb170d 100644
--- a/cfg/robomimic/pretrain/transport/pre_gaussian_mlp_ibrl.yaml
+++ b/cfg/robomimic/pretrain/can/pre_gaussian_mlp_ta1_ph.yaml
@@ -7,13 +7,13 @@ _target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz
seed: 42
device: cuda:0
-env: transport
-obs_dim: 59
-action_dim: 14
+env: can
+obs_dim: 23
+action_dim: 7
horizon_steps: 1
cond_steps: 1
@@ -26,11 +26,11 @@ train:
n_epochs: 5000
batch_size: 256
learning_rate: 1e-4
- weight_decay: 0
+ weight_decay: 1e-6
lr_scheduler:
first_cycle_steps: 5000
warmup_steps: 100
- min_lr: 1e-4
+ min_lr: 1e-5
epoch_start_ema: 20
update_ema_freq: 10
save_model_freq: 1000
@@ -45,7 +45,7 @@ model:
fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
- action_dim: ${action_dim}
+ action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}
diff --git a/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml b/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml
index 1cf5527..cb52740 100644
--- a/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml
+++ b/cfg/robomimic/pretrain/square/calql_mlp_offline.yaml
@@ -46,7 +46,7 @@ wandb:
run: ${now:%H-%M-%S}_${name}
train:
- n_train_itr: 100
+ n_train_itr: 1000
n_steps: 1
gamma: 0.99
actor_lr: 1e-4
@@ -61,8 +61,8 @@ train:
first_cycle_steps: 1000
warmup_steps: 10
min_lr: 3e-4
- save_model_freq: 10
- val_freq: 10
+ save_model_freq: 100
+ val_freq: 20
render:
freq: 1
num: 0
@@ -70,7 +70,7 @@ train:
# CalQL specific
train_online: False
batch_size: 256
- n_random_actions: 4
+ n_random_actions: 10
target_ema_rate: 0.005
scale_reward_factor: 1.0
num_update: 1000
@@ -93,7 +93,7 @@ model:
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
diff --git a/cfg/robomimic/pretrain/square/calql_mlp_offline_ph.yaml b/cfg/robomimic/pretrain/square/calql_mlp_offline_ph.yaml
new file mode 100644
index 0000000..5e541a4
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/calql_mlp_offline_ph.yaml
@@ -0,0 +1,118 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_calql_agent.TrainCalQLAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ best_reward_threshold_for_success: 1
+ max_episode_steps: 400
+ reset_at_iteration: False
+ save_video: False
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: calql-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000
+ n_steps: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 3e-4
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 3e-4
+ save_model_freq: 10
+ val_freq: 10
+ render:
+ freq: 1
+ num: 0
+ log_freq: 1
+ # CalQL specific
+ train_online: False
+ batch_size: 256
+ n_random_actions: 4
+ target_ema_rate: 0.005
+ scale_reward_factor: 1.0
+ num_update: 1000
+ buffer_size: 1000000
+ n_eval_episode: 10
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+ automatic_entropy_tuning: True
+
+model:
+ _target_: model.rl.gaussian_calql.CalQL_Gaussian
+ randn_clip_value: 3
+ cql_min_q_weight: 5.0
+ tanh_output: True
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
+ discount_factor: ${train.gamma}
+ get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1.yaml
new file mode 100644
index 0000000..53e572e
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1.yaml
@@ -0,0 +1,66 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: robomimic-${env}-pretrain
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_epochs: 8000
+ batch_size: 256
+ learning_rate: 1e-4
+ weight_decay: 1e-6
+ lr_scheduler:
+ first_cycle_steps: 10000
+ warmup_steps: 100
+ min_lr: 1e-5
+ epoch_start_ema: 20
+ update_ema_freq: 10
+ save_model_freq: 1000
+
+model:
+ _target_: model.diffusion.diffusion.DiffusionModel
+ predict_epsilon: True
+ denoised_clip_value: 1.0
+ network:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 32
+ mlp_dims: [1024, 1024, 1024]
+ cond_mlp_dims: [512, 64]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
+
+ema:
+ decay: 0.995
+
+train_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceDataset
+ dataset_path: ${train_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1_ph.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1_ph.yaml
new file mode 100644
index 0000000..7bffecd
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ta1_ph.yaml
@@ -0,0 +1,66 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+horizon_steps: 1
+cond_steps: 1
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: robomimic-${env}-pretrain
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_epochs: 8000
+ batch_size: 256
+ learning_rate: 1e-4
+ weight_decay: 1e-6
+ lr_scheduler:
+ first_cycle_steps: 10000
+ warmup_steps: 100
+ min_lr: 1e-5
+ epoch_start_ema: 20
+ update_ema_freq: 10
+ save_model_freq: 1000
+
+model:
+ _target_: model.diffusion.diffusion.DiffusionModel
+ predict_epsilon: True
+ denoised_clip_value: 1.0
+ network:
+ _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+ time_dim: 32
+ mlp_dims: [1024, 1024, 1024]
+ cond_mlp_dims: [512, 64]
+ residual_style: True
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ horizon_steps: ${horizon_steps}
+ obs_dim: ${obs_dim}
+ action_dim: ${action_dim}
+ denoising_steps: ${denoising_steps}
+ device: ${device}
+
+ema:
+ decay: 0.995
+
+train_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceDataset
+ dataset_path: ${train_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ta1_ph.yaml
similarity index 84%
rename from cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml
rename to cfg/robomimic/pretrain/square/pre_gaussian_mlp_ta1_ph.yaml
index 11d3f08..0cc2204 100644
--- a/cfg/robomimic/pretrain/lift/pre_gaussian_mlp_ibrl.yaml
+++ b/cfg/robomimic/pretrain/square/pre_gaussian_mlp_ta1_ph.yaml
@@ -7,12 +7,12 @@ _target_: agent.pretrain.train_gaussian_agent.TrainGaussianAgent
name: ${env}_pre_gaussian_mlp_ta${horizon_steps}
logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}/train.npz
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz
seed: 42
device: cuda:0
-env: lift
-obs_dim: 19
+env: square
+obs_dim: 23
action_dim: 7
horizon_steps: 1
cond_steps: 1
@@ -40,14 +40,15 @@ model:
network:
_target_: model.common.mlp_gaussian.Gaussian_MLP
mlp_dims: [1024, 1024, 1024]
- residual_style: False
+ activation_type: ReLU
+ dropout: 0.5
+ fixed_std: 0.1
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
- action_dim: ${action_dim}
+ action_dim: ${action_dim}
horizon_steps: ${horizon_steps}
device: ${device}
-
ema:
decay: 0.995
diff --git a/cfg/robomimic/finetune/can/rlpd_mlp.yaml b/cfg/robomimic/scratch/can/rlpd_mlp.yaml
similarity index 98%
rename from cfg/robomimic/finetune/can/rlpd_mlp.yaml
rename to cfg/robomimic/scratch/can/rlpd_mlp.yaml
index 4f5a948..8b66075 100644
--- a/cfg/robomimic/finetune/can/rlpd_mlp.yaml
+++ b/cfg/robomimic/scratch/can/rlpd_mlp.yaml
@@ -91,7 +91,7 @@ model:
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
diff --git a/cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml b/cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml
new file mode 100644
index 0000000..d574d5a
--- /dev/null
+++ b/cfg/robomimic/scratch/can/rlpd_mlp_ph.yaml
@@ -0,0 +1,114 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 300
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 1
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: rlpd-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000000
+ n_steps: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-4
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ save_model_freq: 100000
+ val_freq: 10000
+ render:
+ freq: 10000
+ num: 0
+ log_freq: 200
+ # RLPD specific
+ batch_size: 256
+ target_ema_rate: 0.01
+ scale_reward_factor: 1
+ critic_num_update: 3
+ buffer_size: 400000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+
+model:
+ _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
+ randn_clip_value: 10
+ backup_entropy: True
+ n_critics: 5
+ tanh_output: True
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: False # use ensemble
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/rlpd_mlp.yaml b/cfg/robomimic/scratch/square/rlpd_mlp.yaml
similarity index 98%
rename from cfg/robomimic/finetune/square/rlpd_mlp.yaml
rename to cfg/robomimic/scratch/square/rlpd_mlp.yaml
index d62a41d..46730a7 100644
--- a/cfg/robomimic/finetune/square/rlpd_mlp.yaml
+++ b/cfg/robomimic/scratch/square/rlpd_mlp.yaml
@@ -91,7 +91,7 @@ model:
tanh_output: False # squash after sampling instead
cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
horizon_steps: ${horizon_steps}
-
+ action_dim: ${action_dim}
std_max: 7.3891
std_min: 0.0067
critic:
diff --git a/cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml b/cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml
new file mode 100644
index 0000000..cb8a8b4
--- /dev/null
+++ b/cfg/robomimic/scratch/square/rlpd_mlp_ph.yaml
@@ -0,0 +1,114 @@
+defaults:
+ - _self_
+hydra:
+ run:
+ dir: ${logdir}
+_target_: agent.finetune.train_rlpd_agent.TrainRLPDAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+ n_envs: 1
+ name: ${env_name}
+ max_episode_steps: 400
+ reset_at_iteration: False
+ save_video: False
+ best_reward_threshold_for_success: 1
+ wrappers:
+ robomimic_lowdim:
+ normalization_path: ${normalization_path}
+ low_dim_keys: ['robot0_eef_pos',
+ 'robot0_eef_quat',
+ 'robot0_gripper_qpos',
+ 'object'] # same order of preprocessed observations
+ multi_step:
+ n_obs_steps: ${cond_steps}
+ n_action_steps: ${act_steps}
+ max_episode_steps: ${env.max_episode_steps}
+ reset_within_step: True
+
+wandb:
+ entity: ${oc.env:DPPO_WANDB_ENTITY}
+ project: rlpd-${env_name}
+ run: ${now:%H-%M-%S}_${name}
+
+train:
+ n_train_itr: 1000000
+ n_steps: 1
+ gamma: 0.99
+ actor_lr: 1e-4
+ actor_weight_decay: 0
+ actor_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ critic_lr: 1e-4
+ critic_weight_decay: 0
+ critic_lr_scheduler:
+ first_cycle_steps: 1000
+ warmup_steps: 10
+ min_lr: 1e-4
+ save_model_freq: 100000
+ val_freq: 10000
+ render:
+ freq: 10000
+ num: 0
+ log_freq: 200
+ # RLPD specific
+ batch_size: 256
+ target_ema_rate: 0.01
+ scale_reward_factor: 1
+ critic_num_update: 3
+ buffer_size: 400000
+ n_eval_episode: 40
+ n_explore_steps: 0
+ target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+ init_temperature: 1
+
+model:
+ _target_: model.rl.gaussian_rlpd.RLPD_Gaussian
+ randn_clip_value: 10
+ backup_entropy: True
+ n_critics: 5
+ tanh_output: True
+ actor:
+ _target_: model.common.mlp_gaussian.Gaussian_MLP
+ mlp_dims: [512, 512, 512]
+ activation_type: ReLU
+ tanh_output: False # squash after sampling instead
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ horizon_steps: ${horizon_steps}
+ action_dim: ${action_dim}
+ std_max: 7.3891
+ std_min: 0.0067
+ critic:
+ _target_: model.common.critic.CriticObsAct
+ mlp_dims: [256, 256, 256]
+ activation_type: ReLU
+ use_layernorm: True
+ double_q: False # use ensemble
+ cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+ action_dim: ${action_dim}
+ action_steps: ${act_steps}
+ horizon_steps: ${horizon_steps}
+ device: ${device}
+
+offline_dataset:
+ _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+ dataset_path: ${offline_dataset_path}
+ horizon_steps: ${horizon_steps}
+ cond_steps: ${cond_steps}
+ device: ${device}
\ No newline at end of file
diff --git a/env/gym_utils/__init__.py b/env/gym_utils/__init__.py
index cea639c..e70870a 100644
--- a/env/gym_utils/__init__.py
+++ b/env/gym_utils/__init__.py
@@ -165,7 +165,9 @@ def make_async(
# https://github.com/ARISE-Initiative/robosuite/blob/92abf5595eddb3a845cd1093703e5a3ccd01e77e/robosuite/environments/base.py#L247-L248
env.env.hard_reset = False
else: # d3il, gym
- env = make_(id, render=render, **kwargs)
+ if "kitchen" not in id: # d4rl kitchen does not support rendering!
+ kwargs["render"] = render
+ env = make_(id, **kwargs)
# add wrappers
if wrappers is not None:
diff --git a/env/gym_utils/wrapper/furniture.py b/env/gym_utils/wrapper/furniture.py
index 3c02895..3ca67e1 100644
--- a/env/gym_utils/wrapper/furniture.py
+++ b/env/gym_utils/wrapper/furniture.py
@@ -132,9 +132,10 @@ class FurnitureRLSimEnvMultiStepWrapper(gym.Wrapper):
nobs: np.ndarray = self.process_obs(obs)
truncated: np.ndarray = truncated.squeeze().cpu().numpy()
- terminated: np.ndarray = np.zeros_like(truncated, dtype=bool)
+ # terminated: np.ndarray = np.zeros_like(truncated, dtype=bool)
- return {"state": nobs}, reward, terminated, truncated, info
+ # since we only assign reward at the timestep where one stage is finished, and reward does not accumulate, we consider the final step of the episode as terminal
+ return {"state": nobs}, reward, truncated, truncated, info
def _inner_step(self, action_chunk: torch.Tensor):
dense_reward = torch.zeros(action_chunk.shape[0], device=action_chunk.device)
diff --git a/model/common/mlp.py b/model/common/mlp.py
index 3322af9..4ab137c 100644
--- a/model/common/mlp.py
+++ b/model/common/mlp.py
@@ -96,6 +96,7 @@ class ResidualMLP(nn.Module):
out_activation_type="Identity",
use_layernorm=False,
use_layernorm_final=False,
+ dropout=0,
):
super(ResidualMLP, self).__init__()
hidden_dim = dim_list[1]
@@ -108,6 +109,7 @@ class ResidualMLP(nn.Module):
hidden_dim=hidden_dim,
activation_type=activation_type,
use_layernorm=use_layernorm,
+ dropout=dropout,
)
for _ in range(1, num_hidden_layers, 2)
]
@@ -129,6 +131,7 @@ class TwoLayerPreActivationResNetLinear(nn.Module):
hidden_dim,
activation_type="Mish",
use_layernorm=False,
+ dropout=0,
):
super().__init__()
self.l1 = nn.Linear(hidden_dim, hidden_dim)
@@ -137,6 +140,8 @@ class TwoLayerPreActivationResNetLinear(nn.Module):
if use_layernorm:
self.norm1 = nn.LayerNorm(hidden_dim, eps=1e-06)
self.norm2 = nn.LayerNorm(hidden_dim, eps=1e-06)
+ if dropout > 0:
+ raise NotImplementedError("Dropout not implemented for residual MLP!")
def forward(self, x):
x_input = x
diff --git a/model/common/mlp_gaussian.py b/model/common/mlp_gaussian.py
index e05dbed..dbd10cf 100644
--- a/model/common/mlp_gaussian.py
+++ b/model/common/mlp_gaussian.py
@@ -212,6 +212,7 @@ class Gaussian_MLP(nn.Module):
out_activation_type=activation_type,
use_layernorm=use_layernorm,
use_layernorm_final=use_layernorm,
+ dropout=dropout,
)
self.mlp_mean = MLP(
mlp_dims[-1:] + [output_dim],
@@ -233,9 +234,7 @@ class Gaussian_MLP(nn.Module):
if learn_fixed_std:
# initialize to fixed_std
self.logvar = torch.nn.Parameter(
- torch.log(
- torch.tensor([fixed_std**2 for _ in range(action_dim)])
- ),
+ torch.log(torch.tensor([fixed_std**2 for _ in range(action_dim)])),
requires_grad=True,
)
self.logvar_min = torch.nn.Parameter(
diff --git a/model/diffusion/diffusion_ppo.py b/model/diffusion/diffusion_ppo.py
index 9c13863..1c574a3 100644
--- a/model/diffusion/diffusion_ppo.py
+++ b/model/diffusion/diffusion_ppo.py
@@ -22,7 +22,6 @@ from model.diffusion.diffusion_vpg import VPGDiffusion
class PPODiffusion(VPGDiffusion):
-
def __init__(
self,
gamma_denoising: float,
@@ -58,7 +57,9 @@ class PPODiffusion(VPGDiffusion):
def loss(
self,
obs,
- chains,
+ chains_prev,
+ chains_next,
+ denoising_inds,
returns,
oldvalues,
advantages,
@@ -81,9 +82,11 @@ class PPODiffusion(VPGDiffusion):
reward_horizon: action horizon that backpropagates gradient
"""
# Get new logprobs for denoising steps from T-1 to 0 - entropy is fixed fod diffusion
- newlogprobs, eta = self.get_logprobs(
+ newlogprobs, eta = self.get_logprobs_subsample(
obs,
- chains,
+ chains_prev,
+ chains_next,
+ denoising_inds,
get_ent=True,
)
entropy_loss = -eta.mean()
@@ -92,7 +95,7 @@ class PPODiffusion(VPGDiffusion):
# only backpropagate through the earlier steps (e.g., ones actually executed in the environment)
newlogprobs = newlogprobs[:, :reward_horizon, :]
- oldlogprobs = oldlogprobs[:, :, :reward_horizon, :]
+ oldlogprobs = oldlogprobs[:, :reward_horizon, :]
# Get the logprobs - batch over B and denoising steps
newlogprobs = newlogprobs.mean(dim=(-1, -2)).view(-1)
@@ -106,9 +109,7 @@ class PPODiffusion(VPGDiffusion):
# Get counterfactual teacher actions
samples = self.forward(
- cond=obs.float()
- .unsqueeze(1)
- .to(self.device), # B x horizon=1 x obs_dim
+ cond=obs,
deterministic=False,
return_chain=True,
use_base_policy=True,
@@ -116,7 +117,7 @@ class PPODiffusion(VPGDiffusion):
# Get logprobs of teacher actions under this policy
bc_logprobs = self.get_logprobs(
obs,
- samples.chains, # n_env x denoising x horizon x act
+ samples.chains,
get_ent=False,
use_base_policy=False,
)
@@ -133,14 +134,13 @@ class PPODiffusion(VPGDiffusion):
advantage_max = torch.quantile(advantages, self.clip_advantage_upper_quantile)
advantages = advantages.clamp(min=advantage_min, max=advantage_max)
- # repeat advantages for denoising steps and horizon steps
- advantages = advantages.repeat_interleave(self.ft_denoising_steps)
-
# denoising discount
discount = torch.tensor(
- [self.gamma_denoising**i for i in reversed(range(self.ft_denoising_steps))]
+ [
+ self.gamma_denoising ** (self.ft_denoising_steps - i - 1)
+ for i in denoising_inds
+ ]
).to(self.device)
- discount = discount.repeat(len(advantages) // self.ft_denoising_steps)
advantages *= discount
# get ratio
@@ -148,9 +148,7 @@ class PPODiffusion(VPGDiffusion):
ratio = logratio.exp()
# exponentially interpolate between the base and the current clipping value over denoising steps and repeat
- t = torch.arange(self.ft_denoising_steps).float().to(self.device) / (
- self.ft_denoising_steps - 1
- ) # 0 to 1
+ t = (denoising_inds.float() / (self.ft_denoising_steps - 1)).to(self.device)
if self.ft_denoising_steps > 1:
clip_ploss_coef = self.clip_ploss_coef_base + (
self.clip_ploss_coef - self.clip_ploss_coef_base
@@ -158,10 +156,7 @@ class PPODiffusion(VPGDiffusion):
math.exp(self.clip_ploss_coef_rate) - 1
)
else:
- clip_ploss_coef = torch.tensor([self.clip_ploss_coef]).to(self.device)
- clip_ploss_coef = clip_ploss_coef.repeat(
- len(advantages) // self.ft_denoising_steps
- )
+ clip_ploss_coef = t
# get kl difference and whether value clipped
with torch.no_grad():
diff --git a/model/diffusion/diffusion_vpg.py b/model/diffusion/diffusion_vpg.py
index e61b716..cfa9a5a 100644
--- a/model/diffusion/diffusion_vpg.py
+++ b/model/diffusion/diffusion_vpg.py
@@ -395,6 +395,71 @@ class VPGDiffusion(DiffusionModel):
return log_prob, eta
return log_prob
+ def get_logprobs_subsample(
+ self,
+ cond,
+ chains_prev,
+ chains_next,
+ denoising_inds,
+ get_ent: bool = False,
+ use_base_policy: bool = False,
+ ):
+ """
+ Calculating the logprobs of random samples of denoised chains.
+
+ Args:
+ cond: dict with key state/rgb; more recent obs at the end
+ state: (B, To, Do)
+ rgb: (B, To, C, H, W)
+ chains: (B, K+1, Ta, Da)
+ get_ent: flag for returning entropy
+ use_base_policy: flag for using base policy
+
+ Returns:
+ logprobs: (B, Ta, Da)
+ entropy (if get_ent=True): (B, Ta)
+ denoising_indices: (B, )
+ """
+ # Sample t for batch dim, keep it 1-dim
+ if self.use_ddim:
+ t_single = self.ddim_t[-self.ft_denoising_steps :]
+ else:
+ t_single = torch.arange(
+ start=self.ft_denoising_steps - 1,
+ end=-1,
+ step=-1,
+ device=self.device,
+ )
+ # 4,3,2,1,0,4,3,2,1,0,...,4,3,2,1,0
+ t_all = t_single[denoising_inds]
+ if self.use_ddim:
+ ddim_indices_single = torch.arange(
+ start=self.ddim_steps - self.ft_denoising_steps,
+ end=self.ddim_steps,
+ device=self.device,
+ ) # only used for DDIM
+ ddim_indices = ddim_indices_single[denoising_inds]
+ else:
+ ddim_indices = None
+
+ # Forward pass with previous chains
+ next_mean, logvar, eta = self.p_mean_var(
+ chains_prev,
+ t_all,
+ cond=cond,
+ index=ddim_indices,
+ use_base_policy=use_base_policy,
+ )
+ std = torch.exp(0.5 * logvar)
+ std = torch.clip(std, min=self.min_logprob_denoising_std)
+ dist = Normal(next_mean, std)
+
+ # Get logprobs with gaussian
+ log_prob = dist.log_prob(chains_next)
+ if get_ent:
+ return log_prob, eta
+ return log_prob
+
def loss(self, cond, chains, reward):
"""
REINFORCE loss. Not used right now.
diff --git a/model/rl/gaussian_calql.py b/model/rl/gaussian_calql.py
index 14d87f0..0ea9ddb 100644
--- a/model/rl/gaussian_calql.py
+++ b/model/rl/gaussian_calql.py
@@ -63,7 +63,6 @@ class CalQL_Gaussian(GaussianModel):
returns,
terminated,
gamma,
- alpha,
):
B = len(actions)
@@ -71,17 +70,17 @@ class CalQL_Gaussian(GaussianModel):
q_data1, q_data2 = self.critic(obs, actions)
with torch.no_grad():
# repeat for action samples
- next_obs["state"] = next_obs["state"].repeat_interleave(
+ next_obs_repeated = {"state": next_obs["state"].repeat_interleave(
self.cql_n_actions, dim=0
- )
+ )}
# Get the next actions and logprobs
next_actions, next_logprobs = self.forward(
- next_obs,
+ next_obs_repeated,
deterministic=False,
get_logprob=True,
)
- next_q1, next_q2 = self.target_critic(next_obs, next_actions)
+ next_q1, next_q2 = self.target_critic(next_obs_repeated, next_actions)
next_q = torch.min(next_q1, next_q2)
# Reshape the next_q to match the number of samples
@@ -96,9 +95,6 @@ class CalQL_Gaussian(GaussianModel):
# Get the target Q values
target_q = rewards + gamma * (1 - terminated) * next_q
- # Subtract the entropy bonus
- target_q = target_q - alpha * next_logprobs
-
# TD loss
td_loss_1 = nn.functional.mse_loss(q_data1, target_q)
td_loss_2 = nn.functional.mse_loss(q_data2, target_q)
@@ -111,6 +107,12 @@ class CalQL_Gaussian(GaussianModel):
reparameterize=False,
get_logprob=True,
) # no gradient
+ pi_next_actions, log_pi_next = self.forward(
+ next_obs,
+ deterministic=False,
+ reparameterize=False,
+ get_logprob=True,
+ ) # no gradient
# Random action Q values
n_random_actions = random_actions.shape[1]
@@ -130,17 +132,26 @@ class CalQL_Gaussian(GaussianModel):
# Policy action Q values
q_pi_1, q_pi_2 = self.critic(obs, pi_actions)
- q_pi_1 = q_pi_1 - log_pi
- q_pi_2 = q_pi_2 - log_pi
+ q_pi_next_1, q_pi_next_2 = self.critic(next_obs, pi_next_actions)
# Ensure calibration w.r.t. value function estimate
q_pi_1 = torch.max(q_pi_1, returns)[:, None] # (B, 1)
q_pi_2 = torch.max(q_pi_2, returns)[:, None] # (B, 1)
- cat_q_1 = torch.cat([q_rand_1, q_pi_1], dim=-1) # (B, num_samples+1)
+ q_pi_next_1 = torch.max(q_pi_next_1, returns)[:, None] # (B, 1)
+ q_pi_next_2 = torch.max(q_pi_next_2, returns)[:, None] # (B, 1)
+
+ # cql_importance_sample
+ q_pi_1 = q_pi_1 - log_pi
+ q_pi_2 = q_pi_2 - log_pi
+ q_pi_next_1 = q_pi_next_1 - log_pi_next
+ q_pi_next_2 = q_pi_next_2 - log_pi_next
+ cat_q_1 = torch.cat([q_rand_1, q_pi_1, q_pi_next_1], dim=-1) # (B, num_samples+1)
cql_qf1_ood = torch.logsumexp(cat_q_1, dim=-1) # max over num_samples
- cat_q_2 = torch.cat([q_rand_2, q_pi_2], dim=-1) # (B, num_samples+1)
+ cat_q_2 = torch.cat([q_rand_2, q_pi_2, q_pi_next_2], dim=-1) # (B, num_samples+1)
cql_qf2_ood = torch.logsumexp(cat_q_2, dim=-1) # sum over num_samples
+ # skip cal_lagrange since the paper shows cql_target_action_gap not used in kitchen
+
# Subtract the log likelihood of the data
cql_qf1_diff = torch.clamp(
cql_qf1_ood - q_data1,
diff --git a/model/rl/gaussian_ibrl.py b/model/rl/gaussian_ibrl.py
index ce96232..4a87f2d 100644
--- a/model/rl/gaussian_ibrl.py
+++ b/model/rl/gaussian_ibrl.py
@@ -20,7 +20,7 @@ class IBRL_Gaussian(GaussianModel):
critic,
n_critics,
soft_action_sample=False,
- soft_action_sample_beta=0.1,
+ soft_action_sample_beta=10,
**kwargs,
):
super().__init__(network=actor, **kwargs)
diff --git a/model/rl/gaussian_ppo.py b/model/rl/gaussian_ppo.py
index a7e3be8..05c047d 100644
--- a/model/rl/gaussian_ppo.py
+++ b/model/rl/gaussian_ppo.py
@@ -63,6 +63,23 @@ class PPO_Gaussian(VPG_Gaussian):
oldlogprobs = oldlogprobs.clamp(min=-5, max=2)
entropy_loss = -entropy
+ bc_loss = 0.0
+ if use_bc_loss:
+ # See Eqn. 2 of https://arxiv.org/pdf/2403.03949.pdf
+ # Give a reward for maximizing probability of teacher policy's action with current policy.
+ # Actions are chosen along trajectory induced by current policy.
+
+ # Get counterfactual teacher actions
+ samples = self.forward(
+ cond=obs,
+ deterministic=False,
+ use_base_policy=True,
+ )
+ # Get logprobs of teacher actions under this policy
+ bc_logprobs, _, _ = self.get_logprobs(obs, samples, use_base_policy=False)
+ bc_logprobs = bc_logprobs.clamp(min=-5, max=2)
+ bc_loss = -bc_logprobs.mean()
+
# get ratio
logratio = newlogprobs - oldlogprobs
ratio = logratio.exp()
@@ -99,25 +116,6 @@ class PPO_Gaussian(VPG_Gaussian):
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((newvalues - returns) ** 2).mean()
-
- bc_loss = 0.0
- if use_bc_loss:
- # See Eqn. 2 of https://arxiv.org/pdf/2403.03949.pdf
- # Give a reward for maximizing probability of teacher policy's action with current policy.
- # Actions are chosen along trajectory induced by current policy.
-
- # Get counterfactual teacher actions
- samples = self.forward(
- cond=obs.float()
- .unsqueeze(1)
- .to(self.device), # B x horizon=1 x obs_dim
- deterministic=False,
- use_base_policy=True,
- )
- # Get logprobs of teacher actions under this policy
- bc_logprobs, _, _ = self.get_logprobs(obs, samples, use_base_policy=False)
- bc_logprobs = bc_logprobs.clamp(min=-5, max=2)
- bc_loss = -bc_logprobs.mean()
return (
pg_loss,
entropy_loss,
diff --git a/pyproject.toml b/pyproject.toml
index 0191c91..b1dbffe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "dppo"
-version = "0.5.0"
+version = "0.6.0"
description = "Fine-tuning diffusion policies with PPO."
readme = "README.md"
requires-python = ">=3.8"
@@ -32,6 +32,13 @@ gym = [
"d4rl",
"patchelf",
]
+kitchen = [
+ "cython<3",
+ "d4rl",
+ "dm_control==1.0.16",
+ "mujoco==3.1.6",
+ "patchelf",
+]
robomimic = [
"cython<3",
"d4rl",
diff --git a/script/download_url.py b/script/download_url.py
index 9838088..1fa2069 100644
--- a/script/download_url.py
+++ b/script/download_url.py
@@ -7,6 +7,12 @@ def get_dataset_download_url(cfg):
return "https://drive.google.com/drive/u/1/folders/1BJu8NklriunDHsDrLT6fEpcro3_2IPFf"
elif env == "halfcheetah-medium-v2":
return "https://drive.google.com/drive/u/1/folders/1Drel26tiuQ9oD3YNf1eyy0UVaf5SQj-U"
+ elif env == "kitchen-complete-v0":
+ return "https://drive.google.com/drive/u/1/folders/18aqg7KIv-YNXohTsRR7Zmg-RyDtdhkLc"
+ elif env == "kitchen-partial-v0":
+ return "https://drive.google.com/drive/u/1/folders/1zLOx1q4FbJK1ZWLui_vhM2x1fMEkBC2D"
+ elif env == "kitchen-mixed-v0":
+ return "https://drive.google.com/drive/u/1/folders/1HRMM16UC10A00oBqjYOL1E8hS5icwtvo"
# D3IL
elif env == "avoid" and cfg.mode == "d56_r12": # M1
return "https://drive.google.com/drive/u/1/folders/1ZAPvLQwv2y4Q98UDVKXFT4fvGF5yhD_o"
@@ -14,7 +20,20 @@ def get_dataset_download_url(cfg):
return "https://drive.google.com/drive/u/1/folders/1wyJi1Zbnd6JNy4WGszHBH40A0bbl-vkd"
elif env == "avoid" and cfg.mode == "d58_r12": # M3
return "https://drive.google.com/drive/u/1/folders/1mNXCIPnCO_FDBlEj95InA9eWJM2XcEEj"
- # Robomimic
+ # Robomimic-PH
+ elif (
+ env == "can"
+ and "ph" in cfg.train_dataset_path
+ and "img" not in cfg.train_dataset_path
+ ):
+ return "https://drive.google.com/drive/folders/1rpVsdpqWPygL89E-t4SLQmZgwQ3mpNnY?usp=drive_link"
+ elif (
+ env == "square"
+ and "ph" in cfg.train_dataset_path
+ and "img" not in cfg.train_dataset_path
+ ):
+ return "https://drive.google.com/drive/folders/1wqqjT9JZ9LX11l2Sz_vGxfcT3BfcNrGk?usp=drive_link"
+ # Robomimic-MH
elif env == "lift" and "img" not in cfg.train_dataset_path: # state
return "https://drive.google.com/drive/u/1/folders/1lbXgMKBTAiFdJqPZqWXpwjEyrVW16MBu"
elif env == "lift" and "img" in cfg.train_dataset_path: # img
@@ -58,6 +77,12 @@ def get_normalization_download_url(cfg):
return "https://drive.google.com/file/d/1NSX7t3DFKaBj5HNpv91Oo5h6oXTk0zoo/view?usp=drive_link"
elif env == "halfcheetah-medium-v2":
return "https://drive.google.com/file/d/1LlwCMfy1b5e8jSx99CV3lWhcrQWrI2Jm/view?usp=drive_link"
+ elif env == "kitchen-complete-v0":
+ return "https://drive.google.com/file/d/1tBATWLoP1E5s08vr5fiUZBzn8EEsjEZh/view?usp=drive_link"
+ elif env == "kitchen-partial-v0":
+ return "https://drive.google.com/file/d/1Ptt0cwQwmb5_HGNM-zggRaDKfkqqNO5e/view?usp=drive_link"
+ elif env == "kitchen-mixed-v0":
+ return "https://drive.google.com/file/d/11gj846QTYFPeV14nhcL5Z9OA5RHIGVt1/view?usp=drive_link"
# D3IL
elif env == "avoiding-m5" and cfg.mode == "d56_r12": # M1
return "https://drive.google.com/file/d/1PubKaPabbiSdWYpGmouDhYfXp4QwNHFG/view?usp=drive_link"
@@ -65,7 +90,20 @@ def get_normalization_download_url(cfg):
return "https://drive.google.com/file/d/1Hoohw8buhsLzXoqivMA6IzKS5Izlj07_/view?usp=drive_link"
elif env == "avoiding-m5" and cfg.mode == "d58_r12": # M3
return "https://drive.google.com/file/d/1qt7apV52C9Tflsc-A55J6uDMHzaFa1wN/view?usp=drive_link"
- # Robomimic
+ # Robomimic-PH
+ elif (
+ env == "can"
+ and "ph" in cfg.normalization_path
+ and "img" not in cfg.normalization_path
+ ):
+ return "https://drive.google.com/file/d/1y04FAEXgK6UlZuDiQzTumS9lz-Ufn47B/view?usp=drive_link"
+ elif (
+ env == "square"
+ and "ph" in cfg.normalization_path
+ and "img" not in cfg.normalization_path
+ ):
+ return "https://drive.google.com/file/d/1_75UM0frCZVtcROgfWsdJ0FstToZd1b5/view?usp=drive_link"
+ # Robomimic-MH
elif env == "lift" and "img" not in cfg.normalization_path: # state
return "https://drive.google.com/file/d/1d3WjwRds-7I5bBFpZuY27OT9ycb8r_QM/view?usp=drive_link"
elif env == "lift" and "img" in cfg.normalization_path: # img
@@ -120,6 +158,71 @@ def get_checkpoint_download_url(cfg):
in path
):
return "https://drive.google.com/file/d/1o9ryyeZQAsaB4ffUTCJkIaGCi0frL3G4/view?usp=drive_link"
+ # Demo-RL
+ elif (
+ "halfcheetah-medium-v2_pre_diffusion_mlp_ta1_td20/2024-09-29_02-13-10_42/checkpoint/state_1000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1Oi5JhsU45ScHdlrtn5AX8Ji7InLBVj4D/view?usp=drive_link"
+ elif (
+ "halfcheetah-medium-v2_pre_gaussian_mlp_ta1/2024-09-28_18-48-54_42/checkpoint/state_500.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/14rbYGaCxvj1PtELKVfdXNHJ1Od2G6FLw/view?usp=drive_link"
+ elif (
+ "halfcheetah-medium-v2_calql_mlp_ta1/2024-09-29_22-59-08_42/checkpoint/state_49.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1Xf758xzsAqpFwV955OVUNL6Za90XPo1K/view?usp=drive_link"
+
+ elif (
+ "kitchen-complete-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-47-42_42/checkpoint/state_8000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1YBwyNd30a4_inu2sZzNSNLJQsj8fN3ZX/view?usp=drive_link"
+ elif (
+ "kitchen-complete-v0_calql_mlp_ta1/2024-10-26_01-01-33_42/checkpoint/state_999.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1K4V59iXNQbpOvu3u5y6C9R5piMU9idYm/view?usp=drive_link"
+ elif (
+ "kitchen-complete-v0_pre_gaussian_mlp_ta1/2024-10-25_14-48-43_42/checkpoint/state_5000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1tQYgnkdhR5wnuXC4Ha_mKHuIdg6J627s/view?usp=drive_link"
+
+ elif (
+ "kitchen-partial-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-29_42/checkpoint/state_8000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1oSupKkUjCFQVWBIJV5Seh-CclWhgpopS/view?usp=drive_link"
+ elif (
+ "kitchen-partial-v0_calql_mlp_ta1/2024-10-25_21-26-51_42/checkpoint/state_980.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/17HUDp3l8mJsMIW-DRraKPhUkH44KGTbA/view?usp=drive_link"
+ elif (
+ "kitchen-partial-v0_pre_gaussian_mlp_ta1/2024-10-25_01-45-52_42/checkpoint/state_5000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1-ZmGRPi4jMS7HfqHPvWrSPxNSoTwih6q/view?usp=drive_link"
+
+ elif (
+ "kitchen-mixed-v0_pre_diffusion_mlp_ta4_td20/2024-10-20_16-48-28_42/checkpoint/state_8000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1X24Hqbn4b4xyLK_1A3D6zhSgsN7frVCG/view?usp=drive_link"
+ elif (
+ "kitchen-mixed-v0_calql_mlp_ta1/2024-10-25_21-36-13_42/checkpoint/state_999.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1AP7bbzAwwfuSLmV1HkQLfmd76MXQn2Za/view?usp=drive_link"
+ elif (
+ "kitchen-mixed-v0_pre_gaussian_mlp_ta1/2024-10-25_01-39-44_42/checkpoint/state_5000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1LEzGhMOqL3YZFXMGn1mTcOh-tm4Lh1SH/view?usp=drive_link"
+
######################################
#### D3IL
######################################
@@ -246,6 +349,32 @@ def get_checkpoint_download_url(cfg):
in path
):
return "https://drive.google.com/file/d/1xSgwGG40zdoO2DDSM79l0rMHeNmaifnq/view?usp=drive_link"
+ # demo-PH
+ elif (
+ "can_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1Ze86hw2E0jJinn3Vx683JQ10Gq5FIJad/view?usp=drive_link"
+ elif (
+ "can_pre_gaussian_mlp_ta1/2024-10-08_20-52-04_0/checkpoint/state_5000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1jP3mEOhZojWiTXCMZ0ajFRMkAAmonGxV/view?usp=drive_link"
+ elif "can_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt" in path:
+ return "https://drive.google.com/file/d/1ERaZKTXmL-vdyU8PZ2X9GjFIMVKJjA2N/view?usp=drive_link"
+ # demo-MH
+ elif (
+ "can_pre_diffusion_mlp_ta1_td20/2024-09-29_15-43-07_42/checkpoint/state_8000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1pEs1cK1x5obAtJA9pFSN1CWG79gNhH24/view?usp=drive_link"
+ elif (
+ "can_pre_gaussian_mlp_ta1/2024-09-28_13-43-59_42/checkpoint/state_5000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1Fa3yflkvYSAy6PKT646U1VAqUJ0YHqsj/view?usp=drive_link"
+ elif "can_calql_mlp_ta1/2024-10-25_22-30-16_42/checkpoint/state_999.pt" in path:
+ return "https://drive.google.com/file/d/1AA94uEaK_SzG2mTpaKqZIwNMh6omL_g0/view?usp=drive_link"
######################################
#### Robomimic-Square
######################################
@@ -286,6 +415,32 @@ def get_checkpoint_download_url(cfg):
in path
):
return "https://drive.google.com/file/d/1LczXhgeNtQfqySsfGNbbviPrlLwyh-E3/view?usp=drive_link"
+ # demo-PH
+ elif (
+ "square_pre_diffusion_mlp_ta1_td20/2024-10-14_10-54-33_0/checkpoint/state_5000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1_Jnz14ySxqbtZa9IIEWkXqy5_-EwJLBw/view?usp=drive_link"
+ elif (
+ "square_pre_gaussian_mlp_ta1/2024-10-08_20-52-42_0/checkpoint/state_5000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1ZPWKUoZ93OqqVX3ephQMkpeBZoYrceM5/view?usp=drive_link"
+ elif "square_calql_mlp_ta1/2024-10-09_11-05-07_0/checkpoint/state_999.pt" in path:
+ return "https://drive.google.com/file/d/1_7YtUwRd_U5tuOvhHogJDhkEsE-4D24V/view?usp=drive_link"
+ # demo-MH
+ elif (
+ "square_pre_diffusion_mlp_ta1_td20/2024-09-29_02-14-14_42/checkpoint/state_8000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1ks1PnUBvFVWPnpGnYL8_eIfLNeGZbv1p/view?usp=drive_link"
+ elif (
+ "square_pre_gaussian_mlp_ta1/2024-09-28_13-42-43_42/checkpoint/state_5000.pt"
+ in path
+ ):
+ return "https://drive.google.com/file/d/1uIOn8QUkGRbhZLkQ9ziOkP7yGQnpYdk7/view?usp=drive_link"
+ elif "square_calql_mlp_ta1/2024-10-25_22-44-12_42/checkpoint/state_999.pt" in path:
+ return "https://drive.google.com/file/d/1zgzG6bx6ugAEaq72z9WpXX6iewClcKTV/view?usp=drive_link"
######################################
#### Robomimic-Transport
######################################