v0.7 (#26)

* update from scratch configs * update gym pretraining configs - use fewer epochs * update robomimic pretraining configs - use fewer epochs * allow trajectory plotting in eval agent * add simple vit unet * update avoid pretraining configs - use fewer epochs * update furniture pretraining configs - use same amount of epochs as before * add robomimic diffusion unet pretraining configs * update robomimic finetuning configs - higher lr * add vit unet checkpoint urls * update pretraining and finetuning instructions as configs are updated
2024-11-20 15:47:52 -05:00 · 2024-11-20 15:47:52 -05:00 · 1d04211666
commit 1d04211666
parent d2929f65e1
158 changed files with 3350 additions and 410 deletions
--- a/agent/eval/eval_agent.py
+++ b/agent/eval/eval_agent.py
@ -57,6 +57,7 @@ class EvalAgent:
        self.horizon_steps = cfg.horizon_steps
        self.max_episode_steps = cfg.env.max_episode_steps
        self.reset_at_iteration = cfg.env.get("reset_at_iteration", True)
+        self.save_full_observations = cfg.env.get("save_full_observations", False)
        self.furniture_sparse_reward = (
            cfg.env.specific.get("sparse_reward", False)
            if "specific" in cfg.env
@ -85,6 +86,10 @@ class EvalAgent:
        assert not (
            self.n_render <= 0 and self.render_video
        ), "Need to set n_render > 0 if saving video"
+        self.traj_plotter = (
+            hydra.utils.instantiate(cfg.plotter)
+            if "plotter" in cfg else None
+        )

    def run(self):
        pass
--- a/agent/eval/eval_diffusion_agent.py
+++ b/agent/eval/eval_diffusion_agent.py
@ -37,6 +37,11 @@ class EvalDiffusionAgent(EvalAgent):
        prev_obs_venv = self.reset_env_all(options_venv=options_venv)
        firsts_trajs[0] = 1
        reward_trajs = np.zeros((self.n_steps, self.n_envs))
+        if self.save_full_observations:  # state-only
+            obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
+            obs_full_trajs = np.vstack(
+                (obs_full_trajs, prev_obs_venv["state"][:, -1][None])
+            )

        # Collect a set of trajectories from env
        for step in range(self.n_steps):
@ -62,6 +67,13 @@ class EvalDiffusionAgent(EvalAgent):
            )
            reward_trajs[step] = reward_venv
            firsts_trajs[step + 1] = terminated_venv | truncated_venv
+            if self.save_full_observations:  # state-only
+                obs_full_venv = np.array(
+                    [info["full_obs"]["state"] for info in info_venv]
+                )  # n_envs x act_steps x obs_dim
+                obs_full_trajs = np.vstack(
+                    (obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
+                )

            # update for next step
            prev_obs_venv = obs_venv
@ -108,6 +120,16 @@ class EvalDiffusionAgent(EvalAgent):
            success_rate = 0
            log.info("[WARNING] No episode completed within the iteration!")

+        # Plot state trajectories (only in D3IL)
+        if self.traj_plotter is not None:
+            self.traj_plotter(
+                obs_full_trajs=obs_full_trajs,
+                n_render=self.n_render,
+                max_episode_steps=self.max_episode_steps,
+                render_dir=self.render_dir,
+                itr=0,
+            )
+
        # Log loss and save metrics
        time = timer()
        log.info(
--- a/cfg/d3il/eval/avoid_m1/eval_diffusion_mlp.yaml
+++ b/cfg/d3il/eval/avoid_m1/eval_diffusion_mlp.yaml
@ -0,0 +1,68 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/d3il-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: avoiding-m5
+obs_dim: 4
+action_dim: 2
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+n_steps: 25
+render_num: 40
+
+plotter:
+  _target_: env.plot_traj.TrajPlotter
+  env_type: avoid
+  normalization_path: ${normalization_path}
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 100
+  reset_at_iteration: True
+  save_video: False
+  best_reward_threshold_for_success: 2
+  save_full_observations: True
+  wrappers:
+    d3il_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      pass_full_observations: ${env.save_full_observations}
+      reset_within_step: False
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 15000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 15000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 10000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 10000
-  batch_size: 32
+  n_epochs: 5000
+  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 15000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 15000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 10000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 10000
-  batch_size: 32
+  n_epochs: 5000
+  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 15000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 15000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 10000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 10000
+  n_epochs: 5000
  batch_size: 32
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/finetuning.md
+++ b/cfg/finetuning.md
@ -1,5 +1,7 @@
 ## Fine-tuning experiments

+**Update, Nov 20 2024**: In v0.7 we updated the fine-tuning configs as we find sample efficiency can be improved with higher actor learning rate and other hyperparameters. If you would like to replicate the original experimental results from the paper, please use the configs from v0.6. Otherwise we recommmend starting with configs from v0.7 for your applications.
+
 ### Comparing diffusion-based RL algorithms (Sec. 5.1)
 Gym configs are under `cfg/gym/finetune/<env_name>/`, and the naming follows `ft_<alg_name>_diffusion_mlp`, e.g., `ft_awr_diffusion_mlp`. `alg_name` is one of `rwr`, `awr`, `dipo`, `idql`, `dql`, `qsm`, `ppo` (DPPO), `ppo_exact` (exact likelihood). They share the same pre-trained checkpoint in each env.

--- a/cfg/furniture/eval/lamp_low/eval_diffusion_mlp.yaml
+++ b/cfg/furniture/eval/lamp_low/eval_diffusion_mlp.yaml
@ -0,0 +1,66 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+denoising_steps: 100
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+use_ddim: True
+ddim_steps: 5
+
+n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+render_num: 0
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: lamp
+    randomness: low
+    normalization_path: ${normalization_path}
+    obs_steps: ${cond_steps}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/furniture/eval/lamp_low/eval_diffusion_unet.yaml
+++ b/cfg/furniture/eval/lamp_low/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+denoising_steps: 100
+cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+use_ddim: True
+ddim_steps: 5
+
+n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+render_num: 0
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: lamp
+    randomness: low
+    normalization_path: ${normalization_path}
+    obs_steps: ${cond_steps}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    groupnorm_eps: 1e-4 # not important
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/furniture/eval/one_leg_low/eval_diffusion_mlp.yaml
+++ b/cfg/furniture/eval/one_leg_low/eval_diffusion_mlp.yaml
@ -7,7 +7,7 @@ _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent

 name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_low_dim_pre_diffusion_mlp_ta8_td100/2024-07-22_20-01-16/checkpoint/state_8000.pt
+base_policy_path:
 normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth

 seed: 42
--- a/cfg/furniture/eval/one_leg_low/eval_diffusion_unet.yaml
+++ b/cfg/furniture/eval/one_leg_low/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 58
+action_dim: 10
+denoising_steps: 100
+cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+use_ddim: True
+ddim_steps: 5
+
+n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+render_num: 0
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 700
+  best_reward_threshold_for_success: 1
+  specific:
+    headless: true
+    furniture: one_leg
+    randomness: low
+    normalization_path: ${normalization_path}
+    obs_steps: ${cond_steps}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    groupnorm_eps: 1e-4 # not important
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/furniture/eval/round_table_low/eval_diffusion_mlp.yaml
+++ b/cfg/furniture/eval/round_table_low/eval_diffusion_mlp.yaml
@ -0,0 +1,66 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+denoising_steps: 100
+cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+use_ddim: True
+ddim_steps: 5
+
+n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+render_num: 0
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: round_table
+    randomness: low
+    normalization_path: ${normalization_path}
+    obs_steps: ${cond_steps}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    use_layernorm: True # needed for larger MLP
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/furniture/eval/round_table_low/eval_diffusion_unet.yaml
+++ b/cfg/furniture/eval/round_table_low/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
+
+seed: 42
+device: cuda:0
+env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
+obs_dim: 44
+action_dim: 10
+denoising_steps: 100
+cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+use_ddim: True
+ddim_steps: 5
+
+n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
+render_num: 0
+
+env:
+  n_envs: 1000
+  name: ${env_name}
+  env_type: furniture
+  max_episode_steps: 1000
+  best_reward_threshold_for_success: 2
+  specific:
+    headless: true
+    furniture: round_table
+    randomness: low
+    normalization_path: ${normalization_path}
+    obs_steps: ${cond_steps}
+    act_steps: ${act_steps}
+    sparse_reward: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2, 4]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    groupnorm_eps: 1e-4 # not important
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/furniture/pretrain/lamp_low/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/lamp_low/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/lamp_low/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/lamp_low/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/lamp_low/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/lamp_low/pre_gaussian_mlp.yaml
@ -30,7 +30,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/lamp_med/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/lamp_med/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/lamp_med/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/lamp_med/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/lamp_med/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/lamp_med/pre_gaussian_mlp.yaml
@ -30,7 +30,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_low/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_low/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_low/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/one_leg_low/pre_gaussian_mlp.yaml
@ -30,7 +30,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_med/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_med/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_med/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/one_leg_med/pre_gaussian_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 10000
+  n_epochs: 3000
  batch_size: 256
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_low/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/round_table_low/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_low/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/round_table_low/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_low/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/round_table_low/pre_gaussian_mlp.yaml
@ -30,7 +30,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_med/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/round_table_med/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_med/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/round_table_med/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_med/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/round_table_med/pre_gaussian_mlp.yaml
@ -30,7 +30,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/gym/eval/halfcheetah-v2/eval_diffusion_mlp.yaml
+++ b/cfg/gym/eval/halfcheetah-v2/eval_diffusion_mlp.yaml
@ -17,10 +17,10 @@ obs_dim: 17
 action_dim: 6
 denoising_steps: 20
 cond_steps: 1
-horizon_steps: 1
-act_steps: 1
+horizon_steps: 4
+act_steps: 4

-n_steps: 1000  # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
+n_steps: 250  # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
 render_num: 0

 env:
--- a/cfg/gym/eval/hopper-v2/eval_diffusion_mlp.yaml
+++ b/cfg/gym/eval/hopper-v2/eval_diffusion_mlp.yaml
@ -20,7 +20,7 @@ cond_steps: 1
 horizon_steps: 4
 act_steps: 4

-n_steps: 500  # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
+n_steps: 250  # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
 render_num: 0

 env:
--- a/cfg/gym/eval/walker2d-v2/eval_diffusion_mlp.yaml
+++ b/cfg/gym/eval/walker2d-v2/eval_diffusion_mlp.yaml
@ -0,0 +1,61 @@
+defaults:
+  - _self_
+hydra:
+  run:  
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: walker2d-medium-v2
+obs_dim: 17
+action_dim: 6
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+n_steps: 250  # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
+render_num: 0
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  max_episode_steps: 1000
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 3  # success rate not relevant for gym tasks
+  wrappers:
+    mujoco_locomotion_lowdim:
+      normalization_path: ${normalization_path}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    activation_type: ReLU
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/gym/pretrain/halfcheetah-medium-v2/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 3000
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 3000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 100
--- a/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml
@ -23,15 +23,14 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 500
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
- 
  save_model_freq: 100

 model:
--- a/cfg/gym/pretrain/hopper-medium-v2/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/hopper-medium-v2/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 3000
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 3000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 100
--- a/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml
@ -23,12 +23,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 500
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 100
--- a/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 8000
+  n_epochs: 3000
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 8000
+    first_cycle_steps: 3000
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml
@ -23,12 +23,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 5000
+  n_epochs: 3000
  batch_size: 256
  learning_rate: 1e-4
  weight_decay: 0
  lr_scheduler:
-    first_cycle_steps: 5000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 8000
+  n_epochs: 3000
  batch_size: 256
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 8000
+    first_cycle_steps: 3000
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml
@ -23,12 +23,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 5000
+  n_epochs: 3000
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 5000
+    first_cycle_steps: 3000
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 8000
+  n_epochs: 3000
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-5
  lr_scheduler:
-    first_cycle_steps: 8000
+    first_cycle_steps: 3000
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml
@ -23,12 +23,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 5000
+  n_epochs: 3000
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 5000
+    first_cycle_steps: 3000
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/walker2d-medium-v2/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/walker2d-medium-v2/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 3000
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 3000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 100
--- a/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml
@ -23,12 +23,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_epochs: 3000
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 3000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 100
--- a/cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml
@ -42,7 +42,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 1000
+  n_train_itr: 501
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -55,7 +55,7 @@ train:
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 1000
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -67,7 +67,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 10000
+  batch_size: 5000
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
@ -75,7 +75,7 @@ train:
 model:
  _target_: model.diffusion.diffusion_ppo.PPODiffusion
  # HP to tune
-  gamma_denoising: 0.99
+  gamma_denoising: 1
  clip_ploss_coef: 0.1
  clip_ploss_coef_base: 0.1
  clip_ploss_coef_rate: 3
@ -94,10 +94,10 @@ model:
    residual_style: True
  critic:
    _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  ft_denoising_steps: ${ft_denoising_steps}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
--- a/cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml
@ -40,7 +40,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 1000
+  n_train_itr: 501
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -65,7 +65,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 1000
+  batch_size: 500
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
--- a/cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml
@ -42,7 +42,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 1000
+  n_train_itr: 301
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -67,7 +67,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 10000
+  batch_size: 5000
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
@ -75,7 +75,7 @@ train:
 model:
  _target_: model.diffusion.diffusion_ppo.PPODiffusion
  # HP to tune
-  gamma_denoising: 0.99
+  gamma_denoising: 1
  clip_ploss_coef: 0.1
  clip_ploss_coef_base: 0.1
  clip_ploss_coef_rate: 3
--- a/cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml
@ -40,7 +40,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 1000
+  n_train_itr: 301
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -65,7 +65,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 1000
+  batch_size: 500
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
--- a/cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml
@ -42,7 +42,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 1000
+  n_train_itr: 501
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -55,7 +55,7 @@ train:
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 1000
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -67,7 +67,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 10000
+  batch_size: 5000
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
@ -75,7 +75,7 @@ train:
 model:
  _target_: model.diffusion.diffusion_ppo.PPODiffusion
  # HP to tune
-  gamma_denoising: 0.99
+  gamma_denoising: 1
  clip_ploss_coef: 0.1
  clip_ploss_coef_base: 0.1
  clip_ploss_coef_rate: 3
@ -94,10 +94,10 @@ model:
    residual_style: True
  critic:
    _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  ft_denoising_steps: ${ft_denoising_steps}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
--- a/cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml
@ -40,7 +40,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 1000
+  n_train_itr: 301
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -65,7 +65,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 1000
+  batch_size: 500
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
--- a/cfg/pretraining.md
+++ b/cfg/pretraining.md
@ -1,6 +1,6 @@
 ## Pre-training experiments

-**Update, Nov 6 2024**: we fixed the issue of EMA update being too infrequent causing slow pre-training. Now the number of epochs needed for pre-training can be much slower than those used in the configs. We recommend training with fewer epochs and testing the early checkpoints.
+**Update, Nov 20 2024**: We fixed the issue of EMA update being too infrequent causing slow pre-training ([commit](https://github.com/irom-princeton/dppo/commit/e1ef4ca1cfbff85e5ae6c49f5e57debd70174616)). Now the number of epochs needed for pre-training can be much lower than those used in the configs (e.g., 3000 for robomimic state and 1000 for robomimic pixel), and we have updated the pre-training configs in v0.7. If you would like to replicate the original experimental results from the paper, please use v0.6.

 ### Comparing diffusion-based RL algorithms (Sec. 5.1)
 Gym configs are under `cfg/gym/pretrain/<env_name>/`, and the config name is `pre_diffusion_mlp`. Robomimic configs are under `cfg/robomimic/pretrain/<env_name>/`, and the name is also `pre_diffusion_mlp`.
--- a/cfg/robomimic/eval/can/eval_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/eval/can/eval_diffusion_mlp_img.yaml
@ -7,7 +7,7 @@ _target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent

 name: ${env_name}_eval_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_img_ta4_td100/2024-07-30_22-23-55/checkpoint/state_5000.pt
+base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz

@ -28,7 +28,7 @@ n_steps: 300  # each episode takes max_episode_steps / act_steps steps
 render_num: 0

 env:
-  n_envs: 50
+  n_envs: 20  # reduce gpu usage
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
--- a/cfg/robomimic/eval/can/eval_diffusion_unet.yaml
+++ b/cfg/robomimic/eval/can/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+n_steps: 75  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 40
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/eval/can/eval_diffusion_unet_img.yaml
+++ b/cfg/robomimic/eval/can/eval_diffusion_unet_img.yaml
@ -0,0 +1,102 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
+
+name: ${env_name}_eval_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 9
+action_dim: 7
+denoising_steps: 100
+cond_steps: 1
+img_cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+use_ddim: True
+ddim_steps: 5
+
+n_steps: 300  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 20  # reduce gpu usage
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['robot0_eye_in_hand_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.unet.VisionUnet1D
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    img_cond_steps: ${img_cond_steps}
+    augment: False
+    spatial_emb: 128
+    diffusion_step_embed_dim: 32
+    dim: 40
+    dim_mults:
+    - 1
+    - 2
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: false
+    cond_predict_scale: true
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/eval/can/eval_gaussian_mlp.yaml
+++ b/cfg/robomimic/eval/can/eval_gaussian_mlp.yaml
@ -7,7 +7,7 @@ _target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent

 name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_gaussian_mlp_ta4/2024-06-28_13-31-00/checkpoint/state_5000.pt
+base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz

--- a/cfg/robomimic/eval/can/eval_gaussian_mlp_img.yaml
+++ b/cfg/robomimic/eval/can/eval_gaussian_mlp_img.yaml
@ -7,7 +7,7 @@ _target_: agent.eval.eval_gaussian_img_agent.EvalImgGaussianAgent

 name: ${env_name}_eval_gaussian_mlp_img_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_gaussian_mlp_img_ta4/2024-07-28_21-54-40/checkpoint/state_1000.pt
+base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz

--- a/cfg/robomimic/eval/lift/eval_diffusion_mlp.yaml
+++ b/cfg/robomimic/eval/lift/eval_diffusion_mlp.yaml
@ -0,0 +1,65 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+n_steps: 300  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/eval/lift/eval_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/eval/lift/eval_diffusion_mlp_img.yaml
@ -0,0 +1,97 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 9
+action_dim: 7
+denoising_steps: 100
+cond_steps: 1
+img_cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+use_ddim: True
+ddim_steps: 5
+
+n_steps: 300  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 20  # reduce gpu usage
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['robot0_eye_in_hand_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: False
+    spatial_emb: 128
+    time_dim: 32
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    img_cond_steps: ${img_cond_steps}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/eval/lift/eval_diffusion_unet.yaml
+++ b/cfg/robomimic/eval/lift/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 19
+action_dim: 7
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+n_steps: 75  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 40
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 40
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/eval/lift/eval_diffusion_unet_img.yaml
+++ b/cfg/robomimic/eval/lift/eval_diffusion_unet_img.yaml
@ -0,0 +1,100 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
+
+name: ${env_name}_eval_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 9
+action_dim: 7
+denoising_steps: 100
+cond_steps: 1
+img_cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+use_ddim: True
+ddim_steps: 5
+
+n_steps: 300  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 20  # reduce gpu usage
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['robot0_eye_in_hand_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.unet.VisionUnet1D
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    img_cond_steps: ${img_cond_steps}
+    augment: False
+    spatial_emb: 128
+    diffusion_step_embed_dim: 32
+    dim: 40
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/eval/square/eval_diffusion_mlp.yaml
+++ b/cfg/robomimic/eval/square/eval_diffusion_mlp.yaml
@ -18,8 +18,8 @@ obs_dim: 23
 action_dim: 7
 denoising_steps: 20
 cond_steps: 1
-horizon_steps: 1
-act_steps: 1
+horizon_steps: 4
+act_steps: 4

 n_steps: 400  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
--- a/cfg/robomimic/eval/square/eval_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/eval/square/eval_diffusion_mlp_img.yaml
@ -0,0 +1,97 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 9
+action_dim: 7
+denoising_steps: 100
+cond_steps: 1
+img_cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+use_ddim: True
+ddim_steps: 5
+
+n_steps: 400  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 20  # reduce gpu usage
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['agentview_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: False
+    spatial_emb: 128
+    time_dim: 32
+    mlp_dims: [768, 768, 768]
+    residual_style: True
+    img_cond_steps: ${img_cond_steps}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/eval/square/eval_diffusion_unet.yaml
+++ b/cfg/robomimic/eval/square/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+n_steps: 100  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/eval/square/eval_diffusion_unet_img.yaml
+++ b/cfg/robomimic/eval/square/eval_diffusion_unet_img.yaml
@ -0,0 +1,102 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
+
+name: ${env_name}_eval_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 9
+action_dim: 7
+denoising_steps: 100
+cond_steps: 1
+img_cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+use_ddim: True
+ddim_steps: 5
+
+n_steps: 400  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 30  # reduce gpu usage
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['agentview_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.unet.VisionUnet1D
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    img_cond_steps: ${img_cond_steps}
+    augment: False
+    spatial_emb: 128
+    diffusion_step_embed_dim: 32
+    dim: 64
+    dim_mults:
+    - 1
+    - 2
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: false
+    cond_predict_scale: true
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/eval/transport/eval_diffusion_mlp.yaml
+++ b/cfg/robomimic/eval/transport/eval_diffusion_mlp.yaml
@ -3,9 +3,9 @@ defaults:
 hydra:
  run:
    dir: ${logdir}
-_target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent

-name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps}
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
@ -13,12 +13,13 @@ normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.

 seed: 42
 device: cuda:0
-env_name: square
-obs_dim: 23
-action_dim: 7
+env_name: transport
+obs_dim: 59
+action_dim: 14
+denoising_steps: 20
 cond_steps: 1
-horizon_steps: 1
-act_steps: 1
+horizon_steps: 8
+act_steps: 8

 n_steps: 400  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
@ -27,7 +28,7 @@ env:
  n_envs: 50
  name: ${env_name}
  best_reward_threshold_for_success: 1
-  max_episode_steps: 400
+  max_episode_steps: 800
  save_video: False
  wrappers:
    robomimic_lowdim:
@ -35,6 +36,9 @@ env:
      low_dim_keys: ['robot0_eef_pos',
                    'robot0_eef_quat',
                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
                    'object'] # same order of preprocessed observations
    multi_step:
      n_obs_steps: ${cond_steps}
@ -42,19 +46,24 @@ env:
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True

+
 model:
-  _target_: model.common.gaussian.GaussianModel
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  network_path: ${base_policy_path}
  network:
-    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
    mlp_dims: [1024, 1024, 1024]
-    activation_type: ReLU
-    use_layernorm: true
-    fixed_std: 0.1
+    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/transport/eval_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/eval/transport/eval_diffusion_mlp_img.yaml
@ -0,0 +1,102 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
+
+name: ${env_name}_eval_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 18
+action_dim: 14
+denoising_steps: 100
+cond_steps: 1
+img_cond_steps: 1
+horizon_steps: 8
+act_steps: 8
+use_ddim: True
+ddim_steps: 5
+
+n_steps: 200  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 30  # reduce gpu usage
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos',
+                     "robot1_eef_pos",
+                     "robot1_eef_quat",
+                     "robot1_gripper_qpos"]
+      image_keys: ['shouldercamera0_image', 
+                   'shouldercamera1_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [6, 96, 96]
+    state:
+      shape: [18]
+  action: 
+    shape: [14]
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    augment: False
+    num_img: 2
+    spatial_emb: 128
+    time_dim: 32
+    mlp_dims: [768, 768, 768]
+    residual_style: True
+    img_cond_steps: ${img_cond_steps}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/eval/transport/eval_diffusion_unet.yaml
+++ b/cfg/robomimic/eval/transport/eval_diffusion_unet.yaml
@ -0,0 +1,71 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
+
+name: ${env_name}_eval_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 59
+action_dim: 14
+denoising_steps: 20
+cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+
+n_steps: 100  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    "robot1_eef_pos",
+                    "robot1_eef_quat",
+                    "robot1_gripper_qpos",
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.unet.Unet1D
+    diffusion_step_embed_dim: 16
+    dim: 64
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/eval/transport/eval_diffusion_unet_img.yaml
+++ b/cfg/robomimic/eval/transport/eval_diffusion_unet_img.yaml
@ -0,0 +1,107 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
+
+name: ${env_name}_eval_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path:
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: transport
+obs_dim: 18
+action_dim: 14
+denoising_steps: 100
+cond_steps: 1
+img_cond_steps: 1
+horizon_steps: 16
+act_steps: 8
+use_ddim: True
+ddim_steps: 5
+
+n_steps: 400  # each episode takes max_episode_steps / act_steps steps
+render_num: 0
+
+env:
+  n_envs: 30  # reduce gpu usage
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 800
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos',
+                     "robot1_eef_pos",
+                     "robot1_eef_quat",
+                     "robot1_gripper_qpos"]
+      image_keys: ['shouldercamera0_image', 
+                   'shouldercamera1_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [6, 96, 96]
+    state:
+      shape: [18]
+  action: 
+    shape: [14]
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  randn_clip_value: 3
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ddim_steps}
+  network_path: ${base_policy_path}
+  network:
+    _target_: model.diffusion.unet.VisionUnet1D
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    img_cond_steps: ${img_cond_steps}
+    augment: False
+    num_img: 2
+    spatial_emb: 128
+    diffusion_step_embed_dim: 32
+    dim: 64
+    dim_mults:
+    - 1
+    - 2
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: false
+    cond_predict_scale: true
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
@ -7,7 +7,8 @@ _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent

 name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt  # use 8000 for comparing policy parameterizations
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt  # use 5000 for comparing diffusion rl algorithms
+# base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_8000.pt  # use 8000 for comparing policy parameterizations
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz

@ -54,13 +55,13 @@ train:
  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
@ -66,16 +66,16 @@ train:
  gamma: 0.999
  augment: True
  grad_accumulate: 15
-  actor_lr: 1e-4
+  actor_lr: 5e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-4
+    min_lr: 5e-5
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet.yaml
@ -27,7 +27,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -47,20 +47,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 300
+  n_train_itr: 151
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet_img.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet_img.yaml
@ -0,0 +1,173 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_unet_img_ta4_td100/2024-11-15_17-34-05_42/checkpoint/state_500.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 9
+action_dim: 7
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+img_cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+use_ddim: True
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['robot0_eye_in_hand_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 151
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  augment: True
+  grad_accumulate: 15
+  actor_lr: 5e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 10
+    min_lr: 5e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 500
+  logprob_batch_size: 500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.VisionUnet1D
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    img_cond_steps: ${img_cond_steps}
+    augment: False
+    spatial_emb: 128
+    diffusion_step_embed_dim: 32
+    dim: 40
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  critic:
+    _target_: model.common.critic.ViTCritic
+    spatial_emb: 128
+    augment: False
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    img_cond_steps: ${img_cond_steps}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp.yaml
@ -45,20 +45,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 300
+  n_train_itr: 151
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp_img.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp_img.yaml
@ -57,22 +57,22 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 200
+  n_train_itr: 151
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
  augment: True
  grad_accumulate: 5
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -140,9 +140,9 @@ model:
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/can/ft_ppo_gaussian_transformer.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_transformer.yaml
@ -45,20 +45,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 300
+  n_train_itr: 151
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/can/ft_ppo_gmm_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_gmm_mlp.yaml
@ -46,20 +46,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 300
+  n_train_itr: 151
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
@ -7,7 +7,8 @@ _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent

 name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt # use 8000 for comparing policy parameterizations
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt # use 5000 for comparing diffusion rl algorithms
+# base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_8000.pt # use 8000 for comparing policy parameterizations
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz

@ -54,13 +55,13 @@ train:
  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
@ -60,22 +60,22 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 151
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
  augment: True
  grad_accumulate: 15
-  actor_lr: 1e-4
+  actor_lr: 5e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-4
+    min_lr: 5e-5
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet.yaml
@ -27,7 +27,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -47,20 +47,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 300
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -102,10 +102,10 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  ft_denoising_steps: ${ft_denoising_steps}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet_img.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet_img.yaml
@ -0,0 +1,173 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_unet_img_ta4_td100/2024-11-15_17-35-19_42/checkpoint/state_500.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: lift
+obs_dim: 9
+action_dim: 7
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+img_cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+use_ddim: True
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['robot0_eye_in_hand_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 81
+  n_critic_warmup_itr: 2
+  n_steps: 300
+  gamma: 0.999
+  augment: True
+  grad_accumulate: 15
+  actor_lr: 5e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 10
+    min_lr: 5e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 500
+  logprob_batch_size: 500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.VisionUnet1D
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    img_cond_steps: ${img_cond_steps}
+    augment: False
+    spatial_emb: 128
+    diffusion_step_embed_dim: 32
+    dim: 40
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  critic:
+    _target_: model.common.critic.ViTCritic
+    spatial_emb: 128
+    augment: False
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    img_cond_steps: ${img_cond_steps}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp.yaml
@ -25,7 +25,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -45,20 +45,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 300
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -93,9 +93,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp_img.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp_img.yaml
@ -57,22 +57,22 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 200
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
  augment: True
  grad_accumulate: 5
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -140,9 +140,9 @@ model:
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_transformer.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_transformer.yaml
@ -25,7 +25,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -45,20 +45,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 300
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -94,9 +94,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/lift/ft_ppo_gmm_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gmm_mlp.yaml
@ -26,7 +26,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -46,20 +46,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 300
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -94,9 +94,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/lift/ft_ppo_gmm_transformer.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gmm_transformer.yaml
@ -26,7 +26,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -46,20 +46,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 300
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -95,9 +95,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
@ -27,7 +27,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -54,14 +54,14 @@ train:
  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
-    warmup_steps: 10
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 0
    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
-    warmup_steps: 10
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 0
    min_lr: 1e-3
  save_model_freq: 100
  val_freq: 10
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
@ -69,13 +69,13 @@ train:
  actor_lr: 1e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-5
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet.yaml
@ -27,7 +27,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -47,21 +47,21 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 1000
+  n_train_itr: 201
  n_critic_warmup_itr: 2
  n_steps: 400
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 2e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
-    warmup_steps: 10
-    min_lr: 1e-5
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 0
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
-    warmup_steps: 10
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 0
    min_lr: 1e-3
  save_model_freq: 100
  val_freq: 10
@ -102,10 +102,10 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  ft_denoising_steps: ${ft_denoising_steps}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet_img.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet_img.yaml
@ -0,0 +1,173 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
+
+name: ${env_name}_ft_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_diffusion_unet_img_ta4_td100/2024-11-15_17-36-37_42/checkpoint/state_500.pt
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 9
+action_dim: 7
+denoising_steps: 100
+ft_denoising_steps: 5
+cond_steps: 1
+img_cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+use_ddim: True
+
+env:
+  n_envs: 50
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 400
+  save_video: False
+  use_image_obs: True
+  wrappers:
+    robomimic_image:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                     'robot0_eef_quat',
+                     'robot0_gripper_qpos']
+      image_keys: ['agentview_image']
+      shape_meta: ${shape_meta}
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+shape_meta:
+  obs:
+    rgb:
+      shape: [3, 96, 96]
+    state:
+      shape: [9]
+  action: 
+    shape: [7]
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 301
+  n_critic_warmup_itr: 2
+  n_steps: 400
+  gamma: 0.999
+  augment: True
+  grad_accumulate: 20
+  actor_lr: 1e-5
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 10
+    min_lr: 1e-5
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 500
+  logprob_batch_size: 1000
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo.PPODiffusion
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  use_ddim: ${use_ddim}
+  ddim_steps: ${ft_denoising_steps}
+  learn_eta: False
+  eta:
+    base_eta: 1
+    input_dim: ${obs_dim}
+    mlp_dims: [256, 256]
+    action_dim: ${action_dim}
+    min_eta: 0.1
+    max_eta: 1.0
+    _target_: model.diffusion.eta.EtaFixed
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.unet.VisionUnet1D
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    img_cond_steps: ${img_cond_steps}
+    augment: False
+    spatial_emb: 128
+    diffusion_step_embed_dim: 32
+    dim: 64
+    dim_mults: [1, 2]
+    kernel_size: 5
+    n_groups: 8
+    smaller_encoder: False
+    cond_predict_scale: True
+    action_dim: ${action_dim}
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  critic:
+    _target_: model.common.critic.ViTCritic
+    spatial_emb: 128
+    augment: False
+    backbone:
+      _target_: model.common.vit.VitEncoder
+      obs_shape: ${shape_meta.obs.rgb.shape}
+      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
+      img_h: ${shape_meta.obs.rgb.shape[1]}
+      img_w: ${shape_meta.obs.rgb.shape[2]}
+      cfg:
+        patch_size: 8
+        depth: 1
+        embed_dim: 128
+        num_heads: 4
+        embed_style: embed2
+        embed_norm: 0
+    img_cond_steps: ${img_cond_steps}
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
--- a/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp.yaml
@ -25,7 +25,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -45,21 +45,21 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 1000
+  n_train_itr: 201
  n_critic_warmup_itr: 2
  n_steps: 400
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
-    warmup_steps: 10
-    min_lr: 1e-5
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 0
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
-    warmup_steps: 10
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 0
    min_lr: 1e-3
  save_model_freq: 100
  val_freq: 10
@ -93,9 +93,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp_img.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp_img.yaml
@ -57,7 +57,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 500
+  n_train_itr: 301
  n_critic_warmup_itr: 2
  n_steps: 400
  gamma: 0.999
@ -66,13 +66,13 @@ train:
  actor_lr: 1e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 500
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-5
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 500
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -140,9 +140,9 @@ model:
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/square/ft_ppo_gaussian_transformer.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_transformer.yaml
@ -25,7 +25,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -45,21 +45,21 @@ wandb:
  run: ${now:%H-%M-%S}_${name}

 train:
-  n_train_itr: 1000
+  n_train_itr: 201
  n_critic_warmup_itr: 2
  n_steps: 400
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
-    warmup_steps: 10
-    min_lr: 1e-5
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 0
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
-    warmup_steps: 10
+    first_cycle_steps: ${train.n_train_itr}
+    warmup_steps: 0
    min_lr: 1e-3
  save_model_freq: 100
  val_freq: 10
@ -94,9 +94,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
-    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/Show More
+++ b/Show More