v0.7 (#26)

* update from scratch configs * update gym pretraining configs - use fewer epochs * update robomimic pretraining configs - use fewer epochs * allow trajectory plotting in eval agent * add simple vit unet * update avoid pretraining configs - use fewer epochs * update furniture pretraining configs - use same amount of epochs as before * add robomimic diffusion unet pretraining configs * update robomimic finetuning configs - higher lr * add vit unet checkpoint urls * update pretraining and finetuning instructions as configs are updated
2024-11-20 15:47:52 -05:00 · 2024-11-20 15:47:52 -05:00 · 1d04211666
commit 1d04211666
parent d2929f65e1
158 changed files with 3350 additions and 410 deletions
--- a/agent/eval/eval_agent.py
+++ b/agent/eval/eval_agent.py
@ -57,6 +57,7 @@ class EvalAgent:
        self.horizon_steps = cfg.horizon_steps
        self.max_episode_steps = cfg.env.max_episode_steps
        self.reset_at_iteration = cfg.env.get("reset_at_iteration", True)
        self.save_full_observations = cfg.env.get("save_full_observations", False)
        self.furniture_sparse_reward = (
            cfg.env.specific.get("sparse_reward", False)
            if "specific" in cfg.env
@ -85,6 +86,10 @@ class EvalAgent:
        assert not (
            self.n_render <= 0 and self.render_video
        ), "Need to set n_render > 0 if saving video"
        self.traj_plotter = (
            hydra.utils.instantiate(cfg.plotter)
            if "plotter" in cfg else None
        )
    def run(self):
        pass
--- a/agent/eval/eval_diffusion_agent.py
+++ b/agent/eval/eval_diffusion_agent.py
@ -37,6 +37,11 @@ class EvalDiffusionAgent(EvalAgent):
        prev_obs_venv = self.reset_env_all(options_venv=options_venv)
        firsts_trajs[0] = 1
        reward_trajs = np.zeros((self.n_steps, self.n_envs))
        if self.save_full_observations:  # state-only
            obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
            obs_full_trajs = np.vstack(
                (obs_full_trajs, prev_obs_venv["state"][:, -1][None])
            )
        # Collect a set of trajectories from env
        for step in range(self.n_steps):
@ -62,6 +67,13 @@ class EvalDiffusionAgent(EvalAgent):
            )
            reward_trajs[step] = reward_venv
            firsts_trajs[step + 1] = terminated_venv | truncated_venv
            if self.save_full_observations:  # state-only
                obs_full_venv = np.array(
                    [info["full_obs"]["state"] for info in info_venv]
                )  # n_envs x act_steps x obs_dim
                obs_full_trajs = np.vstack(
                    (obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
                )
            # update for next step
            prev_obs_venv = obs_venv
@ -108,6 +120,16 @@ class EvalDiffusionAgent(EvalAgent):
            success_rate = 0
            log.info("[WARNING] No episode completed within the iteration!")
        # Plot state trajectories (only in D3IL)
        if self.traj_plotter is not None:
            self.traj_plotter(
                obs_full_trajs=obs_full_trajs,
                n_render=self.n_render,
                max_episode_steps=self.max_episode_steps,
                render_dir=self.render_dir,
                itr=0,
            )
        # Log loss and save metrics
        time = timer()
        log.info(
--- a/cfg/d3il/eval/avoid_m1/eval_diffusion_mlp.yaml
+++ b/cfg/d3il/eval/avoid_m1/eval_diffusion_mlp.yaml
@ -0,0 +1,68 @@
 defaults:
  - _self_
 hydra:
  run:  
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/d3il-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 normalization_path: ${oc.env:DPPO_DATA_DIR}/d3il/avoid_m1/normalization.npz
 seed: 42
 device: cuda:0
 env_name: avoiding-m5
 obs_dim: 4
 action_dim: 2
 denoising_steps: 20
 cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 n_steps: 25
 render_num: 40
 plotter:
  _target_: env.plot_traj.TrajPlotter
  env_type: avoid
  normalization_path: ${normalization_path}
 env:
  n_envs: 40
  name: ${env_name}
  max_episode_steps: 100
  reset_at_iteration: True
  save_video: False
  best_reward_threshold_for_success: 2
  save_full_observations: True
  wrappers:
    d3il_lowdim:
      normalization_path: ${normalization_path}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      pass_full_observations: ${env.save_full_observations}
      reset_within_step: False
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  #
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
    time_dim: 16
    mlp_dims: [512, 512, 512]
    activation_type: ReLU
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    horizon_steps: ${horizon_steps}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m1/pre_diffusion_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 15000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 15000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m1/pre_gaussian_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 10000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m1/pre_gmm_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 10000
+  n_epochs: 5000
-  batch_size: 32
+  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m2/pre_diffusion_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 15000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 15000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m2/pre_gaussian_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 10000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m2/pre_gmm_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 10000
+  n_epochs: 5000
-  batch_size: 32
+  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m3/pre_diffusion_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 15000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 15000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m3/pre_gaussian_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 10000
+  n_epochs: 5000
  batch_size: 16
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml
+++ b/cfg/d3il/pretrain/avoid_m3/pre_gmm_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 10000
+  n_epochs: 5000
  batch_size: 32
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 5000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/finetuning.md
+++ b/cfg/finetuning.md
@ -1,5 +1,7 @@
 ## Fine-tuning experiments
 **Update, Nov 20 2024**: In v0.7 we updated the fine-tuning configs as we find sample efficiency can be improved with higher actor learning rate and other hyperparameters. If you would like to replicate the original experimental results from the paper, please use the configs from v0.6. Otherwise we recommmend starting with configs from v0.7 for your applications.
 ### Comparing diffusion-based RL algorithms (Sec. 5.1)
 Gym configs are under `cfg/gym/finetune/<env_name>/`, and the naming follows `ft_<alg_name>_diffusion_mlp`, e.g., `ft_awr_diffusion_mlp`. `alg_name` is one of `rwr`, `awr`, `dipo`, `idql`, `dql`, `qsm`, `ppo` (DPPO), `ppo_exact` (exact likelihood). They share the same pre-trained checkpoint in each env.
--- a/cfg/furniture/eval/lamp_low/eval_diffusion_mlp.yaml
+++ b/cfg/furniture/eval/lamp_low/eval_diffusion_mlp.yaml
@ -0,0 +1,66 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
 seed: 42
 device: cuda:0
 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
 obs_dim: 44
 action_dim: 10
 denoising_steps: 100
 cond_steps: 1
 horizon_steps: 8
 act_steps: 8
 use_ddim: True
 ddim_steps: 5
 n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
 render_num: 0
 env:
  n_envs: 1000
  name: ${env_name}
  env_type: furniture
  max_episode_steps: 1000
  best_reward_threshold_for_success: 2
  specific:
    headless: true
    furniture: lamp
    randomness: low
    normalization_path: ${normalization_path}
    obs_steps: ${cond_steps}
    act_steps: ${act_steps}
    sparse_reward: True
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
    time_dim: 32
    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
    cond_mlp_dims: [512, 64]
    use_layernorm: True # needed for larger MLP
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    horizon_steps: ${horizon_steps}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/furniture/eval/lamp_low/eval_diffusion_unet.yaml
+++ b/cfg/furniture/eval/lamp_low/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
 seed: 42
 device: cuda:0
 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
 obs_dim: 44
 action_dim: 10
 denoising_steps: 100
 cond_steps: 1
 horizon_steps: 16
 act_steps: 8
 use_ddim: True
 ddim_steps: 5
 n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
 render_num: 0
 env:
  n_envs: 1000
  name: ${env_name}
  env_type: furniture
  max_episode_steps: 1000
  best_reward_threshold_for_success: 2
  specific:
    headless: true
    furniture: lamp
    randomness: low
    normalization_path: ${normalization_path}
    obs_steps: ${cond_steps}
    act_steps: ${act_steps}
    sparse_reward: True
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.unet.Unet1D
    diffusion_step_embed_dim: 16
    dim: 64
    dim_mults: [1, 2, 4]
    kernel_size: 5
    n_groups: 8
    smaller_encoder: False
    cond_predict_scale: True
    groupnorm_eps: 1e-4 # not important
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/furniture/eval/one_leg_low/eval_diffusion_mlp.yaml
+++ b/cfg/furniture/eval/one_leg_low/eval_diffusion_mlp.yaml
@ -7,7 +7,7 @@ _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/furniture-pretrain/one_leg/one_leg_low_dim_pre_diffusion_mlp_ta8_td100/2024-07-22_20-01-16/checkpoint/state_8000.pt
+base_policy_path:
 normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
 seed: 42
--- a/cfg/furniture/eval/one_leg_low/eval_diffusion_unet.yaml
+++ b/cfg/furniture/eval/one_leg_low/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
 seed: 42
 device: cuda:0
 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
 obs_dim: 58
 action_dim: 10
 denoising_steps: 100
 cond_steps: 1
 horizon_steps: 16
 act_steps: 8
 use_ddim: True
 ddim_steps: 5
 n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
 render_num: 0
 env:
  n_envs: 1000
  name: ${env_name}
  env_type: furniture
  max_episode_steps: 700
  best_reward_threshold_for_success: 1
  specific:
    headless: true
    furniture: one_leg
    randomness: low
    normalization_path: ${normalization_path}
    obs_steps: ${cond_steps}
    act_steps: ${act_steps}
    sparse_reward: True
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.unet.Unet1D
    diffusion_step_embed_dim: 16
    dim: 64
    dim_mults: [1, 2, 4]
    kernel_size: 5
    n_groups: 8
    smaller_encoder: False
    cond_predict_scale: True
    groupnorm_eps: 1e-4 # not important
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/furniture/eval/round_table_low/eval_diffusion_mlp.yaml
+++ b/cfg/furniture/eval/round_table_low/eval_diffusion_mlp.yaml
@ -0,0 +1,66 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
 seed: 42
 device: cuda:0
 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
 obs_dim: 44
 action_dim: 10
 denoising_steps: 100
 cond_steps: 1
 horizon_steps: 8
 act_steps: 8
 use_ddim: True
 ddim_steps: 5
 n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
 render_num: 0
 env:
  n_envs: 1000
  name: ${env_name}
  env_type: furniture
  max_episode_steps: 1000
  best_reward_threshold_for_success: 2
  specific:
    headless: true
    furniture: round_table
    randomness: low
    normalization_path: ${normalization_path}
    obs_steps: ${cond_steps}
    act_steps: ${act_steps}
    sparse_reward: True
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
    time_dim: 32
    mlp_dims: [1024, 1024, 1024, 1024, 1024, 1024, 1024]
    cond_mlp_dims: [512, 64]
    use_layernorm: True # needed for larger MLP
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    horizon_steps: ${horizon_steps}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/furniture/eval/round_table_low/eval_diffusion_unet.yaml
+++ b/cfg/furniture/eval/round_table_low/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/furniture-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 normalization_path: ${oc.env:DPPO_DATA_DIR}/furniture/${env.specific.furniture}_${env.specific.randomness}/normalization.pth
 seed: 42
 device: cuda:0
 env_name: ${env.specific.furniture}_${env.specific.randomness}_dim
 obs_dim: 44
 action_dim: 10
 denoising_steps: 100
 cond_steps: 1
 horizon_steps: 16
 act_steps: 8
 use_ddim: True
 ddim_steps: 5
 n_steps: ${eval:'round(${env.max_episode_steps} / ${act_steps})'}
 render_num: 0
 env:
  n_envs: 1000
  name: ${env_name}
  env_type: furniture
  max_episode_steps: 1000
  best_reward_threshold_for_success: 2
  specific:
    headless: true
    furniture: round_table
    randomness: low
    normalization_path: ${normalization_path}
    obs_steps: ${cond_steps}
    act_steps: ${act_steps}
    sparse_reward: True
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.unet.Unet1D
    diffusion_step_embed_dim: 16
    dim: 64
    dim_mults: [1, 2, 4]
    kernel_size: 5
    n_groups: 8
    smaller_encoder: False
    cond_predict_scale: True
    groupnorm_eps: 1e-4 # not important
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/furniture/pretrain/lamp_low/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/lamp_low/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/lamp_low/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/lamp_low/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/lamp_low/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/lamp_low/pre_gaussian_mlp.yaml
@ -30,7 +30,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/lamp_med/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/lamp_med/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/lamp_med/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/lamp_med/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/lamp_med/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/lamp_med/pre_gaussian_mlp.yaml
@ -30,7 +30,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_low/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_low/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/one_leg_low/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_low/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/one_leg_low/pre_gaussian_mlp.yaml
@ -30,7 +30,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_med/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_med/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/one_leg_med/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/one_leg_med/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/one_leg_med/pre_gaussian_mlp.yaml
@ -25,12 +25,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 10000
+  n_epochs: 3000
  batch_size: 256
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_low/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/round_table_low/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_low/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/round_table_low/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_low/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/round_table_low/pre_gaussian_mlp.yaml
@ -30,7 +30,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_med/pre_diffusion_mlp.yaml
+++ b/cfg/furniture/pretrain/round_table_med/pre_diffusion_mlp.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_med/pre_diffusion_unet.yaml
+++ b/cfg/furniture/pretrain/round_table_med/pre_diffusion_unet.yaml
@ -31,7 +31,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 8000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/furniture/pretrain/round_table_med/pre_gaussian_mlp.yaml
+++ b/cfg/furniture/pretrain/round_table_med/pre_gaussian_mlp.yaml
@ -30,7 +30,7 @@ train:
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-5
  save_model_freq: 500
--- a/cfg/gym/eval/halfcheetah-v2/eval_diffusion_mlp.yaml
+++ b/cfg/gym/eval/halfcheetah-v2/eval_diffusion_mlp.yaml
@ -17,10 +17,10 @@ obs_dim: 17
 action_dim: 6
 denoising_steps: 20
 cond_steps: 1
-horizon_steps: 1
+horizon_steps: 4
-act_steps: 1
+act_steps: 4
-n_steps: 1000  # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
+n_steps: 250  # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
 render_num: 0
 env:
--- a/cfg/gym/eval/hopper-v2/eval_diffusion_mlp.yaml
+++ b/cfg/gym/eval/hopper-v2/eval_diffusion_mlp.yaml
@ -20,7 +20,7 @@ cond_steps: 1
 horizon_steps: 4
 act_steps: 4
-n_steps: 500  # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
+n_steps: 250  # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
 render_num: 0
 env:
--- a/cfg/gym/eval/walker2d-v2/eval_diffusion_mlp.yaml
+++ b/cfg/gym/eval/walker2d-v2/eval_diffusion_mlp.yaml
@ -0,0 +1,61 @@
 defaults:
  - _self_
 hydra:
  run:  
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/gym-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 normalization_path: ${oc.env:DPPO_DATA_DIR}/gym/${env_name}/normalization.npz
 seed: 42
 device: cuda:0
 env_name: walker2d-medium-v2
 obs_dim: 17
 action_dim: 6
 denoising_steps: 20
 cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 n_steps: 250  # each episode can take maximum (max_episode_steps / act_steps, =250 right now) steps but may finish earlier in gym. We only count episodes finished within n_steps for evaluation.
 render_num: 0
 env:
  n_envs: 40
  name: ${env_name}
  max_episode_steps: 1000
  reset_at_iteration: False
  save_video: False
  best_reward_threshold_for_success: 3  # success rate not relevant for gym tasks
  wrappers:
    mujoco_locomotion_lowdim:
      normalization_path: ${normalization_path}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  #
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
    time_dim: 16
    mlp_dims: [512, 512, 512]
    activation_type: ReLU
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    horizon_steps: ${horizon_steps}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/gym/pretrain/halfcheetah-medium-v2/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 3000
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 3000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 100
--- a/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/halfcheetah-medium-v2/pre_gaussian_mlp.yaml
@ -23,15 +23,14 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 500
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 100
 model:
--- a/cfg/gym/pretrain/hopper-medium-v2/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/hopper-medium-v2/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 3000
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 3000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 100
--- a/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/hopper-medium-v2/pre_gaussian_mlp.yaml
@ -23,12 +23,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 500
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 100
--- a/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-complete-v0/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 8000
+  n_epochs: 3000
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 8000
+    first_cycle_steps: 3000
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-complete-v0/pre_gaussian_mlp.yaml
@ -23,12 +23,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 5000
+  n_epochs: 3000
  batch_size: 256
  learning_rate: 1e-4
  weight_decay: 0
  lr_scheduler:
-    first_cycle_steps: 5000
+    first_cycle_steps: 3000
    warmup_steps: 100
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-mixed-v0/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 8000
+  n_epochs: 3000
  batch_size: 256
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 8000
+    first_cycle_steps: 3000
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-mixed-v0/pre_gaussian_mlp.yaml
@ -23,12 +23,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 5000
+  n_epochs: 3000
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 5000
+    first_cycle_steps: 3000
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-partial-v0/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 8000
+  n_epochs: 3000
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-5
  lr_scheduler:
-    first_cycle_steps: 8000
+    first_cycle_steps: 3000
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/kitchen-partial-v0/pre_gaussian_mlp.yaml
@ -23,12 +23,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 5000
+  n_epochs: 3000
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 5000
+    first_cycle_steps: 3000
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 500
--- a/cfg/gym/pretrain/walker2d-medium-v2/pre_diffusion_mlp.yaml
+++ b/cfg/gym/pretrain/walker2d-medium-v2/pre_diffusion_mlp.yaml
@ -24,12 +24,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 3000
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-3
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 3000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 100
--- a/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml
+++ b/cfg/gym/pretrain/walker2d-medium-v2/pre_gaussian_mlp.yaml
@ -23,12 +23,12 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_epochs: 3000
+  n_epochs: 200
  batch_size: 128
  learning_rate: 1e-4
  weight_decay: 1e-6
  lr_scheduler:
-    first_cycle_steps: 3000
+    first_cycle_steps: 200
    warmup_steps: 1
    min_lr: 1e-4
  save_model_freq: 100
--- a/cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/ppo_diffusion_mlp.yaml
@ -1,7 +1,7 @@
 defaults:
  - _self_
 hydra:
-  run:  
+  run:
    dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
@ -42,7 +42,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 1000
+  n_train_itr: 501
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -55,7 +55,7 @@ train:
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 1000
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -67,7 +67,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 10000
+  batch_size: 5000
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
@ -75,7 +75,7 @@ train:
 model:
  _target_: model.diffusion.diffusion_ppo.PPODiffusion
  # HP to tune
-  gamma_denoising: 0.99
+  gamma_denoising: 1
  clip_ploss_coef: 0.1
  clip_ploss_coef_base: 0.1
  clip_ploss_coef_rate: 3
@ -94,10 +94,10 @@ model:
    residual_style: True
  critic:
    _target_: model.common.critic.CriticObs
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  ft_denoising_steps: ${ft_denoising_steps}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
--- a/cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/halfcheetah-v2/ppo_gaussian_mlp.yaml
@ -40,7 +40,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 1000
+  n_train_itr: 501
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -65,7 +65,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 1000
+  batch_size: 500
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
--- a/cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/hopper-v2/ppo_diffusion_mlp.yaml
@ -42,7 +42,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 1000
+  n_train_itr: 301
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -67,7 +67,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 10000
+  batch_size: 5000
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
@ -75,7 +75,7 @@ train:
 model:
  _target_: model.diffusion.diffusion_ppo.PPODiffusion
  # HP to tune
-  gamma_denoising: 0.99
+  gamma_denoising: 1
  clip_ploss_coef: 0.1
  clip_ploss_coef_base: 0.1
  clip_ploss_coef_rate: 3
--- a/cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/hopper-v2/ppo_gaussian_mlp.yaml
@ -40,7 +40,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 1000
+  n_train_itr: 301
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -65,7 +65,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 1000
+  batch_size: 500
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
--- a/cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml
+++ b/cfg/gym/scratch/walker2d-v2/ppo_diffusion_mlp.yaml
@ -42,7 +42,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 1000
+  n_train_itr: 501
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -55,7 +55,7 @@ train:
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 10000
+    first_cycle_steps: 1000
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -67,7 +67,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 10000
+  batch_size: 5000
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
@ -75,7 +75,7 @@ train:
 model:
  _target_: model.diffusion.diffusion_ppo.PPODiffusion
  # HP to tune
-  gamma_denoising: 0.99
+  gamma_denoising: 1
  clip_ploss_coef: 0.1
  clip_ploss_coef_base: 0.1
  clip_ploss_coef_rate: 3
@ -94,10 +94,10 @@ model:
    residual_style: True
  critic:
    _target_: model.common.critic.CriticObs
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  ft_denoising_steps: ${ft_denoising_steps}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
--- a/cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml
+++ b/cfg/gym/scratch/walker2d-v2/ppo_gaussian_mlp.yaml
@ -40,7 +40,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 1000
+  n_train_itr: 301
  n_critic_warmup_itr: 0
  n_steps: 1000
  gamma: 0.99
@ -65,7 +65,7 @@ train:
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
-  batch_size: 1000
+  batch_size: 500
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
--- a/cfg/pretraining.md
+++ b/cfg/pretraining.md
@ -1,6 +1,6 @@
 ## Pre-training experiments
-**Update, Nov 6 2024**: we fixed the issue of EMA update being too infrequent causing slow pre-training. Now the number of epochs needed for pre-training can be much slower than those used in the configs. We recommend training with fewer epochs and testing the early checkpoints.
+**Update, Nov 20 2024**: We fixed the issue of EMA update being too infrequent causing slow pre-training ([commit](https://github.com/irom-princeton/dppo/commit/e1ef4ca1cfbff85e5ae6c49f5e57debd70174616)). Now the number of epochs needed for pre-training can be much lower than those used in the configs (e.g., 3000 for robomimic state and 1000 for robomimic pixel), and we have updated the pre-training configs in v0.7. If you would like to replicate the original experimental results from the paper, please use v0.6.
 ### Comparing diffusion-based RL algorithms (Sec. 5.1)
 Gym configs are under `cfg/gym/pretrain/<env_name>/`, and the config name is `pre_diffusion_mlp`. Robomimic configs are under `cfg/robomimic/pretrain/<env_name>/`, and the name is also `pre_diffusion_mlp`.
--- a/cfg/robomimic/eval/can/eval_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/eval/can/eval_diffusion_mlp_img.yaml
@ -7,7 +7,7 @@ _target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_img_ta4_td100/2024-07-30_22-23-55/checkpoint/state_5000.pt
+base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
@ -28,7 +28,7 @@ n_steps: 300  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
-  n_envs: 50
+  n_envs: 20  # reduce gpu usage
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
--- a/cfg/robomimic/eval/can/eval_diffusion_unet.yaml
+++ b/cfg/robomimic/eval/can/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
 seed: 42
 device: cuda:0
 env_name: can
 obs_dim: 23
 action_dim: 7
 denoising_steps: 20
 cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 n_steps: 75  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 40
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                    'robot0_eef_quat',
                    'robot0_gripper_qpos',
                    'object'] # same order of preprocessed observations
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.unet.Unet1D
    diffusion_step_embed_dim: 16
    dim: 40
    dim_mults: [1, 2]
    kernel_size: 5
    n_groups: 8
    smaller_encoder: False
    cond_predict_scale: True
    action_dim: ${action_dim}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/can/eval_diffusion_unet_img.yaml
+++ b/cfg/robomimic/eval/can/eval_diffusion_unet_img.yaml
@ -0,0 +1,102 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
 name: ${env_name}_eval_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
 seed: 42
 device: cuda:0
 env_name: can
 obs_dim: 9
 action_dim: 7
 denoising_steps: 100
 cond_steps: 1
 img_cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 use_ddim: True
 ddim_steps: 5
 n_steps: 300  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 20  # reduce gpu usage
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
  save_video: False
  use_image_obs: True
  wrappers:
    robomimic_image:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                     'robot0_eef_quat',
                     'robot0_gripper_qpos']
      image_keys: ['robot0_eye_in_hand_image']
      shape_meta: ${shape_meta}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 shape_meta:
  obs:
    rgb:
      shape: [3, 96, 96]
    state:
      shape: [9]
  action: 
    shape: [7]
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.unet.VisionUnet1D
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    augment: False
    spatial_emb: 128
    diffusion_step_embed_dim: 32
    dim: 40
    dim_mults:
    - 1
    - 2
    kernel_size: 5
    n_groups: 8
    smaller_encoder: false
    cond_predict_scale: true
    action_dim: ${action_dim}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/can/eval_gaussian_mlp.yaml
+++ b/cfg/robomimic/eval/can/eval_gaussian_mlp.yaml
@ -7,7 +7,7 @@ _target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent
 name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_gaussian_mlp_ta4/2024-06-28_13-31-00/checkpoint/state_5000.pt
+base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
--- a/cfg/robomimic/eval/can/eval_gaussian_mlp_img.yaml
+++ b/cfg/robomimic/eval/can/eval_gaussian_mlp_img.yaml
@ -7,7 +7,7 @@ _target_: agent.eval.eval_gaussian_img_agent.EvalImgGaussianAgent
 name: ${env_name}_eval_gaussian_mlp_img_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_gaussian_mlp_img_ta4/2024-07-28_21-54-40/checkpoint/state_1000.pt
+base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
--- a/cfg/robomimic/eval/lift/eval_diffusion_mlp.yaml
+++ b/cfg/robomimic/eval/lift/eval_diffusion_mlp.yaml
@ -0,0 +1,65 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
 seed: 42
 device: cuda:0
 env_name: lift
 obs_dim: 19
 action_dim: 7
 denoising_steps: 20
 cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 n_steps: 300  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 50
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                    'robot0_eef_quat',
                    'robot0_gripper_qpos',
                    'object'] # same order of preprocessed observations
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
    time_dim: 16
    mlp_dims: [512, 512, 512]
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    horizon_steps: ${horizon_steps}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/lift/eval_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/eval/lift/eval_diffusion_mlp_img.yaml
@ -0,0 +1,97 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
 seed: 42
 device: cuda:0
 env_name: lift
 obs_dim: 9
 action_dim: 7
 denoising_steps: 100
 cond_steps: 1
 img_cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 use_ddim: True
 ddim_steps: 5
 n_steps: 300  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 20  # reduce gpu usage
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
  save_video: False
  use_image_obs: True
  wrappers:
    robomimic_image:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                     'robot0_eef_quat',
                     'robot0_gripper_qpos']
      image_keys: ['robot0_eye_in_hand_image']
      shape_meta: ${shape_meta}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 shape_meta:
  obs:
    rgb:
      shape: [3, 96, 96]
    state:
      shape: [9]
  action: 
    shape: [7]
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    augment: False
    spatial_emb: 128
    time_dim: 32
    mlp_dims: [512, 512, 512]
    residual_style: True
    img_cond_steps: ${img_cond_steps}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    horizon_steps: ${horizon_steps}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/lift/eval_diffusion_unet.yaml
+++ b/cfg/robomimic/eval/lift/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
 seed: 42
 device: cuda:0
 env_name: lift
 obs_dim: 19
 action_dim: 7
 denoising_steps: 20
 cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 n_steps: 75  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 40
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                    'robot0_eef_quat',
                    'robot0_gripper_qpos',
                    'object'] # same order of preprocessed observations
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.unet.Unet1D
    diffusion_step_embed_dim: 16
    dim: 40
    dim_mults: [1, 2]
    kernel_size: 5
    n_groups: 8
    smaller_encoder: False
    cond_predict_scale: True
    action_dim: ${action_dim}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/lift/eval_diffusion_unet_img.yaml
+++ b/cfg/robomimic/eval/lift/eval_diffusion_unet_img.yaml
@ -0,0 +1,100 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
 name: ${env_name}_eval_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
 seed: 42
 device: cuda:0
 env_name: lift
 obs_dim: 9
 action_dim: 7
 denoising_steps: 100
 cond_steps: 1
 img_cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 use_ddim: True
 ddim_steps: 5
 n_steps: 300  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 20  # reduce gpu usage
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
  save_video: False
  use_image_obs: True
  wrappers:
    robomimic_image:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                     'robot0_eef_quat',
                     'robot0_gripper_qpos']
      image_keys: ['robot0_eye_in_hand_image']
      shape_meta: ${shape_meta}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 shape_meta:
  obs:
    rgb:
      shape: [3, 96, 96]
    state:
      shape: [9]
  action: 
    shape: [7]
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.unet.VisionUnet1D
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    augment: False
    spatial_emb: 128
    diffusion_step_embed_dim: 32
    dim: 40
    dim_mults: [1, 2]
    kernel_size: 5
    n_groups: 8
    smaller_encoder: False
    cond_predict_scale: True
    action_dim: ${action_dim}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/square/eval_diffusion_mlp.yaml
+++ b/cfg/robomimic/eval/square/eval_diffusion_mlp.yaml
@ -18,8 +18,8 @@ obs_dim: 23
 action_dim: 7
 denoising_steps: 20
 cond_steps: 1
-horizon_steps: 1
+horizon_steps: 4
-act_steps: 1
+act_steps: 4
 n_steps: 400  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
--- a/cfg/robomimic/eval/square/eval_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/eval/square/eval_diffusion_mlp_img.yaml
@ -0,0 +1,97 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
 seed: 42
 device: cuda:0
 env_name: square
 obs_dim: 9
 action_dim: 7
 denoising_steps: 100
 cond_steps: 1
 img_cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 use_ddim: True
 ddim_steps: 5
 n_steps: 400  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 20  # reduce gpu usage
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
  save_video: False
  use_image_obs: True
  wrappers:
    robomimic_image:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                     'robot0_eef_quat',
                     'robot0_gripper_qpos']
      image_keys: ['agentview_image']
      shape_meta: ${shape_meta}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 shape_meta:
  obs:
    rgb:
      shape: [3, 96, 96]
    state:
      shape: [9]
  action: 
    shape: [7]
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    augment: False
    spatial_emb: 128
    time_dim: 32
    mlp_dims: [768, 768, 768]
    residual_style: True
    img_cond_steps: ${img_cond_steps}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    horizon_steps: ${horizon_steps}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/square/eval_diffusion_unet.yaml
+++ b/cfg/robomimic/eval/square/eval_diffusion_unet.yaml
@ -0,0 +1,68 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
 seed: 42
 device: cuda:0
 env_name: square
 obs_dim: 23
 action_dim: 7
 denoising_steps: 20
 cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 n_steps: 100  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 50
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                    'robot0_eef_quat',
                    'robot0_gripper_qpos',
                    'object'] # same order of preprocessed observations
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.unet.Unet1D
    diffusion_step_embed_dim: 16
    dim: 64
    dim_mults: [1, 2]
    kernel_size: 5
    n_groups: 8
    smaller_encoder: False
    cond_predict_scale: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/square/eval_diffusion_unet_img.yaml
+++ b/cfg/robomimic/eval/square/eval_diffusion_unet_img.yaml
@ -0,0 +1,102 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
 name: ${env_name}_eval_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
 seed: 42
 device: cuda:0
 env_name: square
 obs_dim: 9
 action_dim: 7
 denoising_steps: 100
 cond_steps: 1
 img_cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 use_ddim: True
 ddim_steps: 5
 n_steps: 400  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 30  # reduce gpu usage
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
  save_video: False
  use_image_obs: True
  wrappers:
    robomimic_image:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                     'robot0_eef_quat',
                     'robot0_gripper_qpos']
      image_keys: ['agentview_image']
      shape_meta: ${shape_meta}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 shape_meta:
  obs:
    rgb:
      shape: [3, 96, 96]
    state:
      shape: [9]
  action: 
    shape: [7]
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.unet.VisionUnet1D
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    augment: False
    spatial_emb: 128
    diffusion_step_embed_dim: 32
    dim: 64
    dim_mults:
    - 1
    - 2
    kernel_size: 5
    n_groups: 8
    smaller_encoder: false
    cond_predict_scale: true
    action_dim: ${action_dim}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/transport/eval_diffusion_mlp.yaml
+++ b/cfg/robomimic/eval/transport/eval_diffusion_mlp.yaml
@ -3,9 +3,9 @@ defaults:
 hydra:
  run:
    dir: ${logdir}
-_target_: agent.eval.eval_gaussian_agent.EvalGaussianAgent
+_target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
-name: ${env_name}_eval_gaussian_mlp_ta${horizon_steps}
+name: ${env_name}_eval_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
@ -13,12 +13,13 @@ normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.
 seed: 42
 device: cuda:0
-env_name: square
+env_name: transport
-obs_dim: 23
+obs_dim: 59
-action_dim: 7
+action_dim: 14
 denoising_steps: 20
 cond_steps: 1
-horizon_steps: 1
+horizon_steps: 8
-act_steps: 1
+act_steps: 8
 n_steps: 400  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
@ -27,7 +28,7 @@ env:
  n_envs: 50
  name: ${env_name}
  best_reward_threshold_for_success: 1
-  max_episode_steps: 400
+  max_episode_steps: 800
  save_video: False
  wrappers:
    robomimic_lowdim:
@ -35,6 +36,9 @@ env:
      low_dim_keys: ['robot0_eef_pos',
                    'robot0_eef_quat',
                    'robot0_gripper_qpos',
                    "robot1_eef_pos",
                    "robot1_eef_quat",
                    "robot1_gripper_qpos",
                    'object'] # same order of preprocessed observations
    multi_step:
      n_obs_steps: ${cond_steps}
@ -42,19 +46,24 @@ env:
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 model:
-  _target_: model.common.gaussian.GaussianModel
+  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  network_path: ${base_policy_path}
  network:
-    _target_: model.common.mlp_gaussian.Gaussian_MLP
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
    time_dim: 32
    mlp_dims: [1024, 1024, 1024]
-    activation_type: ReLU
+    residual_style: True
    use_layernorm: true
    fixed_std: 0.1
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    horizon_steps: ${horizon_steps}
-    
+    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/transport/eval_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/eval/transport/eval_diffusion_mlp_img.yaml
@ -0,0 +1,102 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
 name: ${env_name}_eval_diffusion_mlp_img_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
 seed: 42
 device: cuda:0
 env_name: transport
 obs_dim: 18
 action_dim: 14
 denoising_steps: 100
 cond_steps: 1
 img_cond_steps: 1
 horizon_steps: 8
 act_steps: 8
 use_ddim: True
 ddim_steps: 5
 n_steps: 200  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 30  # reduce gpu usage
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 800
  save_video: False
  use_image_obs: True
  wrappers:
    robomimic_image:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                     'robot0_eef_quat',
                     'robot0_gripper_qpos',
                     "robot1_eef_pos",
                     "robot1_eef_quat",
                     "robot1_gripper_qpos"]
      image_keys: ['shouldercamera0_image', 
                   'shouldercamera1_image']
      shape_meta: ${shape_meta}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 shape_meta:
  obs:
    rgb:
      shape: [6, 96, 96]
    state:
      shape: [18]
  action: 
    shape: [14]
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.mlp_diffusion.VisionDiffusionMLP
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    augment: False
    num_img: 2
    spatial_emb: 128
    time_dim: 32
    mlp_dims: [768, 768, 768]
    residual_style: True
    img_cond_steps: ${img_cond_steps}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    horizon_steps: ${horizon_steps}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/transport/eval_diffusion_unet.yaml
+++ b/cfg/robomimic/eval/transport/eval_diffusion_unet.yaml
@ -0,0 +1,71 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_agent.EvalDiffusionAgent
 name: ${env_name}_eval_diffusion_unet_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
 seed: 42
 device: cuda:0
 env_name: transport
 obs_dim: 59
 action_dim: 14
 denoising_steps: 20
 cond_steps: 1
 horizon_steps: 16
 act_steps: 8
 n_steps: 100  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 50
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 800
  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                    'robot0_eef_quat',
                    'robot0_gripper_qpos',
                    "robot1_eef_pos",
                    "robot1_eef_quat",
                    "robot1_gripper_qpos",
                    'object'] # same order of preprocessed observations
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.unet.Unet1D
    diffusion_step_embed_dim: 16
    dim: 64
    dim_mults: [1, 2]
    kernel_size: 5
    n_groups: 8
    smaller_encoder: False
    cond_predict_scale: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    action_dim: ${action_dim}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/eval/transport/eval_diffusion_unet_img.yaml
+++ b/cfg/robomimic/eval/transport/eval_diffusion_unet_img.yaml
@ -0,0 +1,107 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.eval.eval_diffusion_img_agent.EvalImgDiffusionAgent
 name: ${env_name}_eval_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-eval/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path:
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
 seed: 42
 device: cuda:0
 env_name: transport
 obs_dim: 18
 action_dim: 14
 denoising_steps: 100
 cond_steps: 1
 img_cond_steps: 1
 horizon_steps: 16
 act_steps: 8
 use_ddim: True
 ddim_steps: 5
 n_steps: 400  # each episode takes max_episode_steps / act_steps steps
 render_num: 0
 env:
  n_envs: 30  # reduce gpu usage
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 800
  save_video: False
  use_image_obs: True
  wrappers:
    robomimic_image:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                     'robot0_eef_quat',
                     'robot0_gripper_qpos',
                     "robot1_eef_pos",
                     "robot1_eef_quat",
                     "robot1_gripper_qpos"]
      image_keys: ['shouldercamera0_image', 
                   'shouldercamera1_image']
      shape_meta: ${shape_meta}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 shape_meta:
  obs:
    rgb:
      shape: [6, 96, 96]
    state:
      shape: [18]
  action: 
    shape: [14]
 model:
  _target_: model.diffusion.diffusion.DiffusionModel
  predict_epsilon: True
  denoised_clip_value: 1.0
  randn_clip_value: 3
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ddim_steps}
  network_path: ${base_policy_path}
  network:
    _target_: model.diffusion.unet.VisionUnet1D
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    augment: False
    num_img: 2
    spatial_emb: 128
    diffusion_step_embed_dim: 32
    dim: 64
    dim_mults:
    - 1
    - 2
    kernel_size: 5
    n_groups: 8
    smaller_encoder: false
    cond_predict_scale: true
    action_dim: ${action_dim}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp.yaml
@ -7,7 +7,8 @@ _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
 name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt  # use 8000 for comparing policy parameterizations
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt  # use 5000 for comparing diffusion rl algorithms
 # base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_8000.pt  # use 8000 for comparing policy parameterizations
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
@ -54,13 +55,13 @@ train:
  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_img.yaml
@ -66,16 +66,16 @@ train:
  gamma: 0.999
  augment: True
  grad_accumulate: 15
-  actor_lr: 1e-4
+  actor_lr: 5e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-4
+    min_lr: 5e-5
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet.yaml
@ -27,7 +27,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -47,20 +47,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 300
+  n_train_itr: 151
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet_img.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_unet_img.yaml
@ -0,0 +1,173 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
 name: ${env_name}_ft_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_unet_img_ta4_td100/2024-11-15_17-34-05_42/checkpoint/state_500.pt
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
 seed: 42
 device: cuda:0
 env_name: can
 obs_dim: 9
 action_dim: 7
 denoising_steps: 100
 ft_denoising_steps: 5
 cond_steps: 1
 img_cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 use_ddim: True
 env:
  n_envs: 50
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
  save_video: False
  use_image_obs: True
  wrappers:
    robomimic_image:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                     'robot0_eef_quat',
                     'robot0_gripper_qpos']
      image_keys: ['robot0_eye_in_hand_image']
      shape_meta: ${shape_meta}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 shape_meta:
  obs:
    rgb:
      shape: [3, 96, 96]
    state:
      shape: [9]
  action: 
    shape: [7]
 wandb:
  entity: ${oc.env:DPPO_WANDB_ENTITY}
  project: robomimic-${env_name}-finetune
  run: ${now:%H-%M-%S}_${name}
 train:
  n_train_itr: 151
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
  augment: True
  grad_accumulate: 15
  actor_lr: 5e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 5e-5
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
  val_freq: 10
  render:
    freq: 1
    num: 0
  # PPO specific
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
  batch_size: 500
  logprob_batch_size: 500
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
 model:
  _target_: model.diffusion.diffusion_ppo.PPODiffusion
  # HP to tune
  gamma_denoising: 0.99
  clip_ploss_coef: 0.01
  clip_ploss_coef_base: 0.001
  clip_ploss_coef_rate: 3
  randn_clip_value: 3
  min_sampling_denoising_std: 0.1
  min_logprob_denoising_std: 0.1
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ft_denoising_steps}
  learn_eta: False
  eta:
    base_eta: 1
    input_dim: ${obs_dim}
    mlp_dims: [256, 256]
    action_dim: ${action_dim}
    min_eta: 0.1
    max_eta: 1.0
    _target_: model.diffusion.eta.EtaFixed
  network_path: ${base_policy_path}
  actor:
    _target_: model.diffusion.unet.VisionUnet1D
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    augment: False
    spatial_emb: 128
    diffusion_step_embed_dim: 32
    dim: 40
    dim_mults: [1, 2]
    kernel_size: 5
    n_groups: 8
    smaller_encoder: False
    cond_predict_scale: True
    action_dim: ${action_dim}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  critic:
    _target_: model.common.critic.ViTCritic
    spatial_emb: 128
    augment: False
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  ft_denoising_steps: ${ft_denoising_steps}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp.yaml
@ -45,20 +45,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 300
+  n_train_itr: 151
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp_img.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_mlp_img.yaml
@ -1,7 +1,7 @@
 defaults:
  - _self_
 hydra:
-  run:  
+  run:
    dir: ${logdir}
 _target_: agent.finetune.train_ppo_gaussian_img_agent.TrainPPOImgGaussianAgent
@ -57,22 +57,22 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 200
+  n_train_itr: 151
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
  augment: True
  grad_accumulate: 5
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -140,9 +140,9 @@ model:
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/can/ft_ppo_gaussian_transformer.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_gaussian_transformer.yaml
@ -45,20 +45,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 300
+  n_train_itr: 151
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/can/ft_ppo_gmm_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ft_ppo_gmm_mlp.yaml
@ -46,20 +46,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 300
+  n_train_itr: 151
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp.yaml
@ -1,13 +1,14 @@
 defaults:
  - _self_
 hydra:
-  run:  
+  run:
    dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
 name: ${env_name}_ft_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
-base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt # use 8000 for comparing policy parameterizations
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_5000.pt # use 5000 for comparing diffusion rl algorithms
 # base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_mlp_ta4_td20/2024-06-28_14-47-58/checkpoint/state_8000.pt # use 8000 for comparing policy parameterizations
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
@ -54,13 +55,13 @@ train:
  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_mlp_img.yaml
@ -60,22 +60,22 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 151
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
  augment: True
  grad_accumulate: 15
-  actor_lr: 1e-4
+  actor_lr: 5e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-4
+    min_lr: 5e-5
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet.yaml
@ -27,7 +27,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -47,20 +47,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 300
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -102,10 +102,10 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  ft_denoising_steps: ${ft_denoising_steps}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
--- a/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet_img.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_diffusion_unet_img.yaml
@ -0,0 +1,173 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
 name: ${env_name}_ft_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/lift/lift_pre_diffusion_unet_img_ta4_td100/2024-11-15_17-35-19_42/checkpoint/state_500.pt
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
 seed: 42
 device: cuda:0
 env_name: lift
 obs_dim: 9
 action_dim: 7
 denoising_steps: 100
 ft_denoising_steps: 5
 cond_steps: 1
 img_cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 use_ddim: True
 env:
  n_envs: 50
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
  save_video: False
  use_image_obs: True
  wrappers:
    robomimic_image:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                     'robot0_eef_quat',
                     'robot0_gripper_qpos']
      image_keys: ['robot0_eye_in_hand_image']
      shape_meta: ${shape_meta}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 shape_meta:
  obs:
    rgb:
      shape: [3, 96, 96]
    state:
      shape: [9]
  action: 
    shape: [7]
 wandb:
  entity: ${oc.env:DPPO_WANDB_ENTITY}
  project: robomimic-${env_name}-finetune
  run: ${now:%H-%M-%S}_${name}
 train:
  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
  augment: True
  grad_accumulate: 15
  actor_lr: 5e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 5e-5
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
  val_freq: 10
  render:
    freq: 1
    num: 0
  # PPO specific
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
  batch_size: 500
  logprob_batch_size: 500
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
 model:
  _target_: model.diffusion.diffusion_ppo.PPODiffusion
  # HP to tune
  gamma_denoising: 0.99
  clip_ploss_coef: 0.01
  clip_ploss_coef_base: 0.001
  clip_ploss_coef_rate: 3
  randn_clip_value: 3
  min_sampling_denoising_std: 0.1
  min_logprob_denoising_std: 0.1
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ft_denoising_steps}
  learn_eta: False
  eta:
    base_eta: 1
    input_dim: ${obs_dim}
    mlp_dims: [256, 256]
    action_dim: ${action_dim}
    min_eta: 0.1
    max_eta: 1.0
    _target_: model.diffusion.eta.EtaFixed
  network_path: ${base_policy_path}
  actor:
    _target_: model.diffusion.unet.VisionUnet1D
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    augment: False
    spatial_emb: 128
    diffusion_step_embed_dim: 32
    dim: 40
    dim_mults: [1, 2]
    kernel_size: 5
    n_groups: 8
    smaller_encoder: False
    cond_predict_scale: True
    action_dim: ${action_dim}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  critic:
    _target_: model.common.critic.ViTCritic
    spatial_emb: 128
    augment: False
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  ft_denoising_steps: ${ft_denoising_steps}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp.yaml
@ -25,7 +25,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -45,20 +45,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 300
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -93,9 +93,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp_img.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_mlp_img.yaml
@ -1,7 +1,7 @@
 defaults:
  - _self_
 hydra:
-  run:  
+  run:
    dir: ${logdir}
 _target_: agent.finetune.train_ppo_gaussian_img_agent.TrainPPOImgGaussianAgent
@ -57,22 +57,22 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 200
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
  augment: True
  grad_accumulate: 5
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 200
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -140,9 +140,9 @@ model:
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/lift/ft_ppo_gaussian_transformer.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gaussian_transformer.yaml
@ -25,7 +25,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -45,20 +45,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 300
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -94,9 +94,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/lift/ft_ppo_gmm_mlp.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gmm_mlp.yaml
@ -26,7 +26,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -46,20 +46,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 300
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -94,9 +94,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/lift/ft_ppo_gmm_transformer.yaml
+++ b/cfg/robomimic/finetune/lift/ft_ppo_gmm_transformer.yaml
@ -26,7 +26,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 300
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -46,20 +46,20 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 300
+  n_train_itr: 81
  n_critic_warmup_itr: 2
  n_steps: 300
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -95,9 +95,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp.yaml
@ -1,7 +1,7 @@
 defaults:
  - _self_
 hydra:
-  run:  
+  run:
    dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
@ -27,7 +27,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -54,14 +54,14 @@ train:
  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
-    warmup_steps: 10
+    warmup_steps: 0
    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
-    warmup_steps: 10
+    warmup_steps: 0
    min_lr: 1e-3
  save_model_freq: 100
  val_freq: 10
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_mlp_img.yaml
@ -69,13 +69,13 @@ train:
  actor_lr: 1e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-5
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet.yaml
@ -1,7 +1,7 @@
 defaults:
  - _self_
 hydra:
-  run:  
+  run:
    dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_agent.TrainPPODiffusionAgent
@ -27,7 +27,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -47,21 +47,21 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 1000
+  n_train_itr: 201
  n_critic_warmup_itr: 2
  n_steps: 400
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 2e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
-    warmup_steps: 10
+    warmup_steps: 0
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
-    warmup_steps: 10
+    warmup_steps: 0
    min_lr: 1e-3
  save_model_freq: 100
  val_freq: 10
@ -102,10 +102,10 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  ft_denoising_steps: ${ft_denoising_steps}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
--- a/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet_img.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_diffusion_unet_img.yaml
@ -0,0 +1,173 @@
 defaults:
  - _self_
 hydra:
  run:
    dir: ${logdir}
 _target_: agent.finetune.train_ppo_diffusion_img_agent.TrainPPOImgDiffusionAgent
 name: ${env_name}_ft_diffusion_unet_img_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/square/square_pre_diffusion_unet_img_ta4_td100/2024-11-15_17-36-37_42/checkpoint/state_500.pt
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}-img.json
 normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-img/normalization.npz
 seed: 42
 device: cuda:0
 env_name: square
 obs_dim: 9
 action_dim: 7
 denoising_steps: 100
 ft_denoising_steps: 5
 cond_steps: 1
 img_cond_steps: 1
 horizon_steps: 4
 act_steps: 4
 use_ddim: True
 env:
  n_envs: 50
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
  save_video: False
  use_image_obs: True
  wrappers:
    robomimic_image:
      normalization_path: ${normalization_path}
      low_dim_keys: ['robot0_eef_pos',
                     'robot0_eef_quat',
                     'robot0_gripper_qpos']
      image_keys: ['agentview_image']
      shape_meta: ${shape_meta}
    multi_step:
      n_obs_steps: ${cond_steps}
      n_action_steps: ${act_steps}
      max_episode_steps: ${env.max_episode_steps}
      reset_within_step: True
 shape_meta:
  obs:
    rgb:
      shape: [3, 96, 96]
    state:
      shape: [9]
  action: 
    shape: [7]
 wandb:
  entity: ${oc.env:DPPO_WANDB_ENTITY}
  project: robomimic-${env_name}-finetune
  run: ${now:%H-%M-%S}_${name}
 train:
  n_train_itr: 301
  n_critic_warmup_itr: 2
  n_steps: 400
  gamma: 0.999
  augment: True
  grad_accumulate: 20
  actor_lr: 1e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-5
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
  val_freq: 10
  render:
    freq: 1
    num: 0
  # PPO specific
  reward_scale_running: True
  reward_scale_const: 1.0
  gae_lambda: 0.95
  batch_size: 500
  logprob_batch_size: 1000
  update_epochs: 10
  vf_coef: 0.5
  target_kl: 1
 model:
  _target_: model.diffusion.diffusion_ppo.PPODiffusion
  # HP to tune
  gamma_denoising: 0.99
  clip_ploss_coef: 0.01
  clip_ploss_coef_base: 0.001
  clip_ploss_coef_rate: 3
  randn_clip_value: 3
  min_sampling_denoising_std: 0.1
  min_logprob_denoising_std: 0.1
  #
  use_ddim: ${use_ddim}
  ddim_steps: ${ft_denoising_steps}
  learn_eta: False
  eta:
    base_eta: 1
    input_dim: ${obs_dim}
    mlp_dims: [256, 256]
    action_dim: ${action_dim}
    min_eta: 0.1
    max_eta: 1.0
    _target_: model.diffusion.eta.EtaFixed
  network_path: ${base_policy_path}
  actor:
    _target_: model.diffusion.unet.VisionUnet1D
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    augment: False
    spatial_emb: 128
    diffusion_step_embed_dim: 32
    dim: 64
    dim_mults: [1, 2]
    kernel_size: 5
    n_groups: 8
    smaller_encoder: False
    cond_predict_scale: True
    action_dim: ${action_dim}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  critic:
    _target_: model.common.critic.ViTCritic
    spatial_emb: 128
    augment: False
    backbone:
      _target_: model.common.vit.VitEncoder
      obs_shape: ${shape_meta.obs.rgb.shape}
      num_channel: ${eval:'3 * ${img_cond_steps}'} # each image patch is history concatenated
      img_h: ${shape_meta.obs.rgb.shape[1]}
      img_w: ${shape_meta.obs.rgb.shape[2]}
      cfg:
        patch_size: 8
        depth: 1
        embed_dim: 128
        num_heads: 4
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  ft_denoising_steps: ${ft_denoising_steps}
  horizon_steps: ${horizon_steps}
  obs_dim: ${obs_dim}
  action_dim: ${action_dim}
  denoising_steps: ${denoising_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp.yaml
@ -25,7 +25,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -45,21 +45,21 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 1000
+  n_train_itr: 201
  n_critic_warmup_itr: 2
  n_steps: 400
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
-    warmup_steps: 10
+    warmup_steps: 0
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
-    warmup_steps: 10
+    warmup_steps: 0
    min_lr: 1e-3
  save_model_freq: 100
  val_freq: 10
@ -93,9 +93,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp_img.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_mlp_img.yaml
@ -57,7 +57,7 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 500
+  n_train_itr: 301
  n_critic_warmup_itr: 2
  n_steps: 400
  gamma: 0.999
@ -66,13 +66,13 @@ train:
  actor_lr: 1e-5
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 500
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-5
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 500
+    first_cycle_steps: ${train.n_train_itr}
    warmup_steps: 10
    min_lr: 1e-3
  save_model_freq: 100
@ -140,9 +140,9 @@ model:
        embed_style: embed2
        embed_norm: 0
    img_cond_steps: ${img_cond_steps}
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/cfg/robomimic/finetune/square/ft_ppo_gaussian_transformer.yaml
+++ b/cfg/robomimic/finetune/square/ft_ppo_gaussian_transformer.yaml
@ -25,7 +25,7 @@ env:
  name: ${env_name}
  best_reward_threshold_for_success: 1
  max_episode_steps: 400
-  save_video: false
+  save_video: False
  wrappers:
    robomimic_lowdim:
      normalization_path: ${normalization_path}
@ -45,21 +45,21 @@ wandb:
  run: ${now:%H-%M-%S}_${name}
 train:
-  n_train_itr: 1000
+  n_train_itr: 201
  n_critic_warmup_itr: 2
  n_steps: 400
  gamma: 0.999
-  actor_lr: 1e-5
+  actor_lr: 1e-4
  actor_weight_decay: 0
  actor_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
-    warmup_steps: 10
+    warmup_steps: 0
-    min_lr: 1e-5
+    min_lr: 1e-4
  critic_lr: 1e-3
  critic_weight_decay: 0
  critic_lr_scheduler:
-    first_cycle_steps: 1000
+    first_cycle_steps: ${train.n_train_itr}
-    warmup_steps: 10
+    warmup_steps: 0
    min_lr: 1e-3
  save_model_freq: 100
  val_freq: 10
@ -94,9 +94,9 @@ model:
    action_dim: ${action_dim}
  critic:
    _target_: model.common.critic.CriticObs
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
    mlp_dims: [256, 256, 256]
    activation_type: Mish
    residual_style: True
    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
  horizon_steps: ${horizon_steps}
  device: ${device}
--- a/Show More
+++ b/Show More