diff --git a/agent/finetune/train_ppo_diffusion_agent.py b/agent/finetune/train_ppo_diffusion_agent.py index 3677b91..1897933 100644 --- a/agent/finetune/train_ppo_diffusion_agent.py +++ b/agent/finetune/train_ppo_diffusion_agent.py @@ -68,7 +68,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent): last_itr_eval = eval_mode # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode - dones_trajs = np.empty((0, self.n_envs)) + dones_trajs = np.zeros((self.n_steps, self.n_envs)) firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) if self.reset_at_iteration or eval_mode or last_itr_eval: prev_obs_venv = self.reset_env_all(options_venv=options_venv) @@ -138,7 +138,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent): ) chains_trajs = np.vstack((chains_trajs, chains_venv[None])) reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - dones_trajs = np.vstack((dones_trajs, done_venv[None])) + dones_trajs[step] = done_venv firsts_trajs[step + 1] = done_venv prev_obs_venv = obs_venv @@ -252,24 +252,20 @@ class TrainPPODiffusionAgent(TrainPPOAgent): lastgaelam = 0 for t in reversed(range(self.n_steps)): if t == self.n_steps - 1: - nextnonterminal = 1.0 - done_venv nextvalues = next_value else: - nextnonterminal = 1.0 - dones_trajs[t + 1] nextvalues = values_trajs[t + 1] + nonterminal = 1.0 - dones_trajs[t] # delta = r + gamma*V(st+1) - V(st) delta = ( reward_trajs[t] * self.reward_scale_const - + self.gamma * nextvalues * nextnonterminal + + self.gamma * nextvalues * nonterminal - values_trajs[t] ) # A = delta_t + gamma*lamdba*delta_{t+1} + ... advantages_trajs[t] = lastgaelam = ( delta - + self.gamma - * self.gae_lambda - * nextnonterminal - * lastgaelam + + self.gamma * self.gae_lambda * nonterminal * lastgaelam ) returns_trajs = advantages_trajs + values_trajs diff --git a/agent/finetune/train_ppo_diffusion_img_agent.py b/agent/finetune/train_ppo_diffusion_img_agent.py index 75bb101..ed85b60 100644 --- a/agent/finetune/train_ppo_diffusion_img_agent.py +++ b/agent/finetune/train_ppo_diffusion_img_agent.py @@ -58,7 +58,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): last_itr_eval = eval_mode # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode - dones_trajs = np.empty((0, self.n_envs)) + dones_trajs = np.zeros((self.n_steps, self.n_envs)) firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) if self.reset_at_iteration or eval_mode or last_itr_eval: prev_obs_venv = self.reset_env_all(options_venv=options_venv) @@ -118,7 +118,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None])) chains_trajs = np.vstack((chains_trajs, chains_venv[None])) reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - dones_trajs = np.vstack((dones_trajs, done_venv[None])) + dones_trajs[step] = done_venv firsts_trajs[step + 1] = done_venv prev_obs_venv = obs_venv @@ -251,24 +251,20 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent): lastgaelam = 0 for t in reversed(range(self.n_steps)): if t == self.n_steps - 1: - nextnonterminal = 1.0 - done_venv nextvalues = next_value else: - nextnonterminal = 1.0 - dones_trajs[t + 1] nextvalues = values_trajs[t + 1] + nonterminal = 1.0 - dones_trajs[t] # delta = r + gamma*V(st+1) - V(st) delta = ( reward_trajs[t] * self.reward_scale_const - + self.gamma * nextvalues * nextnonterminal + + self.gamma * nextvalues * nonterminal - values_trajs[t] ) # A = delta_t + gamma*lamdba*delta_{t+1} + ... advantages_trajs[t] = lastgaelam = ( delta - + self.gamma - * self.gae_lambda - * nextnonterminal - * lastgaelam + + self.gamma * self.gae_lambda * nonterminal * lastgaelam ) returns_trajs = advantages_trajs + values_trajs diff --git a/agent/finetune/train_ppo_exact_diffusion_agent.py b/agent/finetune/train_ppo_exact_diffusion_agent.py index e71c00f..9661ec9 100644 --- a/agent/finetune/train_ppo_exact_diffusion_agent.py +++ b/agent/finetune/train_ppo_exact_diffusion_agent.py @@ -46,12 +46,11 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): # Define train or eval - all envs restart eval_mode = self.itr % self.val_freq == 0 and not self.force_train - eval_mode = False self.model.eval() if eval_mode else self.model.train() last_itr_eval = eval_mode # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode - dones_trajs = np.empty((0, self.n_envs)) + dones_trajs = np.zeros((self.n_steps, self.n_envs)) firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) if self.reset_at_iteration or eval_mode or last_itr_eval: prev_obs_venv = self.reset_env_all(options_venv=options_venv) @@ -130,7 +129,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): ) chains_trajs = np.vstack((chains_trajs, chains_venv[None])) reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - dones_trajs = np.vstack((dones_trajs, done_venv[None])) + dones_trajs[step] = done_venv firsts_trajs[step + 1] = done_venv prev_obs_venv = obs_venv @@ -229,15 +228,14 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): lastgaelam = 0 for t in reversed(range(self.n_steps)): if t == self.n_steps - 1: - nextnonterminal = 1.0 - done_venv nextvalues = next_value else: - nextnonterminal = 1.0 - dones_trajs[t + 1] nextvalues = values_trajs[t + 1] + nonterminal = 1.0 - dones_trajs[t] # delta = r + gamma*V(st+1) - V(st) delta = ( reward_trajs[t] * self.reward_scale_const - + self.gamma * nextvalues * nextnonterminal + + self.gamma * nextvalues * nonterminal - values_trajs[t] ) # A = delta_t + gamma*lamdba*delta_{t+1} + ... @@ -245,7 +243,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent): delta + self.gamma * self.gae_lambda - * nextnonterminal + * nonterminal * lastgaelam ) returns_trajs = advantages_trajs + values_trajs diff --git a/agent/finetune/train_ppo_gaussian_agent.py b/agent/finetune/train_ppo_gaussian_agent.py index cb37c0d..f871b6b 100644 --- a/agent/finetune/train_ppo_gaussian_agent.py +++ b/agent/finetune/train_ppo_gaussian_agent.py @@ -45,7 +45,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent): last_itr_eval = eval_mode # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode - dones_trajs = np.empty((0, self.n_envs)) + dones_trajs = np.zeros((self.n_steps, self.n_envs)) firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) if self.reset_at_iteration or eval_mode or last_itr_eval: prev_obs_venv = self.reset_env_all(options_venv=options_venv) @@ -108,7 +108,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent): ) samples_trajs = np.vstack((samples_trajs, output_venv[None])) reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - dones_trajs = np.vstack((dones_trajs, done_venv[None])) + dones_trajs[step] = done_venv firsts_trajs[step + 1] = done_venv prev_obs_venv = obs_venv @@ -217,24 +217,20 @@ class TrainPPOGaussianAgent(TrainPPOAgent): lastgaelam = 0 for t in reversed(range(self.n_steps)): if t == self.n_steps - 1: - nextnonterminal = 1.0 - done_venv nextvalues = next_value else: - nextnonterminal = 1.0 - dones_trajs[t + 1] nextvalues = values_trajs[t + 1] + nonterminal = 1.0 - dones_trajs[t] # delta = r + gamma*V(st+1) - V(st) delta = ( reward_trajs[t] * self.reward_scale_const - + self.gamma * nextvalues * nextnonterminal + + self.gamma * nextvalues * nonterminal - values_trajs[t] ) # A = delta_t + gamma*lamdba*delta_{t+1} + ... advantages_trajs[t] = lastgaelam = ( delta - + self.gamma - * self.gae_lambda - * nextnonterminal - * lastgaelam + + self.gamma * self.gae_lambda * nonterminal * lastgaelam ) returns_trajs = advantages_trajs + values_trajs diff --git a/agent/finetune/train_ppo_gaussian_img_agent.py b/agent/finetune/train_ppo_gaussian_img_agent.py index 37a2ed7..964031d 100644 --- a/agent/finetune/train_ppo_gaussian_img_agent.py +++ b/agent/finetune/train_ppo_gaussian_img_agent.py @@ -58,7 +58,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): last_itr_eval = eval_mode # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode - dones_trajs = np.empty((0, self.n_envs)) + dones_trajs = np.zeros((self.n_steps, self.n_envs)) firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) if self.reset_at_iteration or eval_mode or last_itr_eval: prev_obs_venv = self.reset_env_all(options_venv=options_venv) @@ -111,7 +111,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None])) samples_trajs = np.vstack((samples_trajs, output_venv[None])) reward_trajs = np.vstack((reward_trajs, reward_venv[None])) - dones_trajs = np.vstack((dones_trajs, done_venv[None])) + dones_trajs[step] = done_venv firsts_trajs[step + 1] = done_venv prev_obs_venv = obs_venv @@ -239,24 +239,20 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent): lastgaelam = 0 for t in reversed(range(self.n_steps)): if t == self.n_steps - 1: - nextnonterminal = 1.0 - done_venv nextvalues = next_value else: - nextnonterminal = 1.0 - dones_trajs[t + 1] nextvalues = values_trajs[t + 1] + nonterminal = 1.0 - dones_trajs[t] # delta = r + gamma*V(st+1) - V(st) delta = ( reward_trajs[t] * self.reward_scale_const - + self.gamma * nextvalues * nextnonterminal + + self.gamma * nextvalues * nonterminal - values_trajs[t] ) # A = delta_t + gamma*lamdba*delta_{t+1} + ... advantages_trajs[t] = lastgaelam = ( delta - + self.gamma - * self.gae_lambda - * nextnonterminal - * lastgaelam + + self.gamma * self.gae_lambda * nonterminal * lastgaelam ) returns_trajs = advantages_trajs + values_trajs