more intuitive handling of done
in GAE
This commit is contained in:
parent
f5a8da5719
commit
64595baca9
@ -68,7 +68,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
|||||||
last_itr_eval = eval_mode
|
last_itr_eval = eval_mode
|
||||||
|
|
||||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
||||||
dones_trajs = np.empty((0, self.n_envs))
|
dones_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||||
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
||||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||||
@ -138,7 +138,7 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
|||||||
)
|
)
|
||||||
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
|
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
|
||||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||||
dones_trajs = np.vstack((dones_trajs, done_venv[None]))
|
dones_trajs[step] = done_venv
|
||||||
firsts_trajs[step + 1] = done_venv
|
firsts_trajs[step + 1] = done_venv
|
||||||
prev_obs_venv = obs_venv
|
prev_obs_venv = obs_venv
|
||||||
|
|
||||||
@ -252,24 +252,20 @@ class TrainPPODiffusionAgent(TrainPPOAgent):
|
|||||||
lastgaelam = 0
|
lastgaelam = 0
|
||||||
for t in reversed(range(self.n_steps)):
|
for t in reversed(range(self.n_steps)):
|
||||||
if t == self.n_steps - 1:
|
if t == self.n_steps - 1:
|
||||||
nextnonterminal = 1.0 - done_venv
|
|
||||||
nextvalues = next_value
|
nextvalues = next_value
|
||||||
else:
|
else:
|
||||||
nextnonterminal = 1.0 - dones_trajs[t + 1]
|
|
||||||
nextvalues = values_trajs[t + 1]
|
nextvalues = values_trajs[t + 1]
|
||||||
|
nonterminal = 1.0 - dones_trajs[t]
|
||||||
# delta = r + gamma*V(st+1) - V(st)
|
# delta = r + gamma*V(st+1) - V(st)
|
||||||
delta = (
|
delta = (
|
||||||
reward_trajs[t] * self.reward_scale_const
|
reward_trajs[t] * self.reward_scale_const
|
||||||
+ self.gamma * nextvalues * nextnonterminal
|
+ self.gamma * nextvalues * nonterminal
|
||||||
- values_trajs[t]
|
- values_trajs[t]
|
||||||
)
|
)
|
||||||
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
|
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
|
||||||
advantages_trajs[t] = lastgaelam = (
|
advantages_trajs[t] = lastgaelam = (
|
||||||
delta
|
delta
|
||||||
+ self.gamma
|
+ self.gamma * self.gae_lambda * nonterminal * lastgaelam
|
||||||
* self.gae_lambda
|
|
||||||
* nextnonterminal
|
|
||||||
* lastgaelam
|
|
||||||
)
|
)
|
||||||
returns_trajs = advantages_trajs + values_trajs
|
returns_trajs = advantages_trajs + values_trajs
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
|||||||
last_itr_eval = eval_mode
|
last_itr_eval = eval_mode
|
||||||
|
|
||||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
||||||
dones_trajs = np.empty((0, self.n_envs))
|
dones_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||||
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
||||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||||
@ -118,7 +118,7 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
|||||||
obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
|
obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
|
||||||
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
|
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
|
||||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||||
dones_trajs = np.vstack((dones_trajs, done_venv[None]))
|
dones_trajs[step] = done_venv
|
||||||
firsts_trajs[step + 1] = done_venv
|
firsts_trajs[step + 1] = done_venv
|
||||||
prev_obs_venv = obs_venv
|
prev_obs_venv = obs_venv
|
||||||
|
|
||||||
@ -251,24 +251,20 @@ class TrainPPOImgDiffusionAgent(TrainPPODiffusionAgent):
|
|||||||
lastgaelam = 0
|
lastgaelam = 0
|
||||||
for t in reversed(range(self.n_steps)):
|
for t in reversed(range(self.n_steps)):
|
||||||
if t == self.n_steps - 1:
|
if t == self.n_steps - 1:
|
||||||
nextnonterminal = 1.0 - done_venv
|
|
||||||
nextvalues = next_value
|
nextvalues = next_value
|
||||||
else:
|
else:
|
||||||
nextnonterminal = 1.0 - dones_trajs[t + 1]
|
|
||||||
nextvalues = values_trajs[t + 1]
|
nextvalues = values_trajs[t + 1]
|
||||||
|
nonterminal = 1.0 - dones_trajs[t]
|
||||||
# delta = r + gamma*V(st+1) - V(st)
|
# delta = r + gamma*V(st+1) - V(st)
|
||||||
delta = (
|
delta = (
|
||||||
reward_trajs[t] * self.reward_scale_const
|
reward_trajs[t] * self.reward_scale_const
|
||||||
+ self.gamma * nextvalues * nextnonterminal
|
+ self.gamma * nextvalues * nonterminal
|
||||||
- values_trajs[t]
|
- values_trajs[t]
|
||||||
)
|
)
|
||||||
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
|
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
|
||||||
advantages_trajs[t] = lastgaelam = (
|
advantages_trajs[t] = lastgaelam = (
|
||||||
delta
|
delta
|
||||||
+ self.gamma
|
+ self.gamma * self.gae_lambda * nonterminal * lastgaelam
|
||||||
* self.gae_lambda
|
|
||||||
* nextnonterminal
|
|
||||||
* lastgaelam
|
|
||||||
)
|
)
|
||||||
returns_trajs = advantages_trajs + values_trajs
|
returns_trajs = advantages_trajs + values_trajs
|
||||||
|
|
||||||
|
@ -46,12 +46,11 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
|||||||
|
|
||||||
# Define train or eval - all envs restart
|
# Define train or eval - all envs restart
|
||||||
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
|
eval_mode = self.itr % self.val_freq == 0 and not self.force_train
|
||||||
eval_mode = False
|
|
||||||
self.model.eval() if eval_mode else self.model.train()
|
self.model.eval() if eval_mode else self.model.train()
|
||||||
last_itr_eval = eval_mode
|
last_itr_eval = eval_mode
|
||||||
|
|
||||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
||||||
dones_trajs = np.empty((0, self.n_envs))
|
dones_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||||
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
||||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||||
@ -130,7 +129,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
|||||||
)
|
)
|
||||||
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
|
chains_trajs = np.vstack((chains_trajs, chains_venv[None]))
|
||||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||||
dones_trajs = np.vstack((dones_trajs, done_venv[None]))
|
dones_trajs[step] = done_venv
|
||||||
firsts_trajs[step + 1] = done_venv
|
firsts_trajs[step + 1] = done_venv
|
||||||
prev_obs_venv = obs_venv
|
prev_obs_venv = obs_venv
|
||||||
|
|
||||||
@ -229,15 +228,14 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
|||||||
lastgaelam = 0
|
lastgaelam = 0
|
||||||
for t in reversed(range(self.n_steps)):
|
for t in reversed(range(self.n_steps)):
|
||||||
if t == self.n_steps - 1:
|
if t == self.n_steps - 1:
|
||||||
nextnonterminal = 1.0 - done_venv
|
|
||||||
nextvalues = next_value
|
nextvalues = next_value
|
||||||
else:
|
else:
|
||||||
nextnonterminal = 1.0 - dones_trajs[t + 1]
|
|
||||||
nextvalues = values_trajs[t + 1]
|
nextvalues = values_trajs[t + 1]
|
||||||
|
nonterminal = 1.0 - dones_trajs[t]
|
||||||
# delta = r + gamma*V(st+1) - V(st)
|
# delta = r + gamma*V(st+1) - V(st)
|
||||||
delta = (
|
delta = (
|
||||||
reward_trajs[t] * self.reward_scale_const
|
reward_trajs[t] * self.reward_scale_const
|
||||||
+ self.gamma * nextvalues * nextnonterminal
|
+ self.gamma * nextvalues * nonterminal
|
||||||
- values_trajs[t]
|
- values_trajs[t]
|
||||||
)
|
)
|
||||||
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
|
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
|
||||||
@ -245,7 +243,7 @@ class TrainPPOExactDiffusionAgent(TrainPPODiffusionAgent):
|
|||||||
delta
|
delta
|
||||||
+ self.gamma
|
+ self.gamma
|
||||||
* self.gae_lambda
|
* self.gae_lambda
|
||||||
* nextnonterminal
|
* nonterminal
|
||||||
* lastgaelam
|
* lastgaelam
|
||||||
)
|
)
|
||||||
returns_trajs = advantages_trajs + values_trajs
|
returns_trajs = advantages_trajs + values_trajs
|
||||||
|
@ -45,7 +45,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
|||||||
last_itr_eval = eval_mode
|
last_itr_eval = eval_mode
|
||||||
|
|
||||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
||||||
dones_trajs = np.empty((0, self.n_envs))
|
dones_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||||
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
||||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||||
@ -108,7 +108,7 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
|||||||
)
|
)
|
||||||
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
|
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
|
||||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||||
dones_trajs = np.vstack((dones_trajs, done_venv[None]))
|
dones_trajs[step] = done_venv
|
||||||
firsts_trajs[step + 1] = done_venv
|
firsts_trajs[step + 1] = done_venv
|
||||||
prev_obs_venv = obs_venv
|
prev_obs_venv = obs_venv
|
||||||
|
|
||||||
@ -217,24 +217,20 @@ class TrainPPOGaussianAgent(TrainPPOAgent):
|
|||||||
lastgaelam = 0
|
lastgaelam = 0
|
||||||
for t in reversed(range(self.n_steps)):
|
for t in reversed(range(self.n_steps)):
|
||||||
if t == self.n_steps - 1:
|
if t == self.n_steps - 1:
|
||||||
nextnonterminal = 1.0 - done_venv
|
|
||||||
nextvalues = next_value
|
nextvalues = next_value
|
||||||
else:
|
else:
|
||||||
nextnonterminal = 1.0 - dones_trajs[t + 1]
|
|
||||||
nextvalues = values_trajs[t + 1]
|
nextvalues = values_trajs[t + 1]
|
||||||
|
nonterminal = 1.0 - dones_trajs[t]
|
||||||
# delta = r + gamma*V(st+1) - V(st)
|
# delta = r + gamma*V(st+1) - V(st)
|
||||||
delta = (
|
delta = (
|
||||||
reward_trajs[t] * self.reward_scale_const
|
reward_trajs[t] * self.reward_scale_const
|
||||||
+ self.gamma * nextvalues * nextnonterminal
|
+ self.gamma * nextvalues * nonterminal
|
||||||
- values_trajs[t]
|
- values_trajs[t]
|
||||||
)
|
)
|
||||||
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
|
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
|
||||||
advantages_trajs[t] = lastgaelam = (
|
advantages_trajs[t] = lastgaelam = (
|
||||||
delta
|
delta
|
||||||
+ self.gamma
|
+ self.gamma * self.gae_lambda * nonterminal * lastgaelam
|
||||||
* self.gae_lambda
|
|
||||||
* nextnonterminal
|
|
||||||
* lastgaelam
|
|
||||||
)
|
)
|
||||||
returns_trajs = advantages_trajs + values_trajs
|
returns_trajs = advantages_trajs + values_trajs
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
|||||||
last_itr_eval = eval_mode
|
last_itr_eval = eval_mode
|
||||||
|
|
||||||
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
# Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
|
||||||
dones_trajs = np.empty((0, self.n_envs))
|
dones_trajs = np.zeros((self.n_steps, self.n_envs))
|
||||||
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
|
||||||
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
if self.reset_at_iteration or eval_mode or last_itr_eval:
|
||||||
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
prev_obs_venv = self.reset_env_all(options_venv=options_venv)
|
||||||
@ -111,7 +111,7 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
|||||||
obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
|
obs_trajs[k] = np.vstack((obs_trajs[k], prev_obs_venv[k][None]))
|
||||||
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
|
samples_trajs = np.vstack((samples_trajs, output_venv[None]))
|
||||||
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
reward_trajs = np.vstack((reward_trajs, reward_venv[None]))
|
||||||
dones_trajs = np.vstack((dones_trajs, done_venv[None]))
|
dones_trajs[step] = done_venv
|
||||||
firsts_trajs[step + 1] = done_venv
|
firsts_trajs[step + 1] = done_venv
|
||||||
prev_obs_venv = obs_venv
|
prev_obs_venv = obs_venv
|
||||||
|
|
||||||
@ -239,24 +239,20 @@ class TrainPPOImgGaussianAgent(TrainPPOGaussianAgent):
|
|||||||
lastgaelam = 0
|
lastgaelam = 0
|
||||||
for t in reversed(range(self.n_steps)):
|
for t in reversed(range(self.n_steps)):
|
||||||
if t == self.n_steps - 1:
|
if t == self.n_steps - 1:
|
||||||
nextnonterminal = 1.0 - done_venv
|
|
||||||
nextvalues = next_value
|
nextvalues = next_value
|
||||||
else:
|
else:
|
||||||
nextnonterminal = 1.0 - dones_trajs[t + 1]
|
|
||||||
nextvalues = values_trajs[t + 1]
|
nextvalues = values_trajs[t + 1]
|
||||||
|
nonterminal = 1.0 - dones_trajs[t]
|
||||||
# delta = r + gamma*V(st+1) - V(st)
|
# delta = r + gamma*V(st+1) - V(st)
|
||||||
delta = (
|
delta = (
|
||||||
reward_trajs[t] * self.reward_scale_const
|
reward_trajs[t] * self.reward_scale_const
|
||||||
+ self.gamma * nextvalues * nextnonterminal
|
+ self.gamma * nextvalues * nonterminal
|
||||||
- values_trajs[t]
|
- values_trajs[t]
|
||||||
)
|
)
|
||||||
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
|
# A = delta_t + gamma*lamdba*delta_{t+1} + ...
|
||||||
advantages_trajs[t] = lastgaelam = (
|
advantages_trajs[t] = lastgaelam = (
|
||||||
delta
|
delta
|
||||||
+ self.gamma
|
+ self.gamma * self.gae_lambda * nonterminal * lastgaelam
|
||||||
* self.gae_lambda
|
|
||||||
* nextnonterminal
|
|
||||||
* lastgaelam
|
|
||||||
)
|
)
|
||||||
returns_trajs = advantages_trajs + values_trajs
|
returns_trajs = advantages_trajs + values_trajs
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user