update

2021-06-22 10:27:25 +02:00 · 2021-06-22 10:27:25 +02:00 · 8075655301
commit 8075655301
parent 3876478b96
3 changed files with 12 additions and 7 deletions
--- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py
+++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup.py
@ -113,7 +113,11 @@ class ALRBallInACupEnv(alr_mujoco_env.AlrMujocoEnv, utils.EzPickle):
        return ob, reward, done, dict(reward_dist=reward_dist,
                                      reward_ctrl=reward_ctrl,
                                      velocity=angular_vel,
-                                      traj=self._q_pos, is_success=success,
+                                      # traj=self._q_pos,
                                      action=a,
                                      q_pos=self.sim.data.qpos[0:7].ravel().copy(),
                                      q_vel=self.sim.data.qvel[0:7].ravel().copy(),
                                      is_success=success,
                                      is_collided=is_collided, sim_crash=crash)
    def check_traj_in_joint_limits(self):
--- a/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py
+++ b/alr_envs/mujoco/ball_in_a_cup/ball_in_a_cup_reward_simple.py
@ -81,11 +81,11 @@ class BallInACupReward(alr_reward_fct.AlrReward):
            # cost = 0.5 * dist_final + 0.05 * cost_angle  # TODO: Increase cost_angle weight  # 0.5 * min_dist +
            # reward = np.exp(-2 * cost) - 1e-2 * action_cost - self.collision_penalty * int(self._is_collided)
            # reward = - dist_final**2 - 1e-4 * cost_angle - 1e-5 * action_cost - self.collision_penalty * int(self._is_collided)
-            reward = - dist_final**2 - min_dist_final**2 - 1e-4 * cost_angle - 5e-4 * action_cost - self.collision_penalty * int(self._is_collided)
+            reward = - dist_final**2 - min_dist_final**2 - 1e-4 * cost_angle - 1e-3 * action_cost - self.collision_penalty * int(self._is_collided)
            success = dist_final < 0.05 and ball_in_cup and not self._is_collided
            crash = self._is_collided
        else:
-            reward = - 5e-4 * action_cost - 1e-4 * cost_angle  # TODO: increase action_cost weight
+            reward = - 1e-3 * action_cost - 1e-4 * cost_angle  # TODO: increase action_cost weight
            success = False
            crash = False
--- a/alr_envs/utils/mps/mp_wrapper.py
+++ b/alr_envs/utils/mps/mp_wrapper.py
@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 import gym
 import numpy as np
@ -64,11 +65,11 @@ class MPWrapper(gym.Wrapper, ABC):
            trajectory = np.vstack([trajectory, np.tile(trajectory[-1, :], [self.post_traj_steps, 1])])
            velocity = np.vstack([velocity, np.zeros(shape=(self.post_traj_steps, self.mp.n_dof))])
-        # self._trajectory = trajectory
+        self._trajectory = trajectory
        # self._velocity = velocity
        rewards = 0
-        info = {}
+        infos = defaultdict(list)
        # create random obs as the reset function is called externally
        obs = self.env.observation_space.sample()
@ -77,14 +78,14 @@ class MPWrapper(gym.Wrapper, ABC):
            obs, rew, done, info = self.env.step(ac)
            rewards += rew
            # TODO return all dicts?
-            # [infos[k].append(v) for k, v in info.items()]
+            [infos[k].append(v) for k, v in info.items()]
            if self.render_mode:
                self.env.render(mode=self.render_mode, **self.render_kwargs)
            if done:
                break
        done = True
-        return obs[self.env.active_obs], rewards, done, info
+        return obs[self.env.active_obs], rewards, done, infos
    def render(self, mode='human', **kwargs):
        """Only set render options here, such that they can be used during the rollout.