From aa63f374feeb1b6f6739e97dcae56474689784c6 Mon Sep 17 00:00:00 2001
From: Dominik Roth <dominik.roth.dev@gmail.com>
Date: Sat, 18 May 2024 15:27:06 +0200
Subject: [PATCH] .

---
 mujoco_maze/__init__.py      |  14 ++++
 mujoco_maze/ant.py           |   1 +
 mujoco_maze/assets/point.xml |   2 +-
 mujoco_maze/maze_env.py      |  18 +++++-
 mujoco_maze/point.py         |   2 +-
 mujoco_maze/pos_point.py     | 121 +++++++++++++++++++++++++++++++++++
 mujoco_maze/reacher.py       |   1 +
 mujoco_maze/swimmer.py       |   1 +
 8 files changed, 157 insertions(+), 3 deletions(-)
 create mode 100644 mujoco_maze/pos_point.py
diff --git a/mujoco_maze/__init__.py b/mujoco_maze/__init__.py
index 3309bc9..d5324dd 100644
--- a/mujoco_maze/__init__.py
+++ b/mujoco_maze/__init__.py
@@ -13,6 +13,7 @@ from mujoco_maze.maze_task import TaskRegistry
 from mujoco_maze.point import PointEnv
 from mujoco_maze.reacher import ReacherEnv
 from mujoco_maze.swimmer import SwimmerEnv
+from mujoco_maze.pos_point import PosPointEnv
 
 for maze_id in TaskRegistry.keys():
     for i, task_cls in enumerate(TaskRegistry.tasks(maze_id)):
@@ -31,6 +32,19 @@ for maze_id in TaskRegistry.keys():
                 max_episode_steps=1000,
                 reward_threshold=task_cls.REWARD_THRESHOLD,
             )
+            # PosPoint
+            gym.envs.register(
+                id=f"PosPoint{maze_id}-v{i}",
+                entry_point="mujoco_maze.maze_env:MazeEnv",
+                kwargs=dict(
+                    model_cls=PosPointEnv,
+                    maze_task=task_cls,
+                    maze_size_scaling=point_scale,
+                    inner_reward_scaling=task_cls.INNER_REWARD_SCALING,
+                ),
+                max_episode_steps=1000,
+                reward_threshold=task_cls.REWARD_THRESHOLD,
+            )
 
         ant_scale = task_cls.MAZE_SIZE_SCALING.ant
         if ant_scale is not None:
diff --git a/mujoco_maze/ant.py b/mujoco_maze/ant.py
index bb8448f..b5c2026 100644
--- a/mujoco_maze/ant.py
+++ b/mujoco_maze/ant.py
@@ -57,6 +57,7 @@ class AntEnv(AgentModel):
         forward_reward_weight: float = 1.0,
         ctrl_cost_weight: float = 1e-4,
         forward_reward_fn: ForwardRewardFn = forward_reward_vnorm,
+        maze_env=None,
     ) -> None:
         self._forward_reward_weight = forward_reward_weight
         self._ctrl_cost_weight = ctrl_cost_weight
diff --git a/mujoco_maze/assets/point.xml b/mujoco_maze/assets/point.xml
index 4c06cb1..f7dc3d7 100644
--- a/mujoco_maze/assets/point.xml
+++ b/mujoco_maze/assets/point.xml
@@ -14,7 +14,7 @@
     </asset>
     <worldbody>
         <light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3" />
-        <geom name="floor" material="MatPlane" pos="0 0 0" size="40 40 40" type="plane" conaffinity="1" rgba="0.8 0.9 0.8 1" condim="3" />
+        <geom name="floor" material="MatPlane" pos="0 0 0" size="60 60 60" type="plane" conaffinity="1" rgba="0.8 0.9 0.8 1" condim="3" />
         <!--  ================= Point ================= /-->
         <!--  Note that the solimp is modified from rllab to prevent the point from going through the wall /-->
         <body name="torso" pos="0 0 0">
diff --git a/mujoco_maze/maze_env.py b/mujoco_maze/maze_env.py
index d1725bd..c56d3ad 100644
--- a/mujoco_maze/maze_env.py
+++ b/mujoco_maze/maze_env.py
@@ -212,10 +212,13 @@ class MazeEnv(gym.Env):
                 rgba=goal.rgb.rgba_str(),
             )
 
+        # Add the target site for visualization
+        self._add_target_site(worldbody)
+
         _, file_path = tempfile.mkstemp(text=True, suffix=".xml")
         tree.write(file_path)
         self.world_tree = tree
-        self.wrapped_env = model_cls(file_path=file_path, **kwargs)
+        self.wrapped_env = model_cls(file_path=file_path, maze_env=self, **kwargs)
         self.observation_space = self._get_obs_space()
         self._websock_port = websock_port
         self._camera_move_x = camera_move_x
@@ -225,6 +228,18 @@ class MazeEnv(gym.Env):
         self._mj_offscreen_viewer = None
         self._websock_server_pipe = None
 
+    def _add_target_site(self, worldbody):
+        """Add visualization for the target position if the agent has one."""
+        target_size = self._maze_size_scaling * 0.1
+        ET.SubElement(
+            worldbody,
+            "site",
+            name="target_site",
+            pos="2 0 0",  # Initial position will be updated during the simulation
+            size=f"{target_size} {target_size} {target_size}",
+            rgba="0 0 1 0.5",  # Blue and semi-transparent
+        )
+
     @property
     def has_extended_obs(self) -> bool:
         return self._top_down_view or self._observe_blocks or self._observe_balls
@@ -473,6 +488,7 @@ class MazeEnv(gym.Env):
                     self.wrapped_env.data.xipos[idx][:2] = pos
         else:
             inner_next_obs, inner_reward, _, info = self.wrapped_env.step(action)
+        
         next_obs = self._get_obs()
         inner_reward = self._inner_reward_scaling * inner_reward
         outer_reward = self._task.reward(next_obs)
diff --git a/mujoco_maze/point.py b/mujoco_maze/point.py
index 9745c1f..d026917 100644
--- a/mujoco_maze/point.py
+++ b/mujoco_maze/point.py
@@ -33,7 +33,7 @@ class PointEnv(AgentModel):
 
     VELOCITY_LIMITS: float = 10.0
 
-    def __init__(self, file_path: Optional[str] = None) -> None:
+    def __init__(self, file_path: Optional[str] = None, maze_env=None) -> None:
         high = np.inf * np.ones(6, dtype=np.float32)
         high[3:] = self.VELOCITY_LIMITS * 1.2
         high[self.ORI_IND] = np.pi
diff --git a/mujoco_maze/pos_point.py b/mujoco_maze/pos_point.py
new file mode 100644
index 0000000..6def29d
--- /dev/null
+++ b/mujoco_maze/pos_point.py
@@ -0,0 +1,121 @@
+from typing import Optional, Tuple
+import gym
+import mujoco
+import numpy as np
+import xml.etree.ElementTree as ET
+import os
+import tempfile
+from mujoco_maze.agent_model import AgentModel
+
+class PosPointEnv(AgentModel):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+        "render_fps": 50,
+    }
+
+    FILE: str = "point.xml"
+    MANUAL_COLLISION: bool = True
+    RADIUS: float = 0.4
+    OBJBALL_TYPE: str = "hinge"
+    VELOCITY_LIMITS: float = 0.333
+    ACCELERATION_LIMITS: float = 0.1
+
+    def __init__(self, file_path: Optional[str] = None, maze_env=None) -> None:
+        self.maze_env = maze_env
+        high = np.inf * np.ones(6, dtype=np.float32)  # [target_x, target_y, current_x, current_y, velocity_x, velocity_y]
+        low = -high
+        observation_space = gym.spaces.Box(low, high)
+        action_space = gym.spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=np.float32)  # Normalized [-1, 1] action space
+        self.target_pos = np.zeros(2)
+        self.action_penalty = 0.1
+        super().__init__(file_path, 1, observation_space)
+        self.action_space = action_space
+
+    def step(self, action: np.ndarray) -> Tuple[np.ndarray, float, bool, dict]:
+        # Calculate the maze size based on structure and scaling
+        maze_size = self._calculate_maze_size()
+        
+        # Scale the action to the maze size
+        self.target_pos = (action + 0.5)/1.9 * maze_size
+
+        # Get current position and velocity
+        current_pos = self.get_xy()
+        current_vel = self.data.qvel[:2]
+
+        # Compute desired velocity towards the target position
+        direction = self.target_pos - current_pos
+        distance = np.linalg.norm(direction)
+        if distance > 0:
+            direction /= distance  # Normalize the direction
+
+        desired_velocity = direction * min(distance, self.VELOCITY_LIMITS)
+        
+        # Compute acceleration required to reach the desired velocity
+        acceleration = desired_velocity - current_vel
+        acceleration = np.clip(acceleration, -self.ACCELERATION_LIMITS, self.ACCELERATION_LIMITS)
+
+        # Update velocity and position
+        new_vel = current_vel + acceleration
+        new_vel = np.clip(new_vel, -self.VELOCITY_LIMITS, self.VELOCITY_LIMITS)
+
+        qpos = self.data.qpos.copy()
+        qpos[:2] += new_vel  # Update position with new velocity
+
+        # Calculate the orientation to face the target
+        angle_to_target = np.arctan2(direction[1], direction[0])
+        qpos[2] = angle_to_target  # Set the orientation to face the target
+
+        qpos = np.concatenate([qpos[:3], self.data.qpos[3:]])  # Ensure qpos has the correct shape
+        qvel = np.concatenate([new_vel, self.data.qvel[2:]])  # Ensure qvel has the correct shape
+        self.set_state(qpos, qvel)
+
+        for _ in range(0, self.frame_skip):
+            mujoco.mj_step(self.model, self.data)
+
+        # Update the position of the target site
+        self.data.site_xpos[-1][:2] = self.target_pos  # Assuming 'target_site' is the first site added
+        self.data.site_xpos[-1][2] = 0.1  # Slightly higher in space
+
+        next_obs = self._get_obs()
+        reward = -1*min(max(0, distance - 3), 100)*self.action_penalty
+        return next_obs, reward, False, {}
+
+    def _calculate_maze_size(self) -> float:
+        """Calculate the effective size of the maze."""
+        maze_structure = self.maze_env._maze_structure
+        num_rows = len(maze_structure)
+        num_cols = len(maze_structure[0]) if num_rows > 0 else 0
+        maze_size_scaling = self.maze_env._maze_size_scaling
+        maze_size = max(num_rows, num_cols) * maze_size_scaling
+        return maze_size
+
+    def _get_obs(self):
+        return np.concatenate(
+            [
+                self.data.qpos.flat[:2],  # current position
+                self.data.qvel.flat[:2],  # current velocity
+                self.target_pos,          # target position
+            ]
+        )
+
+    def reset_model(self):
+        qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-0.1, high=0.1)
+        qvel = self.init_qvel + self.np_random.random(self.model.nv) * 0.1
+
+        # Set everything other than point to original position and 0 velocity.
+        qpos[2:] = self.init_qpos[2:]
+        qvel[2:] = 0.0
+        self.set_state(qpos, qvel)
+        return self._get_obs()
+
+    def get_xy(self):
+        return self.data.qpos[:2].copy()
+
+    def set_xy(self, xy: np.ndarray) -> None:
+        qpos = self.data.qpos.copy()
+        qpos[:2] = xy
+        self.set_state(qpos, self.data.qvel)
diff --git a/mujoco_maze/reacher.py b/mujoco_maze/reacher.py
index 9026e40..c80915a 100644
--- a/mujoco_maze/reacher.py
+++ b/mujoco_maze/reacher.py
@@ -31,6 +31,7 @@ class ReacherEnv(AgentModel):
         forward_reward_weight: float = 1.0,
         ctrl_cost_weight: float = 1e-4,
         forward_reward_fn: ForwardRewardFn = forward_reward_vnorm,
+        maze_env=None,
     ) -> None:
         self._forward_reward_weight = forward_reward_weight
         self._ctrl_cost_weight = ctrl_cost_weight
diff --git a/mujoco_maze/swimmer.py b/mujoco_maze/swimmer.py
index 30938a4..9062f09 100644
--- a/mujoco_maze/swimmer.py
+++ b/mujoco_maze/swimmer.py
@@ -33,6 +33,7 @@ class SwimmerEnv(AgentModel):
         forward_reward_weight: float = 1.0,
         ctrl_cost_weight: float = 1e-4,
         forward_reward_fn: ForwardRewardFn = forward_reward_vnorm,
+        maze_env=None,
     ) -> None:
         self._forward_reward_weight = forward_reward_weight
         self._ctrl_cost_weight = ctrl_cost_weight