gr00t-WholeBodyControl/decoupled_wbc/control/envs/robocasa/utils/robocasa_env.py


								import os

								from typing import Any, Dict, List, Tuple


								from gymnasium import spaces

								import mujoco

								import numpy as np

								import robocasa

								from robocasa.utils.gym_utils.gymnasium_basic import (

								    RoboCasaEnv,

								    create_env_robosuite,

								)

								from robocasa.wrappers.ik_wrapper import IKWrapper

								from robosuite.controllers import load_composite_controller_config

								from robosuite.utils.log_utils import ROBOSUITE_DEFAULT_LOGGER


								from decoupled_wbc.control.envs.robocasa.utils.cam_key_converter import CameraKeyMapper

								from decoupled_wbc.control.envs.robocasa.utils.robot_key_converter import Gr00tObsActionConverter

								from decoupled_wbc.control.robot_model.robot_model import RobotModel


								ALLOWED_LANGUAGE_CHARSET = (

								    "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ,.\n\t[]{}()!?'_:"

								)


								class Gr00tLocomanipRoboCasaEnv(RoboCasaEnv):

								    def __init__(

								        self,

								        env_name: str,

								        robots_name: str,

								        robot_model: RobotModel,  # gr00t robot model

								        input_space: str = "JOINT_SPACE",  # either "JOINT_SPACE" or "EEF_SPACE"

								        camera_names: List[str] = ["egoview"],

								        camera_heights: List[int] | None = None,

								        camera_widths: List[int] | None = None,

								        onscreen: bool = False,

								        offscreen: bool = False,

								        dump_rollout_dataset_dir: str | None = None,

								        rollout_hdf5: str | None = None,

								        rollout_trainset: int | None = None,

								        controller_configs: str | None = None,

								        ik_indicator: bool = False,

								        **kwargs,

								    ):

								        # ========= Create env =========

								        if controller_configs is None:

								            if "G1" in robots_name:

								                controller_configs = (

								                    "robocasa/examples/third_party_controller/default_mink_ik_g1_wbc.json"

								                )

								            elif "GR1" in robots_name:

								                controller_configs = (

								                    "robocasa/examples/third_party_controller/default_mink_ik_gr1_smallkd.json"

								                )

								            else:

								                assert False, f"Unsupported robot name: {robots_name}"

								        controller_configs = os.path.join(

								            os.path.dirname(robocasa.__file__),

								            "../",

								            controller_configs,

								        )

								        controller_configs = load_composite_controller_config(

								            controller=controller_configs,

								            robot=robots_name.split("_")[0],

								        )

								        if input_space == "JOINT_SPACE":

								            controller_configs["type"] = "BASIC"

								            controller_configs["composite_controller_specific_configs"] = {}

								            controller_configs["control_delta"] = False


								        self.camera_key_mapper = CameraKeyMapper()

								        self.camera_names = camera_names


								        if camera_widths is None:

								            self.camera_widths = [

								                self.camera_key_mapper.get_camera_config(name)[1] for name in camera_names

								            ]

								        else:

								            self.camera_widths = camera_widths

								        if camera_heights is None:

								            self.camera_heights = [

								                self.camera_key_mapper.get_camera_config(name)[2] for name in camera_names

								            ]

								        else:

								            self.camera_heights = camera_heights


								        self.env, self.env_kwargs = create_env_robosuite(

								            env_name=env_name,

								            robots=robots_name.split("_"),

								            controller_configs=controller_configs,

								            camera_names=camera_names,

								            camera_widths=self.camera_widths,

								            camera_heights=self.camera_heights,

								            enable_render=offscreen,

								            onscreen=onscreen,

								            **kwargs,  # Forward kwargs to create_env_robosuite

								        )


								        if ik_indicator:

								            self.env = IKWrapper(self.env, ik_indicator=True)


								        # ========= create converters first to get total DOFs =========

								        # For now, assume single robot (multi-robot support can be added later)

								        self.obs_action_converter: List[Gr00tObsActionConverter] = [

								            Gr00tObsActionConverter(

								                robot_model=robot_model,

								                robosuite_robot_model=self.env.robots[i],

								            )

								            for i in range(len(self.env.robots))

								        ]


								        self.body_dofs = sum(converter.body_dof for converter in self.obs_action_converter)

								        self.gripper_dofs = sum(converter.gripper_dof for converter in self.obs_action_converter)

								        self.total_dofs = self.body_dofs + self.gripper_dofs

								        self.body_nu = sum(converter.body_nu for converter in self.obs_action_converter)

								        self.gripper_nu = sum(converter.gripper_nu for converter in self.obs_action_converter)

								        self.total_nu = self.body_nu + self.gripper_nu


								        # ========= create spaces to match total DOFs =========

								        self.get_observation_space()

								        self.get_action_space()


								        self.enable_render = offscreen

								        self.render_obs_key = f"{camera_names[0]}_image"

								        self.render_cache = None


								        self.dump_rollout_dataset_dir = dump_rollout_dataset_dir

								        self.gr00t_exporter = None

								        self.np_exporter = None


								        self.rollout_hdf5 = rollout_hdf5

								        self.rollout_trainset = rollout_trainset

								        self.rollout_initial_state = {}


								        self.verbose = False

								        for k, v in self.observation_space.items():

								            self.verbose and print("{OBS}", k, v)

								        for k, v in self.action_space.items():

								            self.verbose and print("{ACTION}", k, v)


								        self.overridden_floating_base_action = None


								    def get_observation_space(self):

								        self.observation_space = spaces.Dict({})


								        # Add all the observation spaces

								        self.observation_space["time"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(1,), dtype=np.float32

								        )

								        self.observation_space["floating_base_pose"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(7,), dtype=np.float32

								        )

								        self.observation_space["floating_base_vel"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(6,), dtype=np.float32

								        )

								        self.observation_space["floating_base_acc"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(6,), dtype=np.float32

								        )

								        self.observation_space["body_q"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.body_dofs,), dtype=np.float32

								        )

								        self.observation_space["body_dq"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.body_dofs,), dtype=np.float32

								        )

								        self.observation_space["body_ddq"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.body_dofs,), dtype=np.float32

								        )

								        self.observation_space["body_tau_est"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.body_nu,), dtype=np.float32

								        )

								        self.observation_space["left_hand_q"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.gripper_dofs // 2,), dtype=np.float32

								        )

								        self.observation_space["left_hand_dq"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.gripper_dofs // 2,), dtype=np.float32

								        )

								        self.observation_space["left_hand_ddq"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.gripper_dofs // 2,), dtype=np.float32

								        )

								        self.observation_space["left_hand_tau_est"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.gripper_nu // 2,), dtype=np.float32

								        )

								        self.observation_space["right_hand_q"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.gripper_dofs // 2,), dtype=np.float32

								        )

								        self.observation_space["right_hand_dq"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.gripper_dofs // 2,), dtype=np.float32

								        )

								        self.observation_space["right_hand_ddq"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.gripper_dofs // 2,), dtype=np.float32

								        )

								        self.observation_space["right_hand_tau_est"] = spaces.Box(

								            low=-np.inf, high=np.inf, shape=(self.gripper_nu // 2,), dtype=np.float32

								        )


								        self.observation_space["language.language_instruction"] = spaces.Text(

								            max_length=256, charset=ALLOWED_LANGUAGE_CHARSET

								        )


								        # Add camera observation spaces

								        for camera_name, w, h in zip(self.camera_names, self.camera_widths, self.camera_heights):

								            k = self.camera_key_mapper.get_camera_config(camera_name)[0]

								            self.observation_space[f"{k}_image"] = spaces.Box(

								                low=0, high=255, shape=(h, w, 3), dtype=np.uint8

								            )


								        # Add extra privileged observation spaces

								        if hasattr(self.env, "get_privileged_obs_keys"):

								            for key, shape in self.env.get_privileged_obs_keys().items():

								                self.observation_space[key] = spaces.Box(

								                    low=-np.inf, high=np.inf, shape=shape, dtype=np.float32

								                )


								        # Add robot-specific observation spaces

								        if hasattr(self.env.robots[0].robot_model, "torso_body"):

								            self.observation_space["secondary_imu_quat"] = spaces.Box(

								                low=-np.inf, high=np.inf, shape=(4,), dtype=np.float32

								            )

								            self.observation_space["secondary_imu_vel"] = spaces.Box(

								                low=-np.inf, high=np.inf, shape=(6,), dtype=np.float32

								            )


								    def get_action_space(self):

								        self.action_space = spaces.Dict(

								            {"q": spaces.Box(low=-np.inf, high=np.inf, shape=(self.total_dofs,), dtype=np.float32)}

								        )


								    def reset(self, seed=None, options=None):

								        raw_obs, info = super().reset(seed=seed, options=options)

								        obs = self.get_gr00t_observation(raw_obs)


								        lang = self.env.get_ep_meta().get("lang", "")

								        ROBOSUITE_DEFAULT_LOGGER.info(f"Instruction: {lang}")


								        return obs, info


								    def step(

								        self, action: Dict[str, Any]

								    ) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:

								        # action={"q": xxx, "tau": xxx}

								        for k, v in action.items():

								            self.verbose and print("<ACTION>", k, v)


								        joint_actoin_vec = action["q"]

								        action_dict = {}

								        for ii, robot in enumerate(self.env.robots):

								            pf = robot.robot_model.naming_prefix

								            _action_dict = self.obs_action_converter[ii].gr00t_to_robocasa_action_dict(

								                joint_actoin_vec

								            )

								            action_dict.update({f"{pf}{k}": v for k, v in _action_dict.items()})

								            if action.get("tau", None) is not None:

								                _torque_dict = self.obs_action_converter[ii].gr00t_to_robocasa_action_dict(

								                    action["tau"]

								                )

								                action_dict.update({f"{pf}{k}_tau": v for k, v in _torque_dict.items()})

								            if self.overridden_floating_base_action is not None:

								                action_dict["robot0_base"] = self.overridden_floating_base_action

								        raw_obs, reward, terminated, truncated, info = super().step(action_dict)

								        obs = self.get_gr00t_observation(raw_obs)


								        for k, v in obs.items():

								            self.verbose and print("<OBS>", k, v.shape if k.startswith("video.") else v)

								        self.verbose = False


								        return obs, reward, terminated, truncated, info


								    def step_only_kinematics(

								        self, action: Dict[str, Any]

								    ) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:

								        joint_actoin_vec = action["q"]

								        for ii, robot in enumerate(self.env.robots):

								            joint_names = np.array(self.env.sim.model.joint_names)[robot._ref_joint_indexes]

								            body_q = self.obs_action_converter[ii].gr00t_to_robocasa_joint_order(

								                joint_names, joint_actoin_vec

								            )

								            self.env.sim.data.qpos[robot._ref_joint_pos_indexes] = body_q


								            for side in ["left", "right"]:

								                joint_names = np.array(self.env.sim.model.joint_names)[

								                    robot._ref_joints_indexes_dict[side + "_gripper"]

								                ]

								                gripper_q = self.obs_action_converter[ii].gr00t_to_robocasa_joint_order(

								                    joint_names, joint_actoin_vec

								                )

								                self.env.sim.data.qpos[robot._ref_gripper_joint_pos_indexes[side]] = gripper_q


								        mujoco.mj_forward(self.env.sim.model._model, self.env.sim.data._data)


								        obs = self.force_update_observation()

								        return obs, 0, False, False, {"success": False}


								    def force_update_observation(self, timestep=0):

								        raw_obs = self.env._get_observations(force_update=True, timestep=timestep)

								        obs = self.get_basic_observation(raw_obs)

								        obs = self.get_gr00t_observation(obs)

								        return obs


								    def get_basic_observation(self, raw_obs):

								        # this function takes a lot of time, so we disable it for now

								        # raw_obs.update(gather_robot_observations(self.env, format_gripper_space=False))


								        # Image are in (H, W, C), flip it upside down

								        def process_img(img):

								            return np.copy(img[::-1, :, :])


								        for obs_name, obs_value in raw_obs.items():

								            if obs_name.endswith("_image"):

								                # image observations

								                raw_obs[obs_name] = process_img(obs_value)

								            else:

								                # non-image observations

								                raw_obs[obs_name] = obs_value.astype(np.float32)


								        # Return black image if rendering is disabled

								        if not self.enable_render:

								            for ii, name in enumerate(self.camera_names):

								                raw_obs[f"{name}_image"] = np.zeros(

								                    (self.camera_heights[ii], self.camera_widths[ii], 3), dtype=np.uint8

								                )


								        self.render_cache = raw_obs[self.render_obs_key]

								        raw_obs["language"] = self.env.get_ep_meta().get("lang", "")


								        return raw_obs


								    def convert_body_q(self, q: np.ndarray) -> np.ndarray:

								        # q is in the order of the joints

								        robot = self.env.robots[0]

								        joint_names = np.array(self.env.sim.model.joint_names)[robot._ref_joint_indexes]

								        # this joint names are in the order of the obs_vec

								        actuated_q = self.obs_action_converter[0].robocasa_to_gr00t_actuated_order(

								            joint_names, q, "body"

								        )

								        return actuated_q


								    def convert_gripper_q(self, q: np.ndarray, side: str = "left") -> np.ndarray:

								        # q is in the order of the joints

								        robot = self.env.robots[0]

								        joint_names = np.array(self.env.sim.model.joint_names)[

								            robot._ref_joints_indexes_dict[side + "_gripper"]

								        ]

								        actuated_q = self.obs_action_converter[0].robocasa_to_gr00t_actuated_order(

								            joint_names, q, side + "_gripper"

								        )

								        return actuated_q


								    def convert_gripper_tau(self, tau: np.ndarray, side: str = "left") -> np.ndarray:

								        # tau is in the order of the actuators

								        robot = self.env.robots[0]

								        actuator_idx = robot._ref_actuators_indexes_dict[side + "_gripper"]

								        actuated_joint_names = [

								            self.env.sim.model.joint_id2name(self.env.sim.model.actuator_trnid[i][0])

								            for i in actuator_idx

								        ]

								        actuated_tau = self.obs_action_converter[0].robocasa_to_gr00t_actuated_order(

								            actuated_joint_names, tau, side + "_gripper"

								        )

								        return actuated_tau


								    def get_gr00t_observation(self, raw_obs: Dict[str, Any]) -> Dict[str, Any]:

								        obs = {}


								        if self.env.sim.model.jnt_type[0] == mujoco.mjtJoint.mjJNT_FREE:

								            # If the first joint is a free joint, use this way to get the floating base data

								            obs["floating_base_pose"] = self.env.sim.data.qpos[:7]

								            obs["floating_base_vel"] = self.env.sim.data.qvel[:6]

								            obs["floating_base_acc"] = self.env.sim.data.qacc[:6]

								        else:

								            # Otherwise, use self.env.sim.model to fetch the floating base pose

								            root_body_id = self.env.sim.model.body_name2id("robot0_base")


								            # Get position and orientation from body state

								            root_pos = self.env.sim.data.body_xpos[root_body_id]

								            root_quat = self.env.sim.data.body_xquat[root_body_id]  # quaternion in wxyz format


								            # Combine position and quaternion to form 7-DOF pose

								            obs["floating_base_pose"] = np.concatenate([root_pos, root_quat])

								            # set vel and acc to 0

								            obs["floating_base_vel"] = np.zeros(6)

								            obs["floating_base_acc"] = np.zeros(6)


								        obs["body_q"] = self.convert_body_q(raw_obs["robot0_joint_pos"])

								        obs["body_dq"] = self.convert_body_q(raw_obs["robot0_joint_vel"])

								        obs["body_ddq"] = self.convert_body_q(raw_obs["robot0_joint_acc"])


								        obs["left_hand_q"] = self.convert_gripper_q(raw_obs["robot0_left_gripper_qpos"], "left")

								        obs["left_hand_dq"] = self.convert_gripper_q(raw_obs["robot0_left_gripper_qvel"], "left")

								        obs["left_hand_ddq"] = self.convert_gripper_q(raw_obs["robot0_left_gripper_qacc"], "left")

								        obs["right_hand_q"] = self.convert_gripper_q(raw_obs["robot0_right_gripper_qpos"], "right")

								        obs["right_hand_dq"] = self.convert_gripper_q(raw_obs["robot0_right_gripper_qvel"], "right")

								        obs["right_hand_ddq"] = self.convert_gripper_q(

								            raw_obs["robot0_right_gripper_qacc"], "right"

								        )


								        robot = self.env.robots[0]

								        body_tau_idx_list = []

								        left_gripper_tau_idx_list = []

								        right_gripper_tau_idx_list = []

								        for part_name, actuator_idx in robot._ref_actuators_indexes_dict.items():

								            if "left_gripper" in part_name:

								                left_gripper_tau_idx_list.extend(actuator_idx)

								            elif "right_gripper" in part_name:

								                right_gripper_tau_idx_list.extend(actuator_idx)

								            elif "base" in part_name:

								                assert (

								                    len(actuator_idx) == 0 or robot.robot_model.default_base == "FloatingLeggedBase"

								                )

								            else:

								                body_tau_idx_list.extend(actuator_idx)


								        body_tau_idx_list = sorted(body_tau_idx_list)

								        left_gripper_tau_idx_list = sorted(left_gripper_tau_idx_list)

								        right_gripper_tau_idx_list = sorted(right_gripper_tau_idx_list)

								        obs["body_tau_est"] = self.convert_body_q(

								            self.env.sim.data.actuator_force[body_tau_idx_list]

								        )

								        obs["right_hand_tau_est"] = self.convert_gripper_tau(

								            self.env.sim.data.actuator_force[right_gripper_tau_idx_list], "right"

								        )

								        obs["left_hand_tau_est"] = self.convert_gripper_tau(

								            self.env.sim.data.actuator_force[left_gripper_tau_idx_list], "left"

								        )


								        obs["time"] = self.env.sim.data.time


								        # Add camera images

								        for ii, camera_name in enumerate(self.camera_names):

								            mapped_camera_name = self.camera_key_mapper.get_camera_config(camera_name)[0]

								            obs[f"{mapped_camera_name}_image"] = raw_obs[f"{camera_name}_image"]


								        # Add privileged observations

								        if hasattr(self.env, "get_privileged_obs_keys"):

								            for key in self.env.get_privileged_obs_keys():

								                obs[key] = raw_obs[key]


								        # Add robot-specific observations

								        if hasattr(self.env.robots[0].robot_model, "torso_body"):

								            obs["secondary_imu_quat"] = raw_obs["robot0_torso_link_imu_quat"]

								            obs["secondary_imu_vel"] = raw_obs["robot0_torso_link_imu_vel"]


								        obs["language.language_instruction"] = raw_obs["language"]


								        return obs