Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor mujoco envs to support dynamic arguments #1304

Merged
41 changes: 40 additions & 1 deletion gym/envs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,39 +252,79 @@
reward_threshold=4800.0,
)

register(
id='HalfCheetah-v3',
entry_point='gym.envs.mujoco.half_cheetah_v3:HalfCheetahEnv',
max_episode_steps=1000,
reward_threshold=4800.0,
)

register(
id='Hopper-v2',
entry_point='gym.envs.mujoco:HopperEnv',
max_episode_steps=1000,
reward_threshold=3800.0,
)

register(
id='Hopper-v3',
entry_point='gym.envs.mujoco.hopper_v3:HopperEnv',
max_episode_steps=1000,
reward_threshold=3800.0,
)

register(
id='Swimmer-v2',
entry_point='gym.envs.mujoco:SwimmerEnv',
max_episode_steps=1000,
reward_threshold=360.0,
)

register(
id='Swimmer-v3',
entry_point='gym.envs.mujoco.swimmer_v3:SwimmerEnv',
max_episode_steps=1000,
reward_threshold=360.0,
)

register(
id='Walker2d-v2',
max_episode_steps=1000,
entry_point='gym.envs.mujoco:Walker2dEnv',
)

register(
id='Walker2d-v3',
max_episode_steps=1000,
entry_point='gym.envs.mujoco.walker2d_v3:Walker2dEnv',
)

register(
id='Ant-v2',
entry_point='gym.envs.mujoco:AntEnv',
max_episode_steps=1000,
reward_threshold=6000.0,
)

register(
id='Ant-v3',
entry_point='gym.envs.mujoco.ant_v3:AntEnv',
max_episode_steps=1000,
reward_threshold=6000.0,
)

register(
id='Humanoid-v2',
entry_point='gym.envs.mujoco:HumanoidEnv',
max_episode_steps=1000,
)

register(
id='Humanoid-v3',
entry_point='gym.envs.mujoco.humanoid_v3:HumanoidEnv',
max_episode_steps=1000,
)

register(
id='HumanoidStandup-v2',
entry_point='gym.envs.mujoco:HumanoidStandupEnv',
Expand Down Expand Up @@ -551,4 +591,3 @@ def _merge(a, b):
entry_point='gym.envs.unittest:MemorizeDigits',
reward_threshold=20,
)

146 changes: 146 additions & 0 deletions gym/envs/mujoco/ant_v3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import numpy as np
from gym import utils
from gym.envs.mujoco import mujoco_env


DEFAULT_CAMERA_CONFIG = {
'distance': 4.0,
}


class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
def __init__(self,
xml_file='ant.xml',
ctrl_cost_weight=0.5,
contact_cost_weight=5e-4,
healthy_reward=1.0,
terminate_when_unhealthy=True,
healthy_z_range=(0.2, 1.0),
contact_force_range=(-1.0, 1.0),
reset_noise_scale=0.1,
exclude_current_positions_from_observation=True):
utils.EzPickle.__init__(**locals())

self._ctrl_cost_weight = ctrl_cost_weight
self._contact_cost_weight = contact_cost_weight

self._healthy_reward = healthy_reward
self._terminate_when_unhealthy = terminate_when_unhealthy
self._healthy_z_range = healthy_z_range

self._contact_force_range = contact_force_range

self._reset_noise_scale = reset_noise_scale

self._exclude_current_positions_from_observation = (
exclude_current_positions_from_observation)

mujoco_env.MujocoEnv.__init__(self, xml_file, 5)

@property
def healthy_reward(self):
return float(
self.is_healthy
or self._terminate_when_unhealthy
) * self._healthy_reward

def control_cost(self, action):
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
return control_cost

@property
def contact_forces(self):
raw_contact_forces = self.sim.data.cfrc_ext
min_value, max_value = self._contact_force_range
contact_forces = np.clip(raw_contact_forces, min_value, max_value)
return contact_forces

@property
def contact_cost(self):
contact_cost = self._contact_cost_weight * np.sum(
np.square(self.contact_forces))
return contact_cost

@property
def is_healthy(self):
state = self.state_vector()
min_z, max_z = self._healthy_z_range
is_healthy = (np.isfinite(state).all() and min_z <= state[2] <= max_z)
return is_healthy

@property
def done(self):
done = (not self.is_healthy
if self._terminate_when_unhealthy
else False)
return done

def step(self, action):
xy_position_before = self.get_body_com("torso")[:2].copy()
self.do_simulation(action, self.frame_skip)
xy_position_after = self.get_body_com("torso")[:2].copy()

xy_velocity = (xy_position_after - xy_position_before) / self.dt
x_velocity, y_velocity = xy_velocity

ctrl_cost = self.control_cost(action)
contact_cost = self.contact_cost

forward_reward = x_velocity
healthy_reward = self.healthy_reward

rewards = forward_reward + healthy_reward
costs = ctrl_cost + contact_cost

reward = rewards - costs
done = self.done
observation = self._get_obs()
info = {
'reward_forward': forward_reward,
'reward_ctrl': -ctrl_cost,
'reward_contact': -contact_cost,
'reward_survive': healthy_reward,

'x_position': xy_position_after[0],
'y_position': xy_position_after[1],
'distance_from_origin': np.linalg.norm(xy_position_after, ord=2),

'x_velocity': x_velocity,
'y_velocity': y_velocity,
'forward_reward': forward_reward,
}

return observation, reward, done, info

def _get_obs(self):
position = self.sim.data.qpos.flat.copy()
velocity = self.sim.data.qvel.flat.copy()
contact_force = self.contact_forces.flat.copy()

if self._exclude_current_positions_from_observation:
position = position[2:]

observations = np.concatenate((position, velocity, contact_force))

return observations

def reset_model(self):
noise_low = -self._reset_noise_scale
noise_high = self._reset_noise_scale

qpos = self.init_qpos + self.np_random.uniform(
low=noise_low, high=noise_high, size=self.model.nq)
qvel = self.init_qvel + self._reset_noise_scale * self.np_random.randn(
self.model.nv)
self.set_state(qpos, qvel)

observation = self._get_obs()

return observation

def viewer_setup(self):
for key, value in DEFAULT_CAMERA_CONFIG.items():
if isinstance(value, np.ndarray):
getattr(self.viewer.cam, key)[:] = value
else:
setattr(self.viewer.cam, key, value)
88 changes: 88 additions & 0 deletions gym/envs/mujoco/half_cheetah_v3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import numpy as np
from gym import utils
from gym.envs.mujoco import mujoco_env


DEFAULT_CAMERA_CONFIG = {
'distance': 4.0,
}


class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
def __init__(self,
xml_file='half_cheetah.xml',
forward_reward_weight=1.0,
ctrl_cost_weight=0.1,
reset_noise_scale=0.1,
exclude_current_positions_from_observation=True):
utils.EzPickle.__init__(**locals())

self._forward_reward_weight = forward_reward_weight

self._ctrl_cost_weight = ctrl_cost_weight

self._reset_noise_scale = reset_noise_scale

self._exclude_current_positions_from_observation = (
exclude_current_positions_from_observation)

mujoco_env.MujocoEnv.__init__(self, xml_file, 5)

def control_cost(self, action):
control_cost = self._ctrl_cost_weight * np.sum(np.square(action))
return control_cost

def step(self, action):
x_position_before = self.sim.data.qpos[0]
self.do_simulation(action, self.frame_skip)
x_position_after = self.sim.data.qpos[0]
x_velocity = ((x_position_after - x_position_before)
/ self.dt)

ctrl_cost = self.control_cost(action)

forward_reward = self._forward_reward_weight * x_velocity

observation = self._get_obs()
reward = forward_reward - ctrl_cost
done = False
info = {
'x_position': x_position_after,
'x_velocity': x_velocity,

'reward_run': forward_reward,
'reward_ctrl': -ctrl_cost
}

return observation, reward, done, info

def _get_obs(self):
position = self.sim.data.qpos.flat.copy()
velocity = self.sim.data.qvel.flat.copy()

if self._exclude_current_positions_from_observation:
position = position[1:]

observation = np.concatenate((position, velocity)).ravel()
return observation

def reset_model(self):
noise_low = -self._reset_noise_scale
noise_high = self._reset_noise_scale

qpos = self.init_qpos + self.np_random.uniform(
low=noise_low, high=noise_high, size=self.model.nq)
qvel = self.init_qvel + self._reset_noise_scale * self.np_random.randn(
self.model.nv)

self.set_state(qpos, qvel)

observation = self._get_obs()
return observation

def viewer_setup(self):
for key, value in DEFAULT_CAMERA_CONFIG.items():
if isinstance(value, np.ndarray):
getattr(self.viewer.cam, key)[:] = value
else:
setattr(self.viewer.cam, key, value)
Loading