From dd6554fed3022bc226a434c1dade39e064045a9d Mon Sep 17 00:00:00 2001 From: "Xuanlin (Simon) Li" <35481101+xuanlinli17@users.noreply.github.com> Date: Thu, 22 Sep 2022 14:57:06 -0700 Subject: [PATCH] Simplify pick single reward to be more friendly to RL (#27) * For PickSingle, change the original complex reward to simple reward, since the original reward can be unfriendly for RL, even though MPC can solve many objects through the original reward. The old reward is renamed as legacy. --- .../envs/pick_and_place/pick_single.py | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/mani_skill2/envs/pick_and_place/pick_single.py b/mani_skill2/envs/pick_and_place/pick_single.py index 17ac063e6..42872127e 100644 --- a/mani_skill2/envs/pick_and_place/pick_single.py +++ b/mani_skill2/envs/pick_and_place/pick_single.py @@ -218,6 +218,45 @@ def evaluate(self, **kwargs): ) def compute_dense_reward(self, info, **kwargs): + + # Sep. 14, 2022: + # We changed the original complex reward to simple reward, + # since the original reward can be unfriendly for RL, + # even though MPC can solve many objects through the original reward. + + reward = 0.0 + + if info["success"]: + reward = 10.0 + else: + obj_pose = self.obj_pose + + # reaching reward + tcp_wrt_obj_pose = obj_pose.inv() * self.tcp.pose + tcp_to_obj_dist = np.linalg.norm(tcp_wrt_obj_pose.p) + reaching_reward = 1 - np.tanh( + 3.0 * np.maximum(tcp_to_obj_dist - np.linalg.norm(self.model_bbox_size), 0.0) + ) + reward = reward + reaching_reward + + # grasp reward + is_grasped = self.agent.check_grasp(self.obj, max_angle=30) + reward += 3.0 if is_grasped else 0.0 + + # reaching-goal reward + if is_grasped: + obj_to_goal_pos = self.goal_pos - obj_pose.p + obj_to_goal_dist = np.linalg.norm(obj_to_goal_pos) + reaching_goal_reward = 3 * (1 - np.tanh(3.0 * obj_to_goal_dist)) + reward += reaching_goal_reward + + return reward + + def compute_dense_reward_legacy(self, info, **kwargs): + # original complex reward that is geometry-independent, + # which ensures that MPC can successfully pick up most objects, + # but can be unfriendly for RL. + reward = 0.0 # hard code gripper info finger_length = 0.025 @@ -324,7 +363,7 @@ def compute_dense_reward(self, info, **kwargs): gripper_width / 2 - self.agent.robot.get_qpos()[-2:] ) # ensures that gripper is open - return reward + return reward def render(self, mode="human"): if mode in ["human", "rgb_array"]: