From b229e7a7c80b63d139acafc0c3f78b421a5ad060 Mon Sep 17 00:00:00 2001 From: JulioJerez Date: Wed, 18 Sep 2024 12:19:50 -0700 Subject: [PATCH] it seems polynomial reward functions are better that gaussian --- .../demos/ndAdvancedIndustrialRobot.cpp | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/newton-4.00/applications/ndSandbox/demos/ndAdvancedIndustrialRobot.cpp b/newton-4.00/applications/ndSandbox/demos/ndAdvancedIndustrialRobot.cpp index e49db4d96..cbdbf06f9 100644 --- a/newton-4.00/applications/ndSandbox/demos/ndAdvancedIndustrialRobot.cpp +++ b/newton-4.00/applications/ndSandbox/demos/ndAdvancedIndustrialRobot.cpp @@ -539,18 +539,24 @@ namespace ndAdvancedRobot return ND_DEAD_PENALTY; } - ndFloat32 invRewardSigma2 = 500.0f; - ndFloat32 rewardWeigh = 1.0f / 6.0f; - const ndMatrix effectorMatrix(m_effectorLocalTarget * m_arm_4->GetBody0()->GetMatrix()); const ndMatrix baseMatrix(m_effectorLocalBase * m_base_rotator->GetBody1()->GetMatrix()); const ndMatrix currentEffectorMatrix(effectorMatrix * baseMatrix.OrthoInverse()); const ndVector positError(CalculateDeltaTargetPosit(currentEffectorMatrix)); const ndVector positError2 = positError * positError; + auto ScalarReward = [](ndFloat32 param2) + { + ndFloat32 rewardWeigh = 1.0f / 6.0f; + //ndFloat32 invRewardSigma2 = 500.0f; + //return rewardWeigh* ndExp(-invRewardSigma2 * positError2.m_z); + return rewardWeigh * ndClamp(ndFloat32(1.0f - param2), ndFloat32(0.0f), ndFloat32(1.0f)); + }; + //ndFloat32 azimuthReward = rewardWeigh * ndExp(-invRewardSigma2 * positError2.m_z); //ndFloat32 azimuthReward = rewardWeigh * ndExp(-50.0f * positError2.m_z); - ndFloat32 azimuthReward = ndClamp (ndFloat32 (1.0f - positError2.m_z), 0.0f, 1.0f); + //ndFloat32 azimuthReward = ndClamp (ndFloat32 (1.0f - positError2.m_z), 0.0f, 1.0f); + ndFloat32 azimuthReward = ScalarReward(positError2.m_z); ndFloat32 reward = azimuthReward; if (azimuthReward > 0.5f) @@ -558,14 +564,14 @@ namespace ndAdvancedRobot const ndVector rotationError(CalculateDeltaTargetRotation(currentEffectorMatrix)); const ndVector rotationError2 = rotationError * rotationError; - ndFloat32 omega_xReward = rewardWeigh * ndExp(-invRewardSigma2 * rotationError2.m_x); - ndFloat32 omega_yReward = rewardWeigh * ndExp(-invRewardSigma2 * rotationError2.m_y); - ndFloat32 omega_zReward = rewardWeigh * ndExp(-invRewardSigma2 * rotationError2.m_z); + ndFloat32 omega_xReward = ScalarReward(rotationError2.m_x); + ndFloat32 omega_yReward = ScalarReward(rotationError2.m_y); + ndFloat32 omega_zReward = ScalarReward(rotationError2.m_z); reward += (omega_xReward + omega_yReward + omega_zReward); if ((omega_xReward > 1.0e-3f) || (omega_yReward > 1.0e-3f) || (omega_zReward > 1.0e-3f)) { - ndFloat32 posit_xReward = rewardWeigh * ndExp(-invRewardSigma2 * positError2.m_x); - ndFloat32 posit_yReward = rewardWeigh * ndExp(-invRewardSigma2 * positError2.m_y); + ndFloat32 posit_xReward = ScalarReward(positError2.m_x); + ndFloat32 posit_yReward = ScalarReward(positError2.m_y); reward += (posit_xReward + posit_yReward); } }