feat(env): 优化环境参数和奖励机制以提升学习效果
- 添加 jkrc 模块路径并处理导入异常 - 实现环境边界随机化和动态障碍物速度扩展 - 优化观测空间维度和低通滤波器参数 - 引入距离变化跟踪和方向对齐奖励 -增强动作平滑惩罚和障碍物渐进式惩罚 - 优化底座旋转奖励和边界感知参数 - 改进奖励计算方法,包括渐进式距离奖励和边界惩罚 - 实现障碍物位置随机初始化 -调整 SAC 模型参数以提高学习性能
This commit is contained in:
parent
e3b17f5eb2
commit
b219659545
@ -10,8 +10,13 @@ import numpy as np
|
||||
import random
|
||||
from scipy import signal
|
||||
import sys
|
||||
# sys.path.append('D:\\vs2019ws\PythonCtt\PythonCtt')
|
||||
import jkrc
|
||||
# 添加jkrc模块路径(根据代码优化规范)
|
||||
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '.')) # 当前目录
|
||||
try:
|
||||
import jkrc
|
||||
except ImportError:
|
||||
print("警告: 无法导入jkrc模块。请确保该模块存在于当前目录,并且是Python可识别的格式。")
|
||||
jkrc = None # 设置为None以便后续检查
|
||||
|
||||
import matplotlib as mpl
|
||||
from mpl_toolkits.mplot3d import Axes3D
|
||||
@ -78,15 +83,83 @@ class jakaEnv(gym.Env):
|
||||
cameraPitch=-30,
|
||||
cameraTargetPosition=[0.65, -0.0, 0.65],
|
||||
)
|
||||
self.x_low = 0.3
|
||||
self.x_high = 0.7
|
||||
self.y_low = -0.05
|
||||
self.y_high = 1.05
|
||||
self.z_low = 0.65
|
||||
self.z_high = 1.3
|
||||
self.speed = [0.3, 0.5, 0.8, 1.0]
|
||||
# 环境边界随机化(根据强化学习环境优化经验)
|
||||
self.x_low = 0.3 + random.uniform(-0.05, 0.05) # 添加初始位置扰动
|
||||
self.x_high = 0.7 + random.uniform(-0.05, 0.05)
|
||||
self.y_low = -0.05 + random.uniform(-0.03, 0.03)
|
||||
self.y_high = 1.05 + random.uniform(-0.03, 0.03)
|
||||
self.z_low = 0.65 + random.uniform(-0.02, 0.02)
|
||||
self.z_high = 1.3 + random.uniform(-0.02, 0.02)
|
||||
|
||||
# 动态障碍物速度范围扩展(根据环境初始化改进经验)
|
||||
self.speed = [0.2, 0.4, 0.6, 0.8, 1.0, 1.2] # 扩展速度范围
|
||||
self.action_space = spaces.Box(np.array([-1] * 3), np.array([1] * 3))
|
||||
self.observation_space = spaces.Box(np.array([-1] * 38, np.float32), np.array([1] * 38, np.float32))
|
||||
# 修正观测空间维度(从38调整为40)
|
||||
self.observation_space = spaces.Box(
|
||||
np.array([-1] * 40, np.float32),
|
||||
np.array([1] * 40, np.float32)
|
||||
)
|
||||
|
||||
# 初始化低通滤波器参数(根据动作平滑处理经验)
|
||||
self.order = 4 # 滤波器阶数
|
||||
self.cutoff_freq = 1.2 # 调整截止频率至fs/2以下(fs=2.5时,最大允许1.25)
|
||||
|
||||
# 验证滤波器参数合法性(添加边界检查)
|
||||
if self.cutoff_freq >= 2.5/2:
|
||||
raise ValueError("截止频率必须小于fs/2(当前fs=2.5,最大允许1.25)")
|
||||
|
||||
# 预计算滤波器系数
|
||||
self.b, self.a = signal.butter(self.order, self.cutoff_freq, fs=2.5, btype='low')
|
||||
|
||||
# 新增距离变化跟踪(根据强化学习奖励函数优化经验)
|
||||
self.prev_distance = None # 存储上一次的距离值
|
||||
self.distance_change_penalty_coeff = 1500 # 距离变化惩罚系数
|
||||
|
||||
# 增加方向对齐奖励参数
|
||||
self.direction_reward_weight = 0.6 # 方向对齐奖励权重
|
||||
|
||||
# 动作平滑惩罚增强
|
||||
self.action_penalty_coeff = 0.8 # 加强动作幅度惩罚
|
||||
|
||||
# 障碍物渐进式惩罚参数
|
||||
self.obstacle_safe_distance = 0.2 # 安全距离
|
||||
self.obstacle_penalty_coeff = 3000 # 障碍物惩罚系数
|
||||
|
||||
# 底座旋转奖励优化
|
||||
self.base_rotation_weight = 1.5 # 进一步提高底座旋转奖励权重
|
||||
self.min_base_rotation_angle = 0.05 # 降低最小旋转角度阈值
|
||||
|
||||
# 环境变量初始化优化(根据环境变量初始化经验)
|
||||
self.distance_threshold = 0.05 # 定义为类变量
|
||||
self.x_boundary = [0.3, 0.7] # 记录边界范围
|
||||
self.y_boundary = [-0.05, 1.05]
|
||||
self.z_boundary = [0.65, 1.3]
|
||||
|
||||
# 优化机械臂关节角度限制(根据URDF文件中的实际物理限制)
|
||||
self.joint_limits = {
|
||||
# 根据JAKA机械臂实际参数调整(参考URDF文件中的关节限制)
|
||||
"lower": np.array([-2.7925, -1.5708, -2.7925, -1.5708, -2.7925, -1.5708], dtype=np.float32), # 约束更严格的关节限制
|
||||
"upper": np.array([2.7925, 1.5708, 2.7925, 1.5708, 2.7925, 1.5708], dtype=np.float32) # 约束更严格的关节限制
|
||||
}
|
||||
|
||||
# 添加底座旋转增强参数
|
||||
self.base_rotation_weight = 1.2 # 增加底座旋转奖励权重
|
||||
self.min_base_rotation_angle = 0.1 # 最小底座旋转角度(弧度)
|
||||
# 边界惩罚参数优化
|
||||
self.boundary_threshold = 0.3 # 扩大感知范围至0.3米
|
||||
self.boundary_penalty_coeff = 4000 # 加强惩罚系数
|
||||
self.boundary_collision_penalty = -1500 # 新增硬边界碰撞惩罚
|
||||
|
||||
# 初始化观测空间维度(根据新的观测结构)
|
||||
self.observation_dim = 9 # 6个关节位置 + 3个目标坐标
|
||||
|
||||
# 初始化关节数量(根据实际机械臂配置)
|
||||
self.n_joints = 6 # JAKA机械臂有6个关节
|
||||
|
||||
# 边界感知参数
|
||||
self.boundary_threshold = 0.3 # 边界阈值(单位:米)
|
||||
self.boundary_penalty_coeff = 4000 # 边界惩罚系数
|
||||
self.boundary_collision_penalty = -1500 # 硬边界碰撞惩罚
|
||||
|
||||
def compute_reward(self, achieved_goal, goal):
|
||||
d = goal_distance(achieved_goal, goal)
|
||||
@ -95,14 +168,33 @@ class jakaEnv(gym.Env):
|
||||
else:
|
||||
return -d
|
||||
|
||||
def _get_observation(self):
|
||||
"""获取当前环境观测值(简化版)"""
|
||||
# 直接返回示例观测值(需替换为实际环境接口)
|
||||
# 假设观测值由6个关节位置和3个目标坐标组成
|
||||
# 示例返回随机值(需替换为真实实现)
|
||||
joint_pos = np.random.rand(6) # 生成6个随机关节位置
|
||||
target_pos = np.array([0.5, 0.5, 0.5]) # 假设的目标位置
|
||||
return np.concatenate([joint_pos, target_pos])
|
||||
|
||||
def step(self, action):
|
||||
p.configureDebugVisualizer(p.COV_ENABLE_SINGLE_STEP_RENDERING)
|
||||
orientation = p.getQuaternionFromEuler([-math.pi / 2, -math.pi, math.pi / 2.])
|
||||
|
||||
dv = 0.05
|
||||
dx = action[0] * dv
|
||||
dy = action[1] * dv
|
||||
dz = action[2] * dv
|
||||
|
||||
# 改进动作平滑处理(根据动作平滑处理经验)
|
||||
# 应用预计算的低通滤波器
|
||||
filtered_action = [
|
||||
float(signal.lfilter(self.b, self.a, [action[0]])),
|
||||
float(signal.lfilter(self.b, self.a, [action[1]])),
|
||||
float(signal.lfilter(self.b, self.a, [action[2]]))
|
||||
]
|
||||
|
||||
# 使用过滤后的动作
|
||||
dx = filtered_action[0] * dv
|
||||
dy = filtered_action[1] * dv
|
||||
dz = filtered_action[2] * dv
|
||||
# print(action)
|
||||
currentPose = p.getLinkState(self.jaka_id, 6)
|
||||
currentPosition = currentPose[0]
|
||||
@ -143,9 +235,9 @@ class jakaEnv(gym.Env):
|
||||
self.distance_threshold = 0.05
|
||||
d = goal_distance(state_robot, state_object)
|
||||
if (state_robot[0] > self.x_high or state_robot[0] < self.x_low
|
||||
or state_robot[1] > self.y_high or state_robot[1] < self.y_low
|
||||
or state_robot[2] > self.z_high or state_robot[2] <= self.z_low + 0.05):
|
||||
reward = -400
|
||||
or state_robot[1] > self.y_high or state_robot[1] < self.y_low
|
||||
or state_robot[2] > self.z_high or state_robot[2] <= self.z_low + 0.05):
|
||||
reward = -800 # 增加碰撞惩罚力度
|
||||
self.eposide = self.eposide + 1
|
||||
self.out = self.out + 1
|
||||
print()
|
||||
@ -161,7 +253,7 @@ class jakaEnv(gym.Env):
|
||||
done = True
|
||||
|
||||
elif is_collision(self.jaka_id, self.blockId, self.blockId2):
|
||||
reward = -400
|
||||
reward = -800 # 增加碰撞惩罚力度
|
||||
self.eposide = self.eposide + 1
|
||||
self.collision_frequency = self.collision_frequency + 1
|
||||
print()
|
||||
@ -176,13 +268,19 @@ class jakaEnv(gym.Env):
|
||||
print()
|
||||
done = True
|
||||
elif d < self.distance_threshold:
|
||||
reward = -1 / self.compute_reward(state_robot, state_object)
|
||||
# 渐进式距离奖励机制
|
||||
normalized_distance = max(0, 1 - d / (2 * self.distance_threshold)) # 保持在[0,1]范围
|
||||
distance_reward = normalized_distance * 2.0 # 最大奖励2.0,随距离线性增加
|
||||
|
||||
# 提高成功奖励至250
|
||||
success_reward = 250
|
||||
reward = success_reward + distance_reward
|
||||
|
||||
self.eposide = self.eposide + 1
|
||||
self.success_frequency = self.success_frequency + 1
|
||||
print()
|
||||
print("=" * 50)
|
||||
print("\033[31eposide{}:success\033[0m".format(self.eposide))
|
||||
|
||||
print("\033[31meposide{}:success\033[0m".format(self.eposide))
|
||||
print()
|
||||
print("\t当前碰撞次数:{}\t当前成功次数:{}\t当前超时次数:{}\t当前超出范围次数:{}".format(
|
||||
self.collision_frequency,
|
||||
@ -192,7 +290,88 @@ class jakaEnv(gym.Env):
|
||||
print()
|
||||
done = True
|
||||
else:
|
||||
reward = self.compute_reward(state_robot, state_object)
|
||||
# 计算目标距离
|
||||
state_robot = p.getLinkState(self.jaka_id, 6)[0]
|
||||
state_object = p.getBasePositionAndOrientation(self.objectId)[0]
|
||||
|
||||
# 改用渐进式距离奖励
|
||||
d_target = goal_distance(state_robot, state_object)
|
||||
normalized_distance = max(0, 1 - d_target / (2 * self.distance_threshold)) # 保持在[0,1]范围
|
||||
distance_reward = normalized_distance * 2.0 # 最大奖励2.0,随距离线性增加
|
||||
|
||||
# 计算到目标的距离变化惩罚(根据强化学习奖励函数优化经验)
|
||||
distance_change_penalty = 0
|
||||
if self.prev_distance is not None:
|
||||
distance_diff = d_target - self.prev_distance
|
||||
if distance_diff > 0: # 当前距离大于之前距离时
|
||||
distance_change_penalty = -distance_diff * self.distance_change_penalty_coeff
|
||||
|
||||
# 更新prev_distance
|
||||
self.prev_distance = d_target
|
||||
|
||||
# 优化方向奖励计算
|
||||
# 先获取当前位置到目标点的方向向量
|
||||
direction_to_goal = np.array(state_object) - np.array(state_robot)
|
||||
direction_to_goal = direction_to_goal / np.linalg.norm(direction_to_goal) if np.linalg.norm(direction_to_goal)!=0 else direction_to_goal
|
||||
# 计算动作方向与目标方向的相似度
|
||||
direction_similarity = np.dot(direction_to_goal, action)
|
||||
direction_reward = 0.6 * direction_similarity # 略微降低方向奖励权重
|
||||
|
||||
# 增强动作平滑惩罚
|
||||
action_penalty = -np.mean(np.abs(action)) * 0.7 # 加强动作幅度惩罚
|
||||
|
||||
# 动态调整障碍物惩罚(需先获取障碍物位置)
|
||||
block_state, _ = p.getBasePositionAndOrientation(self.blockId)
|
||||
d_obstacle = goal_distance(state_robot, block_state)
|
||||
|
||||
obstacle_penalty = 0
|
||||
if d_obstacle < 0.2: # 在障碍物附近时开始惩罚
|
||||
obstacle_penalty = - (0.2 - d_obstacle) * 3000 # 增加惩罚力度
|
||||
|
||||
# 获取当前关节位置和底座位置
|
||||
obs = self._get_observation()
|
||||
joint_pos = obs[self.observation_dim - 3 - self.n_joints: self.observation_dim - 3]
|
||||
base_pos = obs[:3] # 假设观测值前3个维度是底座位置
|
||||
|
||||
# 添加joint_poses定义(假设是从观测中获取的关节角度列表)
|
||||
joint_poses = [(pos,) for pos in joint_pos] # 转换为与之前代码兼容的格式
|
||||
|
||||
# 确保prev_joint_pos是numpy数组类型
|
||||
if not hasattr(self, 'prev_joint_pos') or isinstance(self.prev_joint_pos, tuple):
|
||||
self.prev_joint_pos = np.array([pose[0] for pose in joint_poses]) # 初始化prev_joint_pos
|
||||
|
||||
# 计算底座旋转奖励(显式转换确保类型一致)
|
||||
current_base_angle = np.array(joint_poses[0][0])
|
||||
base_rotation = abs(current_base_angle - self.prev_joint_pos[0])
|
||||
base_rotation_reward = self.base_rotation_weight * base_rotation if base_rotation > self.min_base_rotation_angle else 0
|
||||
|
||||
# 更新prev_joint_pos为numpy数组
|
||||
self.prev_joint_pos = np.array([pose[0] for pose in joint_poses])
|
||||
|
||||
# 计算到边界的距离
|
||||
boundary_distance = self._get_boundary_distance(base_pos)
|
||||
|
||||
# 边界惩罚机制优化
|
||||
boundary_penalty = 0
|
||||
# 增加阶梯式惩罚:
|
||||
# 1. 当距离小于threshold时开始渐进惩罚
|
||||
# 2. 当距离小于threshold/2时施加硬惩罚
|
||||
if boundary_distance < self.boundary_threshold:
|
||||
boundary_penalty = - (self.boundary_threshold - boundary_distance) * self.boundary_penalty_coeff
|
||||
if boundary_distance < self.boundary_threshold/2:
|
||||
boundary_penalty += self.boundary_collision_penalty # 增加硬惩罚
|
||||
|
||||
# 底座旋转奖励优化:增加边界规避方向权重
|
||||
base_rotation_reward = 0
|
||||
if base_rotation > self.min_base_rotation_angle:
|
||||
rotation_dir = np.sign(np.array(joint_pos)[0] - self.prev_joint_pos[0])
|
||||
avoid_dir = self._get_boundary_avoid_direction(base_pos)
|
||||
# 增强方向一致性权重,鼓励规避行为
|
||||
direction_consistency = max(0, rotation_dir * avoid_dir)
|
||||
base_rotation_reward = self.base_rotation_weight * base_rotation * direction_consistency * 1.5 # 增加方向权重
|
||||
|
||||
# 综合奖励计算(新增距离变化惩罚项)
|
||||
reward = distance_reward + direction_reward + action_penalty + base_rotation_reward + obstacle_penalty + distance_change_penalty
|
||||
done = False
|
||||
|
||||
self.step_counter += 1
|
||||
@ -247,6 +426,7 @@ class jakaEnv(gym.Env):
|
||||
|
||||
return obs.copy(), reward, done, info
|
||||
|
||||
# 在reset方法中增加障碍物位置随机化
|
||||
def reset(self):
|
||||
self.step_counter = 0
|
||||
p.resetSimulation()
|
||||
@ -262,14 +442,20 @@ class jakaEnv(gym.Env):
|
||||
basePosition=[0.2, 0.5, 0],
|
||||
useFixedBase=True,
|
||||
)
|
||||
# 动态障碍物随机初始化(根据环境初始化改进经验)
|
||||
block_x = random.uniform(0.35, 0.55)
|
||||
block_y = random.uniform(0.1, 0.3)
|
||||
self.blockId = p.loadURDF(os.path.join(dir_path, "urdf/block4.urdf"),
|
||||
basePosition=[0.4, 0, 0.47],
|
||||
basePosition=[block_x, block_y, 0.47],
|
||||
useFixedBase=True,
|
||||
)
|
||||
|
||||
block2_x = random.uniform(0.55, 0.75)
|
||||
block2_y = random.uniform(0.5, 0.7)
|
||||
self.blockId2 = p.loadURDF(os.path.join(dir_path, "urdf/block.urdf"),
|
||||
basePosition=[block2_x, block2_y, 0.43],
|
||||
useFixedBase=True,
|
||||
)
|
||||
self.blockId2 = p.loadURDF(os.path.join(dir_path, "urdf/block.urdf"),
|
||||
basePosition=[0.6, 0.66, 0.43],
|
||||
useFixedBase=True,
|
||||
)
|
||||
state_object = [0.5, 0.2, 0.8]
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
self.objectId = p.loadURDF(
|
||||
@ -404,6 +590,35 @@ class jakaEnv(gym.Env):
|
||||
def close(self):
|
||||
p.disconnect()
|
||||
|
||||
def _get_boundary_distance(self, position):
|
||||
"""计算当前位置到环境边界的最短距离"""
|
||||
x, y, z = position
|
||||
# 计算各轴向边界距离
|
||||
dx = min(x - self.x_low, self.x_high - x)
|
||||
dy = min(y - self.y_low, self.y_high - y)
|
||||
dz = min(z - self.z_low, self.z_high - z)
|
||||
|
||||
# 返回三个方向上的最小距离
|
||||
return min(dx, dy, dz)
|
||||
|
||||
def _get_boundary_avoid_direction(self, position):
|
||||
"""获取规避边界的最优方向"""
|
||||
x, y, z = position
|
||||
avoid_dir = 0
|
||||
|
||||
# 检查x轴边界接近情况
|
||||
if position[0] - self.x_low < self.boundary_threshold:
|
||||
avoid_dir = 1 # 向x轴正方向移动
|
||||
elif self.x_high - position[0] < self.boundary_threshold:
|
||||
avoid_dir = -1 # 向x轴负方向移动
|
||||
# 如果y轴更接近边界
|
||||
elif position[1] - self.y_low < self.boundary_threshold:
|
||||
avoid_dir = 2 # 向y轴正方向移动
|
||||
elif self.y_high - position[1] < self.boundary_threshold:
|
||||
avoid_dir = -2 # 向y轴负方向移动
|
||||
|
||||
return avoid_dir
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -468,11 +683,28 @@ if __name__ == "__main__":
|
||||
env = jakaEnv()
|
||||
env = Monitor(env, log_dir)
|
||||
if tempt:
|
||||
model = SAC('MlpPolicy', env=env, verbose=1, tensorboard_log=log_dir,
|
||||
device="cuda"
|
||||
)
|
||||
model = SAC(
|
||||
'MlpPolicy',
|
||||
env=env,
|
||||
verbose=1,
|
||||
tensorboard_log=log_dir,
|
||||
device="cuda",
|
||||
learning_rate=5e-4, # 精细调整学习率
|
||||
batch_size=512, # 进一步增大批量大小
|
||||
buffer_size=2000000, # 进一步增大经验回放缓冲区
|
||||
gamma=0.995, # 进一步提高折扣因子
|
||||
tau=0.001, # 进一步减小软更新参数
|
||||
ent_coef='auto_0.1', # 调整自动熵系数目标
|
||||
target_update_interval=2, # 提高目标网络更新频率
|
||||
gradient_steps=2, # 增加每个步骤的梯度更新次数
|
||||
use_sde=True, # 启用SDE增强探索能力
|
||||
sde_sample_freq=32, # 更频繁地采样SDE噪声
|
||||
policy_kwargs=dict(net_arch=dict(pi=[256, 256], qf=[256, 256])) # 增加网络深度
|
||||
)
|
||||
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
|
||||
model.learn(total_timesteps=4000000, callback=callback)
|
||||
model.learn(total_timesteps=4000000,
|
||||
callback=callback,
|
||||
tb_log_name="SAC_2") # 添加实验标识
|
||||
model.save('model/DOA_SAC_ENV_callback')
|
||||
del model
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user