feat(DOA_SAC_sim2real): 实现高级奖励机制并优化模型训练
- 新增奖励函数权重参数,可调节距离、动作、碰撞和边界惩罚 - 实现归一化距离奖励、动作幅度惩罚、碰撞惩罚和边界惩罚 - 更新模型训练配置,增加经验回放缓冲区大小、调整学习率等 - 添加多GPU支持和数据并行训练 - 优化日志记录和模型保存策略
This commit is contained in:
parent
686164f670
commit
1fc489e188
@ -71,6 +71,11 @@ class jakaEnv(gym.Env):
|
||||
self.overstep_frequency = 0
|
||||
self.out = 0
|
||||
self.reward_type = "dense"
|
||||
# 奖励函数权重参数(可调节)
|
||||
self.alpha = 1.0 # 距离奖励权重
|
||||
self.beta = 0.1 # 动作惩罚权重
|
||||
self.gamma = 200 # 碰撞惩罚权重
|
||||
self.delta = 100 # 边界惩罚权重
|
||||
p.connect(
|
||||
p.GUI,
|
||||
# options="--background_color_red=0.0 --background_color_green=0.93--background_color_blue=0.54",
|
||||
@ -95,12 +100,36 @@ class jakaEnv(gym.Env):
|
||||
np.array([1] * 38, np.float32)
|
||||
)
|
||||
|
||||
def compute_reward(self, achieved_goal, goal):
|
||||
def compute_reward(self, achieved_goal, goal, action=None):
|
||||
d = goal_distance(achieved_goal, goal)
|
||||
if self.reward_type == "sparse":
|
||||
return -(d > self.distance_threshold).astyp(np.float32)
|
||||
else:
|
||||
return -d
|
||||
|
||||
# 归一化距离奖励(使用tanh函数)
|
||||
distance_reward = -np.tanh(d)
|
||||
|
||||
# 动作幅度惩罚(L2正则化)
|
||||
action_penalty = 0
|
||||
if action is not None:
|
||||
action_penalty = np.linalg.norm(action)
|
||||
|
||||
# 碰撞惩罚(硬边界)
|
||||
collision_penalty = self.gamma if is_collision(self.jaka_id, self.blockId, self.blockId2) else 0
|
||||
|
||||
# 边界惩罚(指数级增长)
|
||||
boundary_penalty = 0
|
||||
if achieved_goal[0] > self.x_high or achieved_goal[0] < self.x_low:
|
||||
boundary_penalty += self.delta * np.exp(abs(achieved_goal[0] - self.x_high) + abs(achieved_goal[0] - self.x_low))
|
||||
if achieved_goal[1] > self.y_high or achieved_goal[1] < self.y_low:
|
||||
boundary_penalty += self.delta * np.exp(abs(achieved_goal[1] - self.y_high) + abs(achieved_goal[1] - self.y_low))
|
||||
if achieved_goal[2] > self.z_high or achieved_goal[2] <= self.z_low + 0.05:
|
||||
boundary_penalty += self.delta * np.exp(abs(achieved_goal[2] - self.z_high) + abs(achieved_goal[2] - self.z_low))
|
||||
|
||||
# 组合奖励项
|
||||
reward = self.alpha * distance_reward \
|
||||
- self.beta * action_penalty \
|
||||
- collision_penalty \
|
||||
- boundary_penalty
|
||||
|
||||
return reward
|
||||
|
||||
def step(self, action):
|
||||
p.configureDebugVisualizer(p.COV_ENABLE_SINGLE_STEP_RENDERING)
|
||||
@ -149,6 +178,9 @@ class jakaEnv(gym.Env):
|
||||
|
||||
self.distance_threshold = 0.05
|
||||
d = goal_distance(state_robot, state_object)
|
||||
# 使用新的奖励函数
|
||||
reward = self.compute_reward(state_robot, state_object, action)
|
||||
|
||||
if (state_robot[0] > self.x_high or state_robot[0] < self.x_low
|
||||
or state_robot[1] > self.y_high or state_robot[1] < self.y_low
|
||||
or state_robot[2] > self.z_high or state_robot[2] <= self.z_low + 0.05):
|
||||
@ -183,13 +215,12 @@ class jakaEnv(gym.Env):
|
||||
print()
|
||||
done = True
|
||||
elif d < self.distance_threshold:
|
||||
reward = -1 / self.compute_reward(state_robot, state_object)
|
||||
reward = -1 / self.compute_reward(state_robot, state_object, action)
|
||||
self.eposide = self.eposide + 1
|
||||
self.success_frequency = self.success_frequency + 1
|
||||
print()
|
||||
print("=" * 50)
|
||||
print("\033[31eposide{}:success\033[0m".format(self.eposide))
|
||||
|
||||
print("\033[31meposide{}:success\033[0m".format(self.eposide))
|
||||
print()
|
||||
print("\t当前碰撞次数:{}\t当前成功次数:{}\t当前超时次数:{}\t当前超出范围次数:{}".format(
|
||||
self.collision_frequency,
|
||||
@ -199,7 +230,7 @@ class jakaEnv(gym.Env):
|
||||
print()
|
||||
done = True
|
||||
else:
|
||||
reward = self.compute_reward(state_robot, state_object)
|
||||
reward = self.compute_reward(state_robot, state_object, action)
|
||||
done = False
|
||||
|
||||
self.step_counter += 1
|
||||
@ -506,9 +537,31 @@ if __name__ == "__main__":
|
||||
env = jakaEnv()
|
||||
env = Monitor(env, log_dir)
|
||||
if tempt:
|
||||
# 检查CUDA可用性并自动选择设备
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
print(f"Using device: {device}")
|
||||
|
||||
# 自动检测可用GPU数量
|
||||
num_gpus = torch.cuda.device_count()
|
||||
print(f"Number of available GPUs: {num_gpus}")
|
||||
|
||||
model = SAC('MlpPolicy', env=env, verbose=1, tensorboard_log=log_dir,
|
||||
device="cuda"
|
||||
device=device, # 自动选择设备
|
||||
buffer_size=200000, # 增大经验回放缓冲区至200万
|
||||
batch_size=512 * max(1, num_gpus), # 根据GPU数量动态调整批量大小
|
||||
gamma=0.995, # 提高折扣因子至0.995
|
||||
tau=0.001, # 减小软更新参数至0.001
|
||||
ent_coef='auto_0.1', # 自动调节熵系数目标至0.1
|
||||
learning_rate=5e-4, # 提高学习率至5e-4
|
||||
target_update_interval=1, # 提高目标网络更新频率
|
||||
gradient_steps=4 # 增加梯度更新频率至4次/step
|
||||
)
|
||||
|
||||
# 如果有多个GPU,启用数据并行
|
||||
if num_gpus > 1:
|
||||
print("Using DataParallel for multi-GPU training")
|
||||
model.policy = nn.DataParallel(model.policy)
|
||||
|
||||
# 创建回调函数列表,包含原有的最佳模型保存回调和新的周期性检查点回调
|
||||
callback_list = [
|
||||
SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir),
|
||||
@ -518,7 +571,8 @@ if __name__ == "__main__":
|
||||
model.learn(
|
||||
total_timesteps=4000000,
|
||||
callback=callback_list,
|
||||
tb_log_name="SAC_2"
|
||||
tb_log_name="SAC_2",
|
||||
log_interval=50 # 添加日志间隔以监控训练进度
|
||||
)
|
||||
model.save('model/DOA_SAC_ENV_callback')
|
||||
del model
|
||||
|
Loading…
Reference in New Issue
Block a user