diff --git a/DOA_SAC_sim2real.py b/DOA_SAC_sim2real.py index b643887..cdd16f6 100644 --- a/DOA_SAC_sim2real.py +++ b/DOA_SAC_sim2real.py @@ -71,6 +71,11 @@ class jakaEnv(gym.Env): self.overstep_frequency = 0 self.out = 0 self.reward_type = "dense" + # 奖励函数权重参数(可调节) + self.alpha = 1.0 # 距离奖励权重 + self.beta = 0.1 # 动作惩罚权重 + self.gamma = 200 # 碰撞惩罚权重 + self.delta = 100 # 边界惩罚权重 p.connect( p.GUI, # options="--background_color_red=0.0 --background_color_green=0.93--background_color_blue=0.54", @@ -95,12 +100,36 @@ class jakaEnv(gym.Env): np.array([1] * 38, np.float32) ) - def compute_reward(self, achieved_goal, goal): + def compute_reward(self, achieved_goal, goal, action=None): d = goal_distance(achieved_goal, goal) - if self.reward_type == "sparse": - return -(d > self.distance_threshold).astyp(np.float32) - else: - return -d + + # 归一化距离奖励(使用tanh函数) + distance_reward = -np.tanh(d) + + # 动作幅度惩罚(L2正则化) + action_penalty = 0 + if action is not None: + action_penalty = np.linalg.norm(action) + + # 碰撞惩罚(硬边界) + collision_penalty = self.gamma if is_collision(self.jaka_id, self.blockId, self.blockId2) else 0 + + # 边界惩罚(指数级增长) + boundary_penalty = 0 + if achieved_goal[0] > self.x_high or achieved_goal[0] < self.x_low: + boundary_penalty += self.delta * np.exp(abs(achieved_goal[0] - self.x_high) + abs(achieved_goal[0] - self.x_low)) + if achieved_goal[1] > self.y_high or achieved_goal[1] < self.y_low: + boundary_penalty += self.delta * np.exp(abs(achieved_goal[1] - self.y_high) + abs(achieved_goal[1] - self.y_low)) + if achieved_goal[2] > self.z_high or achieved_goal[2] <= self.z_low + 0.05: + boundary_penalty += self.delta * np.exp(abs(achieved_goal[2] - self.z_high) + abs(achieved_goal[2] - self.z_low)) + + # 组合奖励项 + reward = self.alpha * distance_reward \ + - self.beta * action_penalty \ + - collision_penalty \ + - boundary_penalty + + return reward def step(self, action): p.configureDebugVisualizer(p.COV_ENABLE_SINGLE_STEP_RENDERING) @@ -149,6 +178,9 @@ class jakaEnv(gym.Env): self.distance_threshold = 0.05 d = goal_distance(state_robot, state_object) + # 使用新的奖励函数 + reward = self.compute_reward(state_robot, state_object, action) + if (state_robot[0] > self.x_high or state_robot[0] < self.x_low or state_robot[1] > self.y_high or state_robot[1] < self.y_low or state_robot[2] > self.z_high or state_robot[2] <= self.z_low + 0.05): @@ -183,13 +215,12 @@ class jakaEnv(gym.Env): print() done = True elif d < self.distance_threshold: - reward = -1 / self.compute_reward(state_robot, state_object) + reward = -1 / self.compute_reward(state_robot, state_object, action) self.eposide = self.eposide + 1 self.success_frequency = self.success_frequency + 1 print() print("=" * 50) - print("\033[31eposide{}:success\033[0m".format(self.eposide)) - + print("\033[31meposide{}:success\033[0m".format(self.eposide)) print() print("\t当前碰撞次数:{}\t当前成功次数:{}\t当前超时次数:{}\t当前超出范围次数:{}".format( self.collision_frequency, @@ -199,7 +230,7 @@ class jakaEnv(gym.Env): print() done = True else: - reward = self.compute_reward(state_robot, state_object) + reward = self.compute_reward(state_robot, state_object, action) done = False self.step_counter += 1 @@ -506,9 +537,31 @@ if __name__ == "__main__": env = jakaEnv() env = Monitor(env, log_dir) if tempt: + # 检查CUDA可用性并自动选择设备 + device = "cuda" if torch.cuda.is_available() else "cpu" + print(f"Using device: {device}") + + # 自动检测可用GPU数量 + num_gpus = torch.cuda.device_count() + print(f"Number of available GPUs: {num_gpus}") + model = SAC('MlpPolicy', env=env, verbose=1, tensorboard_log=log_dir, - device="cuda" + device=device, # 自动选择设备 + buffer_size=200000, # 增大经验回放缓冲区至200万 + batch_size=512 * max(1, num_gpus), # 根据GPU数量动态调整批量大小 + gamma=0.995, # 提高折扣因子至0.995 + tau=0.001, # 减小软更新参数至0.001 + ent_coef='auto_0.1', # 自动调节熵系数目标至0.1 + learning_rate=5e-4, # 提高学习率至5e-4 + target_update_interval=1, # 提高目标网络更新频率 + gradient_steps=4 # 增加梯度更新频率至4次/step ) + + # 如果有多个GPU,启用数据并行 + if num_gpus > 1: + print("Using DataParallel for multi-GPU training") + model.policy = nn.DataParallel(model.policy) + # 创建回调函数列表,包含原有的最佳模型保存回调和新的周期性检查点回调 callback_list = [ SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir), @@ -518,7 +571,8 @@ if __name__ == "__main__": model.learn( total_timesteps=4000000, callback=callback_list, - tb_log_name="SAC_2" + tb_log_name="SAC_2", + log_interval=50 # 添加日志间隔以监控训练进度 ) model.save('model/DOA_SAC_ENV_callback') del model