Compare commits
7 Commits
main
...
multi-step
Author | SHA1 | Date | |
---|---|---|---|
8381839069 | |||
1fc489e188 | |||
686164f670 | |||
|
3a111b67e2 | ||
|
c76dab54b0 | ||
456ed76e47 | |||
b219659545 |
@ -1,3 +1,6 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.multiprocessing import Process, Queue, set_start_method
|
||||||
import time
|
import time
|
||||||
import gym
|
import gym
|
||||||
from gym import error, spaces, utils
|
from gym import error, spaces, utils
|
||||||
@ -10,13 +13,13 @@ import numpy as np
|
|||||||
import random
|
import random
|
||||||
from scipy import signal
|
from scipy import signal
|
||||||
import sys
|
import sys
|
||||||
# sys.path.append('D:\\vs2019ws\PythonCtt\PythonCtt')
|
# 动态路径配置:为硬件模块(如jkrc)实现动态路径配置,确保在不同环境中均可正确加载。
|
||||||
import jkrc
|
try:
|
||||||
|
import jkrc
|
||||||
import matplotlib as mpl
|
JAKA_AVAILABLE = True
|
||||||
from mpl_toolkits.mplot3d import Axes3D
|
except ImportError:
|
||||||
import matplotlib.pyplot as plt
|
jkrc = None
|
||||||
import pandas as pd
|
JAKA_AVAILABLE = False
|
||||||
|
|
||||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
|
|
||||||
@ -68,6 +71,11 @@ class jakaEnv(gym.Env):
|
|||||||
self.overstep_frequency = 0
|
self.overstep_frequency = 0
|
||||||
self.out = 0
|
self.out = 0
|
||||||
self.reward_type = "dense"
|
self.reward_type = "dense"
|
||||||
|
# 奖励函数权重参数(可调节)
|
||||||
|
self.alpha = 1.0 # 距离奖励权重
|
||||||
|
self.beta = 0.1 # 动作惩罚权重
|
||||||
|
self.gamma = 200 # 碰撞惩罚权重
|
||||||
|
self.delta = 100 # 边界惩罚权重
|
||||||
p.connect(
|
p.connect(
|
||||||
p.GUI,
|
p.GUI,
|
||||||
# options="--background_color_red=0.0 --background_color_green=0.93--background_color_blue=0.54",
|
# options="--background_color_red=0.0 --background_color_green=0.93--background_color_blue=0.54",
|
||||||
@ -86,14 +94,42 @@ class jakaEnv(gym.Env):
|
|||||||
self.z_high = 1.3
|
self.z_high = 1.3
|
||||||
self.speed = [0.3, 0.5, 0.8, 1.0]
|
self.speed = [0.3, 0.5, 0.8, 1.0]
|
||||||
self.action_space = spaces.Box(np.array([-1] * 3), np.array([1] * 3))
|
self.action_space = spaces.Box(np.array([-1] * 3), np.array([1] * 3))
|
||||||
self.observation_space = spaces.Box(np.array([-1] * 38, np.float32), np.array([1] * 38, np.float32))
|
# 修改为38维观测空间
|
||||||
|
self.observation_space = spaces.Box(
|
||||||
|
np.array([-1] * 38, np.float32),
|
||||||
|
np.array([1] * 38, np.float32)
|
||||||
|
)
|
||||||
|
|
||||||
def compute_reward(self, achieved_goal, goal):
|
def compute_reward(self, achieved_goal, goal, action=None):
|
||||||
d = goal_distance(achieved_goal, goal)
|
d = goal_distance(achieved_goal, goal)
|
||||||
if self.reward_type == "sparse":
|
|
||||||
return -(d > self.distance_threshold).astyp(np.float32)
|
# 归一化距离奖励(使用tanh函数)
|
||||||
else:
|
distance_reward = -np.tanh(d)
|
||||||
return -d
|
|
||||||
|
# 动作幅度惩罚(L2正则化)
|
||||||
|
action_penalty = 0
|
||||||
|
if action is not None:
|
||||||
|
action_penalty = np.linalg.norm(action)
|
||||||
|
|
||||||
|
# 碰撞惩罚(硬边界)
|
||||||
|
collision_penalty = self.gamma if is_collision(self.jaka_id, self.blockId, self.blockId2) else 0
|
||||||
|
|
||||||
|
# 边界惩罚(指数级增长)
|
||||||
|
boundary_penalty = 0
|
||||||
|
if achieved_goal[0] > self.x_high or achieved_goal[0] < self.x_low:
|
||||||
|
boundary_penalty += self.delta * np.exp(abs(achieved_goal[0] - self.x_high) + abs(achieved_goal[0] - self.x_low))
|
||||||
|
if achieved_goal[1] > self.y_high or achieved_goal[1] < self.y_low:
|
||||||
|
boundary_penalty += self.delta * np.exp(abs(achieved_goal[1] - self.y_high) + abs(achieved_goal[1] - self.y_low))
|
||||||
|
if achieved_goal[2] > self.z_high or achieved_goal[2] <= self.z_low + 0.05:
|
||||||
|
boundary_penalty += self.delta * np.exp(abs(achieved_goal[2] - self.z_high) + abs(achieved_goal[2] - self.z_low))
|
||||||
|
|
||||||
|
# 组合奖励项
|
||||||
|
reward = self.alpha * distance_reward \
|
||||||
|
- self.beta * action_penalty \
|
||||||
|
- collision_penalty \
|
||||||
|
- boundary_penalty
|
||||||
|
|
||||||
|
return reward
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
p.configureDebugVisualizer(p.COV_ENABLE_SINGLE_STEP_RENDERING)
|
p.configureDebugVisualizer(p.COV_ENABLE_SINGLE_STEP_RENDERING)
|
||||||
@ -142,6 +178,9 @@ class jakaEnv(gym.Env):
|
|||||||
|
|
||||||
self.distance_threshold = 0.05
|
self.distance_threshold = 0.05
|
||||||
d = goal_distance(state_robot, state_object)
|
d = goal_distance(state_robot, state_object)
|
||||||
|
# 使用新的奖励函数
|
||||||
|
reward = self.compute_reward(state_robot, state_object, action)
|
||||||
|
|
||||||
if (state_robot[0] > self.x_high or state_robot[0] < self.x_low
|
if (state_robot[0] > self.x_high or state_robot[0] < self.x_low
|
||||||
or state_robot[1] > self.y_high or state_robot[1] < self.y_low
|
or state_robot[1] > self.y_high or state_robot[1] < self.y_low
|
||||||
or state_robot[2] > self.z_high or state_robot[2] <= self.z_low + 0.05):
|
or state_robot[2] > self.z_high or state_robot[2] <= self.z_low + 0.05):
|
||||||
@ -176,13 +215,12 @@ class jakaEnv(gym.Env):
|
|||||||
print()
|
print()
|
||||||
done = True
|
done = True
|
||||||
elif d < self.distance_threshold:
|
elif d < self.distance_threshold:
|
||||||
reward = -1 / self.compute_reward(state_robot, state_object)
|
reward = -1 / self.compute_reward(state_robot, state_object, action)
|
||||||
self.eposide = self.eposide + 1
|
self.eposide = self.eposide + 1
|
||||||
self.success_frequency = self.success_frequency + 1
|
self.success_frequency = self.success_frequency + 1
|
||||||
print()
|
print()
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
print("\033[31eposide{}:success\033[0m".format(self.eposide))
|
print("\033[31meposide{}:success\033[0m".format(self.eposide))
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print("\t当前碰撞次数:{}\t当前成功次数:{}\t当前超时次数:{}\t当前超出范围次数:{}".format(
|
print("\t当前碰撞次数:{}\t当前成功次数:{}\t当前超时次数:{}\t当前超出范围次数:{}".format(
|
||||||
self.collision_frequency,
|
self.collision_frequency,
|
||||||
@ -192,7 +230,7 @@ class jakaEnv(gym.Env):
|
|||||||
print()
|
print()
|
||||||
done = True
|
done = True
|
||||||
else:
|
else:
|
||||||
reward = self.compute_reward(state_robot, state_object)
|
reward = self.compute_reward(state_robot, state_object, action)
|
||||||
done = False
|
done = False
|
||||||
|
|
||||||
self.step_counter += 1
|
self.step_counter += 1
|
||||||
@ -412,7 +450,7 @@ if __name__ == "__main__":
|
|||||||
from stable_baselines3.common.monitor import Monitor
|
from stable_baselines3.common.monitor import Monitor
|
||||||
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
|
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
|
||||||
from stable_baselines3.common.noise import NormalActionNoise
|
from stable_baselines3.common.noise import NormalActionNoise
|
||||||
from stable_baselines3.common.callbacks import BaseCallback
|
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
|
||||||
|
|
||||||
|
|
||||||
class SaveOnBestTrainingRewardCallback(BaseCallback):
|
class SaveOnBestTrainingRewardCallback(BaseCallback):
|
||||||
@ -462,24 +500,87 @@ if __name__ == "__main__":
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
tempt = 1
|
class PeriodicModelCheckpointCallback(BaseCallback):
|
||||||
|
"""
|
||||||
|
Callback for periodically saving the model at specified intervals,
|
||||||
|
storing each checkpoint in its own subdirectory.
|
||||||
|
"""
|
||||||
|
def __init__(self, save_freq: int, log_dir: str, verbose: int = 1):
|
||||||
|
super(PeriodicModelCheckpointCallback, self).__init__(verbose)
|
||||||
|
self.save_freq = save_freq
|
||||||
|
self.log_dir = log_dir
|
||||||
|
self.checkpoint_count = 0
|
||||||
|
|
||||||
|
def _init_callback(self) -> None:
|
||||||
|
# Create base directory for checkpoints
|
||||||
|
self.checkpoint_dir = os.path.join(self.log_dir, 'checkpoints')
|
||||||
|
os.makedirs(self.checkpoint_dir, exist_ok=True)
|
||||||
|
|
||||||
|
def _on_step(self) -> bool:
|
||||||
|
if self.n_calls % self.save_freq == 0:
|
||||||
|
# Create new checkpoint directory
|
||||||
|
checkpoint_path = os.path.join(self.checkpoint_dir, f'checkpoint_{self.n_calls}')
|
||||||
|
os.makedirs(checkpoint_path, exist_ok=True)
|
||||||
|
|
||||||
|
# Save model
|
||||||
|
if self.verbose > 0:
|
||||||
|
print(f'Saving model checkpoint to {checkpoint_path}')
|
||||||
|
self.model.save(os.path.join(checkpoint_path, 'model'))
|
||||||
|
|
||||||
|
self.checkpoint_count += 1
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
tempt = 0
|
||||||
log_dir = './tensorboard/DOA_SAC_callback/'
|
log_dir = './tensorboard/DOA_SAC_callback/'
|
||||||
os.makedirs(log_dir, exist_ok=True)
|
os.makedirs(log_dir, exist_ok=True)
|
||||||
env = jakaEnv()
|
env = jakaEnv()
|
||||||
env = Monitor(env, log_dir)
|
env = Monitor(env, log_dir)
|
||||||
if tempt:
|
if tempt:
|
||||||
|
# 检查CUDA可用性并自动选择设备
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
print(f"Using device: {device}")
|
||||||
|
|
||||||
|
# 自动检测可用GPU数量
|
||||||
|
num_gpus = torch.cuda.device_count()
|
||||||
|
print(f"Number of available GPUs: {num_gpus}")
|
||||||
|
|
||||||
model = SAC('MlpPolicy', env=env, verbose=1, tensorboard_log=log_dir,
|
model = SAC('MlpPolicy', env=env, verbose=1, tensorboard_log=log_dir,
|
||||||
device="cuda"
|
device=device, # 自动选择设备
|
||||||
|
buffer_size=200000, # 增大经验回放缓冲区至200万
|
||||||
|
batch_size=512 * max(1, num_gpus), # 根据GPU数量动态调整批量大小
|
||||||
|
gamma=0.995, # 提高折扣因子至0.995
|
||||||
|
tau=0.001, # 减小软更新参数至0.001
|
||||||
|
ent_coef='auto_0.1', # 自动调节熵系数目标至0.1
|
||||||
|
learning_rate=5e-4, # 提高学习率至5e-4
|
||||||
|
target_update_interval=1, # 提高目标网络更新频率
|
||||||
|
gradient_steps=4 # 增加梯度更新频率至4次/step
|
||||||
)
|
)
|
||||||
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
|
|
||||||
model.learn(total_timesteps=4000000, callback=callback)
|
# 如果有多个GPU,启用数据并行
|
||||||
|
if num_gpus > 1:
|
||||||
|
print("Using DataParallel for multi-GPU training")
|
||||||
|
model.policy = nn.DataParallel(model.policy)
|
||||||
|
|
||||||
|
# 创建回调函数列表,包含原有的最佳模型保存回调和新的周期性检查点回调
|
||||||
|
callback_list = [
|
||||||
|
SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir),
|
||||||
|
PeriodicModelCheckpointCallback(save_freq=5000, log_dir=log_dir)
|
||||||
|
]
|
||||||
|
|
||||||
|
model.learn(
|
||||||
|
total_timesteps=4000000,
|
||||||
|
callback=callback_list,
|
||||||
|
tb_log_name="SAC_2",
|
||||||
|
log_interval=50 # 添加日志间隔以监控训练进度
|
||||||
|
)
|
||||||
model.save('model/DOA_SAC_ENV_callback')
|
model.save('model/DOA_SAC_ENV_callback')
|
||||||
del model
|
del model
|
||||||
|
|
||||||
else:
|
else:
|
||||||
obs = env.reset()
|
obs = env.reset()
|
||||||
# 改变路径为你保存模型的路径
|
# 改变路径为你保存模型的路径
|
||||||
model = SAC.load(r'best_model.zip', env=env)
|
model = SAC.load(r'tensorboard/DOA_SAC_callback/best_model.zip', env=env)
|
||||||
|
|
||||||
for j in range(50):
|
for j in range(50):
|
||||||
for i in range(2000):
|
for i in range(2000):
|
||||||
@ -530,4 +631,55 @@ if __name__ == "__main__":
|
|||||||
# plt.show()
|
# plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
def train_parallel(num_processes):
|
||||||
|
"""多进程并行训练函数"""
|
||||||
|
set_start_method('spawn')
|
||||||
|
|
||||||
|
# 创建共享模型(使用Stable Baselines3的SAC)
|
||||||
|
env = jakaEnv() # 创建环境实例
|
||||||
|
model = SAC('MlpPolicy', env=env, verbose=1, device="cuda") # 使用CUDA加速
|
||||||
|
shared_model = model.policy.to(torch.device('cuda')) # 显式指定模型在GPU上
|
||||||
|
shared_model.share_memory() # 共享模型参数
|
||||||
|
|
||||||
|
# 创建进程列表
|
||||||
|
processes = []
|
||||||
|
for rank in range(num_processes):
|
||||||
|
p = Process(target=train_process, args=(rank, shared_model))
|
||||||
|
p.start()
|
||||||
|
processes.append(p)
|
||||||
|
|
||||||
|
# 等待所有进程完成
|
||||||
|
for p in processes:
|
||||||
|
p.join()
|
||||||
|
|
||||||
|
|
||||||
|
def train_process(rank, shared_model):
|
||||||
|
"""单个训练进程"""
|
||||||
|
# 为每个进程分配独立GPU
|
||||||
|
device = torch.device(f'cuda:{rank % torch.cuda.device_count()}')
|
||||||
|
|
||||||
|
# 创建独立环境实例
|
||||||
|
env = create_arm_environment() # 替换为实际的环境创建函数
|
||||||
|
|
||||||
|
# 创建本地模型副本
|
||||||
|
local_model = SAC().to(device)
|
||||||
|
local_model.load_state_dict(shared_model.state_dict())
|
||||||
|
|
||||||
|
# 在此处替换原有训练循环为并行版本
|
||||||
|
while True:
|
||||||
|
# 训练本地模型...
|
||||||
|
|
||||||
|
# 同步参数到共享模型
|
||||||
|
with torch.no_grad():
|
||||||
|
for param, shared_param in zip(local_model.parameters(), shared_model.parameters()):
|
||||||
|
shared_param.copy_(param)
|
||||||
|
|
||||||
|
|
||||||
|
def create_arm_environment():
|
||||||
|
"""创建机械臂环境实例"""
|
||||||
|
return jakaEnv() # 返回机械臂环境实例
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# 启动并行训练(使用4个进程为例)
|
||||||
|
train_parallel(4)
|
||||||
|
|
BIN
tensorboard/DOA_SAC_callback/best_model.zip
Normal file
BIN
tensorboard/DOA_SAC_callback/best_model.zip
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user