first commit
This commit is contained in:
105
Seg_All_In_One_SegModel/train.sh
Normal file
105
Seg_All_In_One_SegModel/train.sh
Normal file
@@ -0,0 +1,105 @@
|
||||
#!/bin/bash
|
||||
# --- 1. Conda 环境设置 ---
|
||||
CONDA_BASE_PATH="/home/wkmgc/miniconda3" # <--- 在这里修改为您自己的路径
|
||||
CONDA_ENV_NAME="${SEG_CONDA_ENV:-seg_smp}" # 可用 SEG_CONDA_ENV=SMP bash train.sh 临时覆盖
|
||||
|
||||
# 初始化并激活 Conda 环境
|
||||
if [ -f "${CONDA_BASE_PATH}/etc/profile.d/conda.sh" ]; then
|
||||
source "${CONDA_BASE_PATH}/etc/profile.d/conda.sh"
|
||||
conda activate "${CONDA_ENV_NAME}"
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "错误: 激活 Conda 环境 '${CONDA_ENV_NAME}' 失败!"
|
||||
exit 1
|
||||
fi
|
||||
echo "Conda 环境 '${CONDA_ENV_NAME}' 已成功激活。"
|
||||
else
|
||||
echo "错误: 找不到 conda.sh 脚本。请检查您的 CONDA_BASE_PATH 设置是否正确。"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- 2. 模型与 GPU 配置 ---
|
||||
GPUS_GROUP_1="3"
|
||||
GPUS_GROUP_2="2"
|
||||
GPUS_GROUP_3="1"
|
||||
GPUS_GROUP_4="4"
|
||||
|
||||
GROUP_1_ARCHS=("PSPNet" "Unet" "UnetPlusPlus" ) # G3 # "PSPNet" # "Unet" "UnetPlusPlus" "FPN"
|
||||
GROUP_2_ARCHS=("Linknet" "MAnet" "DeepLabV3" ) # G3 "Linknet" "MAnet" # "DeepLabV3" # "DeepLabV3Plus"
|
||||
GROUP_3_ARCHS=("UPerNet" "Segformer" "DPT" ) # G3 # "UPerNet" "Segformer" "DPT" # "PAN"
|
||||
GROUP_4_ARCHS=("FPN" "DeepLabV3Plus" "PAN")
|
||||
|
||||
# 1. 从 config.py 中读取 OUTPUTS_DIR 的值
|
||||
OUTPUTS_DIR=$(python -c "from config import OUTPUTS_DIR; print(OUTPUTS_DIR)")
|
||||
# 检查是否成功获取了 OUTPUTS_DIR
|
||||
if [ -z "$OUTPUTS_DIR" ]; then
|
||||
echo "OUTPUTS_DIR: $OUTPUTS_DIR"
|
||||
echo "Error 1: Could not read OUTPUTS_DIR from config.py. Exiting."
|
||||
echo "Error 2: Or the directory specified by OUTPUTS_DIR does not exist. Please create it first."
|
||||
exit 1
|
||||
fi
|
||||
# 2. 定义带有时间戳的日志目录名
|
||||
LOG_DIR_NAME="predict_logs_parallel_$(date +%Y-%m-%d_%H-%M-%S)"
|
||||
# 3. 拼接成最终的完整路径
|
||||
LOG_DIR="$OUTPUTS_DIR/$LOG_DIR_NAME"
|
||||
mkdir -p "${LOG_DIR}"
|
||||
echo "所有模型的日志将保存在 ./${LOG_DIR}/ 目录中。"
|
||||
echo "----------------------------------------------------"
|
||||
|
||||
|
||||
# --- 3. 依次启动所有训练任务 ---
|
||||
# 脚本将按顺序逐一执行每个模型的训练,等待上一个完成后再开始下一个。
|
||||
|
||||
echo ">>> 准备启动第一组训练任务 (后台运行)..."
|
||||
for arch in "${GROUP_1_ARCHS[@]}"; do
|
||||
echo " -> 正在后台启动模型: ${arch} on GPUs: ${GPUS_GROUP_1}"
|
||||
# 使用 '&' 将命令放入后台运行
|
||||
CUDA_VISIBLE_DEVICES=${GPUS_GROUP_1} python train.py -a "${arch}" > "${LOG_DIR}/${arch}.log" 2>&1 &
|
||||
echo " - 模型 ${arch} 已在后台启动。日志文件: ${LOG_DIR}/${arch}.log"
|
||||
echo " - 等待 50 秒..."
|
||||
sleep 50
|
||||
done
|
||||
echo ">>> 第一组所有模型均已启动。"
|
||||
echo "----------------------------------------------------"
|
||||
|
||||
echo ">>> 准备启动第二组训练任务 (后台运行)..."
|
||||
for arch in "${GROUP_2_ARCHS[@]}"; do
|
||||
echo " -> 正在后台启动模型: ${arch} on GPUs: ${GPUS_GROUP_2}"
|
||||
CUDA_VISIBLE_DEVICES=${GPUS_GROUP_2} python train.py -a "${arch}" > "${LOG_DIR}/${arch}.log" 2>&1 &
|
||||
echo " - 模型 ${arch} 已在后台启动。日志文件: ${LOG_DIR}/${arch}.log"
|
||||
echo " - 等待 50 秒..."
|
||||
sleep 50
|
||||
done
|
||||
echo ">>> 第二组所有模型均已启动。"
|
||||
echo "----------------------------------------------------"
|
||||
|
||||
echo ">>> 准备启动第三组训练任务 (后台运行)..."
|
||||
for arch in "${GROUP_3_ARCHS[@]}"; do
|
||||
echo " -> 正在后台启动模型: ${arch} on GPUs: ${GPUS_GROUP_3}"
|
||||
CUDA_VISIBLE_DEVICES=${GPUS_GROUP_3} python train.py -a "${arch}" > "${LOG_DIR}/${arch}.log" 2>&1 &
|
||||
echo " - 模型 ${arch} 已在后台启动。日志文件: ${LOG_DIR}/${arch}.log"
|
||||
echo " - 等待 50 秒..."
|
||||
sleep 50
|
||||
done
|
||||
echo ">>> 第三组所有模型均已启动。"
|
||||
echo "----------------------------------------------------"
|
||||
|
||||
echo ">>> 准备启动第四组训练任务 (后台运行)..."
|
||||
for arch in "${GROUP_4_ARCHS[@]}"; do
|
||||
echo " -> 正在后台启动模型: ${arch} on GPUs: ${GPUS_GROUP_4}"
|
||||
CUDA_VISIBLE_DEVICES=${GPUS_GROUP_4} python train.py -a "${arch}" > "${LOG_DIR}/${arch}.log" 2>&1 &
|
||||
echo " - 模型 ${arch} 已在后台启动。日志文件: ${LOG_DIR}/${arch}.log"
|
||||
echo " - 等待 50 秒..."
|
||||
sleep 50
|
||||
done
|
||||
# echo ">>> 第四组所有模型均已启动。"
|
||||
# echo "----------------------------------------------------"
|
||||
|
||||
# --- 4. 等待所有后台任务完成 ---
|
||||
echo ""
|
||||
echo "--- 所有模型均已在后台启动。现在等待所有训练任务完成... ---"
|
||||
# 'wait' 命令会暂停脚本,直到所有由此脚本启动的后台任务全部执行完毕
|
||||
wait
|
||||
echo "--- 所有后台训练任务已全部完成! ---"
|
||||
|
||||
# 退出前取消激活环境
|
||||
conda deactivate
|
||||
Reference in New Issue
Block a user