This commit is contained in:
hailin 2025-09-25 13:47:11 +08:00
parent 43111064cc
commit 2d06f0ab90
1 changed files with 14 additions and 8 deletions

View File

@ -1,6 +1,13 @@
#!/usr/bin/env bash
set -euo pipefail
# ==== 0) 日志文件 ====
LOG_DIR="/home/test/logs"
mkdir -p "$LOG_DIR"
LOG_FILE="${LOG_DIR}/train_$(date +%F_%H%M%S)_$$.log"
export PYTHONUNBUFFERED=1
echo ">> logging to ${LOG_FILE}"
# 1) 可选:若含 API Key保护权限
[ -f .deepspeed_env ] && chmod 600 .deepspeed_env
@ -9,12 +16,6 @@ set -a
. ./.deepspeed_env
set +a
# unset PYTHONNOUSERSITE
# USER_SITE=$(python3 -c 'import site;print(site.getusersitepackages())')
# export PATH="$HOME/.local/bin:$PATH"
# export PYTHONPATH="$USER_SITE:/home/test/jd_train:${PYTHONPATH:-}"
# 统一环境(会被 deepspeed 的 ssh 继承到各节点)
unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE
export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext
@ -25,6 +26,8 @@ export OMP_NUM_THREADS=8
export MKL_NUM_THREADS=8
export OPENBLAS_NUM_THREADS=8
# ==== 3) 运行并同步到文件 + 控制台(保留退出码)====
set +e
FORCE_COLOR=1 deepspeed --hostfile hostfile \
--num_nodes 6 --num_gpus 4 \
/home/test/jd_train/train_sft_ds.py \
@ -48,5 +51,8 @@ FORCE_COLOR=1 deepspeed --hostfile hostfile \
--early_stopping_patience 5 \
--early_stopping_threshold 0.0 \
--metric_for_best_model eval_loss \
--eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"
--eval_data_glob "/home/test/datasets/my_corpus/test.jsonl" \
2>&1 | tee -a "$LOG_FILE"
DS_RC=${PIPESTATUS[0]}
set -e
exit "$DS_RC"