diff --git a/mm-zero3.sh b/mm-zero3.sh index a0ae104..f3be15d 100755 --- a/mm-zero3.sh +++ b/mm-zero3.sh @@ -1,6 +1,13 @@ #!/usr/bin/env bash set -euo pipefail +# ==== 0) 日志文件 ==== +LOG_DIR="/home/test/logs" +mkdir -p "$LOG_DIR" +LOG_FILE="${LOG_DIR}/train_$(date +%F_%H%M%S)_$$.log" +export PYTHONUNBUFFERED=1 +echo ">> logging to ${LOG_FILE}" + # 1) 可选:若含 API Key,保护权限 [ -f .deepspeed_env ] && chmod 600 .deepspeed_env @@ -9,12 +16,6 @@ set -a . ./.deepspeed_env set +a -# unset PYTHONNOUSERSITE -# USER_SITE=$(python3 -c 'import site;print(site.getusersitepackages())') -# export PATH="$HOME/.local/bin:$PATH" -# export PYTHONPATH="$USER_SITE:/home/test/jd_train:${PYTHONPATH:-}" - - # 统一环境(会被 deepspeed 的 ssh 继承到各节点) unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext @@ -25,6 +26,8 @@ export OMP_NUM_THREADS=8 export MKL_NUM_THREADS=8 export OPENBLAS_NUM_THREADS=8 +# ==== 3) 运行并同步到文件 + 控制台(保留退出码)==== +set +e FORCE_COLOR=1 deepspeed --hostfile hostfile \ --num_nodes 6 --num_gpus 4 \ /home/test/jd_train/train_sft_ds.py \ @@ -48,5 +51,8 @@ FORCE_COLOR=1 deepspeed --hostfile hostfile \ --early_stopping_patience 5 \ --early_stopping_threshold 0.0 \ --metric_for_best_model eval_loss \ - --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl" - + --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl" \ + 2>&1 | tee -a "$LOG_FILE" +DS_RC=${PIPESTATUS[0]} +set -e +exit "$DS_RC"