This commit is contained in:
parent
43111064cc
commit
2d06f0ab90
22
mm-zero3.sh
22
mm-zero3.sh
|
|
@ -1,6 +1,13 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
# ==== 0) 日志文件 ====
|
||||||
|
LOG_DIR="/home/test/logs"
|
||||||
|
mkdir -p "$LOG_DIR"
|
||||||
|
LOG_FILE="${LOG_DIR}/train_$(date +%F_%H%M%S)_$$.log"
|
||||||
|
export PYTHONUNBUFFERED=1
|
||||||
|
echo ">> logging to ${LOG_FILE}"
|
||||||
|
|
||||||
# 1) 可选:若含 API Key,保护权限
|
# 1) 可选:若含 API Key,保护权限
|
||||||
[ -f .deepspeed_env ] && chmod 600 .deepspeed_env
|
[ -f .deepspeed_env ] && chmod 600 .deepspeed_env
|
||||||
|
|
||||||
|
|
@ -9,12 +16,6 @@ set -a
|
||||||
. ./.deepspeed_env
|
. ./.deepspeed_env
|
||||||
set +a
|
set +a
|
||||||
|
|
||||||
# unset PYTHONNOUSERSITE
|
|
||||||
# USER_SITE=$(python3 -c 'import site;print(site.getusersitepackages())')
|
|
||||||
# export PATH="$HOME/.local/bin:$PATH"
|
|
||||||
# export PYTHONPATH="$USER_SITE:/home/test/jd_train:${PYTHONPATH:-}"
|
|
||||||
|
|
||||||
|
|
||||||
# 统一环境(会被 deepspeed 的 ssh 继承到各节点)
|
# 统一环境(会被 deepspeed 的 ssh 继承到各节点)
|
||||||
unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE
|
unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE
|
||||||
export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext
|
export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext
|
||||||
|
|
@ -25,6 +26,8 @@ export OMP_NUM_THREADS=8
|
||||||
export MKL_NUM_THREADS=8
|
export MKL_NUM_THREADS=8
|
||||||
export OPENBLAS_NUM_THREADS=8
|
export OPENBLAS_NUM_THREADS=8
|
||||||
|
|
||||||
|
# ==== 3) 运行并同步到文件 + 控制台(保留退出码)====
|
||||||
|
set +e
|
||||||
FORCE_COLOR=1 deepspeed --hostfile hostfile \
|
FORCE_COLOR=1 deepspeed --hostfile hostfile \
|
||||||
--num_nodes 6 --num_gpus 4 \
|
--num_nodes 6 --num_gpus 4 \
|
||||||
/home/test/jd_train/train_sft_ds.py \
|
/home/test/jd_train/train_sft_ds.py \
|
||||||
|
|
@ -48,5 +51,8 @@ FORCE_COLOR=1 deepspeed --hostfile hostfile \
|
||||||
--early_stopping_patience 5 \
|
--early_stopping_patience 5 \
|
||||||
--early_stopping_threshold 0.0 \
|
--early_stopping_threshold 0.0 \
|
||||||
--metric_for_best_model eval_loss \
|
--metric_for_best_model eval_loss \
|
||||||
--eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"
|
--eval_data_glob "/home/test/datasets/my_corpus/test.jsonl" \
|
||||||
|
2>&1 | tee -a "$LOG_FILE"
|
||||||
|
DS_RC=${PIPESTATUS[0]}
|
||||||
|
set -e
|
||||||
|
exit "$DS_RC"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue