#!/usr/bin/env bash set -euo pipefail # ==== 0) 日志文件 ==== LOG_DIR="/home/test/logs" mkdir -p "$LOG_DIR" LOG_FILE="${LOG_DIR}/train_$(date +%F_%H%M%S)_$$.log" export PYTHONUNBUFFERED=1 echo ">> logging to ${LOG_FILE}" # 1) 可选:若含 API Key,保护权限 [ -f .deepspeed_env ] && chmod 600 .deepspeed_env # 2) 注入统一环境(供所有 rank 继承) set -a . ./.deepspeed_env set +a # 统一环境(会被 deepspeed 的 ssh 继承到各节点) unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9" export OMP_NUM_THREADS=8 export MKL_NUM_THREADS=8 export OPENBLAS_NUM_THREADS=8 # ==== 3) 运行并同步到文件 + 控制台(保留退出码)==== set +e FORCE_COLOR=1 deepspeed --hostfile hostfile \ --num_nodes 6 --num_gpus 4 \ /home/test/jd_train/train_sft_ds.py \ --model_name_or_path /home/test/Qwen3-32B \ --data_glob "/home/test/datasets/my_corpus/train.jsonl" \ --output_dir /home/test/checkpoints/q3-32b-ds4 \ --seq_len 512 \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 1 \ --learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \ --max_steps 300 \ --log_interval 1 \ --gradient_checkpointing \ --bf16 \ --deepspeed /home/test/jd_train/ds_config_zero3.json \ --report_to wandb \ --wandb_project ds-qwen3 \ --eval_steps 10 \ --save_steps 10 \ --load_best_model_at_end \ --early_stopping_patience 5 \ --early_stopping_threshold 0.0 \ --metric_for_best_model eval_loss \ --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl" \ 2>&1 | tee -a "$LOG_FILE" DS_RC=${PIPESTATUS[0]} set -e exit "$DS_RC"