#!/usr/bin/env bash set -euo pipefail # 1) 可选:若含 API Key,保护权限 [ -f .deepspeed_env ] && chmod 600 .deepspeed_env # 2) 注入统一环境(供所有 rank 继承) set -a . ./.deepspeed_env set +a # unset PYTHONNOUSERSITE # USER_SITE=$(python3 -c 'import site;print(site.getusersitepackages())') # export PATH="$HOME/.local/bin:$PATH" # export PYTHONPATH="$USER_SITE:/home/test/jd_train:${PYTHONPATH:-}" # 统一环境(会被 deepspeed 的 ssh 继承到各节点) unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9" export OMP_NUM_THREADS=8 export MKL_NUM_THREADS=8 export OPENBLAS_NUM_THREADS=8 deepspeed --hostfile hostfile \ --num_nodes 6 --num_gpus 4 \ /home/test/jd_train/train_sft_ds.py \ --model_name_or_path /home/test/Qwen3-32B \ --data_glob "/home/test/datasets/my_corpus/train.jsonl" \ --output_dir /home/test/checkpoints/q3-32b-ds4 \ --seq_len 512 \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 1 \ --learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \ --max_steps 20 \ --log_interval 1 \ --gradient_checkpointing \ --bf16 \ --deepspeed /home/test/jd_train/ds_config_zero3.json \ --report_to wandb \ --eval_steps 10 \ --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"