48 lines
1.5 KiB
Bash
Executable File
48 lines
1.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
# 1) 可选:若含 API Key,保护权限
|
||
[ -f .deepspeed_env ] && chmod 600 .deepspeed_env
|
||
|
||
# 2) 注入统一环境(供所有 rank 继承)
|
||
set -a
|
||
. ./.deepspeed_env
|
||
set +a
|
||
|
||
# unset PYTHONNOUSERSITE
|
||
# USER_SITE=$(python3 -c 'import site;print(site.getusersitepackages())')
|
||
# export PATH="$HOME/.local/bin:$PATH"
|
||
# export PYTHONPATH="$USER_SITE:/home/test/jd_train:${PYTHONPATH:-}"
|
||
|
||
|
||
# 统一环境(会被 deepspeed 的 ssh 继承到各节点)
|
||
unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE
|
||
export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext
|
||
|
||
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9"
|
||
|
||
export OMP_NUM_THREADS=8
|
||
export MKL_NUM_THREADS=8
|
||
export OPENBLAS_NUM_THREADS=8
|
||
|
||
deepspeed --hostfile hostfile \
|
||
--num_nodes 6 --num_gpus 4 \
|
||
/home/test/jd_train/train_sft_ds.py \
|
||
--model_name_or_path /home/test/Qwen3-32B \
|
||
--data_glob "/home/test/datasets/my_corpus/train.jsonl" \
|
||
--output_dir /home/test/checkpoints/q3-32b-ds4 \
|
||
--seq_len 512 \
|
||
--per_device_train_batch_size 1 \
|
||
--gradient_accumulation_steps 1 \
|
||
--learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \
|
||
--max_steps 20 \
|
||
--log_interval 1 \
|
||
--gradient_checkpointing \
|
||
--bf16 \
|
||
--deepspeed /home/test/jd_train/ds_config_zero3.json \
|
||
--report_to wandb \
|
||
--wandb_project ds-qwen3 \
|
||
--eval_steps 10 \
|
||
--eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"
|
||
|