#!/usr/bin/env bash set -euo pipefail export NCCL_DEBUG=INFO # 如走 IB/RoCE,请按实际网卡开启: # export NCCL_IB_HCA="mlx5_0,mlx5_1" # export NCCL_SOCKET_IFNAME="ib0" # 纯以太: # export NCCL_SOCKET_IFNAME="eth0" # ==== 超参数(本地路径;可用 VAR=xxx ./launch_ds.sh 覆写)==== MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-/home/test/Qwen3-8B}" DATA_GLOB="${DATA_GLOB:-/data/datasets/my_corpus/*.jsonl}" # 每台机器都放相同路径 OUTDIR="${OUTDIR:-/data/checkpoints/run-qwen3-8b}" # 每台机器各自本地输出 SEQ_LEN="${SEQ_LEN:-4096}" LR="${LR:-2e-5}" GAS="${GAS:-64}" LOG_STEPS="${LOG_STEPS:-10}" SAVE_STEPS="${SAVE_STEPS:-500}" MAX_STEPS="${MAX_STEPS:-10000}" mkdir -p "${OUTDIR}" # ==== 多机 DeepSpeed ==== deepspeed --hostfile hostfile train_sft_ds.py \ --model_name_or_path "${MODEL_NAME_OR_PATH}" \ --data_glob "${DATA_GLOB}" \ --output_dir "${OUTDIR}" \ --seq_len "${SEQ_LEN}" \ --learning_rate "${LR}" \ --gradient_accumulation_steps "${GAS}" \ --per_device_train_batch_size 1 \ --warmup_ratio 0.02 \ --weight_decay 0.1 \ --max_steps "${MAX_STEPS}" \ --log_interval "${LOG_STEPS}" \ --save_steps "${SAVE_STEPS}" \ --deepspeed ds_config_zero3.json \ --gradient_checkpointing \ --bf16