#!/usr/bin/env bash set -euo pipefail # [ "${DEBUG:-0}" = "1" ] && set -x export NCCL_DEBUG=INFO # 如走 IB/RoCE,请按实际网卡开启(示例): # export NCCL_IB_HCA="mlx5_0,mlx5_1" # export NCCL_SOCKET_IFNAME="ib0" # 纯以太: # export NCCL_SOCKET_IFNAME="eth0" # 解析脚本目录,避免相对路径问题 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" DS_CONFIG="${DS_CONFIG:-$SCRIPT_DIR/ds_config_zero3.json}" # ==== 超参数(本地路径;可用 VAR=xxx ./run_ds.sh 覆写)==== MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-/home/test/Qwen3-1.7B}" # 明确区分训练/评测文件(可按需改成通配符) DATA_GLOB="${DATA_GLOB:-$HOME/datasets/my_corpus/train.jsonl}" EVAL_DATA_GLOB="${EVAL_DATA_GLOB:-$HOME/datasets/my_corpus/test.jsonl}" OUTDIR="${OUTDIR:-$HOME/checkpoints/run-qwen3-1.7b}" SEQ_LEN="${SEQ_LEN:-512}" LR="${LR:-2e-5}" GAS="${GAS:-1}" LOG_STEPS="${LOG_STEPS:-10}" SAVE_STEPS="${SAVE_STEPS:-10}" MAX_STEPS="${MAX_STEPS:-62}" # 轻量校验(只在发起节点做;各 rank 在脚本里也会 mkdir) [ -d "$MODEL_NAME_OR_PATH" ] || { echo "ERR: model not found at $MODEL_NAME_OR_PATH"; exit 1; } [ -f "$DS_CONFIG" ] || { echo "ERR: deepspeed config not found at $DS_CONFIG"; exit 1; } # 检查训练集是否能匹配到 shopt -s nullglob train_matches=( $DATA_GLOB ) if [ ${#train_matches[@]} -eq 0 ]; then echo "WARN: no files matched by DATA_GLOB=$DATA_GLOB on this node (确保每台机器该路径下有数据)" fi # 检查评测集(可为空;若为空则不传 --eval_data_glob) eval_matches=() if [ -n "${EVAL_DATA_GLOB:-}" ]; then eval_matches=( $EVAL_DATA_GLOB ) if [ ${#eval_matches[@]} -eq 0 ]; then echo "WARN: no files matched by EVAL_DATA_GLOB=$EVAL_DATA_GLOB on this node (将不进行评测)" fi fi shopt -u nullglob mkdir -p "${OUTDIR}" # 组装参数(只有 eval 有效才加) args=( --model_name_or_path "${MODEL_NAME_OR_PATH}" --data_glob "${DATA_GLOB}" --output_dir "${OUTDIR}" --seq_len "${SEQ_LEN}" --learning_rate "${LR}" --gradient_accumulation_steps "${GAS}" --per_device_train_batch_size 1 --warmup_ratio 0.02 --weight_decay 0.1 --max_steps "${MAX_STEPS}" --log_interval "${LOG_STEPS}" --save_steps "${SAVE_STEPS}" --deepspeed "${DS_CONFIG}" --gradient_checkpointing --bf16 ) if [ ${#eval_matches[@]} -gt 0 ]; then args+=( --eval_data_glob "${EVAL_DATA_GLOB}" ) fi # ==== 多机 DeepSpeed ==== deepspeed --hostfile "$SCRIPT_DIR/hostfile" "$SCRIPT_DIR/train_sft_ds.py" "${args[@]}"