jd_train/run_ds.sh

78 lines
2.5 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
# [ "${DEBUG:-0}" = "1" ] && set -x
export NCCL_DEBUG=INFO
# 如走 IB/RoCE请按实际网卡开启示例
# export NCCL_IB_HCA="mlx5_0,mlx5_1"
# export NCCL_SOCKET_IFNAME="ib0"
# 纯以太:
# export NCCL_SOCKET_IFNAME="eth0"
# 解析脚本目录,避免相对路径问题
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DS_CONFIG="${DS_CONFIG:-$SCRIPT_DIR/ds_config_zero3.json}"
# ==== 超参数(本地路径;可用 VAR=xxx ./run_ds.sh 覆写)====
MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-/home/test/Qwen3-8B}"
# 明确区分训练/评测文件(可按需改成通配符)
DATA_GLOB="${DATA_GLOB:-$HOME/datasets/my_corpus/train.jsonl}"
EVAL_DATA_GLOB="${EVAL_DATA_GLOB:-$HOME/datasets/my_corpus/test.jsonl}"
OUTDIR="${OUTDIR:-$HOME/checkpoints/run-qwen3-8b}"
SEQ_LEN="${SEQ_LEN:-4096}"
LR="${LR:-2e-5}"
GAS="${GAS:-1}"
LOG_STEPS="${LOG_STEPS:-10}"
SAVE_STEPS="${SAVE_STEPS:-10}"
MAX_STEPS="${MAX_STEPS:-62}"
# 轻量校验(只在发起节点做;各 rank 在脚本里也会 mkdir
[ -d "$MODEL_NAME_OR_PATH" ] || { echo "ERR: model not found at $MODEL_NAME_OR_PATH"; exit 1; }
[ -f "$DS_CONFIG" ] || { echo "ERR: deepspeed config not found at $DS_CONFIG"; exit 1; }
# 检查训练集是否能匹配到
shopt -s nullglob
train_matches=( $DATA_GLOB )
if [ ${#train_matches[@]} -eq 0 ]; then
echo "WARN: no files matched by DATA_GLOB=$DATA_GLOB on this node (确保每台机器该路径下有数据)"
fi
# 检查评测集(可为空;若为空则不传 --eval_data_glob
eval_matches=()
if [ -n "${EVAL_DATA_GLOB:-}" ]; then
eval_matches=( $EVAL_DATA_GLOB )
if [ ${#eval_matches[@]} -eq 0 ]; then
echo "WARN: no files matched by EVAL_DATA_GLOB=$EVAL_DATA_GLOB on this node (将不进行评测)"
fi
fi
shopt -u nullglob
mkdir -p "${OUTDIR}"
# 组装参数(只有 eval 有效才加)
args=(
--model_name_or_path "${MODEL_NAME_OR_PATH}"
--data_glob "${DATA_GLOB}"
--output_dir "${OUTDIR}"
--seq_len "${SEQ_LEN}"
--learning_rate "${LR}"
--gradient_accumulation_steps "${GAS}"
--per_device_train_batch_size 1
--warmup_ratio 0.02
--weight_decay 0.1
--max_steps "${MAX_STEPS}"
--log_interval "${LOG_STEPS}"
--save_steps "${SAVE_STEPS}"
--deepspeed "${DS_CONFIG}"
--gradient_checkpointing
--bf16
)
if [ ${#eval_matches[@]} -gt 0 ]; then
args+=( --eval_data_glob "${EVAL_DATA_GLOB}" )
fi
# ==== 多机 DeepSpeed ====
deepspeed --hostfile "$SCRIPT_DIR/hostfile" "$SCRIPT_DIR/train_sft_ds.py" "${args[@]}"