78 lines
2.5 KiB
Bash
Executable File
78 lines
2.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
# [ "${DEBUG:-0}" = "1" ] && set -x
|
||
|
||
export NCCL_DEBUG=INFO
|
||
# 如走 IB/RoCE,请按实际网卡开启(示例):
|
||
# export NCCL_IB_HCA="mlx5_0,mlx5_1"
|
||
# export NCCL_SOCKET_IFNAME="ib0"
|
||
# 纯以太:
|
||
# export NCCL_SOCKET_IFNAME="eth0"
|
||
|
||
# 解析脚本目录,避免相对路径问题
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
DS_CONFIG="${DS_CONFIG:-$SCRIPT_DIR/ds_config_zero3.json}"
|
||
|
||
# ==== 超参数(本地路径;可用 VAR=xxx ./run_ds.sh 覆写)====
|
||
MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-/home/test/Qwen3-1.7B}"
|
||
|
||
# 明确区分训练/评测文件(可按需改成通配符)
|
||
DATA_GLOB="${DATA_GLOB:-$HOME/datasets/my_corpus/train.jsonl}"
|
||
EVAL_DATA_GLOB="${EVAL_DATA_GLOB:-$HOME/datasets/my_corpus/test.jsonl}"
|
||
|
||
OUTDIR="${OUTDIR:-$HOME/checkpoints/run-qwen3-1.7b}"
|
||
SEQ_LEN="${SEQ_LEN:-512}"
|
||
LR="${LR:-2e-5}"
|
||
GAS="${GAS:-1}"
|
||
LOG_STEPS="${LOG_STEPS:-10}"
|
||
SAVE_STEPS="${SAVE_STEPS:-10}"
|
||
MAX_STEPS="${MAX_STEPS:-62}"
|
||
|
||
# 轻量校验(只在发起节点做;各 rank 在脚本里也会 mkdir)
|
||
[ -d "$MODEL_NAME_OR_PATH" ] || { echo "ERR: model not found at $MODEL_NAME_OR_PATH"; exit 1; }
|
||
[ -f "$DS_CONFIG" ] || { echo "ERR: deepspeed config not found at $DS_CONFIG"; exit 1; }
|
||
|
||
# 检查训练集是否能匹配到
|
||
shopt -s nullglob
|
||
train_matches=( $DATA_GLOB )
|
||
if [ ${#train_matches[@]} -eq 0 ]; then
|
||
echo "WARN: no files matched by DATA_GLOB=$DATA_GLOB on this node (确保每台机器该路径下有数据)"
|
||
fi
|
||
|
||
# 检查评测集(可为空;若为空则不传 --eval_data_glob)
|
||
eval_matches=()
|
||
if [ -n "${EVAL_DATA_GLOB:-}" ]; then
|
||
eval_matches=( $EVAL_DATA_GLOB )
|
||
if [ ${#eval_matches[@]} -eq 0 ]; then
|
||
echo "WARN: no files matched by EVAL_DATA_GLOB=$EVAL_DATA_GLOB on this node (将不进行评测)"
|
||
fi
|
||
fi
|
||
shopt -u nullglob
|
||
|
||
mkdir -p "${OUTDIR}"
|
||
|
||
# 组装参数(只有 eval 有效才加)
|
||
args=(
|
||
--model_name_or_path "${MODEL_NAME_OR_PATH}"
|
||
--data_glob "${DATA_GLOB}"
|
||
--output_dir "${OUTDIR}"
|
||
--seq_len "${SEQ_LEN}"
|
||
--learning_rate "${LR}"
|
||
--gradient_accumulation_steps "${GAS}"
|
||
--per_device_train_batch_size 1
|
||
--warmup_ratio 0.02
|
||
--weight_decay 0.1
|
||
--max_steps "${MAX_STEPS}"
|
||
--log_interval "${LOG_STEPS}"
|
||
--save_steps "${SAVE_STEPS}"
|
||
--deepspeed "${DS_CONFIG}"
|
||
--gradient_checkpointing
|
||
--bf16
|
||
)
|
||
if [ ${#eval_matches[@]} -gt 0 ]; then
|
||
args+=( --eval_data_glob "${EVAL_DATA_GLOB}" )
|
||
fi
|
||
|
||
# ==== 多机 DeepSpeed ====
|
||
deepspeed --hostfile "$SCRIPT_DIR/hostfile" "$SCRIPT_DIR/train_sft_ds.py" "${args[@]}"
|