41 lines
1.2 KiB
Bash
41 lines
1.2 KiB
Bash
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
export NCCL_DEBUG=INFO
|
||
# 如走 IB/RoCE,请按实际网卡开启:
|
||
# export NCCL_IB_HCA="mlx5_0,mlx5_1"
|
||
# export NCCL_SOCKET_IFNAME="ib0"
|
||
# 纯以太:
|
||
# export NCCL_SOCKET_IFNAME="eth0"
|
||
|
||
# ==== 超参数(本地路径;可用 VAR=xxx ./launch_ds.sh 覆写)====
|
||
MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-/home/test/Qwen3-8B}"
|
||
DATA_GLOB="${DATA_GLOB:-/data/datasets/my_corpus/*.jsonl}" # 每台机器都放相同路径
|
||
OUTDIR="${OUTDIR:-/data/checkpoints/run-qwen3-8b}" # 每台机器各自本地输出
|
||
SEQ_LEN="${SEQ_LEN:-4096}"
|
||
LR="${LR:-2e-5}"
|
||
GAS="${GAS:-64}"
|
||
LOG_STEPS="${LOG_STEPS:-10}"
|
||
SAVE_STEPS="${SAVE_STEPS:-500}"
|
||
MAX_STEPS="${MAX_STEPS:-10000}"
|
||
|
||
mkdir -p "${OUTDIR}"
|
||
|
||
# ==== 多机 DeepSpeed ====
|
||
deepspeed --hostfile hostfile train_sft_ds.py \
|
||
--model_name_or_path "${MODEL_NAME_OR_PATH}" \
|
||
--data_glob "${DATA_GLOB}" \
|
||
--output_dir "${OUTDIR}" \
|
||
--seq_len "${SEQ_LEN}" \
|
||
--learning_rate "${LR}" \
|
||
--gradient_accumulation_steps "${GAS}" \
|
||
--per_device_train_batch_size 1 \
|
||
--warmup_ratio 0.02 \
|
||
--weight_decay 0.1 \
|
||
--max_steps "${MAX_STEPS}" \
|
||
--log_interval "${LOG_STEPS}" \
|
||
--save_steps "${SAVE_STEPS}" \
|
||
--deepspeed ds_config_zero3.json \
|
||
--gradient_checkpointing \
|
||
--bf16
|