This commit is contained in:
parent
f82c5831ee
commit
56a60e0e5b
26
run_ds.sh
26
run_ds.sh
|
|
@ -1,6 +1,9 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
# 可选:调试输出
|
||||||
|
# [ "${DEBUG:-0}" = "1" ] && set -x
|
||||||
|
|
||||||
export NCCL_DEBUG=INFO
|
export NCCL_DEBUG=INFO
|
||||||
# 如走 IB/RoCE,请按实际网卡开启:
|
# 如走 IB/RoCE,请按实际网卡开启:
|
||||||
# export NCCL_IB_HCA="mlx5_0,mlx5_1"
|
# export NCCL_IB_HCA="mlx5_0,mlx5_1"
|
||||||
|
|
@ -8,10 +11,14 @@ export NCCL_DEBUG=INFO
|
||||||
# 纯以太:
|
# 纯以太:
|
||||||
# export NCCL_SOCKET_IFNAME="eth0"
|
# export NCCL_SOCKET_IFNAME="eth0"
|
||||||
|
|
||||||
|
# 解析脚本目录,避免相对路径问题
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
DS_CONFIG="${DS_CONFIG:-$SCRIPT_DIR/ds_config_zero3.json}"
|
||||||
|
|
||||||
# ==== 超参数(本地路径;可用 VAR=xxx ./launch_ds.sh 覆写)====
|
# ==== 超参数(本地路径;可用 VAR=xxx ./launch_ds.sh 覆写)====
|
||||||
MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-/home/test/Qwen3-8B}"
|
MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-/home/test/Qwen3-8B}"
|
||||||
DATA_GLOB="${DATA_GLOB:-/data/datasets/my_corpus/*.jsonl}" # 每台机器都放相同路径
|
DATA_GLOB="${DATA_GLOB:-$HOME/datasets/my_corpus/*.jsonl}" # 每台机器都放相同路径
|
||||||
OUTDIR="${OUTDIR:-/data/checkpoints/run-qwen3-8b}" # 每台机器各自本地输出
|
OUTDIR="${OUTDIR:-$HOME/checkpoints/run-qwen3-8b}" # 每台机器各自本地输出
|
||||||
SEQ_LEN="${SEQ_LEN:-4096}"
|
SEQ_LEN="${SEQ_LEN:-4096}"
|
||||||
LR="${LR:-2e-5}"
|
LR="${LR:-2e-5}"
|
||||||
GAS="${GAS:-64}"
|
GAS="${GAS:-64}"
|
||||||
|
|
@ -19,10 +26,21 @@ LOG_STEPS="${LOG_STEPS:-10}"
|
||||||
SAVE_STEPS="${SAVE_STEPS:-500}"
|
SAVE_STEPS="${SAVE_STEPS:-500}"
|
||||||
MAX_STEPS="${MAX_STEPS:-10000}"
|
MAX_STEPS="${MAX_STEPS:-10000}"
|
||||||
|
|
||||||
|
# 轻量校验(只在发起节点做;远端各 rank 会在各自脚本里也 mkdir)
|
||||||
|
[ -d "$MODEL_NAME_OR_PATH" ] || { echo "ERR: model not found at $MODEL_NAME_OR_PATH"; exit 1; }
|
||||||
|
[ -f "$DS_CONFIG" ] || { echo "ERR: deepspeed config not found at $DS_CONFIG"; exit 1; }
|
||||||
|
# 数据是通配符,这里只做一次“至少匹配到 1 个文件”的检查(发起节点);各节点需自行确保相同路径存在
|
||||||
|
shopt -s nullglob
|
||||||
|
matches=( $DATA_GLOB )
|
||||||
|
if [ ${#matches[@]} -eq 0 ]; then
|
||||||
|
echo "WARN: no files matched by DATA_GLOB=$DATA_GLOB on this node (确保每台机器该路径下有数据)"
|
||||||
|
fi
|
||||||
|
shopt -u nullglob
|
||||||
|
|
||||||
mkdir -p "${OUTDIR}"
|
mkdir -p "${OUTDIR}"
|
||||||
|
|
||||||
# ==== 多机 DeepSpeed ====
|
# ==== 多机 DeepSpeed ====
|
||||||
deepspeed --hostfile hostfile train_sft_ds.py \
|
deepspeed --hostfile "$SCRIPT_DIR/hostfile" "$SCRIPT_DIR/train_sft_ds.py" \
|
||||||
--model_name_or_path "${MODEL_NAME_OR_PATH}" \
|
--model_name_or_path "${MODEL_NAME_OR_PATH}" \
|
||||||
--data_glob "${DATA_GLOB}" \
|
--data_glob "${DATA_GLOB}" \
|
||||||
--output_dir "${OUTDIR}" \
|
--output_dir "${OUTDIR}" \
|
||||||
|
|
@ -35,6 +53,6 @@ deepspeed --hostfile hostfile train_sft_ds.py \
|
||||||
--max_steps "${MAX_STEPS}" \
|
--max_steps "${MAX_STEPS}" \
|
||||||
--log_interval "${LOG_STEPS}" \
|
--log_interval "${LOG_STEPS}" \
|
||||||
--save_steps "${SAVE_STEPS}" \
|
--save_steps "${SAVE_STEPS}" \
|
||||||
--deepspeed ds_config_zero3.json \
|
--deepspeed "${DS_CONFIG}" \
|
||||||
--gradient_checkpointing \
|
--gradient_checkpointing \
|
||||||
--bf16
|
--bf16
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import os
|
import os
|
||||||
import glob
|
import glob
|
||||||
|
import socket
|
||||||
import argparse
|
import argparse
|
||||||
from typing import Dict, List, Iterable, Iterator
|
from typing import Dict, List, Iterable, Iterator
|
||||||
|
|
||||||
|
|
@ -131,11 +132,21 @@ def main():
|
||||||
if args.gradient_checkpointing:
|
if args.gradient_checkpointing:
|
||||||
model.gradient_checkpointing_enable()
|
model.gradient_checkpointing_enable()
|
||||||
|
|
||||||
# 本地数据(每台机器同路径各自读取)
|
# ===== 数据鲁棒性检查(多机各自执行)=====
|
||||||
|
host = socket.gethostname()
|
||||||
|
rank = int(os.environ.get("RANK", "0"))
|
||||||
|
|
||||||
files = sorted(glob.glob(args.data_glob))
|
files = sorted(glob.glob(args.data_glob))
|
||||||
if len(files) == 0:
|
if len(files) == 0:
|
||||||
raise FileNotFoundError(f"No files match: {args.data_glob}")
|
raise FileNotFoundError(
|
||||||
|
f"[host={host} rank={rank}] No files matched DATA_GLOB={args.data_glob}\n"
|
||||||
|
"每台机器都必须在相同本地路径下放置数据;"
|
||||||
|
"可通过 DATA_GLOB=<your_glob> ./launch_ds.sh 覆写。"
|
||||||
|
)
|
||||||
|
if is_main_process():
|
||||||
|
print(f"[data] matched {len(files)} files on host={host}, example[0]={files[0]}", flush=True)
|
||||||
|
|
||||||
|
# streaming 逐行读取,字段名为 'text'
|
||||||
dataset_iter = load_dataset(
|
dataset_iter = load_dataset(
|
||||||
"json",
|
"json",
|
||||||
data_files={"train": files},
|
data_files={"train": files},
|
||||||
|
|
@ -149,38 +160,59 @@ def main():
|
||||||
if isinstance(txt, str) and len(txt.strip()) > 0:
|
if isinstance(txt, str) and len(txt.strip()) > 0:
|
||||||
yield txt
|
yield txt
|
||||||
|
|
||||||
train_stream = ConstantLengthDataset(texts_iter=text_iter(), tokenizer=tokenizer, seq_len=args.seq_len)
|
# 先构造一次流,做“非空探针”
|
||||||
|
train_stream_probe = ConstantLengthDataset(texts_iter=text_iter(), tokenizer=tokenizer, seq_len=args.seq_len)
|
||||||
|
_probe = iter(train_stream_probe)
|
||||||
|
try:
|
||||||
|
_ = next(_probe) # 拉一个 chunk,确保真的能产出训练样本
|
||||||
|
except StopIteration:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"[host={host} rank={rank}] 数据文件匹配到了,但没有产生任何可训练样本。\n"
|
||||||
|
"常见原因:jsonl 缺少 'text' 字段、内容全为空/空白行、或 --seq_len 过大。\n"
|
||||||
|
"请检查样例行,或将 --seq_len 调小后再试。"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 探针消耗了流,重新构造一次“干净”的训练流
|
||||||
|
dataset_iter2 = load_dataset("json", data_files={"train": files}, split="train", streaming=True)
|
||||||
|
def text_iter2():
|
||||||
|
for ex in dataset_iter2:
|
||||||
|
txt = ex.get("text", None)
|
||||||
|
if isinstance(txt, str) and len(txt.strip()) > 0:
|
||||||
|
yield txt
|
||||||
|
train_stream = ConstantLengthDataset(texts_iter=text_iter2(), tokenizer=tokenizer, seq_len=args.seq_len)
|
||||||
|
|
||||||
|
# 可选 eval(从头部抽样)
|
||||||
eval_dataset = None
|
eval_dataset = None
|
||||||
if args.eval_ratio and args.eval_ratio > 0:
|
if args.eval_ratio and args.eval_ratio > 0:
|
||||||
# 简易 eval:从训练流头部抽若干 batch(可按需关闭)
|
|
||||||
desired_eval_batches = 200
|
desired_eval_batches = 200
|
||||||
gen = iter(train_stream)
|
gen = iter(train_stream)
|
||||||
eval_samples = []
|
eval_samples = []
|
||||||
for _ in range(desired_eval_batches):
|
for _ in range(desired_eval_batches):
|
||||||
try: eval_samples.append(next(gen))
|
try:
|
||||||
except StopIteration: break
|
eval_samples.append(next(gen))
|
||||||
|
except StopIteration:
|
||||||
|
break
|
||||||
class ListDataset(torch.utils.data.Dataset):
|
class ListDataset(torch.utils.data.Dataset):
|
||||||
def __init__(self, items): self.items = items
|
def __init__(self, items): self.items = items
|
||||||
def __len__(self): return len(self.items)
|
def __len__(self): return len(self.items)
|
||||||
def __getitem__(self, idx): return self.items[idx]
|
def __getitem__(self, idx): return self.items[idx]
|
||||||
eval_dataset = ListDataset(eval_samples)
|
eval_dataset = ListDataset(eval_samples)
|
||||||
# 重新构造训练流
|
|
||||||
dataset_iter2 = load_dataset("json", data_files={"train": files}, split="train", streaming=True)
|
# 抽样后再重建训练流,防止“吃掉”头部
|
||||||
def text_iter2():
|
dataset_iter3 = load_dataset("json", data_files={"train": files}, split="train", streaming=True)
|
||||||
for ex in dataset_iter2:
|
def text_iter3():
|
||||||
|
for ex in dataset_iter3:
|
||||||
txt = ex.get("text", None)
|
txt = ex.get("text", None)
|
||||||
if isinstance(txt, str) and len(txt.strip()) > 0:
|
if isinstance(txt, str) and len(txt.strip()) > 0:
|
||||||
yield txt
|
yield txt
|
||||||
train_stream = ConstantLengthDataset(texts_iter=text_iter2(), tokenizer=tokenizer, seq_len=args.seq_len)
|
train_stream = ConstantLengthDataset(texts_iter=text_iter3(), tokenizer=tokenizer, seq_len=args.seq_len)
|
||||||
|
|
||||||
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
||||||
|
|
||||||
os.makedirs(args.output_dir, exist_ok=True)
|
os.makedirs(args.output_dir, exist_ok=True)
|
||||||
logging_dir = os.path.join(args.output_dir, "logs")
|
logging_dir = os.path.join(args.output_dir, "logs")
|
||||||
|
|
||||||
# 关键点:无共享盘时,HF/DS 的分片 checkpoint 会在每个 rank 的本地 output_dir 下各自写入
|
# 无共享盘:各 rank 在各自本地 output_dir 下写入自己的分片
|
||||||
# 训练恢复时,各 rank 从各自本地的相同路径读取自己的分片即可(保持相同 world_size 即可)
|
|
||||||
training_args = TrainingArguments(
|
training_args = TrainingArguments(
|
||||||
output_dir=args.output_dir,
|
output_dir=args.output_dir,
|
||||||
logging_dir=logging_dir,
|
logging_dir=logging_dir,
|
||||||
|
|
@ -208,7 +240,7 @@ def main():
|
||||||
gradient_checkpointing=args.gradient_checkpointing,
|
gradient_checkpointing=args.gradient_checkpointing,
|
||||||
remove_unused_columns=False,
|
remove_unused_columns=False,
|
||||||
torch_compile=False,
|
torch_compile=False,
|
||||||
save_on_each_node=False # 保持默认;DeepSpeed 的分片 checkpoint 本就会各 rank 本地写
|
save_on_each_node=False
|
||||||
)
|
)
|
||||||
|
|
||||||
trainer = Trainer(
|
trainer = Trainer(
|
||||||
|
|
@ -226,10 +258,10 @@ def main():
|
||||||
and any(n.startswith("checkpoint-") for n in os.listdir(args.output_dir)))
|
and any(n.startswith("checkpoint-") for n in os.listdir(args.output_dir)))
|
||||||
resume_flag = True if ckpt_exists else None
|
resume_flag = True if ckpt_exists else None
|
||||||
|
|
||||||
print_once(f"Resume = {resume_flag is True}")
|
print_once(f"[host={host}] Resume = {resume_flag is True}")
|
||||||
print_once("***** Starting training *****")
|
print_once("***** Starting training *****")
|
||||||
train_result = trainer.train(resume_from_checkpoint=resume_flag)
|
train_result = trainer.train(resume_from_checkpoint=resume_flag)
|
||||||
trainer.save_model() # 配合 DS 配置中的 gather,会在全局 rank0 聚合保存 16-bit 整模型
|
trainer.save_model() # 配合 DS 配置 stage3_gather_16bit_weights_on_model_save=true,仅在全局 rank0 聚合保存整模型
|
||||||
|
|
||||||
metrics = train_result.metrics
|
metrics = train_result.metrics
|
||||||
trainer.log_metrics("train", metrics)
|
trainer.log_metrics("train", metrics)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue