From 43111064cc724b49f4099bf91df7e5eaebbf63d9 Mon Sep 17 00:00:00 2001 From: hailin Date: Wed, 24 Sep 2025 20:25:25 +0800 Subject: [PATCH] . --- train_sft_ds.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/train_sft_ds.py b/train_sft_ds.py index 45f944d..5399433 100644 --- a/train_sft_ds.py +++ b/train_sft_ds.py @@ -853,15 +853,17 @@ def main(): # ====== 正式训练流(不做任何手动分片,交给 Accelerate/Trainer)====== - ds_stream2 = load_dataset("json", data_files={"train": files}, split="train", streaming=True)\ - .shuffle(buffer_size=50000, seed=args.seed) + # ds_stream2 = load_dataset("json", data_files={"train": files}, split="train", streaming=True)\ + # .shuffle(buffer_size=50000, seed=args.seed) + + ex_iter = endless_examples(files, args.seed, buf=50000) # 先尝试 datasets 的无限流;没有就用我们自己的无限生成器 - try: - ds_stream2 = ds_stream2.repeat() # ★ 若可用:官方无限流 - ex_iter = (ex for ex in ds_stream2) # ★ 统一用 ex_iter 作为上游 - except AttributeError: - ex_iter = endless_examples(files, args.seed, buf=50000) # ★ 兜底:自制无限流 + # try: + # ds_stream2 = ds_stream2.repeat() # ★ 若可用:官方无限流 + # ex_iter = (ex for ex in ds_stream2) # ★ 统一用 ex_iter 作为上游 + # except AttributeError: + # ex_iter = endless_examples(files, args.seed, buf=50000) # ★ 兜底:自制无限流 # 关键:这里一定要用 ex_iter,而不是重新从 ds_stream2 取一次 train_stream = QwenChatSFTDataset(ex_iter, tokenizer, seq_len=args.seq_len)