.

2025-08-29 22:06:04 +08:00 · 2025-08-29 22:06:04 +08:00 · 35f5c85446
parent a87f1212cc
commit 35f5c85446
3 changed files with 41 additions and 21 deletions
--- a/ds_config_zero3.json
+++ b/ds_config_zero3.json
@ -17,6 +17,7 @@
    "stage3_gather_16bit_weights_on_model_save": false,
    "offload_param":     { "device": "cpu", "pin_memory": true }
  },
  "wall_clock_breakdown": false
--- a/mm-zero3.sh
+++ b/mm-zero3.sh
@ -1,3 +1,5 @@
 export DS_BUILD_OPS=0
 export DS_SKIP_CUDA_BUILD=1
 export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9"
 deepspeed --hostfile hostfile \
--- a/train_sft_ds.py
+++ b/train_sft_ds.py
@ -24,6 +24,8 @@ from transformers import (
 )
 from transformers.trainer_callback import TrainerCallback
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.deepspeed import HfDeepSpeedConfig
 # ----------------- 进程工具 -----------------
 def is_main_process():
@ -483,32 +485,42 @@ def main():
    dtype = (torch.bfloat16 if use_bf16 else
            (torch.float16 if torch.cuda.is_available() else torch.float32))
    dschf = None
    if args.deepspeed and os.path.isfile(args.deepspeed):
        dschf = HfDeepSpeedConfig(args.deepspeed)   # ← 关键：提前启用插件
        dbg("HfDeepSpeedConfig loaded")
    # try:
    #     import deepspeed
    #     zero_init_ctx = deepspeed.zero.Init(
    #         remote_device="cpu",  # 参数最终托管在 CPU（可结合 offload）
    #         device="cpu",         # ← 关键：不要用 meta
    #         pin_memory=True,
    #         dtype=dtype,
    #         config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
    #     )
    # except Exception:
    #     zero_init_ctx = nullcontext()  # 没装 DS 时也能单机跑
-    try:
+    # with zero_init_ctx:
-        import deepspeed
+    #     model = AutoModelForCausalLM.from_pretrained(
-        zero_init_ctx = deepspeed.zero.Init(
+    #         args.model_name_or_path,
-            remote_device="cpu",  # 参数最终托管在 CPU（可结合 offload）
+    #         torch_dtype=dtype,
-            device="cpu",         # ← 关键：不要用 meta
+    #         low_cpu_mem_usage=False,
-            pin_memory=True,
+    #         trust_remote_code=True,
-            dtype=dtype,
+    #         attn_implementation="sdpa"
-            config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
+    #     )
        )
    except Exception:
        zero_init_ctx = nullcontext()  # 没装 DS 时也能单机跑
    with zero_init_ctx:
        model = AutoModelForCausalLM.from_pretrained(
            args.model_name_or_path,
            torch_dtype=dtype,
            low_cpu_mem_usage=False,
            trust_remote_code=True,
            attn_implementation="sdpa"
        )
    # 交给插件做 ZeRO-Init/分片加载
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name_or_path,
        torch_dtype=dtype,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        attn_implementation="sdpa",
    )
    # model = AutoModelForCausalLM.from_pretrained(
@ -853,6 +865,11 @@ def main():
    print_once(f"[resume] final = {resume_flag if resume_flag else 'None (fresh start)'}")
    print_once("***** Starting training *****")
    dbg(f"allocated={torch.cuda.memory_allocated()/1024**2:.1f} MB, "
        f"reserved={torch.cuda.memory_reserved()/1024**2:.1f} MB")
    train_result = trainer.train(resume_from_checkpoint=resume_flag)
    trainer.save_model()  # DeepSpeed stage3_gather_16bit_weights_on_model_save=true 时，在 rank0 聚合整模型