.

2025-08-29 22:06:04 +08:00 · 2025-08-29 22:06:04 +08:00 · 35f5c85446
parent a87f1212cc
commit 35f5c85446
3 changed files with 41 additions and 21 deletions
--- a/ds_config_zero3.json
+++ b/ds_config_zero3.json
@ -17,6 +17,7 @@
    "stage3_gather_16bit_weights_on_model_save": false,
  
    "offload_param":     { "device": "cpu", "pin_memory": true }
+    

  },
  "wall_clock_breakdown": false
--- a/mm-zero3.sh
+++ b/mm-zero3.sh
@ -1,3 +1,5 @@
+export DS_BUILD_OPS=0
+export DS_SKIP_CUDA_BUILD=1
 export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9"

 deepspeed --hostfile hostfile \
--- a/train_sft_ds.py
+++ b/train_sft_ds.py
@ -24,6 +24,8 @@ from transformers import (
 )
 from transformers.trainer_callback import TrainerCallback
 from transformers.trainer_utils import get_last_checkpoint
+from transformers.deepspeed import HfDeepSpeedConfig
+

 # ----------------- 进程工具 -----------------
 def is_main_process():
@ -483,32 +485,42 @@ def main():
    dtype = (torch.bfloat16 if use_bf16 else
            (torch.float16 if torch.cuda.is_available() else torch.float32))

+    dschf = None
+    if args.deepspeed and os.path.isfile(args.deepspeed):
+        dschf = HfDeepSpeedConfig(args.deepspeed)   # ← 关键：提前启用插件
+        dbg("HfDeepSpeedConfig loaded")


+    # try:
+    #     import deepspeed
+    #     zero_init_ctx = deepspeed.zero.Init(
+    #         remote_device="cpu",  # 参数最终托管在 CPU（可结合 offload）
+    #         device="cpu",         # ← 关键：不要用 meta
+    #         pin_memory=True,
+    #         dtype=dtype,
+    #         config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
+    #     )
+    # except Exception:
+    #     zero_init_ctx = nullcontext()  # 没装 DS 时也能单机跑

-    try:
-        import deepspeed
-        zero_init_ctx = deepspeed.zero.Init(
-            remote_device="cpu",  # 参数最终托管在 CPU（可结合 offload）
-            device="cpu",         # ← 关键：不要用 meta
-            pin_memory=True,
-            dtype=dtype,
-            config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
-        )
-    except Exception:
-        zero_init_ctx = nullcontext()  # 没装 DS 时也能单机跑
-
-    with zero_init_ctx:
-        model = AutoModelForCausalLM.from_pretrained(
-            args.model_name_or_path,
-            torch_dtype=dtype,
-            low_cpu_mem_usage=False,
-            trust_remote_code=True,
-            attn_implementation="sdpa"
-        )
-
+    # with zero_init_ctx:
+    #     model = AutoModelForCausalLM.from_pretrained(
+    #         args.model_name_or_path,
+    #         torch_dtype=dtype,
+    #         low_cpu_mem_usage=False,
+    #         trust_remote_code=True,
+    #         attn_implementation="sdpa"
+    #     )


+    # 交给插件做 ZeRO-Init/分片加载
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,
+        torch_dtype=dtype,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+        attn_implementation="sdpa",
+    )


    # model = AutoModelForCausalLM.from_pretrained(
@ -853,6 +865,11 @@ def main():
    print_once(f"[resume] final = {resume_flag if resume_flag else 'None (fresh start)'}")
    print_once("***** Starting training *****")

+
+    dbg(f"allocated={torch.cuda.memory_allocated()/1024**2:.1f} MB, "
+        f"reserved={torch.cuda.memory_reserved()/1024**2:.1f} MB")
+
+
    train_result = trainer.train(resume_from_checkpoint=resume_flag)
    trainer.save_model()  # DeepSpeed stage3_gather_16bit_weights_on_model_save=true 时，在 rank0 聚合整模型