diff --git a/ds_config_zero3.json b/ds_config_zero3.json
index 563bd73..212ed33 100644
--- a/ds_config_zero3.json
+++ b/ds_config_zero3.json
@@ -17,6 +17,7 @@
     "stage3_gather_16bit_weights_on_model_save": false,
   
     "offload_param":     { "device": "cpu", "pin_memory": true }
+    
 
   },
   "wall_clock_breakdown": false
diff --git a/mm-zero3.sh b/mm-zero3.sh
index 6319333..eaf2291 100755
--- a/mm-zero3.sh
+++ b/mm-zero3.sh
@@ -1,3 +1,5 @@
+export DS_BUILD_OPS=0
+export DS_SKIP_CUDA_BUILD=1
 export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9"
 
 deepspeed --hostfile hostfile \
diff --git a/train_sft_ds.py b/train_sft_ds.py
index ee6a30c..99824c9 100644
--- a/train_sft_ds.py
+++ b/train_sft_ds.py
@@ -24,6 +24,8 @@ from transformers import (
 )
 from transformers.trainer_callback import TrainerCallback
 from transformers.trainer_utils import get_last_checkpoint
+from transformers.deepspeed import HfDeepSpeedConfig
+
 
 # ----------------- 进程工具 -----------------
 def is_main_process():
@@ -483,32 +485,42 @@ def main():
     dtype = (torch.bfloat16 if use_bf16 else
             (torch.float16 if torch.cuda.is_available() else torch.float32))
 
+    dschf = None
+    if args.deepspeed and os.path.isfile(args.deepspeed):
+        dschf = HfDeepSpeedConfig(args.deepspeed)   # ← 关键：提前启用插件
+        dbg("HfDeepSpeedConfig loaded")
 
 
+    # try:
+    #     import deepspeed
+    #     zero_init_ctx = deepspeed.zero.Init(
+    #         remote_device="cpu",  # 参数最终托管在 CPU（可结合 offload）
+    #         device="cpu",         # ← 关键：不要用 meta
+    #         pin_memory=True,
+    #         dtype=dtype,
+    #         config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
+    #     )
+    # except Exception:
+    #     zero_init_ctx = nullcontext()  # 没装 DS 时也能单机跑
 
-    try:
-        import deepspeed
-        zero_init_ctx = deepspeed.zero.Init(
-            remote_device="cpu",  # 参数最终托管在 CPU（可结合 offload）
-            device="cpu",         # ← 关键：不要用 meta
-            pin_memory=True,
-            dtype=dtype,
-            config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
-        )
-    except Exception:
-        zero_init_ctx = nullcontext()  # 没装 DS 时也能单机跑
-
-    with zero_init_ctx:
-        model = AutoModelForCausalLM.from_pretrained(
-            args.model_name_or_path,
-            torch_dtype=dtype,
-            low_cpu_mem_usage=False,
-            trust_remote_code=True,
-            attn_implementation="sdpa"
-        )
-
+    # with zero_init_ctx:
+    #     model = AutoModelForCausalLM.from_pretrained(
+    #         args.model_name_or_path,
+    #         torch_dtype=dtype,
+    #         low_cpu_mem_usage=False,
+    #         trust_remote_code=True,
+    #         attn_implementation="sdpa"
+    #     )
 
 
+    # 交给插件做 ZeRO-Init/分片加载
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,
+        torch_dtype=dtype,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+        attn_implementation="sdpa",
+    )
 
 
     # model = AutoModelForCausalLM.from_pretrained(
@@ -853,6 +865,11 @@ def main():
     print_once(f"[resume] final = {resume_flag if resume_flag else 'None (fresh start)'}")
     print_once("***** Starting training *****")
 
+
+    dbg(f"allocated={torch.cuda.memory_allocated()/1024**2:.1f} MB, "
+        f"reserved={torch.cuda.memory_reserved()/1024**2:.1f} MB")
+
+
     train_result = trainer.train(resume_from_checkpoint=resume_flag)
     trainer.save_model()  # DeepSpeed stage3_gather_16bit_weights_on_model_save=true 时，在 rank0 聚合整模型