diff --git a/ds_config_zero3.json b/ds_config_zero3.json index 563bd73..212ed33 100644 --- a/ds_config_zero3.json +++ b/ds_config_zero3.json @@ -17,6 +17,7 @@ "stage3_gather_16bit_weights_on_model_save": false, "offload_param": { "device": "cpu", "pin_memory": true } + }, "wall_clock_breakdown": false diff --git a/mm-zero3.sh b/mm-zero3.sh index 6319333..eaf2291 100755 --- a/mm-zero3.sh +++ b/mm-zero3.sh @@ -1,3 +1,5 @@ +export DS_BUILD_OPS=0 +export DS_SKIP_CUDA_BUILD=1 export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9" deepspeed --hostfile hostfile \ diff --git a/train_sft_ds.py b/train_sft_ds.py index ee6a30c..99824c9 100644 --- a/train_sft_ds.py +++ b/train_sft_ds.py @@ -24,6 +24,8 @@ from transformers import ( ) from transformers.trainer_callback import TrainerCallback from transformers.trainer_utils import get_last_checkpoint +from transformers.deepspeed import HfDeepSpeedConfig + # ----------------- 进程工具 ----------------- def is_main_process(): @@ -483,32 +485,42 @@ def main(): dtype = (torch.bfloat16 if use_bf16 else (torch.float16 if torch.cuda.is_available() else torch.float32)) + dschf = None + if args.deepspeed and os.path.isfile(args.deepspeed): + dschf = HfDeepSpeedConfig(args.deepspeed) # ← 关键:提前启用插件 + dbg("HfDeepSpeedConfig loaded") + # try: + # import deepspeed + # zero_init_ctx = deepspeed.zero.Init( + # remote_device="cpu", # 参数最终托管在 CPU(可结合 offload) + # device="cpu", # ← 关键:不要用 meta + # pin_memory=True, + # dtype=dtype, + # config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None), + # ) + # except Exception: + # zero_init_ctx = nullcontext() # 没装 DS 时也能单机跑 - try: - import deepspeed - zero_init_ctx = deepspeed.zero.Init( - remote_device="cpu", # 参数最终托管在 CPU(可结合 offload) - device="cpu", # ← 关键:不要用 meta - pin_memory=True, - dtype=dtype, - config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None), - ) - except Exception: - zero_init_ctx = nullcontext() # 没装 DS 时也能单机跑 - - with zero_init_ctx: - model = AutoModelForCausalLM.from_pretrained( - args.model_name_or_path, - torch_dtype=dtype, - low_cpu_mem_usage=False, - trust_remote_code=True, - attn_implementation="sdpa" - ) - + # with zero_init_ctx: + # model = AutoModelForCausalLM.from_pretrained( + # args.model_name_or_path, + # torch_dtype=dtype, + # low_cpu_mem_usage=False, + # trust_remote_code=True, + # attn_implementation="sdpa" + # ) + # 交给插件做 ZeRO-Init/分片加载 + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + torch_dtype=dtype, + low_cpu_mem_usage=True, + trust_remote_code=True, + attn_implementation="sdpa", + ) # model = AutoModelForCausalLM.from_pretrained( @@ -853,6 +865,11 @@ def main(): print_once(f"[resume] final = {resume_flag if resume_flag else 'None (fresh start)'}") print_once("***** Starting training *****") + + dbg(f"allocated={torch.cuda.memory_allocated()/1024**2:.1f} MB, " + f"reserved={torch.cuda.memory_reserved()/1024**2:.1f} MB") + + train_result = trainer.train(resume_from_checkpoint=resume_flag) trainer.save_model() # DeepSpeed stage3_gather_16bit_weights_on_model_save=true 时,在 rank0 聚合整模型