This commit is contained in:
hailin 2025-08-29 22:06:04 +08:00
parent a87f1212cc
commit 35f5c85446
3 changed files with 41 additions and 21 deletions

View File

@ -18,6 +18,7 @@
"offload_param": { "device": "cpu", "pin_memory": true }
},
"wall_clock_breakdown": false
}

View File

@ -1,3 +1,5 @@
export DS_BUILD_OPS=0
export DS_SKIP_CUDA_BUILD=1
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9"
deepspeed --hostfile hostfile \

View File

@ -24,6 +24,8 @@ from transformers import (
)
from transformers.trainer_callback import TrainerCallback
from transformers.trainer_utils import get_last_checkpoint
from transformers.deepspeed import HfDeepSpeedConfig
# ----------------- 进程工具 -----------------
def is_main_process():
@ -483,34 +485,44 @@ def main():
dtype = (torch.bfloat16 if use_bf16 else
(torch.float16 if torch.cuda.is_available() else torch.float32))
dschf = None
if args.deepspeed and os.path.isfile(args.deepspeed):
dschf = HfDeepSpeedConfig(args.deepspeed) # ← 关键:提前启用插件
dbg("HfDeepSpeedConfig loaded")
# try:
# import deepspeed
# zero_init_ctx = deepspeed.zero.Init(
# remote_device="cpu", # 参数最终托管在 CPU可结合 offload
# device="cpu", # ← 关键:不要用 meta
# pin_memory=True,
# dtype=dtype,
# config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
# )
# except Exception:
# zero_init_ctx = nullcontext() # 没装 DS 时也能单机跑
try:
import deepspeed
zero_init_ctx = deepspeed.zero.Init(
remote_device="cpu", # 参数最终托管在 CPU可结合 offload
device="cpu", # ← 关键:不要用 meta
pin_memory=True,
dtype=dtype,
config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
)
except Exception:
zero_init_ctx = nullcontext() # 没装 DS 时也能单机跑
# with zero_init_ctx:
# model = AutoModelForCausalLM.from_pretrained(
# args.model_name_or_path,
# torch_dtype=dtype,
# low_cpu_mem_usage=False,
# trust_remote_code=True,
# attn_implementation="sdpa"
# )
with zero_init_ctx:
# 交给插件做 ZeRO-Init/分片加载
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path,
torch_dtype=dtype,
low_cpu_mem_usage=False,
low_cpu_mem_usage=True,
trust_remote_code=True,
attn_implementation="sdpa"
attn_implementation="sdpa",
)
# model = AutoModelForCausalLM.from_pretrained(
# args.model_name_or_path,
# torch_dtype=dtype,
@ -853,6 +865,11 @@ def main():
print_once(f"[resume] final = {resume_flag if resume_flag else 'None (fresh start)'}")
print_once("***** Starting training *****")
dbg(f"allocated={torch.cuda.memory_allocated()/1024**2:.1f} MB, "
f"reserved={torch.cuda.memory_reserved()/1024**2:.1f} MB")
train_result = trainer.train(resume_from_checkpoint=resume_flag)
trainer.save_model() # DeepSpeed stage3_gather_16bit_weights_on_model_save=true 时,在 rank0 聚合整模型