This commit is contained in:
parent
a87f1212cc
commit
35f5c85446
|
|
@ -17,6 +17,7 @@
|
|||
"stage3_gather_16bit_weights_on_model_save": false,
|
||||
|
||||
"offload_param": { "device": "cpu", "pin_memory": true }
|
||||
|
||||
|
||||
},
|
||||
"wall_clock_breakdown": false
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
export DS_BUILD_OPS=0
|
||||
export DS_SKIP_CUDA_BUILD=1
|
||||
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9"
|
||||
|
||||
deepspeed --hostfile hostfile \
|
||||
|
|
|
|||
|
|
@ -24,6 +24,8 @@ from transformers import (
|
|||
)
|
||||
from transformers.trainer_callback import TrainerCallback
|
||||
from transformers.trainer_utils import get_last_checkpoint
|
||||
from transformers.deepspeed import HfDeepSpeedConfig
|
||||
|
||||
|
||||
# ----------------- 进程工具 -----------------
|
||||
def is_main_process():
|
||||
|
|
@ -483,32 +485,42 @@ def main():
|
|||
dtype = (torch.bfloat16 if use_bf16 else
|
||||
(torch.float16 if torch.cuda.is_available() else torch.float32))
|
||||
|
||||
dschf = None
|
||||
if args.deepspeed and os.path.isfile(args.deepspeed):
|
||||
dschf = HfDeepSpeedConfig(args.deepspeed) # ← 关键:提前启用插件
|
||||
dbg("HfDeepSpeedConfig loaded")
|
||||
|
||||
|
||||
# try:
|
||||
# import deepspeed
|
||||
# zero_init_ctx = deepspeed.zero.Init(
|
||||
# remote_device="cpu", # 参数最终托管在 CPU(可结合 offload)
|
||||
# device="cpu", # ← 关键:不要用 meta
|
||||
# pin_memory=True,
|
||||
# dtype=dtype,
|
||||
# config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
|
||||
# )
|
||||
# except Exception:
|
||||
# zero_init_ctx = nullcontext() # 没装 DS 时也能单机跑
|
||||
|
||||
try:
|
||||
import deepspeed
|
||||
zero_init_ctx = deepspeed.zero.Init(
|
||||
remote_device="cpu", # 参数最终托管在 CPU(可结合 offload)
|
||||
device="cpu", # ← 关键:不要用 meta
|
||||
pin_memory=True,
|
||||
dtype=dtype,
|
||||
config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
|
||||
)
|
||||
except Exception:
|
||||
zero_init_ctx = nullcontext() # 没装 DS 时也能单机跑
|
||||
|
||||
with zero_init_ctx:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
torch_dtype=dtype,
|
||||
low_cpu_mem_usage=False,
|
||||
trust_remote_code=True,
|
||||
attn_implementation="sdpa"
|
||||
)
|
||||
|
||||
# with zero_init_ctx:
|
||||
# model = AutoModelForCausalLM.from_pretrained(
|
||||
# args.model_name_or_path,
|
||||
# torch_dtype=dtype,
|
||||
# low_cpu_mem_usage=False,
|
||||
# trust_remote_code=True,
|
||||
# attn_implementation="sdpa"
|
||||
# )
|
||||
|
||||
|
||||
# 交给插件做 ZeRO-Init/分片加载
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model_name_or_path,
|
||||
torch_dtype=dtype,
|
||||
low_cpu_mem_usage=True,
|
||||
trust_remote_code=True,
|
||||
attn_implementation="sdpa",
|
||||
)
|
||||
|
||||
|
||||
# model = AutoModelForCausalLM.from_pretrained(
|
||||
|
|
@ -853,6 +865,11 @@ def main():
|
|||
print_once(f"[resume] final = {resume_flag if resume_flag else 'None (fresh start)'}")
|
||||
print_once("***** Starting training *****")
|
||||
|
||||
|
||||
dbg(f"allocated={torch.cuda.memory_allocated()/1024**2:.1f} MB, "
|
||||
f"reserved={torch.cuda.memory_reserved()/1024**2:.1f} MB")
|
||||
|
||||
|
||||
train_result = trainer.train(resume_from_checkpoint=resume_flag)
|
||||
trainer.save_model() # DeepSpeed stage3_gather_16bit_weights_on_model_save=true 时,在 rank0 聚合整模型
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue