This commit is contained in:
parent
a87f1212cc
commit
35f5c85446
|
|
@ -17,6 +17,7 @@
|
||||||
"stage3_gather_16bit_weights_on_model_save": false,
|
"stage3_gather_16bit_weights_on_model_save": false,
|
||||||
|
|
||||||
"offload_param": { "device": "cpu", "pin_memory": true }
|
"offload_param": { "device": "cpu", "pin_memory": true }
|
||||||
|
|
||||||
|
|
||||||
},
|
},
|
||||||
"wall_clock_breakdown": false
|
"wall_clock_breakdown": false
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
export DS_BUILD_OPS=0
|
||||||
|
export DS_SKIP_CUDA_BUILD=1
|
||||||
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9"
|
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9"
|
||||||
|
|
||||||
deepspeed --hostfile hostfile \
|
deepspeed --hostfile hostfile \
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,8 @@ from transformers import (
|
||||||
)
|
)
|
||||||
from transformers.trainer_callback import TrainerCallback
|
from transformers.trainer_callback import TrainerCallback
|
||||||
from transformers.trainer_utils import get_last_checkpoint
|
from transformers.trainer_utils import get_last_checkpoint
|
||||||
|
from transformers.deepspeed import HfDeepSpeedConfig
|
||||||
|
|
||||||
|
|
||||||
# ----------------- 进程工具 -----------------
|
# ----------------- 进程工具 -----------------
|
||||||
def is_main_process():
|
def is_main_process():
|
||||||
|
|
@ -483,32 +485,42 @@ def main():
|
||||||
dtype = (torch.bfloat16 if use_bf16 else
|
dtype = (torch.bfloat16 if use_bf16 else
|
||||||
(torch.float16 if torch.cuda.is_available() else torch.float32))
|
(torch.float16 if torch.cuda.is_available() else torch.float32))
|
||||||
|
|
||||||
|
dschf = None
|
||||||
|
if args.deepspeed and os.path.isfile(args.deepspeed):
|
||||||
|
dschf = HfDeepSpeedConfig(args.deepspeed) # ← 关键:提前启用插件
|
||||||
|
dbg("HfDeepSpeedConfig loaded")
|
||||||
|
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# import deepspeed
|
||||||
|
# zero_init_ctx = deepspeed.zero.Init(
|
||||||
|
# remote_device="cpu", # 参数最终托管在 CPU(可结合 offload)
|
||||||
|
# device="cpu", # ← 关键:不要用 meta
|
||||||
|
# pin_memory=True,
|
||||||
|
# dtype=dtype,
|
||||||
|
# config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
|
||||||
|
# )
|
||||||
|
# except Exception:
|
||||||
|
# zero_init_ctx = nullcontext() # 没装 DS 时也能单机跑
|
||||||
|
|
||||||
try:
|
# with zero_init_ctx:
|
||||||
import deepspeed
|
# model = AutoModelForCausalLM.from_pretrained(
|
||||||
zero_init_ctx = deepspeed.zero.Init(
|
# args.model_name_or_path,
|
||||||
remote_device="cpu", # 参数最终托管在 CPU(可结合 offload)
|
# torch_dtype=dtype,
|
||||||
device="cpu", # ← 关键:不要用 meta
|
# low_cpu_mem_usage=False,
|
||||||
pin_memory=True,
|
# trust_remote_code=True,
|
||||||
dtype=dtype,
|
# attn_implementation="sdpa"
|
||||||
config_dict_or_path=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
|
# )
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
zero_init_ctx = nullcontext() # 没装 DS 时也能单机跑
|
|
||||||
|
|
||||||
with zero_init_ctx:
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
|
||||||
args.model_name_or_path,
|
|
||||||
torch_dtype=dtype,
|
|
||||||
low_cpu_mem_usage=False,
|
|
||||||
trust_remote_code=True,
|
|
||||||
attn_implementation="sdpa"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 交给插件做 ZeRO-Init/分片加载
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
args.model_name_or_path,
|
||||||
|
torch_dtype=dtype,
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
|
trust_remote_code=True,
|
||||||
|
attn_implementation="sdpa",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# model = AutoModelForCausalLM.from_pretrained(
|
# model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
|
@ -853,6 +865,11 @@ def main():
|
||||||
print_once(f"[resume] final = {resume_flag if resume_flag else 'None (fresh start)'}")
|
print_once(f"[resume] final = {resume_flag if resume_flag else 'None (fresh start)'}")
|
||||||
print_once("***** Starting training *****")
|
print_once("***** Starting training *****")
|
||||||
|
|
||||||
|
|
||||||
|
dbg(f"allocated={torch.cuda.memory_allocated()/1024**2:.1f} MB, "
|
||||||
|
f"reserved={torch.cuda.memory_reserved()/1024**2:.1f} MB")
|
||||||
|
|
||||||
|
|
||||||
train_result = trainer.train(resume_from_checkpoint=resume_flag)
|
train_result = trainer.train(resume_from_checkpoint=resume_flag)
|
||||||
trainer.save_model() # DeepSpeed stage3_gather_16bit_weights_on_model_save=true 时,在 rank0 聚合整模型
|
trainer.save_model() # DeepSpeed stage3_gather_16bit_weights_on_model_save=true 时,在 rank0 聚合整模型
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue