From ddf438d728f5dab4defa6237ded74828746a56f0 Mon Sep 17 00:00:00 2001 From: hailin Date: Fri, 5 Sep 2025 10:51:47 +0800 Subject: [PATCH] . --- convertfp32_to_bf16.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 convertfp32_to_bf16.py diff --git a/convertfp32_to_bf16.py b/convertfp32_to_bf16.py new file mode 100644 index 0000000..9697a8e --- /dev/null +++ b/convertfp32_to_bf16.py @@ -0,0 +1,29 @@ +from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer + +SRC = "/home/test/checkpoints/q3-32b-ds4/merged-global_step62" # FP32 合并目录 +DST = "/home/test/checkpoints/q3-32b-ds4/merged-global_step62-bf16" # 目标输出目录 + +# 1) 以 bfloat16 加载(不会加载优化器状态) +model = AutoModelForCausalLM.from_pretrained( + SRC, + torch_dtype="bfloat16", + low_cpu_mem_usage=True, # 节省内存 + device_map=None # 全在 CPU 上处理 +) + +# 2) 可选:某些层保持 FP32(更稳),一般是 LayerNorm/Embedding +# 若你想更“训练一致”,可以跳过这步;若想更稳,放开下面注释: +# for name, module in model.named_modules(): +# if "norm" in name.lower(): +# module.to(dtype=None) # 让它跟随权重 dtype(已是 bf16) +# (transformers 默认就会按权重 dtype 存;真的想强行 FP32,可手动 .float()) + +# 3) 保存为 bf16 + safetensors 分片(按 5GB 切片) +model.save_pretrained(DST, safe_serialization=True, max_shard_size="5GB") + +# 4) 同步 tokenizer/config(若上一步没自动带上) +tok = AutoTokenizer.from_pretrained(SRC, use_fast=True) +tok.save_pretrained(DST) +cfg = AutoConfig.from_pretrained(SRC) +cfg.save_pretrained(DST) +print("✅ 保存完成:", DST)