tools/extract_chat_template.py

51 lines
1.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# extract_chat_template.py —— 最终版
#
# 用法示例:
# # 只导出原模板
# python extract_chat_template.py Qwen3-32B/tokenizer_config.json \
# -o Qwen3-32B/chat_template.jinja
#
# # 同时导出“无 <think>”版本
# python extract_chat_template.py Qwen3-32B/tokenizer_config.json \
# -o Qwen3-32B/chat_template.jinja \
# --no-think \
# -n Qwen3-32B/chat_template_nothink.jinja
#
# # -o/-n 若省略就写到当前目录chat_template.jinja / chat_template_nothink.jinja
import argparse, json, re, sys
from pathlib import Path
def main():
p = argparse.ArgumentParser()
p.add_argument("config", type=Path, help="tokenizer_config.json 路径")
p.add_argument("-o", "--output", type=Path,
default=Path("chat_template.jinja"),
help="原始模板输出文件(默认 chat_template.jinja")
p.add_argument("--no-think", action="store_true",
help="额外生成去掉 <think> 块的模板")
p.add_argument("-n", "--no-think-out", type=Path,
help="无 <think> 模板输出文件(默认 chat_template_nothink.jinja")
args = p.parse_args()
try:
tpl = json.loads(args.config.read_text(encoding="utf-8"))["chat_template"]
except Exception as e:
sys.exit(f"❌ 读取失败:{e}")
# 写原始模板
args.output.write_text(tpl, encoding="utf-8")
print(f"✅ 原模板 → {args.output.resolve()}")
# 写无思考模板(可选)
if args.no_think:
nt_path = args.no_think_out or Path("chat_template_nothink.jinja")
nt_path.write_text(re.sub(r"<think>.*?</think>", "", tpl, flags=re.S),
encoding="utf-8")
print(f"✅ 无 <think> 模板 → {nt_path.resolve()}")
if __name__ == "__main__":
main()