From bd9a294e421c1dd5fce4d91a8a29444498a87601 Mon Sep 17 00:00:00 2001 From: hailin Date: Thu, 4 Sep 2025 11:20:00 +0800 Subject: [PATCH] . --- .deepspeed_env | 6 ++++++ mm-zero3.sh | 14 +++++++++++++- train_sft_ds.py | 8 ++++++-- 3 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 .deepspeed_env diff --git a/.deepspeed_env b/.deepspeed_env new file mode 100644 index 0000000..cd44121 --- /dev/null +++ b/.deepspeed_env @@ -0,0 +1,6 @@ +WANDB_BASE_URL=https://wandb.szaiai.com +WANDB_API_KEY=local-701636f51b4741d3862007df5cf7f12cca53d8d1 +WANDB_PROJECT=ds-qwen3 +WANDB_GROUP=q3-32b-ds4-2025-09-04 +WANDB_RUN_ID=q3-32b-lr2e-5-train1 +WANDB_RESUME=allow diff --git a/mm-zero3.sh b/mm-zero3.sh index 0d80e67..39ab346 100755 --- a/mm-zero3.sh +++ b/mm-zero3.sh @@ -1,3 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +# 1) 可选:若含 API Key,保护权限 +[ -f .deepspeed_env ] && chmod 600 .deepspeed_env + +# 2) 注入统一环境(供所有 rank 继承) +set -a +. ./.deepspeed_env +set +a + + # 统一环境(会被 deepspeed 的 ssh 继承到各节点) unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext @@ -23,6 +35,6 @@ deepspeed --hostfile hostfile \ --gradient_checkpointing \ --bf16 \ --deepspeed /home/test/jd_train/ds_config_zero3.json \ - --report_to none \ + --report_to wandb \ --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl" diff --git a/train_sft_ds.py b/train_sft_ds.py index d62c649..5198ac7 100644 --- a/train_sft_ds.py +++ b/train_sft_ds.py @@ -1,6 +1,12 @@ #!/usr/bin/env python3 import os os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") +if os.environ.get("RANK","0") != "0": + os.environ["WANDB_DISABLED"] = "true" + +os.environ.setdefault("WANDB_START_METHOD", "thread") +os.environ.setdefault("WANDB_DIR", f"/tmp/{os.environ.get('USER','user')}/wandb") + import glob import socket import argparse @@ -591,8 +597,6 @@ def main(): pass # ===== 数据鲁棒性检查(多机各自执行)===== - # host = socket.gethostname() - files = sorted(glob.glob(args.data_glob)) if len(files) == 0: raise FileNotFoundError(