This commit is contained in:
hailin 2025-09-04 11:20:00 +08:00
parent b054b9e805
commit bd9a294e42
3 changed files with 25 additions and 3 deletions

6
.deepspeed_env Normal file
View File

@ -0,0 +1,6 @@
WANDB_BASE_URL=https://wandb.szaiai.com
WANDB_API_KEY=local-701636f51b4741d3862007df5cf7f12cca53d8d1
WANDB_PROJECT=ds-qwen3
WANDB_GROUP=q3-32b-ds4-2025-09-04
WANDB_RUN_ID=q3-32b-lr2e-5-train1
WANDB_RESUME=allow

View File

@ -1,3 +1,15 @@
#!/usr/bin/env bash
set -euo pipefail
# 1) 可选:若含 API Key保护权限
[ -f .deepspeed_env ] && chmod 600 .deepspeed_env
# 2) 注入统一环境(供所有 rank 继承)
set -a
. ./.deepspeed_env
set +a
# 统一环境(会被 deepspeed 的 ssh 继承到各节点)
unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE
export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext
@ -23,6 +35,6 @@ deepspeed --hostfile hostfile \
--gradient_checkpointing \
--bf16 \
--deepspeed /home/test/jd_train/ds_config_zero3.json \
--report_to none \
--report_to wandb \
--eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"

View File

@ -1,6 +1,12 @@
#!/usr/bin/env python3
import os
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
if os.environ.get("RANK","0") != "0":
os.environ["WANDB_DISABLED"] = "true"
os.environ.setdefault("WANDB_START_METHOD", "thread")
os.environ.setdefault("WANDB_DIR", f"/tmp/{os.environ.get('USER','user')}/wandb")
import glob
import socket
import argparse
@ -591,8 +597,6 @@ def main():
pass
# ===== 数据鲁棒性检查(多机各自执行)=====
# host = socket.gethostname()
files = sorted(glob.glob(args.data_glob))
if len(files) == 0:
raise FileNotFoundError(