This commit is contained in:
parent
b054b9e805
commit
bd9a294e42
|
|
@ -0,0 +1,6 @@
|
|||
WANDB_BASE_URL=https://wandb.szaiai.com
|
||||
WANDB_API_KEY=local-701636f51b4741d3862007df5cf7f12cca53d8d1
|
||||
WANDB_PROJECT=ds-qwen3
|
||||
WANDB_GROUP=q3-32b-ds4-2025-09-04
|
||||
WANDB_RUN_ID=q3-32b-lr2e-5-train1
|
||||
WANDB_RESUME=allow
|
||||
14
mm-zero3.sh
14
mm-zero3.sh
|
|
@ -1,3 +1,15 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# 1) 可选:若含 API Key,保护权限
|
||||
[ -f .deepspeed_env ] && chmod 600 .deepspeed_env
|
||||
|
||||
# 2) 注入统一环境(供所有 rank 继承)
|
||||
set -a
|
||||
. ./.deepspeed_env
|
||||
set +a
|
||||
|
||||
|
||||
# 统一环境(会被 deepspeed 的 ssh 继承到各节点)
|
||||
unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE
|
||||
export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext
|
||||
|
|
@ -23,6 +35,6 @@ deepspeed --hostfile hostfile \
|
|||
--gradient_checkpointing \
|
||||
--bf16 \
|
||||
--deepspeed /home/test/jd_train/ds_config_zero3.json \
|
||||
--report_to none \
|
||||
--report_to wandb \
|
||||
--eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,12 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
||||
if os.environ.get("RANK","0") != "0":
|
||||
os.environ["WANDB_DISABLED"] = "true"
|
||||
|
||||
os.environ.setdefault("WANDB_START_METHOD", "thread")
|
||||
os.environ.setdefault("WANDB_DIR", f"/tmp/{os.environ.get('USER','user')}/wandb")
|
||||
|
||||
import glob
|
||||
import socket
|
||||
import argparse
|
||||
|
|
@ -591,8 +597,6 @@ def main():
|
|||
pass
|
||||
|
||||
# ===== 数据鲁棒性检查(多机各自执行)=====
|
||||
# host = socket.gethostname()
|
||||
|
||||
files = sorted(glob.glob(args.data_glob))
|
||||
if len(files) == 0:
|
||||
raise FileNotFoundError(
|
||||
|
|
|
|||
Loading…
Reference in New Issue