This commit is contained in:
parent
817f135417
commit
c473527297
|
|
@ -27,6 +27,42 @@ from transformers.trainer_utils import get_last_checkpoint
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ==== ensure python can see user site & set torch extensions dir ====
|
||||||
|
import os, sys, site
|
||||||
|
|
||||||
|
# 1) 确保不会屏蔽用户站点包(ninja 安在 ~/.local 里)
|
||||||
|
os.environ.pop("PYTHONNOUSERSITE", None)
|
||||||
|
os.environ.pop("DS_BUILD_OPS", None)
|
||||||
|
os.environ.pop("DS_SKIP_CUDA_BUILD", None)
|
||||||
|
|
||||||
|
# 2) 把用户站点目录插入 sys.path(比如 /home/test/.local/lib/python3.10/site-packages)
|
||||||
|
try:
|
||||||
|
user_site = site.getusersitepackages()
|
||||||
|
if user_site and user_site not in sys.path:
|
||||||
|
sys.path.insert(0, user_site)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 3) 统一 JIT 缓存目录(可选,但更稳;日志里你现在用的是 ~/.cache/torch_extensions)
|
||||||
|
os.environ.setdefault("TORCH_EXTENSIONS_DIR", f"/tmp/{os.environ.get('USER','user')}/torch_ext")
|
||||||
|
os.environ.setdefault("TORCH_EXTENSIONS_DIR", f"/tmp/{os.environ.get('USER','user')}/torch_ext")
|
||||||
|
os.environ.setdefault("MAX_JOBS", "8")
|
||||||
|
|
||||||
|
|
||||||
|
# 4) 立即验证 ninja 与 CPUAdam 的 JIT(若这里失败,日志会第一时间告诉你是哪台/哪 rank 环境不对)
|
||||||
|
try:
|
||||||
|
import ninja
|
||||||
|
print(f"[env] ninja {getattr(ninja,'__version__','?')} @ {getattr(ninja,'__file__','?')}", flush=True)
|
||||||
|
from deepspeed.ops.op_builder import CPUAdamBuilder
|
||||||
|
CPUAdamBuilder().load()
|
||||||
|
print("[env] CPUAdamBuilder JIT OK", flush=True)
|
||||||
|
except Exception as e:
|
||||||
|
import socket
|
||||||
|
print(f"[env][host={socket.gethostname()} RANK={os.environ.get('RANK','?')}] PRE-JIT FAILED: {e}", flush=True)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------- 进程工具 -----------------
|
# ----------------- 进程工具 -----------------
|
||||||
def is_main_process():
|
def is_main_process():
|
||||||
return int(os.environ.get("RANK", "0")) == 0
|
return int(os.environ.get("RANK", "0")) == 0
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue