This commit is contained in:
hailin 2025-08-30 10:14:24 +08:00
parent 04a0717d53
commit 817f135417
1 changed files with 4 additions and 2 deletions

View File

@ -1,5 +1,7 @@
export DS_BUILD_OPS=0
export DS_SKIP_CUDA_BUILD=1
# 统一环境(会被 deepspeed 的 ssh 继承到各节点)
unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE
export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9"
deepspeed --hostfile hostfile \