diff --git a/mm-zero3.sh b/mm-zero3.sh index eaf2291..7c05e40 100755 --- a/mm-zero3.sh +++ b/mm-zero3.sh @@ -1,5 +1,7 @@ -export DS_BUILD_OPS=0 -export DS_SKIP_CUDA_BUILD=1 +# 统一环境(会被 deepspeed 的 ssh 继承到各节点) +unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE +export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext + export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9" deepspeed --hostfile hostfile \