From 817f1354170dacfff3f6df3366c0de4ceebe44b7 Mon Sep 17 00:00:00 2001 From: hailin Date: Sat, 30 Aug 2025 10:14:24 +0800 Subject: [PATCH] . --- mm-zero3.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm-zero3.sh b/mm-zero3.sh index eaf2291..7c05e40 100755 --- a/mm-zero3.sh +++ b/mm-zero3.sh @@ -1,5 +1,7 @@ -export DS_BUILD_OPS=0 -export DS_SKIP_CUDA_BUILD=1 +# 统一环境(会被 deepspeed 的 ssh 继承到各节点) +unset DS_BUILD_OPS DS_SKIP_CUDA_BUILD PYTHONNOUSERSITE +export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext + export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9" deepspeed --hostfile hostfile \