diff --git a/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh b/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh index 8b61681..0558872 100644 --- a/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh +++ b/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh @@ -10,7 +10,7 @@ print("Torch CUDA tag:", torch.version.cuda) print("CUDA available:", torch.cuda.is_available()) PY -# 尝试用 nvidia 官方 11.8 频道;失败则走 conda-forge 逐组件 +# 安装 CUDA 11.8 工具链 echo "==> Installing CUDA 11.8 toolchain into current env..." if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then echo "Installed cuda-toolkit (nvidia channel)." @@ -24,23 +24,28 @@ else cuda-profiler-api=11.8 fi -# 强制当前会话使用 env 里的 11.8 nvcc/库 +# 强制用当前环境里的 CUDA 11.8 export CUDA_HOME="$CONDA_PREFIX" +export CUDA_PATH="$CUDA_HOME" +export TORCH_CUDA_HOME="$CUDA_HOME" +export CUDACXX="$CUDA_HOME/bin/nvcc" export PATH="$CUDA_HOME/bin:$PATH" export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}" +hash -r # 刷新 shell 命令缓存 echo "==> nvcc should now be 11.8:" +which nvcc nvcc --version # 架构:3090 = sm_86 export TORCH_CUDA_ARCH_LIST="8.6" -# 编译依赖(用 mamba 提速) +# 编译依赖 echo "==> Build deps" mamba install -y -c conda-forge cmake ninja pybind11 libaio git pip install -U pip setuptools wheel -# 获取 DeepSpeed 源码(固定较稳 tag) +# 获取 DeepSpeed 源码 echo "==> Clone DeepSpeed (if not exists)" cd "${HOME}/train/new" [ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git @@ -48,13 +53,14 @@ cd DeepSpeed # 清理旧安装 pip uninstall -y deepspeed || true +rm -rf build -# 仅启用训练相关内核 +# 启用训练相关内核 export DS_BUILD_OPS=1 export DS_BUILD_AIO=1 export DS_BUILD_FUSED_ADAM=1 export DS_BUILD_CPU_ADAM=1 -# 推理/transformer 内核先关,减少兼容风险 +# 推理/transformer 内核先关 # export DS_BUILD_TRANSFORMER=1 echo "==> Build & install DeepSpeed"