This commit is contained in:
hailin 2025-08-08 20:37:13 +08:00
parent 2e06ad0ccf
commit d437e96505
1 changed files with 12 additions and 6 deletions

View File

@ -10,7 +10,7 @@ print("Torch CUDA tag:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
PY
# 尝试用 nvidia 官方 11.8 频道;失败则走 conda-forge 逐组件
# 安装 CUDA 11.8 工具链
echo "==> Installing CUDA 11.8 toolchain into current env..."
if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then
echo "Installed cuda-toolkit (nvidia channel)."
@ -24,23 +24,28 @@ else
cuda-profiler-api=11.8
fi
# 强制当前会话使用 env 里的 11.8 nvcc/库
# 强制用当前环境里的 CUDA 11.8
export CUDA_HOME="$CONDA_PREFIX"
export CUDA_PATH="$CUDA_HOME"
export TORCH_CUDA_HOME="$CUDA_HOME"
export CUDACXX="$CUDA_HOME/bin/nvcc"
export PATH="$CUDA_HOME/bin:$PATH"
export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
hash -r # 刷新 shell 命令缓存
echo "==> nvcc should now be 11.8:"
which nvcc
nvcc --version
# 架构3090 = sm_86
export TORCH_CUDA_ARCH_LIST="8.6"
# 编译依赖(用 mamba 提速)
# 编译依赖
echo "==> Build deps"
mamba install -y -c conda-forge cmake ninja pybind11 libaio git
pip install -U pip setuptools wheel
# 获取 DeepSpeed 源码(固定较稳 tag
# 获取 DeepSpeed 源码
echo "==> Clone DeepSpeed (if not exists)"
cd "${HOME}/train/new"
[ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git
@ -48,13 +53,14 @@ cd DeepSpeed
# 清理旧安装
pip uninstall -y deepspeed || true
rm -rf build
# 启用训练相关内核
# 启用训练相关内核
export DS_BUILD_OPS=1
export DS_BUILD_AIO=1
export DS_BUILD_FUSED_ADAM=1
export DS_BUILD_CPU_ADAM=1
# 推理/transformer 内核先关,减少兼容风险
# 推理/transformer 内核先关
# export DS_BUILD_TRANSFORMER=1
echo "==> Build & install DeepSpeed"