This commit is contained in:
hailin 2025-08-08 20:37:13 +08:00
parent 2e06ad0ccf
commit d437e96505
1 changed files with 12 additions and 6 deletions

View File

@ -10,7 +10,7 @@ print("Torch CUDA tag:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available()) print("CUDA available:", torch.cuda.is_available())
PY PY
# 尝试用 nvidia 官方 11.8 频道;失败则走 conda-forge 逐组件 # 安装 CUDA 11.8 工具链
echo "==> Installing CUDA 11.8 toolchain into current env..." echo "==> Installing CUDA 11.8 toolchain into current env..."
if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then
echo "Installed cuda-toolkit (nvidia channel)." echo "Installed cuda-toolkit (nvidia channel)."
@ -24,23 +24,28 @@ else
cuda-profiler-api=11.8 cuda-profiler-api=11.8
fi fi
# 强制当前会话使用 env 里的 11.8 nvcc/库 # 强制用当前环境里的 CUDA 11.8
export CUDA_HOME="$CONDA_PREFIX" export CUDA_HOME="$CONDA_PREFIX"
export CUDA_PATH="$CUDA_HOME"
export TORCH_CUDA_HOME="$CUDA_HOME"
export CUDACXX="$CUDA_HOME/bin/nvcc"
export PATH="$CUDA_HOME/bin:$PATH" export PATH="$CUDA_HOME/bin:$PATH"
export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}" export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
hash -r # 刷新 shell 命令缓存
echo "==> nvcc should now be 11.8:" echo "==> nvcc should now be 11.8:"
which nvcc
nvcc --version nvcc --version
# 架构3090 = sm_86 # 架构3090 = sm_86
export TORCH_CUDA_ARCH_LIST="8.6" export TORCH_CUDA_ARCH_LIST="8.6"
# 编译依赖(用 mamba 提速) # 编译依赖
echo "==> Build deps" echo "==> Build deps"
mamba install -y -c conda-forge cmake ninja pybind11 libaio git mamba install -y -c conda-forge cmake ninja pybind11 libaio git
pip install -U pip setuptools wheel pip install -U pip setuptools wheel
# 获取 DeepSpeed 源码(固定较稳 tag # 获取 DeepSpeed 源码
echo "==> Clone DeepSpeed (if not exists)" echo "==> Clone DeepSpeed (if not exists)"
cd "${HOME}/train/new" cd "${HOME}/train/new"
[ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git [ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git
@ -48,13 +53,14 @@ cd DeepSpeed
# 清理旧安装 # 清理旧安装
pip uninstall -y deepspeed || true pip uninstall -y deepspeed || true
rm -rf build
# 启用训练相关内核 # 启用训练相关内核
export DS_BUILD_OPS=1 export DS_BUILD_OPS=1
export DS_BUILD_AIO=1 export DS_BUILD_AIO=1
export DS_BUILD_FUSED_ADAM=1 export DS_BUILD_FUSED_ADAM=1
export DS_BUILD_CPU_ADAM=1 export DS_BUILD_CPU_ADAM=1
# 推理/transformer 内核先关,减少兼容风险 # 推理/transformer 内核先关
# export DS_BUILD_TRANSFORMER=1 # export DS_BUILD_TRANSFORMER=1
echo "==> Build & install DeepSpeed" echo "==> Build & install DeepSpeed"