This commit is contained in:
parent
2e06ad0ccf
commit
d437e96505
|
|
@ -10,7 +10,7 @@ print("Torch CUDA tag:", torch.version.cuda)
|
|||
print("CUDA available:", torch.cuda.is_available())
|
||||
PY
|
||||
|
||||
# 尝试用 nvidia 官方 11.8 频道;失败则走 conda-forge 逐组件
|
||||
# 安装 CUDA 11.8 工具链
|
||||
echo "==> Installing CUDA 11.8 toolchain into current env..."
|
||||
if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then
|
||||
echo "Installed cuda-toolkit (nvidia channel)."
|
||||
|
|
@ -24,23 +24,28 @@ else
|
|||
cuda-profiler-api=11.8
|
||||
fi
|
||||
|
||||
# 强制当前会话使用 env 里的 11.8 nvcc/库
|
||||
# 强制用当前环境里的 CUDA 11.8
|
||||
export CUDA_HOME="$CONDA_PREFIX"
|
||||
export CUDA_PATH="$CUDA_HOME"
|
||||
export TORCH_CUDA_HOME="$CUDA_HOME"
|
||||
export CUDACXX="$CUDA_HOME/bin/nvcc"
|
||||
export PATH="$CUDA_HOME/bin:$PATH"
|
||||
export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
|
||||
hash -r # 刷新 shell 命令缓存
|
||||
|
||||
echo "==> nvcc should now be 11.8:"
|
||||
which nvcc
|
||||
nvcc --version
|
||||
|
||||
# 架构:3090 = sm_86
|
||||
export TORCH_CUDA_ARCH_LIST="8.6"
|
||||
|
||||
# 编译依赖(用 mamba 提速)
|
||||
# 编译依赖
|
||||
echo "==> Build deps"
|
||||
mamba install -y -c conda-forge cmake ninja pybind11 libaio git
|
||||
pip install -U pip setuptools wheel
|
||||
|
||||
# 获取 DeepSpeed 源码(固定较稳 tag)
|
||||
# 获取 DeepSpeed 源码
|
||||
echo "==> Clone DeepSpeed (if not exists)"
|
||||
cd "${HOME}/train/new"
|
||||
[ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git
|
||||
|
|
@ -48,13 +53,14 @@ cd DeepSpeed
|
|||
|
||||
# 清理旧安装
|
||||
pip uninstall -y deepspeed || true
|
||||
rm -rf build
|
||||
|
||||
# 仅启用训练相关内核
|
||||
# 启用训练相关内核
|
||||
export DS_BUILD_OPS=1
|
||||
export DS_BUILD_AIO=1
|
||||
export DS_BUILD_FUSED_ADAM=1
|
||||
export DS_BUILD_CPU_ADAM=1
|
||||
# 推理/transformer 内核先关,减少兼容风险
|
||||
# 推理/transformer 内核先关
|
||||
# export DS_BUILD_TRANSFORMER=1
|
||||
|
||||
echo "==> Build & install DeepSpeed"
|
||||
|
|
|
|||
Loading…
Reference in New Issue