This commit is contained in:
parent
24aa1bc9bc
commit
5df186d033
|
|
@ -1,86 +1,210 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
echo "==> Python/Torch baseline"
|
# =============================
|
||||||
|
# Config(可用环境变量覆盖)
|
||||||
|
# =============================
|
||||||
|
REPO_URL="${REPO_URL:-https://github.com/microsoft/DeepSpeed.git}"
|
||||||
|
REPO_BRANCH="${REPO_BRANCH:-v0.14.3}"
|
||||||
|
REPO_DIR="${REPO_DIR:-DeepSpeed}" # 将在“当前目录/REPO_DIR”下操作
|
||||||
|
CUDA_VERSION_PIN="${CUDA_VERSION_PIN:-11.8}" # 期望 Torch 的 CUDA tag 和 nvcc 都是 11.8
|
||||||
|
ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-}" # 为空则自动探测
|
||||||
|
PY_MIN="${PY_MIN:-3.8}" # 最低 Python 版本
|
||||||
|
AIO_THREADS="${AIO_THREADS:-1}"
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# Utils
|
||||||
|
# =============================
|
||||||
|
log(){ printf "\n\033[1;36m==> %s\033[0m\n" "$*"; }
|
||||||
|
die(){ echo -e "\n\033[1;31m[ERROR]\033[0m $*" ; exit 1; }
|
||||||
|
|
||||||
|
need_bin(){
|
||||||
|
command -v "$1" >/dev/null 2>&1 || die "缺少可执行程序: $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# 0. 前置检查
|
||||||
|
# =============================
|
||||||
|
need_bin python
|
||||||
|
need_bin pip
|
||||||
|
need_bin git
|
||||||
|
|
||||||
|
if command -v mamba >/dev/null 2>&1; then
|
||||||
|
CONDA_BIN="mamba"
|
||||||
|
elif command -v conda >/dev/null 2>&1; then
|
||||||
|
CONDA_BIN="conda"
|
||||||
|
else
|
||||||
|
die "没有找到 mamba/conda,无法安装 CUDA 工具链和构建依赖。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Python/Torch 基线
|
||||||
|
log "Python/Torch baseline"
|
||||||
python - <<'PY'
|
python - <<'PY'
|
||||||
import sys, torch
|
import sys, torch
|
||||||
print("Python:", sys.version.split()[0])
|
print("Python:", sys.version.split()[0])
|
||||||
print("Torch:", torch.__version__)
|
print("Torch:", getattr(torch, "__version__", "n/a"))
|
||||||
print("Torch CUDA tag:", torch.version.cuda)
|
print("Torch CUDA tag:", getattr(torch.version, "cuda", "n/a"))
|
||||||
print("CUDA available:", torch.cuda.is_available())
|
print("CUDA available:", torch.cuda.is_available())
|
||||||
PY
|
PY
|
||||||
|
|
||||||
# 安装 CUDA 11.8 工具链
|
# 检查 Python 版本
|
||||||
echo "==> Installing CUDA 11.8 toolchain into current env..."
|
PY_CUR=$(python - <<'PY'
|
||||||
if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then
|
import sys
|
||||||
echo "Installed cuda-toolkit (nvidia channel)."
|
print(".".join(map(str, sys.version_info[:2])))
|
||||||
else
|
PY
|
||||||
echo "Fallback to conda-forge components..."
|
)
|
||||||
mamba install -y -c conda-forge \
|
python - <<PY
|
||||||
cuda-nvcc=11.8 cuda-version=11.8 \
|
from packaging.version import Version
|
||||||
cuda-cudart-dev=11.8 libnvjitlink=11.8 \
|
assert Version("${PY_CUR}") >= Version("${PY_MIN}"), "Python 过旧: ${PY_CUR} < ${PY_MIN}"
|
||||||
libcublas-dev=11.8 libcufft-dev=11.8 \
|
print("Python OK")
|
||||||
libcurand-dev=11.8 libcusolver-dev=11.8 libcusparse-dev=11.8 \
|
PY
|
||||||
cuda-profiler-api=11.8
|
|
||||||
|
# 如果安装的是 torch+cu118 以外的轮子,强制提醒
|
||||||
|
TORCH_CUDA_TAG=$(python - <<'PY'
|
||||||
|
import torch
|
||||||
|
print(getattr(torch.version, "cuda", ""))
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
if [[ -n "${CUDA_VERSION_PIN}" && -n "${TORCH_CUDA_TAG}" && "${TORCH_CUDA_TAG}" != "${CUDA_VERSION_PIN}" ]]; then
|
||||||
|
die "当前 PyTorch CUDA tag=${TORCH_CUDA_TAG},与期望 ${CUDA_VERSION_PIN} 不一致。换成匹配的 torch 或修改 CUDA_VERSION_PIN。"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 强制用当前环境里的 CUDA 11.8
|
# =============================
|
||||||
export CUDA_HOME="$CONDA_PREFIX"
|
# 1. 安装 CUDA 11.8 工具链(进 conda 环境)
|
||||||
export CUDA_PATH="$CUDA_HOME"
|
# =============================
|
||||||
export TORCH_CUDA_HOME="$CUDA_HOME"
|
log "Installing CUDA ${CUDA_VERSION_PIN} toolchain into current conda env..."
|
||||||
export CUDACXX="$CUDA_HOME/bin/nvcc"
|
set +e
|
||||||
export PATH="$CUDA_HOME/bin:$PATH"
|
${CONDA_BIN} install -y -c "nvidia/label/cuda-${CUDA_VERSION_PIN}.0" cuda-toolkit
|
||||||
export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
|
RC=$?
|
||||||
hash -r # 刷新 shell 命令缓存
|
set -e
|
||||||
|
if [[ $RC -ne 0 ]]; then
|
||||||
|
log "nvidia channel 失败,回退 conda-forge 组件安装..."
|
||||||
|
${CONDA_BIN} install -y -c conda-forge \
|
||||||
|
cuda-nvcc="${CUDA_VERSION_PIN}" cuda-version="${CUDA_VERSION_PIN}" \
|
||||||
|
cuda-cudart-dev="${CUDA_VERSION_PIN}" libnvjitlink="${CUDA_VERSION_PIN}" \
|
||||||
|
libcublas-dev="${CUDA_VERSION_PIN}" libcufft-dev="${CUDA_VERSION_PIN}" \
|
||||||
|
libcurand-dev="${CUDA_VERSION_PIN}" libcusolver-dev="${CUDA_VERSION_PIN}" libcusparse-dev="${CUDA_VERSION_PIN}" \
|
||||||
|
cuda-profiler-api="${CUDA_VERSION_PIN}"
|
||||||
|
fi
|
||||||
|
|
||||||
echo "==> nvcc should now be 11.8:"
|
# =============================
|
||||||
which nvcc
|
# 2. 设置 CUDA 环境变量(指向当前 conda 前缀)
|
||||||
nvcc --version
|
# =============================
|
||||||
|
if [[ -z "${CONDA_PREFIX:-}" ]]; then
|
||||||
|
die "CONDA_PREFIX 未设置。请在目标 conda/mamba 环境里执行本脚本。"
|
||||||
|
fi
|
||||||
|
export CUDA_HOME="${CONDA_PREFIX}"
|
||||||
|
export CUDA_PATH="${CUDA_HOME}"
|
||||||
|
export TORCH_CUDA_HOME="${CUDA_HOME}"
|
||||||
|
export CUDACXX="${CUDA_HOME}/bin/nvcc"
|
||||||
|
export PATH="${CUDA_HOME}/bin:${PATH}"
|
||||||
|
export LD_LIBRARY_PATH="${CUDA_HOME}/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-}"
|
||||||
|
hash -r
|
||||||
|
|
||||||
# 架构:3090 = sm_86
|
log "nvcc version check"
|
||||||
export TORCH_CUDA_ARCH_LIST="8.6"
|
need_bin nvcc
|
||||||
|
nvcc_out=$(nvcc --version | tail -n1 || true)
|
||||||
|
echo "$nvcc_out"
|
||||||
|
echo "$nvcc_out" | grep -q "${CUDA_VERSION_PIN}" || die "nvcc 版本与期望 ${CUDA_VERSION_PIN} 不匹配。"
|
||||||
|
|
||||||
# 编译依赖
|
# =============================
|
||||||
echo "==> Build deps"
|
# 3. 选择/推断 TORCH_CUDA_ARCH_LIST
|
||||||
mamba install -y -c conda-forge cmake ninja pybind11 libaio git
|
# =============================
|
||||||
|
if [[ -z "${ARCH_LIST}" ]]; then
|
||||||
|
# 尝试用 torch 探测
|
||||||
|
ARCH_LIST=$(python - <<'PY'
|
||||||
|
import torch
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
cc=set()
|
||||||
|
for i in range(torch.cuda.device_count()):
|
||||||
|
cc.add("{}.{}".format(*torch.cuda.get_device_capability(i)))
|
||||||
|
print(";".join(sorted(cc)))
|
||||||
|
else:
|
||||||
|
print("")
|
||||||
|
PY
|
||||||
|
)
|
||||||
|
# 无法探测就默认 8.6(3090)
|
||||||
|
[[ -z "${ARCH_LIST}" ]] && ARCH_LIST="8.6"
|
||||||
|
fi
|
||||||
|
export TORCH_CUDA_ARCH_LIST="${ARCH_LIST}"
|
||||||
|
log "TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}"
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# 4. 构建依赖
|
||||||
|
# =============================
|
||||||
|
log "Build deps"
|
||||||
|
${CONDA_BIN} install -y -c conda-forge cmake ninja pybind11 libaio git
|
||||||
pip install -U pip setuptools wheel
|
pip install -U pip setuptools wheel
|
||||||
|
|
||||||
# 获取 DeepSpeed 源码
|
# =============================
|
||||||
echo "==> Clone DeepSpeed (if not exists)"
|
# 5. 获取/刷新 DeepSpeed 源码(在当前目录)
|
||||||
cd "${HOME}/train/new"
|
# =============================
|
||||||
[ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git
|
log "Clone/refresh DeepSpeed repo"
|
||||||
cd DeepSpeed
|
WORKDIR="$(pwd)"
|
||||||
|
REPO_PATH="${WORKDIR}/${REPO_DIR}"
|
||||||
|
|
||||||
# 清理旧安装
|
if git -C "${REPO_PATH}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
||||||
pip uninstall -y deepspeed || true
|
# 已存在且是 git 仓库:校验 remote 和分支,强制到目标分支
|
||||||
|
cur_remote=$(git -C "${REPO_PATH}" remote get-url origin || echo "")
|
||||||
|
if [[ "${cur_remote}" != "${REPO_URL}" ]]; then
|
||||||
|
log "remote 不匹配,重建仓库"
|
||||||
|
rm -rf "${REPO_PATH}"
|
||||||
|
git clone --branch "${REPO_BRANCH}" --depth 1 "${REPO_URL}" "${REPO_PATH}"
|
||||||
|
else
|
||||||
|
log "更新已有仓库到 ${REPO_BRANCH}"
|
||||||
|
git -C "${REPO_PATH}" fetch --depth 1 origin "${REPO_BRANCH}"
|
||||||
|
git -C "${REPO_PATH}" checkout -f "${REPO_BRANCH}"
|
||||||
|
git -C "${REPO_PATH}" reset --hard "origin/${REPO_BRANCH}"
|
||||||
|
git -C "${REPO_PATH}" clean -fdx
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# 不存在或不是仓库:重拉
|
||||||
|
rm -rf "${REPO_PATH}"
|
||||||
|
git clone --branch "${REPO_BRANCH}" --depth 1 "${REPO_URL}" "${REPO_PATH}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 基本健检
|
||||||
|
test -f "${REPO_PATH}/pyproject.toml" -o -f "${REPO_PATH}/setup.py" || die "DeepSpeed 源码目录缺少构建文件。"
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# 6. 编译安装 DeepSpeed
|
||||||
|
# =============================
|
||||||
|
log "Build & install DeepSpeed"
|
||||||
|
cd "${REPO_PATH}"
|
||||||
|
pip uninstall -y deepspeed >/dev/null 2>&1 || true
|
||||||
rm -rf build
|
rm -rf build
|
||||||
|
|
||||||
# 启用训练相关内核
|
|
||||||
export DS_BUILD_OPS=1
|
export DS_BUILD_OPS=1
|
||||||
export DS_BUILD_AIO=1
|
export DS_BUILD_AIO=1
|
||||||
export DS_BUILD_FUSED_ADAM=1
|
export DS_BUILD_FUSED_ADAM=1
|
||||||
export DS_BUILD_CPU_ADAM=1
|
export DS_BUILD_CPU_ADAM=1
|
||||||
# 推理/transformer 内核先关
|
# 如需推理内核:export DS_BUILD_TRANSFORMER=1
|
||||||
# export DS_BUILD_TRANSFORMER=1
|
|
||||||
|
|
||||||
echo "==> Build & install DeepSpeed"
|
# 提供更详细日志便于排错
|
||||||
pip install .
|
pip install -v .
|
||||||
|
|
||||||
|
# =============================
|
||||||
|
# 7. 验证安装
|
||||||
|
# =============================
|
||||||
|
log "Verify DeepSpeed installation"
|
||||||
|
python -m deepspeed.env_report || die "deepspeed.env_report 失败"
|
||||||
|
|
||||||
echo "==> Verify DeepSpeed"
|
|
||||||
python -m deepspeed.env_report
|
|
||||||
python - <<'PY'
|
python - <<'PY'
|
||||||
import deepspeed, torch
|
import deepspeed, torch
|
||||||
from deepspeed.ops.adam import FusedAdam
|
from deepspeed.ops.adam import FusedAdam
|
||||||
print("DeepSpeed:", deepspeed.__version__)
|
print("DeepSpeed:", deepspeed.__version__)
|
||||||
print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
|
print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
|
||||||
print("GPU0:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
|
print("GPU count:", torch.cuda.device_count())
|
||||||
print("FusedAdam OK")
|
if torch.cuda.is_available():
|
||||||
|
print("GPU0:", torch.cuda.get_device_name(0))
|
||||||
|
print("FusedAdam import OK")
|
||||||
PY
|
PY
|
||||||
|
|
||||||
# 单机单卡最小训练验证
|
# =============================
|
||||||
echo "==> Prepare single-GPU minimal HF training"
|
# 8. 最小单卡训练验证(HF Trainer + ZeRO-2)
|
||||||
cd "${HOME}/train/new"
|
# =============================
|
||||||
cat > ds_config_stage2_single.json <<'JSON'
|
log "Prepare and run minimal single-GPU training"
|
||||||
|
cd "${WORKDIR}"
|
||||||
|
cat > ds_config_stage2_single.json <<JSON
|
||||||
{
|
{
|
||||||
"train_batch_size": 8,
|
"train_batch_size": 8,
|
||||||
"train_micro_batch_size_per_gpu": 1,
|
"train_micro_batch_size_per_gpu": 1,
|
||||||
|
|
@ -90,7 +214,7 @@ cat > ds_config_stage2_single.json <<'JSON'
|
||||||
"aio": {
|
"aio": {
|
||||||
"block_size": 1048576,
|
"block_size": 1048576,
|
||||||
"queue_depth": 16,
|
"queue_depth": 16,
|
||||||
"thread_count": 1,
|
"thread_count": ${AIO_THREADS},
|
||||||
"single_submit": false,
|
"single_submit": false,
|
||||||
"overlap_events": true,
|
"overlap_events": true,
|
||||||
"verbose": false
|
"verbose": false
|
||||||
|
|
@ -130,7 +254,6 @@ Trainer(model=model, args=args, tokenizer=tok, train_dataset=ds, data_collator=c
|
||||||
print("OK: single-GPU training finished.")
|
print("OK: single-GPU training finished.")
|
||||||
PY
|
PY
|
||||||
|
|
||||||
echo "==> Run single-GPU test (CUDA_VISIBLE_DEVICES=0)"
|
CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} python train_single_gpu_min.py
|
||||||
CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py
|
|
||||||
|
|
||||||
echo "=== DONE: DeepSpeed built with CUDA 11.8 and single-GPU test passed ==="
|
log "DONE: DeepSpeed built with CUDA ${CUDA_VERSION_PIN} and single-GPU test passed"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue