train_env_prepare/dspeed/install_deepspeed_src_mamba...

306 lines
10 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
# =============================
# Config可用环境变量覆盖
# =============================
REPO_URL="${REPO_URL:-https://github.com/microsoft/DeepSpeed.git}"
REPO_BRANCH="${REPO_BRANCH:-v0.17.4}"
REPO_DIR="${REPO_DIR:-DeepSpeed}" # 将在“当前目录/REPO_DIR”下操作
CUDA_VERSION_PIN="${CUDA_VERSION_PIN:-11.8}" # 期望 Torch 的 CUDA tag 和 nvcc 都是 11.8
ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-}" # 为空则自动探测
PY_MIN="${PY_MIN:-3.8}" # 最低 Python 版本
AIO_THREADS="${AIO_THREADS:-1}"
# =============================
# Utils
# =============================
log(){ printf "\n\033[1;36m==> %s\033[0m\n" "$*"; }
die(){ echo -e "\n\033[1;31m[ERROR]\033[0m $*" ; exit 1; }
need_bin(){
command -v "$1" >/dev/null 2>&1 || die "缺少可执行程序: $1"
}
# =============================
# 0. 前置检查
# =============================
need_bin python
need_bin pip
need_bin git
if command -v mamba >/dev/null 2>&1; then
CONDA_BIN="mamba"
elif command -v conda >/dev/null 2>&1; then
CONDA_BIN="conda"
else
die "没有找到 mamba/conda无法安装 CUDA 工具链和构建依赖。"
fi
# Python/Torch 基线
log "Python/Torch baseline"
python - <<'PY'
import sys, torch
print("Python:", sys.version.split()[0])
print("Torch:", getattr(torch, "__version__", "n/a"))
print("Torch CUDA tag:", getattr(torch.version, "cuda", "n/a"))
print("CUDA available:", torch.cuda.is_available())
PY
# 检查 Python 版本
PY_CUR=$(python - <<'PY'
import sys
print(".".join(map(str, sys.version_info[:2])))
PY
)
python - <<PY
from packaging.version import Version
assert Version("${PY_CUR}") >= Version("${PY_MIN}"), "Python 过旧: ${PY_CUR} < ${PY_MIN}"
print("Python OK")
PY
# 如果安装的是 torch+cu118 以外的轮子,强制提醒
TORCH_CUDA_TAG=$(python - <<'PY'
import torch
print(getattr(torch.version, "cuda", ""))
PY
)
if [[ -n "${CUDA_VERSION_PIN}" && -n "${TORCH_CUDA_TAG}" && "${TORCH_CUDA_TAG}" != "${CUDA_VERSION_PIN}" ]]; then
die "当前 PyTorch CUDA tag=${TORCH_CUDA_TAG},与期望 ${CUDA_VERSION_PIN} 不一致。换成匹配的 torch 或修改 CUDA_VERSION_PIN。"
fi
# =============================
# 1. 安装 CUDA 11.8 工具链(进 conda 环境)
# =============================
log "Installing CUDA ${CUDA_VERSION_PIN} toolchain into current conda env..."
set +e
${CONDA_BIN} install -y -c "nvidia/label/cuda-${CUDA_VERSION_PIN}.0" cuda-toolkit
RC=$?
set -e
if [[ $RC -ne 0 ]]; then
log "nvidia channel 失败,回退 conda-forge 组件安装..."
${CONDA_BIN} install -y -c conda-forge \
cuda-nvcc="${CUDA_VERSION_PIN}" cuda-version="${CUDA_VERSION_PIN}" \
cuda-cudart-dev="${CUDA_VERSION_PIN}" libnvjitlink="${CUDA_VERSION_PIN}" \
libcublas-dev="${CUDA_VERSION_PIN}" libcufft-dev="${CUDA_VERSION_PIN}" \
libcurand-dev="${CUDA_VERSION_PIN}" libcusolver-dev="${CUDA_VERSION_PIN}" libcusparse-dev="${CUDA_VERSION_PIN}" \
cuda-profiler-api="${CUDA_VERSION_PIN}"
fi
# =============================
# 2. 设置 CUDA 环境变量(指向当前 conda 前缀)
# =============================
if [[ -z "${CONDA_PREFIX:-}" ]]; then
die "CONDA_PREFIX 未设置。请在目标 conda/mamba 环境里执行本脚本。"
fi
export CUDA_HOME="${CONDA_PREFIX}"
export CUDA_PATH="${CUDA_HOME}"
export TORCH_CUDA_HOME="${CUDA_HOME}"
export CUDACXX="${CUDA_HOME}/bin/nvcc"
export PATH="${CUDA_HOME}/bin:${PATH}"
export LD_LIBRARY_PATH="${CUDA_HOME}/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-}"
hash -r
log "nvcc version check"
need_bin nvcc
nvcc_out=$(nvcc --version | tail -n1 || true)
echo "$nvcc_out"
echo "$nvcc_out" | grep -q "${CUDA_VERSION_PIN}" || die "nvcc 版本与期望 ${CUDA_VERSION_PIN} 不匹配。"
# =============================
# 3. 选择/推断 TORCH_CUDA_ARCH_LIST
# =============================
if [[ -z "${ARCH_LIST}" ]]; then
# 尝试用 torch 探测
ARCH_LIST=$(python - <<'PY'
import torch
if torch.cuda.is_available():
cc=set()
for i in range(torch.cuda.device_count()):
cc.add("{}.{}".format(*torch.cuda.get_device_capability(i)))
print(";".join(sorted(cc)))
else:
print("")
PY
)
# 无法探测就默认 8.63090
[[ -z "${ARCH_LIST}" ]] && ARCH_LIST="8.6"
fi
export TORCH_CUDA_ARCH_LIST="${ARCH_LIST}"
log "TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}"
# =============================
# 4. 构建依赖
# =============================
log "Build deps"
${CONDA_BIN} install -y -c conda-forge cmake ninja pybind11 libaio git
pip install -U pip setuptools wheel
# =============================
# 5. 获取/刷新 DeepSpeed 源码(在当前目录)
# =============================
log "Clone/refresh DeepSpeed repo"
WORKDIR="$(pwd)"
REPO_PATH="${WORKDIR}/${REPO_DIR}"
if git -C "${REPO_PATH}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
# 已存在且是 git 仓库:校验 remote 和分支,强制到目标分支
cur_remote=$(git -C "${REPO_PATH}" remote get-url origin || echo "")
if [[ "${cur_remote}" != "${REPO_URL}" ]]; then
log "remote 不匹配,重建仓库"
rm -rf "${REPO_PATH}"
git clone --branch "${REPO_BRANCH}" --depth 1 --recurse-submodules --shallow-submodules "${REPO_URL}" "${REPO_PATH}"
else
log "更新已有仓库到 ${REPO_BRANCH}"
git -C "${REPO_PATH}" fetch --depth 1 origin "${REPO_BRANCH}"
git -C "${REPO_PATH}" checkout -f "${REPO_BRANCH}"
git -C "${REPO_PATH}" reset --hard "origin/${REPO_BRANCH}"
git -C "${REPO_PATH}" clean -fdx
# 同步并更新子模块到正确提交
git -C "${REPO_PATH}" submodule sync --recursive
git -C "${REPO_PATH}" submodule update --init --recursive --depth 1
fi
else
# 不存在或不是仓库:重拉(连带子模块)
rm -rf "${REPO_PATH}"
git clone --branch "${REPO_BRANCH}" --depth 1 --recurse-submodules --shallow-submodules "${REPO_URL}" "${REPO_PATH}"
fi
# 再保险:确保子模块已到位
git -C "${REPO_PATH}" submodule sync --recursive
git -C "${REPO_PATH}" submodule update --init --recursive --depth 1
# 安装 ds-kernelsDeepSpeed Inference CUTLASS 依赖)
DK_DIR=""
# 从 .gitmodules 找子模块路径(不同版本命名可能是 ds-kernels 或 dskernels
if [[ -f "${REPO_PATH}/.gitmodules" ]]; then
DK_DIR=$(git -C "${REPO_PATH}" config --file .gitmodules --get-regexp path | awk '/ds[-_]?kernels/{print $2; exit}' || true)
fi
if [[ -n "${DK_DIR}" && -f "${REPO_PATH}/${DK_DIR}/setup.py" ]]; then
log "Install ds-kernels from submodule: ${DK_DIR}"
pip install -v "${REPO_PATH}/${DK_DIR}"
else
log "Submodule ds-kernels 不存在或未包含构建脚本,直接从仓库安装"
pip install -v "git+https://github.com/microsoft/ds-kernels.git"
fi
# 基本健检
test -f "${REPO_PATH}/pyproject.toml" -o -f "${REPO_PATH}/setup.py" || die "DeepSpeed 源码目录缺少构建文件。"
# =============================
# 6. 编译安装 DeepSpeed
# =============================
log "Build & install DeepSpeed"
cd "${REPO_PATH}"
pip uninstall -y deepspeed >/dev/null 2>&1 || true
rm -rf build
# —— 编译前置(强烈建议)——
python - <<'PY'
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "pip", "setuptools", "wheel", "ninja"])
PY
# 并行编译提速(按 CPU 核心数调)
export MAX_JOBS=${MAX_JOBS:-$(nproc)}
# —— DeepSpeed 功能开关 ——
# 编译核心 CUDA/C++ 扩展(必须)
export DS_BUILD_OPS=1
# 训练常用加速内核
export DS_BUILD_TRANSFORMER=1
export DS_BUILD_SPARSE_ATTN=1
export DS_BUILD_FLASH_ATTN=1
# 常用优化器(强烈建议开)
export DS_BUILD_FUSED_ADAM=1
export DS_BUILD_CPU_ADAM=1
# 可选:异步 IOLinux 才生效Windows 不支持 AIO/GDS
export DS_BUILD_AIO=1
export DS_BUILD_INFERENCE=0
export DS_BUILD_CUTLASS=0
# 安装(带详细日志)
pip install -v .
# =============================
# 7. 验证安装
# =============================
log "Verify DeepSpeed installation"
python -m deepspeed.env_report || die "deepspeed.env_report 失败"
python - <<'PY'
import deepspeed, torch
from deepspeed.ops.adam import FusedAdam
print("DeepSpeed:", deepspeed.__version__)
print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
print("GPU count:", torch.cuda.device_count())
if torch.cuda.is_available():
print("GPU0:", torch.cuda.get_device_name(0))
print("FusedAdam import OK")
PY
# =============================
# 8. 最小单卡训练验证HF Trainer + ZeRO-2
# =============================
log "Prepare and run minimal single-GPU training"
cd "${WORKDIR}"
cat > ds_config_stage2_single.json <<JSON
{
"train_batch_size": 8,
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 8,
"zero_optimization": { "stage": 2, "overlap_comm": true, "contiguous_gradients": true },
"fp16": { "enabled": true },
"aio": {
"block_size": 1048576,
"queue_depth": 16,
"thread_count": ${AIO_THREADS},
"single_submit": false,
"overlap_events": true,
"verbose": false
},
"gradient_clipping": 1.0,
"steps_per_print": 1000,
"wall_clock_breakdown": false
}
JSON
cat > train_single_gpu_min.py <<'PY'
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
model_name = "sshleifer/tiny-gpt2"
tok = AutoTokenizer.from_pretrained(model_name)
if tok.pad_token is None: tok.pad_token = tok.eos_token
texts = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200
ds = Dataset.from_dict({"text": texts}).map(lambda e: tok(e["text"], truncation=True, max_length=128))
collator = DataCollatorForLanguageModeling(tok, mlm=False)
model = AutoModelForCausalLM.from_pretrained(model_name)
args = TrainingArguments(
output_dir="out-ds-single",
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
learning_rate=5e-4,
num_train_epochs=1,
logging_steps=10,
save_steps=0,
fp16=True,
deepspeed="ds_config_stage2_single.json"
)
Trainer(model=model, args=args, tokenizer=tok, train_dataset=ds, data_collator=collator).train()
print("OK: single-GPU training finished.")
PY
CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} python train_single_gpu_min.py
log "DONE: DeepSpeed built with CUDA ${CUDA_VERSION_PIN} and single-GPU test passed"