274 lines
8.9 KiB
Bash
Executable File
274 lines
8.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
# =============================
|
||
# Config(可用环境变量覆盖)
|
||
# =============================
|
||
REPO_URL="${REPO_URL:-https://github.com/microsoft/DeepSpeed.git}"
|
||
REPO_BRANCH="${REPO_BRANCH:-v0.17.4}"
|
||
REPO_DIR="${REPO_DIR:-DeepSpeed}" # 将在“当前目录/REPO_DIR”下操作
|
||
CUDA_VERSION_PIN="${CUDA_VERSION_PIN:-11.8}" # 期望 Torch 的 CUDA tag 和 nvcc 都是 11.8
|
||
ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-}" # 为空则自动探测
|
||
PY_MIN="${PY_MIN:-3.8}" # 最低 Python 版本
|
||
AIO_THREADS="${AIO_THREADS:-1}"
|
||
|
||
# =============================
|
||
# Utils
|
||
# =============================
|
||
log(){ printf "\n\033[1;36m==> %s\033[0m\n" "$*"; }
|
||
die(){ echo -e "\n\033[1;31m[ERROR]\033[0m $*" ; exit 1; }
|
||
|
||
need_bin(){
|
||
command -v "$1" >/dev/null 2>&1 || die "缺少可执行程序: $1"
|
||
}
|
||
|
||
# =============================
|
||
# 0. 前置检查
|
||
# =============================
|
||
need_bin python
|
||
need_bin pip
|
||
need_bin git
|
||
|
||
if command -v mamba >/dev/null 2>&1; then
|
||
CONDA_BIN="mamba"
|
||
elif command -v conda >/dev/null 2>&1; then
|
||
CONDA_BIN="conda"
|
||
else
|
||
die "没有找到 mamba/conda,无法安装 CUDA 工具链和构建依赖。"
|
||
fi
|
||
|
||
# Python/Torch 基线
|
||
log "Python/Torch baseline"
|
||
python - <<'PY'
|
||
import sys, torch
|
||
print("Python:", sys.version.split()[0])
|
||
print("Torch:", getattr(torch, "__version__", "n/a"))
|
||
print("Torch CUDA tag:", getattr(torch.version, "cuda", "n/a"))
|
||
print("CUDA available:", torch.cuda.is_available())
|
||
PY
|
||
|
||
# 检查 Python 版本
|
||
PY_CUR=$(python - <<'PY'
|
||
import sys
|
||
print(".".join(map(str, sys.version_info[:2])))
|
||
PY
|
||
)
|
||
python - <<PY
|
||
from packaging.version import Version
|
||
assert Version("${PY_CUR}") >= Version("${PY_MIN}"), "Python 过旧: ${PY_CUR} < ${PY_MIN}"
|
||
print("Python OK")
|
||
PY
|
||
|
||
# 如果安装的是 torch+cu118 以外的轮子,强制提醒
|
||
TORCH_CUDA_TAG=$(python - <<'PY'
|
||
import torch
|
||
print(getattr(torch.version, "cuda", ""))
|
||
PY
|
||
)
|
||
if [[ -n "${CUDA_VERSION_PIN}" && -n "${TORCH_CUDA_TAG}" && "${TORCH_CUDA_TAG}" != "${CUDA_VERSION_PIN}" ]]; then
|
||
die "当前 PyTorch CUDA tag=${TORCH_CUDA_TAG},与期望 ${CUDA_VERSION_PIN} 不一致。换成匹配的 torch 或修改 CUDA_VERSION_PIN。"
|
||
fi
|
||
|
||
# =============================
|
||
# 1. 安装 CUDA 11.8 工具链(进 conda 环境)
|
||
# =============================
|
||
log "Installing CUDA ${CUDA_VERSION_PIN} toolchain into current conda env..."
|
||
set +e
|
||
${CONDA_BIN} install -y -c "nvidia/label/cuda-${CUDA_VERSION_PIN}.0" cuda-toolkit
|
||
RC=$?
|
||
set -e
|
||
if [[ $RC -ne 0 ]]; then
|
||
log "nvidia channel 失败,回退 conda-forge 组件安装..."
|
||
${CONDA_BIN} install -y -c conda-forge \
|
||
cuda-nvcc="${CUDA_VERSION_PIN}" cuda-version="${CUDA_VERSION_PIN}" \
|
||
cuda-cudart-dev="${CUDA_VERSION_PIN}" libnvjitlink="${CUDA_VERSION_PIN}" \
|
||
libcublas-dev="${CUDA_VERSION_PIN}" libcufft-dev="${CUDA_VERSION_PIN}" \
|
||
libcurand-dev="${CUDA_VERSION_PIN}" libcusolver-dev="${CUDA_VERSION_PIN}" libcusparse-dev="${CUDA_VERSION_PIN}" \
|
||
cuda-profiler-api="${CUDA_VERSION_PIN}"
|
||
fi
|
||
|
||
# =============================
|
||
# 2. 设置 CUDA 环境变量(指向当前 conda 前缀)
|
||
# =============================
|
||
if [[ -z "${CONDA_PREFIX:-}" ]]; then
|
||
die "CONDA_PREFIX 未设置。请在目标 conda/mamba 环境里执行本脚本。"
|
||
fi
|
||
export CUDA_HOME="${CONDA_PREFIX}"
|
||
export CUDA_PATH="${CUDA_HOME}"
|
||
export TORCH_CUDA_HOME="${CUDA_HOME}"
|
||
export CUDACXX="${CUDA_HOME}/bin/nvcc"
|
||
export PATH="${CUDA_HOME}/bin:${PATH}"
|
||
export LD_LIBRARY_PATH="${CUDA_HOME}/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-}"
|
||
hash -r
|
||
|
||
log "nvcc version check"
|
||
need_bin nvcc
|
||
nvcc_out=$(nvcc --version | tail -n1 || true)
|
||
echo "$nvcc_out"
|
||
echo "$nvcc_out" | grep -q "${CUDA_VERSION_PIN}" || die "nvcc 版本与期望 ${CUDA_VERSION_PIN} 不匹配。"
|
||
|
||
# =============================
|
||
# 3. 选择/推断 TORCH_CUDA_ARCH_LIST
|
||
# =============================
|
||
if [[ -z "${ARCH_LIST}" ]]; then
|
||
# 尝试用 torch 探测
|
||
ARCH_LIST=$(python - <<'PY'
|
||
import torch
|
||
if torch.cuda.is_available():
|
||
cc=set()
|
||
for i in range(torch.cuda.device_count()):
|
||
cc.add("{}.{}".format(*torch.cuda.get_device_capability(i)))
|
||
print(";".join(sorted(cc)))
|
||
else:
|
||
print("")
|
||
PY
|
||
)
|
||
# 无法探测就默认 8.6(3090)
|
||
[[ -z "${ARCH_LIST}" ]] && ARCH_LIST="8.6"
|
||
fi
|
||
export TORCH_CUDA_ARCH_LIST="${ARCH_LIST}"
|
||
log "TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}"
|
||
|
||
# =============================
|
||
# 4. 构建依赖
|
||
# =============================
|
||
log "Build deps"
|
||
${CONDA_BIN} install -y -c conda-forge cmake ninja pybind11 libaio git
|
||
pip install -U pip setuptools wheel
|
||
|
||
# =============================
|
||
# 5. 获取/刷新 DeepSpeed 源码(在当前目录)
|
||
# =============================
|
||
log "Clone/refresh DeepSpeed repo"
|
||
WORKDIR="$(pwd)"
|
||
REPO_PATH="${WORKDIR}/${REPO_DIR}"
|
||
|
||
if git -C "${REPO_PATH}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
||
cur_remote=$(git -C "${REPO_PATH}" remote get-url origin || echo "")
|
||
if [[ "${cur_remote}" != "${REPO_URL}" ]]; then
|
||
log "remote 不匹配,重建仓库"
|
||
rm -rf "${REPO_PATH}"
|
||
git clone --branch "${REPO_BRANCH}" --depth 1 "${REPO_URL}" "${REPO_PATH}"
|
||
else
|
||
log "更新已有仓库到 ${REPO_BRANCH}"
|
||
git -C "${REPO_PATH}" fetch --depth 1 origin "${REPO_BRANCH}"
|
||
git -C "${REPO_PATH}" checkout -f "${REPO_BRANCH}"
|
||
git -C "${REPO_PATH}" reset --hard "origin/${REPO_BRANCH}"
|
||
git -C "${REPO_PATH}" clean -fdx
|
||
fi
|
||
else
|
||
rm -rf "${REPO_PATH}"
|
||
git clone --branch "${REPO_BRANCH}" --depth 1 "${REPO_URL}" "${REPO_PATH}"
|
||
fi
|
||
|
||
# =============================
|
||
# 6. 编译安装 DeepSpeed(训练专用版,无推理组件)
|
||
# =============================
|
||
log "Build & install DeepSpeed (training only, no inference ops)"
|
||
|
||
|
||
# 先把 libcurand 的开发包装上(提供无版本名 libcurand.so)
|
||
set +e
|
||
${CONDA_BIN} install -y -c "nvidia/label/cuda-${CUDA_VERSION_PIN}.0" libcurand-dev
|
||
RC=$?
|
||
set -e
|
||
if [[ $RC -ne 0 ]]; then
|
||
log "nvidia channel 没装上 libcurand-dev,回退到 conda-forge"
|
||
${CONDA_BIN} install -y -c conda-forge libcurand libcurand-dev
|
||
fi
|
||
|
||
# 链接器需要能找到这些库
|
||
export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib64:${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH:-}"
|
||
export LIBRARY_PATH="${CONDA_PREFIX}/lib64:${CONDA_PREFIX}/lib:${LIBRARY_PATH:-}"
|
||
|
||
export DS_BUILD_INFERENCE=0
|
||
export DS_BUILD_CUTLASS=0
|
||
export DS_BUILD_QUANTIZER=0
|
||
export DS_BUILD_FP_QUANTIZER=0
|
||
export DS_BUILD_EVOFORMER_ATTN=0
|
||
export DS_BUILD_GDS=0
|
||
|
||
# 保留训练必需的内核
|
||
export DS_BUILD_TRANSFORMER=1
|
||
export DS_BUILD_SPARSE_ATTN=1
|
||
export DS_BUILD_FLASH_ATTN=1
|
||
|
||
pip install -v "${REPO_PATH}"
|
||
|
||
|
||
# =============================
|
||
# 7. 验证安装
|
||
# =============================
|
||
log "Verify DeepSpeed installation"
|
||
python -m deepspeed.env_report || die "deepspeed.env_report 失败"
|
||
|
||
python - <<'PY'
|
||
import deepspeed, torch
|
||
from deepspeed.ops.adam import FusedAdam
|
||
print("DeepSpeed:", deepspeed.__version__)
|
||
print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
|
||
print("GPU count:", torch.cuda.device_count())
|
||
if torch.cuda.is_available():
|
||
print("GPU0:", torch.cuda.get_device_name(0))
|
||
print("FusedAdam import OK")
|
||
PY
|
||
|
||
# =============================
|
||
# 8. 最小单卡训练验证(HF Trainer + ZeRO-2)
|
||
# =============================
|
||
log "Prepare and run minimal single-GPU training"
|
||
cd "${WORKDIR}"
|
||
cat > ds_config_stage2_single.json <<JSON
|
||
{
|
||
"train_batch_size": 8,
|
||
"train_micro_batch_size_per_gpu": 1,
|
||
"gradient_accumulation_steps": 8,
|
||
"zero_optimization": { "stage": 2, "overlap_comm": true, "contiguous_gradients": true },
|
||
"fp16": { "enabled": true },
|
||
"aio": {
|
||
"block_size": 1048576,
|
||
"queue_depth": 16,
|
||
"thread_count": ${AIO_THREADS},
|
||
"single_submit": false,
|
||
"overlap_events": true,
|
||
"verbose": false
|
||
},
|
||
"gradient_clipping": 1.0,
|
||
"steps_per_print": 1000,
|
||
"wall_clock_breakdown": false
|
||
}
|
||
JSON
|
||
|
||
cat > train_single_gpu_min.py <<'PY'
|
||
from datasets import Dataset
|
||
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
||
import torch
|
||
|
||
model_name = "sshleifer/tiny-gpt2"
|
||
tok = AutoTokenizer.from_pretrained(model_name)
|
||
if tok.pad_token is None: tok.pad_token = tok.eos_token
|
||
|
||
texts = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200
|
||
ds = Dataset.from_dict({"text": texts}).map(lambda e: tok(e["text"], truncation=True, max_length=128))
|
||
collator = DataCollatorForLanguageModeling(tok, mlm=False)
|
||
|
||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||
args = TrainingArguments(
|
||
output_dir="out-ds-single",
|
||
per_device_train_batch_size=1,
|
||
gradient_accumulation_steps=8,
|
||
learning_rate=5e-4,
|
||
num_train_epochs=1,
|
||
logging_steps=10,
|
||
save_steps=0,
|
||
fp16=True,
|
||
deepspeed="ds_config_stage2_single.json"
|
||
)
|
||
Trainer(model=model, args=args, tokenizer=tok, train_dataset=ds, data_collator=collator).train()
|
||
print("OK: single-GPU training finished.")
|
||
PY
|
||
|
||
CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} python train_single_gpu_min.py
|
||
|
||
log "DONE: DeepSpeed built with CUDA ${CUDA_VERSION_PIN} and single-GPU test passed"
|