diff --git a/dspeed/install_deepspeed_src_mamba_single_gpu.sh b/dspeed/install_deepspeed_src_mamba_single_gpu.sh index 0558872..c3bd27c 100644 --- a/dspeed/install_deepspeed_src_mamba_single_gpu.sh +++ b/dspeed/install_deepspeed_src_mamba_single_gpu.sh @@ -1,86 +1,210 @@ #!/usr/bin/env bash set -euo pipefail -echo "==> Python/Torch baseline" +# ============================= +# Config(可用环境变量覆盖) +# ============================= +REPO_URL="${REPO_URL:-https://github.com/microsoft/DeepSpeed.git}" +REPO_BRANCH="${REPO_BRANCH:-v0.14.3}" +REPO_DIR="${REPO_DIR:-DeepSpeed}" # 将在“当前目录/REPO_DIR”下操作 +CUDA_VERSION_PIN="${CUDA_VERSION_PIN:-11.8}" # 期望 Torch 的 CUDA tag 和 nvcc 都是 11.8 +ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-}" # 为空则自动探测 +PY_MIN="${PY_MIN:-3.8}" # 最低 Python 版本 +AIO_THREADS="${AIO_THREADS:-1}" + +# ============================= +# Utils +# ============================= +log(){ printf "\n\033[1;36m==> %s\033[0m\n" "$*"; } +die(){ echo -e "\n\033[1;31m[ERROR]\033[0m $*" ; exit 1; } + +need_bin(){ + command -v "$1" >/dev/null 2>&1 || die "缺少可执行程序: $1" +} + +# ============================= +# 0. 前置检查 +# ============================= +need_bin python +need_bin pip +need_bin git + +if command -v mamba >/dev/null 2>&1; then + CONDA_BIN="mamba" +elif command -v conda >/dev/null 2>&1; then + CONDA_BIN="conda" +else + die "没有找到 mamba/conda,无法安装 CUDA 工具链和构建依赖。" +fi + +# Python/Torch 基线 +log "Python/Torch baseline" python - <<'PY' import sys, torch print("Python:", sys.version.split()[0]) -print("Torch:", torch.__version__) -print("Torch CUDA tag:", torch.version.cuda) +print("Torch:", getattr(torch, "__version__", "n/a")) +print("Torch CUDA tag:", getattr(torch.version, "cuda", "n/a")) print("CUDA available:", torch.cuda.is_available()) PY -# 安装 CUDA 11.8 工具链 -echo "==> Installing CUDA 11.8 toolchain into current env..." -if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then - echo "Installed cuda-toolkit (nvidia channel)." -else - echo "Fallback to conda-forge components..." - mamba install -y -c conda-forge \ - cuda-nvcc=11.8 cuda-version=11.8 \ - cuda-cudart-dev=11.8 libnvjitlink=11.8 \ - libcublas-dev=11.8 libcufft-dev=11.8 \ - libcurand-dev=11.8 libcusolver-dev=11.8 libcusparse-dev=11.8 \ - cuda-profiler-api=11.8 +# 检查 Python 版本 +PY_CUR=$(python - <<'PY' +import sys +print(".".join(map(str, sys.version_info[:2]))) +PY +) +python - <= Version("${PY_MIN}"), "Python 过旧: ${PY_CUR} < ${PY_MIN}" +print("Python OK") +PY + +# 如果安装的是 torch+cu118 以外的轮子,强制提醒 +TORCH_CUDA_TAG=$(python - <<'PY' +import torch +print(getattr(torch.version, "cuda", "")) +PY +) +if [[ -n "${CUDA_VERSION_PIN}" && -n "${TORCH_CUDA_TAG}" && "${TORCH_CUDA_TAG}" != "${CUDA_VERSION_PIN}" ]]; then + die "当前 PyTorch CUDA tag=${TORCH_CUDA_TAG},与期望 ${CUDA_VERSION_PIN} 不一致。换成匹配的 torch 或修改 CUDA_VERSION_PIN。" fi -# 强制用当前环境里的 CUDA 11.8 -export CUDA_HOME="$CONDA_PREFIX" -export CUDA_PATH="$CUDA_HOME" -export TORCH_CUDA_HOME="$CUDA_HOME" -export CUDACXX="$CUDA_HOME/bin/nvcc" -export PATH="$CUDA_HOME/bin:$PATH" -export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}" -hash -r # 刷新 shell 命令缓存 +# ============================= +# 1. 安装 CUDA 11.8 工具链(进 conda 环境) +# ============================= +log "Installing CUDA ${CUDA_VERSION_PIN} toolchain into current conda env..." +set +e +${CONDA_BIN} install -y -c "nvidia/label/cuda-${CUDA_VERSION_PIN}.0" cuda-toolkit +RC=$? +set -e +if [[ $RC -ne 0 ]]; then + log "nvidia channel 失败,回退 conda-forge 组件安装..." + ${CONDA_BIN} install -y -c conda-forge \ + cuda-nvcc="${CUDA_VERSION_PIN}" cuda-version="${CUDA_VERSION_PIN}" \ + cuda-cudart-dev="${CUDA_VERSION_PIN}" libnvjitlink="${CUDA_VERSION_PIN}" \ + libcublas-dev="${CUDA_VERSION_PIN}" libcufft-dev="${CUDA_VERSION_PIN}" \ + libcurand-dev="${CUDA_VERSION_PIN}" libcusolver-dev="${CUDA_VERSION_PIN}" libcusparse-dev="${CUDA_VERSION_PIN}" \ + cuda-profiler-api="${CUDA_VERSION_PIN}" +fi -echo "==> nvcc should now be 11.8:" -which nvcc -nvcc --version +# ============================= +# 2. 设置 CUDA 环境变量(指向当前 conda 前缀) +# ============================= +if [[ -z "${CONDA_PREFIX:-}" ]]; then + die "CONDA_PREFIX 未设置。请在目标 conda/mamba 环境里执行本脚本。" +fi +export CUDA_HOME="${CONDA_PREFIX}" +export CUDA_PATH="${CUDA_HOME}" +export TORCH_CUDA_HOME="${CUDA_HOME}" +export CUDACXX="${CUDA_HOME}/bin/nvcc" +export PATH="${CUDA_HOME}/bin:${PATH}" +export LD_LIBRARY_PATH="${CUDA_HOME}/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-}" +hash -r -# 架构:3090 = sm_86 -export TORCH_CUDA_ARCH_LIST="8.6" +log "nvcc version check" +need_bin nvcc +nvcc_out=$(nvcc --version | tail -n1 || true) +echo "$nvcc_out" +echo "$nvcc_out" | grep -q "${CUDA_VERSION_PIN}" || die "nvcc 版本与期望 ${CUDA_VERSION_PIN} 不匹配。" -# 编译依赖 -echo "==> Build deps" -mamba install -y -c conda-forge cmake ninja pybind11 libaio git +# ============================= +# 3. 选择/推断 TORCH_CUDA_ARCH_LIST +# ============================= +if [[ -z "${ARCH_LIST}" ]]; then + # 尝试用 torch 探测 + ARCH_LIST=$(python - <<'PY' +import torch +if torch.cuda.is_available(): + cc=set() + for i in range(torch.cuda.device_count()): + cc.add("{}.{}".format(*torch.cuda.get_device_capability(i))) + print(";".join(sorted(cc))) +else: + print("") +PY +) + # 无法探测就默认 8.6(3090) + [[ -z "${ARCH_LIST}" ]] && ARCH_LIST="8.6" +fi +export TORCH_CUDA_ARCH_LIST="${ARCH_LIST}" +log "TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}" + +# ============================= +# 4. 构建依赖 +# ============================= +log "Build deps" +${CONDA_BIN} install -y -c conda-forge cmake ninja pybind11 libaio git pip install -U pip setuptools wheel -# 获取 DeepSpeed 源码 -echo "==> Clone DeepSpeed (if not exists)" -cd "${HOME}/train/new" -[ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git -cd DeepSpeed +# ============================= +# 5. 获取/刷新 DeepSpeed 源码(在当前目录) +# ============================= +log "Clone/refresh DeepSpeed repo" +WORKDIR="$(pwd)" +REPO_PATH="${WORKDIR}/${REPO_DIR}" -# 清理旧安装 -pip uninstall -y deepspeed || true +if git -C "${REPO_PATH}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then + # 已存在且是 git 仓库:校验 remote 和分支,强制到目标分支 + cur_remote=$(git -C "${REPO_PATH}" remote get-url origin || echo "") + if [[ "${cur_remote}" != "${REPO_URL}" ]]; then + log "remote 不匹配,重建仓库" + rm -rf "${REPO_PATH}" + git clone --branch "${REPO_BRANCH}" --depth 1 "${REPO_URL}" "${REPO_PATH}" + else + log "更新已有仓库到 ${REPO_BRANCH}" + git -C "${REPO_PATH}" fetch --depth 1 origin "${REPO_BRANCH}" + git -C "${REPO_PATH}" checkout -f "${REPO_BRANCH}" + git -C "${REPO_PATH}" reset --hard "origin/${REPO_BRANCH}" + git -C "${REPO_PATH}" clean -fdx + fi +else + # 不存在或不是仓库:重拉 + rm -rf "${REPO_PATH}" + git clone --branch "${REPO_BRANCH}" --depth 1 "${REPO_URL}" "${REPO_PATH}" +fi + +# 基本健检 +test -f "${REPO_PATH}/pyproject.toml" -o -f "${REPO_PATH}/setup.py" || die "DeepSpeed 源码目录缺少构建文件。" + +# ============================= +# 6. 编译安装 DeepSpeed +# ============================= +log "Build & install DeepSpeed" +cd "${REPO_PATH}" +pip uninstall -y deepspeed >/dev/null 2>&1 || true rm -rf build - -# 启用训练相关内核 export DS_BUILD_OPS=1 export DS_BUILD_AIO=1 export DS_BUILD_FUSED_ADAM=1 export DS_BUILD_CPU_ADAM=1 -# 推理/transformer 内核先关 -# export DS_BUILD_TRANSFORMER=1 +# 如需推理内核:export DS_BUILD_TRANSFORMER=1 -echo "==> Build & install DeepSpeed" -pip install . +# 提供更详细日志便于排错 +pip install -v . + +# ============================= +# 7. 验证安装 +# ============================= +log "Verify DeepSpeed installation" +python -m deepspeed.env_report || die "deepspeed.env_report 失败" -echo "==> Verify DeepSpeed" -python -m deepspeed.env_report python - <<'PY' import deepspeed, torch from deepspeed.ops.adam import FusedAdam print("DeepSpeed:", deepspeed.__version__) print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda) -print("GPU0:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None) -print("FusedAdam OK") +print("GPU count:", torch.cuda.device_count()) +if torch.cuda.is_available(): + print("GPU0:", torch.cuda.get_device_name(0)) +print("FusedAdam import OK") PY -# 单机单卡最小训练验证 -echo "==> Prepare single-GPU minimal HF training" -cd "${HOME}/train/new" -cat > ds_config_stage2_single.json <<'JSON' +# ============================= +# 8. 最小单卡训练验证(HF Trainer + ZeRO-2) +# ============================= +log "Prepare and run minimal single-GPU training" +cd "${WORKDIR}" +cat > ds_config_stage2_single.json < ds_config_stage2_single.json <<'JSON' "aio": { "block_size": 1048576, "queue_depth": 16, - "thread_count": 1, + "thread_count": ${AIO_THREADS}, "single_submit": false, "overlap_events": true, "verbose": false @@ -130,7 +254,6 @@ Trainer(model=model, args=args, tokenizer=tok, train_dataset=ds, data_collator=c print("OK: single-GPU training finished.") PY -echo "==> Run single-GPU test (CUDA_VISIBLE_DEVICES=0)" -CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py +CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} python train_single_gpu_min.py -echo "=== DONE: DeepSpeed built with CUDA 11.8 and single-GPU test passed ===" +log "DONE: DeepSpeed built with CUDA ${CUDA_VERSION_PIN} and single-GPU test passed"