From 2e06ad0ccf801ae7cedee28f235cdadd7a7b9231 Mon Sep 17 00:00:00 2001 From: hailin Date: Fri, 8 Aug 2025 20:34:33 +0800 Subject: [PATCH] . --- .../install_deepspeed_src_mamba_single_gpu.sh | 89 +++++++++---------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh b/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh index a6618ee..8b61681 100644 --- a/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh +++ b/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -euo pipefail -echo "==> Show Python / Torch / CUDA" +echo "==> Python/Torch baseline" python - <<'PY' import sys, torch print("Python:", sys.version.split()[0]) @@ -10,66 +10,70 @@ print("Torch CUDA tag:", torch.version.cuda) print("CUDA available:", torch.cuda.is_available()) PY -# 选择 CUDA 工具链:优先 /usr/local/cuda-11.8,其次现有的 /usr/local/cuda-12.8 -if [ -d /usr/local/cuda-11.8 ]; then - export CUDA_HOME=/usr/local/cuda-11.8 - echo "==> Using CUDA_HOME=${CUDA_HOME} (preferred for cu118)" -elif [ -d /usr/local/cuda-12.8 ]; then - export CUDA_HOME=/usr/local/cuda-12.8 - echo "==> Using CUDA_HOME=${CUDA_HOME} (nvcc 12.8)" +# 尝试用 nvidia 官方 11.8 频道;失败则走 conda-forge 逐组件 +echo "==> Installing CUDA 11.8 toolchain into current env..." +if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then + echo "Installed cuda-toolkit (nvidia channel)." else - echo "!! 未找到 /usr/local/cuda-11.8 或 /usr/local/cuda-12.8,请安装 CUDA toolkit (dev)。" - exit 1 + echo "Fallback to conda-forge components..." + mamba install -y -c conda-forge \ + cuda-nvcc=11.8 cuda-version=11.8 \ + cuda-cudart-dev=11.8 libnvjitlink=11.8 \ + libcublas-dev=11.8 libcufft-dev=11.8 \ + libcurand-dev=11.8 libcusolver-dev=11.8 libcusparse-dev=11.8 \ + cuda-profiler-api=11.8 fi -export PATH="${CUDA_HOME}/bin:${PATH}" -echo "==> nvcc version" -nvcc --version || { echo "nvcc not found via CUDA_HOME=${CUDA_HOME}"; exit 1; } +# 强制当前会话使用 env 里的 11.8 nvcc/库 +export CUDA_HOME="$CONDA_PREFIX" +export PATH="$CUDA_HOME/bin:$PATH" +export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}" -# 3090 = sm_86 +echo "==> nvcc should now be 11.8:" +nvcc --version + +# 架构:3090 = sm_86 export TORCH_CUDA_ARCH_LIST="8.6" -echo "==> Install build deps via mamba" +# 编译依赖(用 mamba 提速) +echo "==> Build deps" mamba install -y -c conda-forge cmake ninja pybind11 libaio git - -echo "==> Upgrade Python build tools" pip install -U pip setuptools wheel -# 固定到较稳的 DeepSpeed tag;需要最新版可改为 --branch master -DS_TAG="v0.14.3" -if [ ! -d DeepSpeed ]; then - git clone --branch ${DS_TAG} https://github.com/microsoft/DeepSpeed.git -fi +# 获取 DeepSpeed 源码(固定较稳 tag) +echo "==> Clone DeepSpeed (if not exists)" +cd "${HOME}/train/new" +[ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git cd DeepSpeed -echo "==> Build & Install DeepSpeed (training kernels only)" +# 清理旧安装 +pip uninstall -y deepspeed || true + +# 仅启用训练相关内核 export DS_BUILD_OPS=1 export DS_BUILD_AIO=1 export DS_BUILD_FUSED_ADAM=1 export DS_BUILD_CPU_ADAM=1 -# 不启用推理/transformer内核,降低不必要的编译/兼容风险 +# 推理/transformer 内核先关,减少兼容风险 # export DS_BUILD_TRANSFORMER=1 -# 某些环境需要强制找 CUDA:export DS_FORCE_CUDA=1 -# export DS_FORCE_CUDA=1 - +echo "==> Build & install DeepSpeed" pip install . -echo "==> Verify DeepSpeed env" +echo "==> Verify DeepSpeed" python -m deepspeed.env_report - -echo "==> Smoke test: import FusedAdam" python - <<'PY' import deepspeed, torch from deepspeed.ops.adam import FusedAdam print("DeepSpeed:", deepspeed.__version__) print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda) -print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None) +print("GPU0:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None) print("FusedAdam OK") PY -echo "==> Create minimal HF Trainer single-GPU test (tiny model)" -cd .. +# 单机单卡最小训练验证 +echo "==> Prepare single-GPU minimal HF training" +cd "${HOME}/train/new" cat > ds_config_stage2_single.json <<'JSON' { "train_batch_size": 8, @@ -96,15 +100,12 @@ from datasets import Dataset from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling import torch -model_name = "sshleifer/tiny-gpt2" # 极小模型,快速验证 +model_name = "sshleifer/tiny-gpt2" tok = AutoTokenizer.from_pretrained(model_name) -if tok.pad_token is None: - tok.pad_token = tok.eos_token +if tok.pad_token is None: tok.pad_token = tok.eos_token -data = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200 -ds = Dataset.from_dict({"text": data}) -def enc(e): return tok(e["text"], truncation=True, max_length=128) -ds = ds.map(enc) +texts = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200 +ds = Dataset.from_dict({"text": texts}).map(lambda e: tok(e["text"], truncation=True, max_length=128)) collator = DataCollatorForLanguageModeling(tok, mlm=False) model = AutoModelForCausalLM.from_pretrained(model_name) @@ -119,13 +120,11 @@ args = TrainingArguments( fp16=True, deepspeed="ds_config_stage2_single.json" ) -trainer = Trainer(model=model, args=args, tokenizer=tok, - train_dataset=ds, data_collator=collator) -trainer.train() +Trainer(model=model, args=args, tokenizer=tok, train_dataset=ds, data_collator=collator).train() print("OK: single-GPU training finished.") PY -echo "==> Run minimal single-GPU training with DeepSpeed Stage-2" +echo "==> Run single-GPU test (CUDA_VISIBLE_DEVICES=0)" CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py -echo "==> ALL DONE (single-GPU)." +echo "=== DONE: DeepSpeed built with CUDA 11.8 and single-GPU test passed ==="