From 2e06ad0ccf801ae7cedee28f235cdadd7a7b9231 Mon Sep 17 00:00:00 2001
From: hailin <hailin@gdzx.xyz>
Date: Fri, 8 Aug 2025 20:34:33 +0800
Subject: [PATCH] .

---
 .../install_deepspeed_src_mamba_single_gpu.sh | 89 +++++++++----------
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh b/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh
index a6618ee..8b61681 100644
--- a/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh
+++ b/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
-echo "==> Show Python / Torch / CUDA"
+echo "==> Python/Torch baseline"
 python - <<'PY'
 import sys, torch
 print("Python:", sys.version.split()[0])
@@ -10,66 +10,70 @@ print("Torch CUDA tag:", torch.version.cuda)
 print("CUDA available:", torch.cuda.is_available())
 PY
 
-# 选择 CUDA 工具链：优先 /usr/local/cuda-11.8，其次现有的 /usr/local/cuda-12.8
-if [ -d /usr/local/cuda-11.8 ]; then
-  export CUDA_HOME=/usr/local/cuda-11.8
-  echo "==> Using CUDA_HOME=${CUDA_HOME} (preferred for cu118)"
-elif [ -d /usr/local/cuda-12.8 ]; then
-  export CUDA_HOME=/usr/local/cuda-12.8
-  echo "==> Using CUDA_HOME=${CUDA_HOME} (nvcc 12.8)"
+# 尝试用 nvidia 官方 11.8 频道；失败则走 conda-forge 逐组件
+echo "==> Installing CUDA 11.8 toolchain into current env..."
+if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then
+  echo "Installed cuda-toolkit (nvidia channel)."
 else
-  echo "!! 未找到 /usr/local/cuda-11.8 或 /usr/local/cuda-12.8，请安装 CUDA toolkit (dev)。"
-  exit 1
+  echo "Fallback to conda-forge components..."
+  mamba install -y -c conda-forge \
+    cuda-nvcc=11.8 cuda-version=11.8 \
+    cuda-cudart-dev=11.8 libnvjitlink=11.8 \
+    libcublas-dev=11.8 libcufft-dev=11.8 \
+    libcurand-dev=11.8 libcusolver-dev=11.8 libcusparse-dev=11.8 \
+    cuda-profiler-api=11.8
 fi
-export PATH="${CUDA_HOME}/bin:${PATH}"
 
-echo "==> nvcc version"
-nvcc --version || { echo "nvcc not found via CUDA_HOME=${CUDA_HOME}"; exit 1; }
+# 强制当前会话使用 env 里的 11.8 nvcc/库
+export CUDA_HOME="$CONDA_PREFIX"
+export PATH="$CUDA_HOME/bin:$PATH"
+export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
 
-# 3090 = sm_86
+echo "==> nvcc should now be 11.8:"
+nvcc --version
+
+# 架构：3090 = sm_86
 export TORCH_CUDA_ARCH_LIST="8.6"
 
-echo "==> Install build deps via mamba"
+# 编译依赖（用 mamba 提速）
+echo "==> Build deps"
 mamba install -y -c conda-forge cmake ninja pybind11 libaio git
-
-echo "==> Upgrade Python build tools"
 pip install -U pip setuptools wheel
 
-# 固定到较稳的 DeepSpeed tag；需要最新版可改为 --branch master
-DS_TAG="v0.14.3"
-if [ ! -d DeepSpeed ]; then
-  git clone --branch ${DS_TAG} https://github.com/microsoft/DeepSpeed.git
-fi
+# 获取 DeepSpeed 源码（固定较稳 tag）
+echo "==> Clone DeepSpeed (if not exists)"
+cd "${HOME}/train/new"
+[ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git
 cd DeepSpeed
 
-echo "==> Build & Install DeepSpeed (training kernels only)"
+# 清理旧安装
+pip uninstall -y deepspeed || true
+
+# 仅启用训练相关内核
 export DS_BUILD_OPS=1
 export DS_BUILD_AIO=1
 export DS_BUILD_FUSED_ADAM=1
 export DS_BUILD_CPU_ADAM=1
-# 不启用推理/transformer内核，降低不必要的编译/兼容风险
+# 推理/transformer 内核先关，减少兼容风险
 # export DS_BUILD_TRANSFORMER=1
 
-# 某些环境需要强制找 CUDA：export DS_FORCE_CUDA=1
-# export DS_FORCE_CUDA=1
-
+echo "==> Build & install DeepSpeed"
 pip install .
 
-echo "==> Verify DeepSpeed env"
+echo "==> Verify DeepSpeed"
 python -m deepspeed.env_report
-
-echo "==> Smoke test: import FusedAdam"
 python - <<'PY'
 import deepspeed, torch
 from deepspeed.ops.adam import FusedAdam
 print("DeepSpeed:", deepspeed.__version__)
 print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
-print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
+print("GPU0:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
 print("FusedAdam OK")
 PY
 
-echo "==> Create minimal HF Trainer single-GPU test (tiny model)"
-cd ..
+# 单机单卡最小训练验证
+echo "==> Prepare single-GPU minimal HF training"
+cd "${HOME}/train/new"
 cat > ds_config_stage2_single.json <<'JSON'
 {
   "train_batch_size": 8,
@@ -96,15 +100,12 @@ from datasets import Dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
 import torch
 
-model_name = "sshleifer/tiny-gpt2"   # 极小模型，快速验证
+model_name = "sshleifer/tiny-gpt2"
 tok = AutoTokenizer.from_pretrained(model_name)
-if tok.pad_token is None:
-    tok.pad_token = tok.eos_token
+if tok.pad_token is None: tok.pad_token = tok.eos_token
 
-data = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200
-ds = Dataset.from_dict({"text": data})
-def enc(e): return tok(e["text"], truncation=True, max_length=128)
-ds = ds.map(enc)
+texts = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200
+ds = Dataset.from_dict({"text": texts}).map(lambda e: tok(e["text"], truncation=True, max_length=128))
 collator = DataCollatorForLanguageModeling(tok, mlm=False)
 
 model = AutoModelForCausalLM.from_pretrained(model_name)
@@ -119,13 +120,11 @@ args = TrainingArguments(
     fp16=True,
     deepspeed="ds_config_stage2_single.json"
 )
-trainer = Trainer(model=model, args=args, tokenizer=tok,
-                  train_dataset=ds, data_collator=collator)
-trainer.train()
+Trainer(model=model, args=args, tokenizer=tok, train_dataset=ds, data_collator=collator).train()
 print("OK: single-GPU training finished.")
 PY
 
-echo "==> Run minimal single-GPU training with DeepSpeed Stage-2"
+echo "==> Run single-GPU test (CUDA_VISIBLE_DEVICES=0)"
 CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py
 
-echo "==> ALL DONE (single-GPU)."
+echo "=== DONE: DeepSpeed built with CUDA 11.8 and single-GPU test passed ==="