diff --git a/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh b/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh new file mode 100644 index 0000000..a6618ee --- /dev/null +++ b/DeepSpeed/install_deepspeed_src_mamba_single_gpu.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "==> Show Python / Torch / CUDA" +python - <<'PY' +import sys, torch +print("Python:", sys.version.split()[0]) +print("Torch:", torch.__version__) +print("Torch CUDA tag:", torch.version.cuda) +print("CUDA available:", torch.cuda.is_available()) +PY + +# 选择 CUDA 工具链:优先 /usr/local/cuda-11.8,其次现有的 /usr/local/cuda-12.8 +if [ -d /usr/local/cuda-11.8 ]; then + export CUDA_HOME=/usr/local/cuda-11.8 + echo "==> Using CUDA_HOME=${CUDA_HOME} (preferred for cu118)" +elif [ -d /usr/local/cuda-12.8 ]; then + export CUDA_HOME=/usr/local/cuda-12.8 + echo "==> Using CUDA_HOME=${CUDA_HOME} (nvcc 12.8)" +else + echo "!! 未找到 /usr/local/cuda-11.8 或 /usr/local/cuda-12.8,请安装 CUDA toolkit (dev)。" + exit 1 +fi +export PATH="${CUDA_HOME}/bin:${PATH}" + +echo "==> nvcc version" +nvcc --version || { echo "nvcc not found via CUDA_HOME=${CUDA_HOME}"; exit 1; } + +# 3090 = sm_86 +export TORCH_CUDA_ARCH_LIST="8.6" + +echo "==> Install build deps via mamba" +mamba install -y -c conda-forge cmake ninja pybind11 libaio git + +echo "==> Upgrade Python build tools" +pip install -U pip setuptools wheel + +# 固定到较稳的 DeepSpeed tag;需要最新版可改为 --branch master +DS_TAG="v0.14.3" +if [ ! -d DeepSpeed ]; then + git clone --branch ${DS_TAG} https://github.com/microsoft/DeepSpeed.git +fi +cd DeepSpeed + +echo "==> Build & Install DeepSpeed (training kernels only)" +export DS_BUILD_OPS=1 +export DS_BUILD_AIO=1 +export DS_BUILD_FUSED_ADAM=1 +export DS_BUILD_CPU_ADAM=1 +# 不启用推理/transformer内核,降低不必要的编译/兼容风险 +# export DS_BUILD_TRANSFORMER=1 + +# 某些环境需要强制找 CUDA:export DS_FORCE_CUDA=1 +# export DS_FORCE_CUDA=1 + +pip install . + +echo "==> Verify DeepSpeed env" +python -m deepspeed.env_report + +echo "==> Smoke test: import FusedAdam" +python - <<'PY' +import deepspeed, torch +from deepspeed.ops.adam import FusedAdam +print("DeepSpeed:", deepspeed.__version__) +print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda) +print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None) +print("FusedAdam OK") +PY + +echo "==> Create minimal HF Trainer single-GPU test (tiny model)" +cd .. +cat > ds_config_stage2_single.json <<'JSON' +{ + "train_batch_size": 8, + "train_micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 8, + "zero_optimization": { "stage": 2, "overlap_comm": true, "contiguous_gradients": true }, + "fp16": { "enabled": true }, + "aio": { + "block_size": 1048576, + "queue_depth": 16, + "thread_count": 1, + "single_submit": false, + "overlap_events": true, + "verbose": false + }, + "gradient_clipping": 1.0, + "steps_per_print": 1000, + "wall_clock_breakdown": false +} +JSON + +cat > train_single_gpu_min.py <<'PY' +from datasets import Dataset +from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling +import torch + +model_name = "sshleifer/tiny-gpt2" # 极小模型,快速验证 +tok = AutoTokenizer.from_pretrained(model_name) +if tok.pad_token is None: + tok.pad_token = tok.eos_token + +data = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200 +ds = Dataset.from_dict({"text": data}) +def enc(e): return tok(e["text"], truncation=True, max_length=128) +ds = ds.map(enc) +collator = DataCollatorForLanguageModeling(tok, mlm=False) + +model = AutoModelForCausalLM.from_pretrained(model_name) +args = TrainingArguments( + output_dir="out-ds-single", + per_device_train_batch_size=1, + gradient_accumulation_steps=8, + learning_rate=5e-4, + num_train_epochs=1, + logging_steps=10, + save_steps=0, + fp16=True, + deepspeed="ds_config_stage2_single.json" +) +trainer = Trainer(model=model, args=args, tokenizer=tok, + train_dataset=ds, data_collator=collator) +trainer.train() +print("OK: single-GPU training finished.") +PY + +echo "==> Run minimal single-GPU training with DeepSpeed Stage-2" +CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py + +echo "==> ALL DONE (single-GPU)."