#!/usr/bin/env bash set -euo pipefail echo "==> Python/Torch baseline" python - <<'PY' import sys, torch print("Python:", sys.version.split()[0]) print("Torch:", torch.__version__) print("Torch CUDA tag:", torch.version.cuda) print("CUDA available:", torch.cuda.is_available()) PY # 尝试用 nvidia 官方 11.8 频道;失败则走 conda-forge 逐组件 echo "==> Installing CUDA 11.8 toolchain into current env..." if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then echo "Installed cuda-toolkit (nvidia channel)." else echo "Fallback to conda-forge components..." mamba install -y -c conda-forge \ cuda-nvcc=11.8 cuda-version=11.8 \ cuda-cudart-dev=11.8 libnvjitlink=11.8 \ libcublas-dev=11.8 libcufft-dev=11.8 \ libcurand-dev=11.8 libcusolver-dev=11.8 libcusparse-dev=11.8 \ cuda-profiler-api=11.8 fi # 强制当前会话使用 env 里的 11.8 nvcc/库 export CUDA_HOME="$CONDA_PREFIX" export PATH="$CUDA_HOME/bin:$PATH" export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}" echo "==> nvcc should now be 11.8:" nvcc --version # 架构:3090 = sm_86 export TORCH_CUDA_ARCH_LIST="8.6" # 编译依赖(用 mamba 提速) echo "==> Build deps" mamba install -y -c conda-forge cmake ninja pybind11 libaio git pip install -U pip setuptools wheel # 获取 DeepSpeed 源码(固定较稳 tag) echo "==> Clone DeepSpeed (if not exists)" cd "${HOME}/train/new" [ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git cd DeepSpeed # 清理旧安装 pip uninstall -y deepspeed || true # 仅启用训练相关内核 export DS_BUILD_OPS=1 export DS_BUILD_AIO=1 export DS_BUILD_FUSED_ADAM=1 export DS_BUILD_CPU_ADAM=1 # 推理/transformer 内核先关,减少兼容风险 # export DS_BUILD_TRANSFORMER=1 echo "==> Build & install DeepSpeed" pip install . echo "==> Verify DeepSpeed" python -m deepspeed.env_report python - <<'PY' import deepspeed, torch from deepspeed.ops.adam import FusedAdam print("DeepSpeed:", deepspeed.__version__) print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda) print("GPU0:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None) print("FusedAdam OK") PY # 单机单卡最小训练验证 echo "==> Prepare single-GPU minimal HF training" cd "${HOME}/train/new" cat > ds_config_stage2_single.json <<'JSON' { "train_batch_size": 8, "train_micro_batch_size_per_gpu": 1, "gradient_accumulation_steps": 8, "zero_optimization": { "stage": 2, "overlap_comm": true, "contiguous_gradients": true }, "fp16": { "enabled": true }, "aio": { "block_size": 1048576, "queue_depth": 16, "thread_count": 1, "single_submit": false, "overlap_events": true, "verbose": false }, "gradient_clipping": 1.0, "steps_per_print": 1000, "wall_clock_breakdown": false } JSON cat > train_single_gpu_min.py <<'PY' from datasets import Dataset from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling import torch model_name = "sshleifer/tiny-gpt2" tok = AutoTokenizer.from_pretrained(model_name) if tok.pad_token is None: tok.pad_token = tok.eos_token texts = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200 ds = Dataset.from_dict({"text": texts}).map(lambda e: tok(e["text"], truncation=True, max_length=128)) collator = DataCollatorForLanguageModeling(tok, mlm=False) model = AutoModelForCausalLM.from_pretrained(model_name) args = TrainingArguments( output_dir="out-ds-single", per_device_train_batch_size=1, gradient_accumulation_steps=8, learning_rate=5e-4, num_train_epochs=1, logging_steps=10, save_steps=0, fp16=True, deepspeed="ds_config_stage2_single.json" ) Trainer(model=model, args=args, tokenizer=tok, train_dataset=ds, data_collator=collator).train() print("OK: single-GPU training finished.") PY echo "==> Run single-GPU test (CUDA_VISIBLE_DEVICES=0)" CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py echo "=== DONE: DeepSpeed built with CUDA 11.8 and single-GPU test passed ==="