137 lines
4.1 KiB
Bash
137 lines
4.1 KiB
Bash
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
echo "==> Python/Torch baseline"
|
||
python - <<'PY'
|
||
import sys, torch
|
||
print("Python:", sys.version.split()[0])
|
||
print("Torch:", torch.__version__)
|
||
print("Torch CUDA tag:", torch.version.cuda)
|
||
print("CUDA available:", torch.cuda.is_available())
|
||
PY
|
||
|
||
# 安装 CUDA 11.8 工具链
|
||
echo "==> Installing CUDA 11.8 toolchain into current env..."
|
||
if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then
|
||
echo "Installed cuda-toolkit (nvidia channel)."
|
||
else
|
||
echo "Fallback to conda-forge components..."
|
||
mamba install -y -c conda-forge \
|
||
cuda-nvcc=11.8 cuda-version=11.8 \
|
||
cuda-cudart-dev=11.8 libnvjitlink=11.8 \
|
||
libcublas-dev=11.8 libcufft-dev=11.8 \
|
||
libcurand-dev=11.8 libcusolver-dev=11.8 libcusparse-dev=11.8 \
|
||
cuda-profiler-api=11.8
|
||
fi
|
||
|
||
# 强制用当前环境里的 CUDA 11.8
|
||
export CUDA_HOME="$CONDA_PREFIX"
|
||
export CUDA_PATH="$CUDA_HOME"
|
||
export TORCH_CUDA_HOME="$CUDA_HOME"
|
||
export CUDACXX="$CUDA_HOME/bin/nvcc"
|
||
export PATH="$CUDA_HOME/bin:$PATH"
|
||
export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
|
||
hash -r # 刷新 shell 命令缓存
|
||
|
||
echo "==> nvcc should now be 11.8:"
|
||
which nvcc
|
||
nvcc --version
|
||
|
||
# 架构:3090 = sm_86
|
||
export TORCH_CUDA_ARCH_LIST="8.6"
|
||
|
||
# 编译依赖
|
||
echo "==> Build deps"
|
||
mamba install -y -c conda-forge cmake ninja pybind11 libaio git
|
||
pip install -U pip setuptools wheel
|
||
|
||
# 获取 DeepSpeed 源码
|
||
echo "==> Clone DeepSpeed (if not exists)"
|
||
cd "${HOME}/train/new"
|
||
[ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git
|
||
cd DeepSpeed
|
||
|
||
# 清理旧安装
|
||
pip uninstall -y deepspeed || true
|
||
rm -rf build
|
||
|
||
# 启用训练相关内核
|
||
export DS_BUILD_OPS=1
|
||
export DS_BUILD_AIO=1
|
||
export DS_BUILD_FUSED_ADAM=1
|
||
export DS_BUILD_CPU_ADAM=1
|
||
# 推理/transformer 内核先关
|
||
# export DS_BUILD_TRANSFORMER=1
|
||
|
||
echo "==> Build & install DeepSpeed"
|
||
pip install .
|
||
|
||
echo "==> Verify DeepSpeed"
|
||
python -m deepspeed.env_report
|
||
python - <<'PY'
|
||
import deepspeed, torch
|
||
from deepspeed.ops.adam import FusedAdam
|
||
print("DeepSpeed:", deepspeed.__version__)
|
||
print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
|
||
print("GPU0:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
|
||
print("FusedAdam OK")
|
||
PY
|
||
|
||
# 单机单卡最小训练验证
|
||
echo "==> Prepare single-GPU minimal HF training"
|
||
cd "${HOME}/train/new"
|
||
cat > ds_config_stage2_single.json <<'JSON'
|
||
{
|
||
"train_batch_size": 8,
|
||
"train_micro_batch_size_per_gpu": 1,
|
||
"gradient_accumulation_steps": 8,
|
||
"zero_optimization": { "stage": 2, "overlap_comm": true, "contiguous_gradients": true },
|
||
"fp16": { "enabled": true },
|
||
"aio": {
|
||
"block_size": 1048576,
|
||
"queue_depth": 16,
|
||
"thread_count": 1,
|
||
"single_submit": false,
|
||
"overlap_events": true,
|
||
"verbose": false
|
||
},
|
||
"gradient_clipping": 1.0,
|
||
"steps_per_print": 1000,
|
||
"wall_clock_breakdown": false
|
||
}
|
||
JSON
|
||
|
||
cat > train_single_gpu_min.py <<'PY'
|
||
from datasets import Dataset
|
||
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
||
import torch
|
||
|
||
model_name = "sshleifer/tiny-gpt2"
|
||
tok = AutoTokenizer.from_pretrained(model_name)
|
||
if tok.pad_token is None: tok.pad_token = tok.eos_token
|
||
|
||
texts = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200
|
||
ds = Dataset.from_dict({"text": texts}).map(lambda e: tok(e["text"], truncation=True, max_length=128))
|
||
collator = DataCollatorForLanguageModeling(tok, mlm=False)
|
||
|
||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||
args = TrainingArguments(
|
||
output_dir="out-ds-single",
|
||
per_device_train_batch_size=1,
|
||
gradient_accumulation_steps=8,
|
||
learning_rate=5e-4,
|
||
num_train_epochs=1,
|
||
logging_steps=10,
|
||
save_steps=0,
|
||
fp16=True,
|
||
deepspeed="ds_config_stage2_single.json"
|
||
)
|
||
Trainer(model=model, args=args, tokenizer=tok, train_dataset=ds, data_collator=collator).train()
|
||
print("OK: single-GPU training finished.")
|
||
PY
|
||
|
||
echo "==> Run single-GPU test (CUDA_VISIBLE_DEVICES=0)"
|
||
CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py
|
||
|
||
echo "=== DONE: DeepSpeed built with CUDA 11.8 and single-GPU test passed ==="
|