train_env_prepare/DeepSpeed/install_deepspeed_src_mamba...

137 lines
4.1 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
echo "==> Python/Torch baseline"
python - <<'PY'
import sys, torch
print("Python:", sys.version.split()[0])
print("Torch:", torch.__version__)
print("Torch CUDA tag:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
PY
# 安装 CUDA 11.8 工具链
echo "==> Installing CUDA 11.8 toolchain into current env..."
if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then
echo "Installed cuda-toolkit (nvidia channel)."
else
echo "Fallback to conda-forge components..."
mamba install -y -c conda-forge \
cuda-nvcc=11.8 cuda-version=11.8 \
cuda-cudart-dev=11.8 libnvjitlink=11.8 \
libcublas-dev=11.8 libcufft-dev=11.8 \
libcurand-dev=11.8 libcusolver-dev=11.8 libcusparse-dev=11.8 \
cuda-profiler-api=11.8
fi
# 强制用当前环境里的 CUDA 11.8
export CUDA_HOME="$CONDA_PREFIX"
export CUDA_PATH="$CUDA_HOME"
export TORCH_CUDA_HOME="$CUDA_HOME"
export CUDACXX="$CUDA_HOME/bin/nvcc"
export PATH="$CUDA_HOME/bin:$PATH"
export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
hash -r # 刷新 shell 命令缓存
echo "==> nvcc should now be 11.8:"
which nvcc
nvcc --version
# 架构3090 = sm_86
export TORCH_CUDA_ARCH_LIST="8.6"
# 编译依赖
echo "==> Build deps"
mamba install -y -c conda-forge cmake ninja pybind11 libaio git
pip install -U pip setuptools wheel
# 获取 DeepSpeed 源码
echo "==> Clone DeepSpeed (if not exists)"
cd "${HOME}/train/new"
[ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git
cd DeepSpeed
# 清理旧安装
pip uninstall -y deepspeed || true
rm -rf build
# 启用训练相关内核
export DS_BUILD_OPS=1
export DS_BUILD_AIO=1
export DS_BUILD_FUSED_ADAM=1
export DS_BUILD_CPU_ADAM=1
# 推理/transformer 内核先关
# export DS_BUILD_TRANSFORMER=1
echo "==> Build & install DeepSpeed"
pip install .
echo "==> Verify DeepSpeed"
python -m deepspeed.env_report
python - <<'PY'
import deepspeed, torch
from deepspeed.ops.adam import FusedAdam
print("DeepSpeed:", deepspeed.__version__)
print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
print("GPU0:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
print("FusedAdam OK")
PY
# 单机单卡最小训练验证
echo "==> Prepare single-GPU minimal HF training"
cd "${HOME}/train/new"
cat > ds_config_stage2_single.json <<'JSON'
{
"train_batch_size": 8,
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 8,
"zero_optimization": { "stage": 2, "overlap_comm": true, "contiguous_gradients": true },
"fp16": { "enabled": true },
"aio": {
"block_size": 1048576,
"queue_depth": 16,
"thread_count": 1,
"single_submit": false,
"overlap_events": true,
"verbose": false
},
"gradient_clipping": 1.0,
"steps_per_print": 1000,
"wall_clock_breakdown": false
}
JSON
cat > train_single_gpu_min.py <<'PY'
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
model_name = "sshleifer/tiny-gpt2"
tok = AutoTokenizer.from_pretrained(model_name)
if tok.pad_token is None: tok.pad_token = tok.eos_token
texts = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200
ds = Dataset.from_dict({"text": texts}).map(lambda e: tok(e["text"], truncation=True, max_length=128))
collator = DataCollatorForLanguageModeling(tok, mlm=False)
model = AutoModelForCausalLM.from_pretrained(model_name)
args = TrainingArguments(
output_dir="out-ds-single",
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
learning_rate=5e-4,
num_train_epochs=1,
logging_steps=10,
save_steps=0,
fp16=True,
deepspeed="ds_config_stage2_single.json"
)
Trainer(model=model, args=args, tokenizer=tok, train_dataset=ds, data_collator=collator).train()
print("OK: single-GPU training finished.")
PY
echo "==> Run single-GPU test (CUDA_VISIBLE_DEVICES=0)"
CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py
echo "=== DONE: DeepSpeed built with CUDA 11.8 and single-GPU test passed ==="