train_env_prepare/DeepSpeed/install_deepspeed_src_mamba...

132 lines
4.0 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
echo "==> Show Python / Torch / CUDA"
python - <<'PY'
import sys, torch
print("Python:", sys.version.split()[0])
print("Torch:", torch.__version__)
print("Torch CUDA tag:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
PY
# 选择 CUDA 工具链:优先 /usr/local/cuda-11.8,其次现有的 /usr/local/cuda-12.8
if [ -d /usr/local/cuda-11.8 ]; then
export CUDA_HOME=/usr/local/cuda-11.8
echo "==> Using CUDA_HOME=${CUDA_HOME} (preferred for cu118)"
elif [ -d /usr/local/cuda-12.8 ]; then
export CUDA_HOME=/usr/local/cuda-12.8
echo "==> Using CUDA_HOME=${CUDA_HOME} (nvcc 12.8)"
else
echo "!! 未找到 /usr/local/cuda-11.8 或 /usr/local/cuda-12.8,请安装 CUDA toolkit (dev)。"
exit 1
fi
export PATH="${CUDA_HOME}/bin:${PATH}"
echo "==> nvcc version"
nvcc --version || { echo "nvcc not found via CUDA_HOME=${CUDA_HOME}"; exit 1; }
# 3090 = sm_86
export TORCH_CUDA_ARCH_LIST="8.6"
echo "==> Install build deps via mamba"
mamba install -y -c conda-forge cmake ninja pybind11 libaio git
echo "==> Upgrade Python build tools"
pip install -U pip setuptools wheel
# 固定到较稳的 DeepSpeed tag需要最新版可改为 --branch master
DS_TAG="v0.14.3"
if [ ! -d DeepSpeed ]; then
git clone --branch ${DS_TAG} https://github.com/microsoft/DeepSpeed.git
fi
cd DeepSpeed
echo "==> Build & Install DeepSpeed (training kernels only)"
export DS_BUILD_OPS=1
export DS_BUILD_AIO=1
export DS_BUILD_FUSED_ADAM=1
export DS_BUILD_CPU_ADAM=1
# 不启用推理/transformer内核降低不必要的编译/兼容风险
# export DS_BUILD_TRANSFORMER=1
# 某些环境需要强制找 CUDAexport DS_FORCE_CUDA=1
# export DS_FORCE_CUDA=1
pip install .
echo "==> Verify DeepSpeed env"
python -m deepspeed.env_report
echo "==> Smoke test: import FusedAdam"
python - <<'PY'
import deepspeed, torch
from deepspeed.ops.adam import FusedAdam
print("DeepSpeed:", deepspeed.__version__)
print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
print("FusedAdam OK")
PY
echo "==> Create minimal HF Trainer single-GPU test (tiny model)"
cd ..
cat > ds_config_stage2_single.json <<'JSON'
{
"train_batch_size": 8,
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 8,
"zero_optimization": { "stage": 2, "overlap_comm": true, "contiguous_gradients": true },
"fp16": { "enabled": true },
"aio": {
"block_size": 1048576,
"queue_depth": 16,
"thread_count": 1,
"single_submit": false,
"overlap_events": true,
"verbose": false
},
"gradient_clipping": 1.0,
"steps_per_print": 1000,
"wall_clock_breakdown": false
}
JSON
cat > train_single_gpu_min.py <<'PY'
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
model_name = "sshleifer/tiny-gpt2" # 极小模型,快速验证
tok = AutoTokenizer.from_pretrained(model_name)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
data = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200
ds = Dataset.from_dict({"text": data})
def enc(e): return tok(e["text"], truncation=True, max_length=128)
ds = ds.map(enc)
collator = DataCollatorForLanguageModeling(tok, mlm=False)
model = AutoModelForCausalLM.from_pretrained(model_name)
args = TrainingArguments(
output_dir="out-ds-single",
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
learning_rate=5e-4,
num_train_epochs=1,
logging_steps=10,
save_steps=0,
fp16=True,
deepspeed="ds_config_stage2_single.json"
)
trainer = Trainer(model=model, args=args, tokenizer=tok,
train_dataset=ds, data_collator=collator)
trainer.train()
print("OK: single-GPU training finished.")
PY
echo "==> Run minimal single-GPU training with DeepSpeed Stage-2"
CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py
echo "==> ALL DONE (single-GPU)."