This commit is contained in:
parent
3e237e9c82
commit
0c8c8d48af
|
|
@ -0,0 +1,131 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "==> Show Python / Torch / CUDA"
|
||||
python - <<'PY'
|
||||
import sys, torch
|
||||
print("Python:", sys.version.split()[0])
|
||||
print("Torch:", torch.__version__)
|
||||
print("Torch CUDA tag:", torch.version.cuda)
|
||||
print("CUDA available:", torch.cuda.is_available())
|
||||
PY
|
||||
|
||||
# 选择 CUDA 工具链:优先 /usr/local/cuda-11.8,其次现有的 /usr/local/cuda-12.8
|
||||
if [ -d /usr/local/cuda-11.8 ]; then
|
||||
export CUDA_HOME=/usr/local/cuda-11.8
|
||||
echo "==> Using CUDA_HOME=${CUDA_HOME} (preferred for cu118)"
|
||||
elif [ -d /usr/local/cuda-12.8 ]; then
|
||||
export CUDA_HOME=/usr/local/cuda-12.8
|
||||
echo "==> Using CUDA_HOME=${CUDA_HOME} (nvcc 12.8)"
|
||||
else
|
||||
echo "!! 未找到 /usr/local/cuda-11.8 或 /usr/local/cuda-12.8,请安装 CUDA toolkit (dev)。"
|
||||
exit 1
|
||||
fi
|
||||
export PATH="${CUDA_HOME}/bin:${PATH}"
|
||||
|
||||
echo "==> nvcc version"
|
||||
nvcc --version || { echo "nvcc not found via CUDA_HOME=${CUDA_HOME}"; exit 1; }
|
||||
|
||||
# 3090 = sm_86
|
||||
export TORCH_CUDA_ARCH_LIST="8.6"
|
||||
|
||||
echo "==> Install build deps via mamba"
|
||||
mamba install -y -c conda-forge cmake ninja pybind11 libaio git
|
||||
|
||||
echo "==> Upgrade Python build tools"
|
||||
pip install -U pip setuptools wheel
|
||||
|
||||
# 固定到较稳的 DeepSpeed tag;需要最新版可改为 --branch master
|
||||
DS_TAG="v0.14.3"
|
||||
if [ ! -d DeepSpeed ]; then
|
||||
git clone --branch ${DS_TAG} https://github.com/microsoft/DeepSpeed.git
|
||||
fi
|
||||
cd DeepSpeed
|
||||
|
||||
echo "==> Build & Install DeepSpeed (training kernels only)"
|
||||
export DS_BUILD_OPS=1
|
||||
export DS_BUILD_AIO=1
|
||||
export DS_BUILD_FUSED_ADAM=1
|
||||
export DS_BUILD_CPU_ADAM=1
|
||||
# 不启用推理/transformer内核,降低不必要的编译/兼容风险
|
||||
# export DS_BUILD_TRANSFORMER=1
|
||||
|
||||
# 某些环境需要强制找 CUDA:export DS_FORCE_CUDA=1
|
||||
# export DS_FORCE_CUDA=1
|
||||
|
||||
pip install .
|
||||
|
||||
echo "==> Verify DeepSpeed env"
|
||||
python -m deepspeed.env_report
|
||||
|
||||
echo "==> Smoke test: import FusedAdam"
|
||||
python - <<'PY'
|
||||
import deepspeed, torch
|
||||
from deepspeed.ops.adam import FusedAdam
|
||||
print("DeepSpeed:", deepspeed.__version__)
|
||||
print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
|
||||
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
|
||||
print("FusedAdam OK")
|
||||
PY
|
||||
|
||||
echo "==> Create minimal HF Trainer single-GPU test (tiny model)"
|
||||
cd ..
|
||||
cat > ds_config_stage2_single.json <<'JSON'
|
||||
{
|
||||
"train_batch_size": 8,
|
||||
"train_micro_batch_size_per_gpu": 1,
|
||||
"gradient_accumulation_steps": 8,
|
||||
"zero_optimization": { "stage": 2, "overlap_comm": true, "contiguous_gradients": true },
|
||||
"fp16": { "enabled": true },
|
||||
"aio": {
|
||||
"block_size": 1048576,
|
||||
"queue_depth": 16,
|
||||
"thread_count": 1,
|
||||
"single_submit": false,
|
||||
"overlap_events": true,
|
||||
"verbose": false
|
||||
},
|
||||
"gradient_clipping": 1.0,
|
||||
"steps_per_print": 1000,
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
JSON
|
||||
|
||||
cat > train_single_gpu_min.py <<'PY'
|
||||
from datasets import Dataset
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
||||
import torch
|
||||
|
||||
model_name = "sshleifer/tiny-gpt2" # 极小模型,快速验证
|
||||
tok = AutoTokenizer.from_pretrained(model_name)
|
||||
if tok.pad_token is None:
|
||||
tok.pad_token = tok.eos_token
|
||||
|
||||
data = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200
|
||||
ds = Dataset.from_dict({"text": data})
|
||||
def enc(e): return tok(e["text"], truncation=True, max_length=128)
|
||||
ds = ds.map(enc)
|
||||
collator = DataCollatorForLanguageModeling(tok, mlm=False)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||
args = TrainingArguments(
|
||||
output_dir="out-ds-single",
|
||||
per_device_train_batch_size=1,
|
||||
gradient_accumulation_steps=8,
|
||||
learning_rate=5e-4,
|
||||
num_train_epochs=1,
|
||||
logging_steps=10,
|
||||
save_steps=0,
|
||||
fp16=True,
|
||||
deepspeed="ds_config_stage2_single.json"
|
||||
)
|
||||
trainer = Trainer(model=model, args=args, tokenizer=tok,
|
||||
train_dataset=ds, data_collator=collator)
|
||||
trainer.train()
|
||||
print("OK: single-GPU training finished.")
|
||||
PY
|
||||
|
||||
echo "==> Run minimal single-GPU training with DeepSpeed Stage-2"
|
||||
CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py
|
||||
|
||||
echo "==> ALL DONE (single-GPU)."
|
||||
Loading…
Reference in New Issue