This commit is contained in:
parent
0c8c8d48af
commit
2e06ad0ccf
|
|
@ -1,7 +1,7 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
echo "==> Show Python / Torch / CUDA"
|
echo "==> Python/Torch baseline"
|
||||||
python - <<'PY'
|
python - <<'PY'
|
||||||
import sys, torch
|
import sys, torch
|
||||||
print("Python:", sys.version.split()[0])
|
print("Python:", sys.version.split()[0])
|
||||||
|
|
@ -10,66 +10,70 @@ print("Torch CUDA tag:", torch.version.cuda)
|
||||||
print("CUDA available:", torch.cuda.is_available())
|
print("CUDA available:", torch.cuda.is_available())
|
||||||
PY
|
PY
|
||||||
|
|
||||||
# 选择 CUDA 工具链:优先 /usr/local/cuda-11.8,其次现有的 /usr/local/cuda-12.8
|
# 尝试用 nvidia 官方 11.8 频道;失败则走 conda-forge 逐组件
|
||||||
if [ -d /usr/local/cuda-11.8 ]; then
|
echo "==> Installing CUDA 11.8 toolchain into current env..."
|
||||||
export CUDA_HOME=/usr/local/cuda-11.8
|
if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then
|
||||||
echo "==> Using CUDA_HOME=${CUDA_HOME} (preferred for cu118)"
|
echo "Installed cuda-toolkit (nvidia channel)."
|
||||||
elif [ -d /usr/local/cuda-12.8 ]; then
|
|
||||||
export CUDA_HOME=/usr/local/cuda-12.8
|
|
||||||
echo "==> Using CUDA_HOME=${CUDA_HOME} (nvcc 12.8)"
|
|
||||||
else
|
else
|
||||||
echo "!! 未找到 /usr/local/cuda-11.8 或 /usr/local/cuda-12.8,请安装 CUDA toolkit (dev)。"
|
echo "Fallback to conda-forge components..."
|
||||||
exit 1
|
mamba install -y -c conda-forge \
|
||||||
|
cuda-nvcc=11.8 cuda-version=11.8 \
|
||||||
|
cuda-cudart-dev=11.8 libnvjitlink=11.8 \
|
||||||
|
libcublas-dev=11.8 libcufft-dev=11.8 \
|
||||||
|
libcurand-dev=11.8 libcusolver-dev=11.8 libcusparse-dev=11.8 \
|
||||||
|
cuda-profiler-api=11.8
|
||||||
fi
|
fi
|
||||||
export PATH="${CUDA_HOME}/bin:${PATH}"
|
|
||||||
|
|
||||||
echo "==> nvcc version"
|
# 强制当前会话使用 env 里的 11.8 nvcc/库
|
||||||
nvcc --version || { echo "nvcc not found via CUDA_HOME=${CUDA_HOME}"; exit 1; }
|
export CUDA_HOME="$CONDA_PREFIX"
|
||||||
|
export PATH="$CUDA_HOME/bin:$PATH"
|
||||||
|
export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
|
||||||
|
|
||||||
# 3090 = sm_86
|
echo "==> nvcc should now be 11.8:"
|
||||||
|
nvcc --version
|
||||||
|
|
||||||
|
# 架构:3090 = sm_86
|
||||||
export TORCH_CUDA_ARCH_LIST="8.6"
|
export TORCH_CUDA_ARCH_LIST="8.6"
|
||||||
|
|
||||||
echo "==> Install build deps via mamba"
|
# 编译依赖(用 mamba 提速)
|
||||||
|
echo "==> Build deps"
|
||||||
mamba install -y -c conda-forge cmake ninja pybind11 libaio git
|
mamba install -y -c conda-forge cmake ninja pybind11 libaio git
|
||||||
|
|
||||||
echo "==> Upgrade Python build tools"
|
|
||||||
pip install -U pip setuptools wheel
|
pip install -U pip setuptools wheel
|
||||||
|
|
||||||
# 固定到较稳的 DeepSpeed tag;需要最新版可改为 --branch master
|
# 获取 DeepSpeed 源码(固定较稳 tag)
|
||||||
DS_TAG="v0.14.3"
|
echo "==> Clone DeepSpeed (if not exists)"
|
||||||
if [ ! -d DeepSpeed ]; then
|
cd "${HOME}/train/new"
|
||||||
git clone --branch ${DS_TAG} https://github.com/microsoft/DeepSpeed.git
|
[ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git
|
||||||
fi
|
|
||||||
cd DeepSpeed
|
cd DeepSpeed
|
||||||
|
|
||||||
echo "==> Build & Install DeepSpeed (training kernels only)"
|
# 清理旧安装
|
||||||
|
pip uninstall -y deepspeed || true
|
||||||
|
|
||||||
|
# 仅启用训练相关内核
|
||||||
export DS_BUILD_OPS=1
|
export DS_BUILD_OPS=1
|
||||||
export DS_BUILD_AIO=1
|
export DS_BUILD_AIO=1
|
||||||
export DS_BUILD_FUSED_ADAM=1
|
export DS_BUILD_FUSED_ADAM=1
|
||||||
export DS_BUILD_CPU_ADAM=1
|
export DS_BUILD_CPU_ADAM=1
|
||||||
# 不启用推理/transformer内核,降低不必要的编译/兼容风险
|
# 推理/transformer 内核先关,减少兼容风险
|
||||||
# export DS_BUILD_TRANSFORMER=1
|
# export DS_BUILD_TRANSFORMER=1
|
||||||
|
|
||||||
# 某些环境需要强制找 CUDA:export DS_FORCE_CUDA=1
|
echo "==> Build & install DeepSpeed"
|
||||||
# export DS_FORCE_CUDA=1
|
|
||||||
|
|
||||||
pip install .
|
pip install .
|
||||||
|
|
||||||
echo "==> Verify DeepSpeed env"
|
echo "==> Verify DeepSpeed"
|
||||||
python -m deepspeed.env_report
|
python -m deepspeed.env_report
|
||||||
|
|
||||||
echo "==> Smoke test: import FusedAdam"
|
|
||||||
python - <<'PY'
|
python - <<'PY'
|
||||||
import deepspeed, torch
|
import deepspeed, torch
|
||||||
from deepspeed.ops.adam import FusedAdam
|
from deepspeed.ops.adam import FusedAdam
|
||||||
print("DeepSpeed:", deepspeed.__version__)
|
print("DeepSpeed:", deepspeed.__version__)
|
||||||
print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
|
print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
|
||||||
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
|
print("GPU0:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
|
||||||
print("FusedAdam OK")
|
print("FusedAdam OK")
|
||||||
PY
|
PY
|
||||||
|
|
||||||
echo "==> Create minimal HF Trainer single-GPU test (tiny model)"
|
# 单机单卡最小训练验证
|
||||||
cd ..
|
echo "==> Prepare single-GPU minimal HF training"
|
||||||
|
cd "${HOME}/train/new"
|
||||||
cat > ds_config_stage2_single.json <<'JSON'
|
cat > ds_config_stage2_single.json <<'JSON'
|
||||||
{
|
{
|
||||||
"train_batch_size": 8,
|
"train_batch_size": 8,
|
||||||
|
|
@ -96,15 +100,12 @@ from datasets import Dataset
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
model_name = "sshleifer/tiny-gpt2" # 极小模型,快速验证
|
model_name = "sshleifer/tiny-gpt2"
|
||||||
tok = AutoTokenizer.from_pretrained(model_name)
|
tok = AutoTokenizer.from_pretrained(model_name)
|
||||||
if tok.pad_token is None:
|
if tok.pad_token is None: tok.pad_token = tok.eos_token
|
||||||
tok.pad_token = tok.eos_token
|
|
||||||
|
|
||||||
data = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200
|
texts = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200
|
||||||
ds = Dataset.from_dict({"text": data})
|
ds = Dataset.from_dict({"text": texts}).map(lambda e: tok(e["text"], truncation=True, max_length=128))
|
||||||
def enc(e): return tok(e["text"], truncation=True, max_length=128)
|
|
||||||
ds = ds.map(enc)
|
|
||||||
collator = DataCollatorForLanguageModeling(tok, mlm=False)
|
collator = DataCollatorForLanguageModeling(tok, mlm=False)
|
||||||
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_name)
|
model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||||
|
|
@ -119,13 +120,11 @@ args = TrainingArguments(
|
||||||
fp16=True,
|
fp16=True,
|
||||||
deepspeed="ds_config_stage2_single.json"
|
deepspeed="ds_config_stage2_single.json"
|
||||||
)
|
)
|
||||||
trainer = Trainer(model=model, args=args, tokenizer=tok,
|
Trainer(model=model, args=args, tokenizer=tok, train_dataset=ds, data_collator=collator).train()
|
||||||
train_dataset=ds, data_collator=collator)
|
|
||||||
trainer.train()
|
|
||||||
print("OK: single-GPU training finished.")
|
print("OK: single-GPU training finished.")
|
||||||
PY
|
PY
|
||||||
|
|
||||||
echo "==> Run minimal single-GPU training with DeepSpeed Stage-2"
|
echo "==> Run single-GPU test (CUDA_VISIBLE_DEVICES=0)"
|
||||||
CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py
|
CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py
|
||||||
|
|
||||||
echo "==> ALL DONE (single-GPU)."
|
echo "=== DONE: DeepSpeed built with CUDA 11.8 and single-GPU test passed ==="
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue