#!/usr/bin/env bash
set -euo pipefail

echo "==> Python/Torch baseline"
python - <<'PY'
import sys, torch
print("Python:", sys.version.split()[0])
print("Torch:", torch.__version__)
print("Torch CUDA tag:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
PY

# 尝试用 nvidia 官方 11.8 频道；失败则走 conda-forge 逐组件
echo "==> Installing CUDA 11.8 toolchain into current env..."
if mamba install -y -c "nvidia/label/cuda-11.8.0" cuda-toolkit; then
  echo "Installed cuda-toolkit (nvidia channel)."
else
  echo "Fallback to conda-forge components..."
  mamba install -y -c conda-forge \
    cuda-nvcc=11.8 cuda-version=11.8 \
    cuda-cudart-dev=11.8 libnvjitlink=11.8 \
    libcublas-dev=11.8 libcufft-dev=11.8 \
    libcurand-dev=11.8 libcusolver-dev=11.8 libcusparse-dev=11.8 \
    cuda-profiler-api=11.8
fi

# 强制当前会话使用 env 里的 11.8 nvcc/库
export CUDA_HOME="$CONDA_PREFIX"
export PATH="$CUDA_HOME/bin:$PATH"
export LD_LIBRARY_PATH="$CUDA_HOME/lib:$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"

echo "==> nvcc should now be 11.8:"
nvcc --version

# 架构：3090 = sm_86
export TORCH_CUDA_ARCH_LIST="8.6"

# 编译依赖（用 mamba 提速）
echo "==> Build deps"
mamba install -y -c conda-forge cmake ninja pybind11 libaio git
pip install -U pip setuptools wheel

# 获取 DeepSpeed 源码（固定较稳 tag）
echo "==> Clone DeepSpeed (if not exists)"
cd "${HOME}/train/new"
[ -d DeepSpeed ] || git clone --branch v0.14.3 https://github.com/microsoft/DeepSpeed.git
cd DeepSpeed

# 清理旧安装
pip uninstall -y deepspeed || true

# 仅启用训练相关内核
export DS_BUILD_OPS=1
export DS_BUILD_AIO=1
export DS_BUILD_FUSED_ADAM=1
export DS_BUILD_CPU_ADAM=1
# 推理/transformer 内核先关，减少兼容风险
# export DS_BUILD_TRANSFORMER=1

echo "==> Build & install DeepSpeed"
pip install .

echo "==> Verify DeepSpeed"
python -m deepspeed.env_report
python - <<'PY'
import deepspeed, torch
from deepspeed.ops.adam import FusedAdam
print("DeepSpeed:", deepspeed.__version__)
print("Torch:", torch.__version__, "CUDA tag:", torch.version.cuda)
print("GPU0:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
print("FusedAdam OK")
PY

# 单机单卡最小训练验证
echo "==> Prepare single-GPU minimal HF training"
cd "${HOME}/train/new"
cat > ds_config_stage2_single.json <<'JSON'
{
  "train_batch_size": 8,
  "train_micro_batch_size_per_gpu": 1,
  "gradient_accumulation_steps": 8,
  "zero_optimization": { "stage": 2, "overlap_comm": true, "contiguous_gradients": true },
  "fp16": { "enabled": true },
  "aio": {
    "block_size": 1048576,
    "queue_depth": 16,
    "thread_count": 1,
    "single_submit": false,
    "overlap_events": true,
    "verbose": false
  },
  "gradient_clipping": 1.0,
  "steps_per_print": 1000,
  "wall_clock_breakdown": false
}
JSON

cat > train_single_gpu_min.py <<'PY'
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch

model_name = "sshleifer/tiny-gpt2"
tok = AutoTokenizer.from_pretrained(model_name)
if tok.pad_token is None: tok.pad_token = tok.eos_token

texts = ["hello world", "deepspeed single gpu", "trainer test", "fast check"] * 200
ds = Dataset.from_dict({"text": texts}).map(lambda e: tok(e["text"], truncation=True, max_length=128))
collator = DataCollatorForLanguageModeling(tok, mlm=False)

model = AutoModelForCausalLM.from_pretrained(model_name)
args = TrainingArguments(
    output_dir="out-ds-single",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=5e-4,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=0,
    fp16=True,
    deepspeed="ds_config_stage2_single.json"
)
Trainer(model=model, args=args, tokenizer=tok, train_dataset=ds, data_collator=collator).train()
print("OK: single-GPU training finished.")
PY

echo "==> Run single-GPU test (CUDA_VISIBLE_DEVICES=0)"
CUDA_VISIBLE_DEVICES=0 python train_single_gpu_min.py

echo "=== DONE: DeepSpeed built with CUDA 11.8 and single-GPU test passed ==="