From d0ad05f4fb893f7cb9ea9bb21a8adc8d0c4fe4c5 Mon Sep 17 00:00:00 2001 From: hailin Date: Fri, 8 Aug 2025 17:20:00 +0800 Subject: [PATCH] first commit --- README.txt | 8 ++++++++ check_core_cuda.sh | 10 ++++++++++ check_hf.sh | 11 +++++++++++ check_train.sh | 30 +++++++++++++++++++++++++++++ constraints-cu118.txt | 3 +++ hf-train-cu118.yaml | 44 +++++++++++++++++++++++++++++++++++++++++++ requirements-hf.txt | 15 +++++++++++++++ 7 files changed, 121 insertions(+) create mode 100644 README.txt create mode 100644 check_core_cuda.sh create mode 100644 check_hf.sh create mode 100644 check_train.sh create mode 100644 constraints-cu118.txt create mode 100644 hf-train-cu118.yaml create mode 100644 requirements-hf.txt diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..cb4374a --- /dev/null +++ b/README.txt @@ -0,0 +1,8 @@ +mamba env create -f hf-train-cu118.yaml --strict-channel-priority +mamba activate hf-train-cu118 + +pip install --upgrade pip +pip install --no-deps -r requirements-hf.txt --constraint constraints-cu118.txt + +# 需要 deepspeed 时再装: +DS_BUILD_OPS=0 pip install "deepspeed==0.14.*" # 先不编译 CUDA 内核 diff --git a/check_core_cuda.sh b/check_core_cuda.sh new file mode 100644 index 0000000..4172861 --- /dev/null +++ b/check_core_cuda.sh @@ -0,0 +1,10 @@ +python - <<'PY' +import torch +print("PyTorch 版本:", torch.__version__) +print("CUDA runtime 版本:", torch.version.cuda) +print("GPU 可用:", torch.cuda.is_available()) +if torch.cuda.is_available(): + print("GPU 数量:", torch.cuda.device_count()) + for i in range(torch.cuda.device_count()): + print(f" GPU {i}:", torch.cuda.get_device_name(i)) +PY diff --git a/check_hf.sh b/check_hf.sh new file mode 100644 index 0000000..12833e7 --- /dev/null +++ b/check_hf.sh @@ -0,0 +1,11 @@ +python - <<'PY' +import transformers, accelerate, datasets, safetensors, sentencepiece, peft, bitsandbytes +print("Transformers:", transformers.__version__) +print("Accelerate:", accelerate.__version__) +print("Datasets:", datasets.__version__) +print("Safetensors:", safetensors.__version__) +print("SentencePiece:", sentencepiece.__version__) +print("PEFT:", peft.__version__) +print("BitsAndBytes:", bitsandbytes.__version__) +PY + diff --git a/check_train.sh b/check_train.sh new file mode 100644 index 0000000..96db7ba --- /dev/null +++ b/check_train.sh @@ -0,0 +1,30 @@ +python - <<'PY' +from datasets import load_dataset +from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling +import torch + +model_id = "sshleifer/tiny-gpt2" # 极小模型 +tok = AutoTokenizer.from_pretrained(model_id) +tok.pad_token = tok.eos_token + +ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]") +def tok_fn(ex): return tok(ex["text"], truncation=True, padding="max_length", max_length=64) +ds = ds.map(tok_fn, batched=True, remove_columns=["text"]) + +mdl = AutoModelForCausalLM.from_pretrained(model_id) +collator = DataCollatorForLanguageModeling(tok, mlm=False) +args = TrainingArguments( + output_dir="out-mini", + per_device_train_batch_size=2, + num_train_epochs=1, + fp16=torch.cuda.is_available(), + logging_steps=2, + save_steps=10, + report_to="none", +) + +trainer = Trainer(model=mdl, args=args, train_dataset=ds, data_collator=collator) +trainer.train() +print("✅ 训练链路 OK") +PY + diff --git a/constraints-cu118.txt b/constraints-cu118.txt new file mode 100644 index 0000000..53947a6 --- /dev/null +++ b/constraints-cu118.txt @@ -0,0 +1,3 @@ +torch==2.1.2 +torchvision==0.16.2 +torchaudio==2.1.2 diff --git a/hf-train-cu118.yaml b/hf-train-cu118.yaml new file mode 100644 index 0000000..107cabe --- /dev/null +++ b/hf-train-cu118.yaml @@ -0,0 +1,44 @@ +name: hf-train-cu118 +channels: + - pytorch + - nvidia + - conda-forge +dependencies: + - python=3.10 + - pip + # ---- Torch 栈:固定 2.1.2 + cu118 ---- + - pytorch=2.1.2 + - torchvision=0.16.2 + - torchaudio=2.1.2 + - pytorch-cuda=11.8 + # ---- 避坑:Numpy 钉在 1.26.* ---- + - numpy=1.26.* + # ---- 常用科学/系统库 ---- + - pandas + - scipy + - pyarrow + - uvicorn + - git + # ---- HF 主栈 + 其运行时依赖(全部走 conda,不让 pip 动依赖)---- + - transformers>=4.40 + - accelerate>=0.30 + - datasets>=2.18 + - evaluate>=0.4 + - safetensors>=0.4 + - sentencepiece>=0.1.99 + - tokenizers=0.19.* + - huggingface_hub>=0.23 + - tqdm>=4.66 + - scikit-learn>=1.4 + - tensorboard>=2.16 + - packaging + - regex + - pyyaml + - requests + - fsspec + - dill + - multiprocess + - xxhash + - aiohttp + - psutil + diff --git a/requirements-hf.txt b/requirements-hf.txt new file mode 100644 index 0000000..da833f0 --- /dev/null +++ b/requirements-hf.txt @@ -0,0 +1,15 @@ +# requirements-hf.txt (HF 生态) +transformers>=4.40,<5.0 +accelerate>=0.30 +datasets>=2.18 +evaluate>=0.4 +safetensors>=0.4 +sentencepiece>=0.1.99 +tokenizers>=0.19,<0.21 +huggingface_hub>=0.23 +tqdm>=4.66 +peft>=0.11 +bitsandbytes>=0.43 +tensorboard>=2.16 +scikit-learn>=1.4 +# deepspeed 单独装,别放进来