.

2025-08-08 17:43:49 +08:00 · 2025-08-08 17:43:49 +08:00 · 5868300787
parent d0ad05f4fb
commit 5868300787
4 changed files with 26 additions and 34 deletions
--- a/README.txt
+++ b/README.txt
@ -1,8 +1,9 @@
-mamba env create -f hf-train-cu118.yaml --strict-channel-priority
+mamba env create -f env.yaml --strict-channel-priority
 mamba activate hf-train-cu118
 python -m pip install --no-deps -r requirements-hf.txt
-pip install --upgrade pip
+# deepspeed（选一）
-pip install --no-deps -r requirements-hf.txt --constraint constraints-cu118.txt
+DS_BUILD_OPS=0 python -m pip install "deepspeed==0.14.*"
-
+# 或
-# 需要 deepspeed 时再装：
+mamba install -y cuda-toolkit=11.8 cmake ninja
-DS_BUILD_OPS=0 pip install "deepspeed==0.14.*"  # 先不编译 CUDA 内核
+python -m pip install "deepspeed==0.14.*"
--- a/constraints-cu118.txt
+++ b/constraints-cu118.txt
@ -1,3 +0,0 @@
 torch==2.1.2
 torchvision==0.16.2
 torchaudio==2.1.2
--- a/hf-train-cu118.yaml
+++ b/hf-train-cu118.yaml
@ -1,32 +1,31 @@
 name: hf-train-cu118
-channels:
+channels: [pytorch, nvidia, conda-forge]
  - pytorch
  - nvidia
  - conda-forge
 dependencies:
  - python=3.10
  - pip
-  # ---- Torch 栈：固定 2.1.2 + cu118 ----
+  # ---- Torch 栈：2.1.2 + cu118 ----
  - pytorch=2.1.2
  - torchvision=0.16.2
  - torchaudio=2.1.2
  - pytorch-cuda=11.8
-  # ---- 避坑：Numpy 钉在 1.26.* ----
+
  # ---- 数值栈 ----
  - numpy=1.26.*
  # ---- 常用科学/系统库 ----
  - pandas
  - scipy
  - pyarrow
-  - uvicorn
+
-  - git
+  # ---- HF 主栈 ----
-  # ---- HF 主栈 + 其运行时依赖（全部走 conda，不让 pip 动依赖）----
+  - transformers>=4.40,<5
  - transformers>=4.40
  - accelerate>=0.30
  - datasets>=2.18
  - evaluate>=0.4
  - safetensors>=0.4
  - sentencepiece>=0.1.99
  - tokenizers=0.19.*
  # ---- 依赖/工具 ----
  - protobuf<5
  - huggingface_hub>=0.23
  - tqdm>=4.66
  - scikit-learn>=1.4
@ -41,4 +40,9 @@ dependencies:
  - xxhash
  - aiohttp
  - psutil
-
+  # 可选（按需）
  # - einops
  # - ninja
  # - cmake
  # - xformers==0.0.22.post3  # 若真需要
  # - fastapi                # 若要配合 uvicorn 跑服务
--- a/requirements-hf.txt
+++ b/requirements-hf.txt
@ -1,15 +1,5 @@
 # requirements-hf.txt  （HF 生态）
 transformers>=4.40,<5.0
 accelerate>=0.30
 datasets>=2.18
 evaluate>=0.4
 safetensors>=0.4
 sentencepiece>=0.1.99
 tokenizers>=0.19,<0.21
 huggingface_hub>=0.23
 tqdm>=4.66
 peft>=0.11
 bitsandbytes>=0.43
-tensorboard>=2.16
+# 可选：
-scikit-learn>=1.4
+# xformers==0.0.22.post3
-# deepspeed 单独装，别放进来
+# flash-attn