This commit is contained in:
hailin 2025-08-08 22:07:59 +08:00
parent 28c3ac0b0f
commit afca830e53
1 changed files with 12 additions and 48 deletions

View File

@ -144,77 +144,41 @@ WORKDIR="$(pwd)"
REPO_PATH="${WORKDIR}/${REPO_DIR}"
if git -C "${REPO_PATH}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
# 已存在且是 git 仓库:校验 remote 和分支,强制到目标分支
cur_remote=$(git -C "${REPO_PATH}" remote get-url origin || echo "")
if [[ "${cur_remote}" != "${REPO_URL}" ]]; then
log "remote 不匹配,重建仓库"
rm -rf "${REPO_PATH}"
git clone --branch "${REPO_BRANCH}" --depth 1 --recurse-submodules --shallow-submodules "${REPO_URL}" "${REPO_PATH}"
git clone --branch "${REPO_BRANCH}" --depth 1 "${REPO_URL}" "${REPO_PATH}"
else
log "更新已有仓库到 ${REPO_BRANCH}"
git -C "${REPO_PATH}" fetch --depth 1 origin "${REPO_BRANCH}"
git -C "${REPO_PATH}" checkout -f "${REPO_BRANCH}"
git -C "${REPO_PATH}" reset --hard "origin/${REPO_BRANCH}"
git -C "${REPO_PATH}" clean -fdx
# 同步并更新子模块到正确提交
git -C "${REPO_PATH}" submodule sync --recursive
git -C "${REPO_PATH}" submodule update --init --recursive --depth 1
fi
else
# 不存在或不是仓库:重拉(连带子模块)
rm -rf "${REPO_PATH}"
git clone --branch "${REPO_BRANCH}" --depth 1 --recurse-submodules --shallow-submodules "${REPO_URL}" "${REPO_PATH}"
git clone --branch "${REPO_BRANCH}" --depth 1 "${REPO_URL}" "${REPO_PATH}"
fi
# 再保险:确保子模块已到位
git -C "${REPO_PATH}" submodule sync --recursive
git -C "${REPO_PATH}" submodule update --init --recursive --depth 1
# 不安装 ds-kernels纯训练环境不需要推理 CUTLASS 内核)
log "跳过 ds-kernels 安装(推理相关组件已禁用)"
# 基本健检
test -f "${REPO_PATH}/pyproject.toml" -o -f "${REPO_PATH}/setup.py" || die "DeepSpeed 源码目录缺少构建文件。"
# =============================
# 6. 编译安装 DeepSpeed
# 6. 编译安装 DeepSpeed训练专用版无推理组件
# =============================
log "Build & install DeepSpeed"
cd "${REPO_PATH}"
pip uninstall -y deepspeed >/dev/null 2>&1 || true
rm -rf build
log "Build & install DeepSpeed (training only, no inference ops)"
# —— 编译前置(强烈建议)——
python - <<'PY'
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "pip", "setuptools", "wheel", "ninja"])
PY
export DS_BUILD_INFERENCE=0
export DS_BUILD_CUTLASS=0
export DS_BUILD_QUANTIZER=0
export DS_BUILD_FP_QUANTIZER=0
export DS_BUILD_EVOFORMER_ATTN=0
export DS_BUILD_GDS=0
# 并行编译提速(按 CPU 核心数调)
export MAX_JOBS=${MAX_JOBS:-$(nproc)}
# —— DeepSpeed 功能开关 ——
# 编译核心 CUDA/C++ 扩展(必须)
export DS_BUILD_OPS=1
# 训练常用加速内核
# 保留训练必需的内核
export DS_BUILD_TRANSFORMER=1
export DS_BUILD_SPARSE_ATTN=1
export DS_BUILD_FLASH_ATTN=1
# 常用优化器(强烈建议开)
export DS_BUILD_FUSED_ADAM=1
export DS_BUILD_CPU_ADAM=1
# 可选:异步 IOLinux 才生效Windows 不支持 AIO/GDS
export DS_BUILD_AIO=1
export DS_BUILD_INFERENCE=0
export DS_BUILD_CUTLASS=0
# 安装(带详细日志)
pip install -v .
pip install -v "${REPO_PATH}"
# =============================