diff --git a/Dockerfile b/Dockerfile index cf5009c..97cfbf9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,11 +11,11 @@ ENV USE_CUDA=1 \ USE_NCCL=1 \ USE_SYSTEM_NCCL=1 \ BUILD_TEST=0 - + ARG MAX_JOBS=90 ENV DEBIAN_FRONTEND=noninteractive \ PYTHONUNBUFFERED=1 LANG=C.UTF-8 LC_ALL=C.UTF-8 \ - USE_CUDA=1 USE_DISTRIBUTED=0 BUILD_TEST=0 TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0" + TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0" RUN apt-get update && apt-get install -y --no-install-recommends \ python3 python3-dev python3-pip python3-distutils git cmake ninja-build \ @@ -30,7 +30,9 @@ RUN git clone --recursive -b v2.7.1 https://github.com/pytorch/pytorch.git WORKDIR /opt/pytorch ENV MAX_JOBS=${MAX_JOBS} -RUN python3 setup.py bdist_wheel # ≈50‒60 min 首编 +RUN echo "Building PyTorch with USE_DISTRIBUTED=$USE_DISTRIBUTED" && \ + python3 setup.py bdist_wheel && \ + python3 -c "from torch.distributed import Backend; print('✅ Build success. GLOO =', Backend.GLOO)" ############################################################################### # Stage 1 ─ builder-extras:用自编 Torch 装 TV / flashinfer / sglang,并收集轮子 @@ -105,7 +107,12 @@ COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-lin RUN ldconfig COPY --from=builder-extras /wheels /tmp/wheels -RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels +#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels +# ✅ 优先装你自编的 torch,避免被 PyPI 上的覆盖 +RUN python3 -m pip install --no-cache-dir /tmp/wheels/torch*.whl && \ + python3 -m pip install --no-cache-dir /tmp/wheels/* && \ + python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \ + rm -rf /tmp/wheels # 安装运行时漏掉的依赖 RUN python3 -m pip install --no-cache-dir pydantic orjson psutil pyzmq pynvml transformers==4.48.3 uvicorn fastapi IPython aiohttp setproctitle uvloop sentencepiece triton