.

2025-07-26 22:19:16 +08:00 · 2025-07-26 22:19:16 +08:00 · d1a2b815b3
parent 49b8cae1bb
commit d1a2b815b3
2 changed files with 15 additions and 4 deletions
--- a/9
+++ b/9
@ -135,17 +135,18 @@ COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-lin
 # 👇建议在后面补上
 RUN ldconfig

-COPY --from=builder-extras /wheels /tmp/wheels
-#COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
+# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
+COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
+
+
+COPY --from=builder-extras /wheels /tmp/wheels

-#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
    rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
    rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    #python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
    python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
    python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
--- a/moe_kernels/triton_3_3_1/E=128,N=384,device_name=NVIDIA_GeForce_RTX_3090.json
+++ b/moe_kernels/triton_3_3_1/E=128,N=384,device_name=NVIDIA_GeForce_RTX_3090.json
@ -0,0 +1,10 @@
+{
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}