diff --git a/Dockerfile b/Dockerfile
index 91ff789..9ffb35f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -135,17 +135,18 @@ COPY --from=builder-extras /usr/local/cuda/lib64/libcupti.so /usr/lib/x86_64-lin
 # 👇建议在后面补上
 RUN ldconfig
 
-COPY --from=builder-extras /wheels /tmp/wheels
-#COPY --from=builder-extras /tmp/sgl_kernel_wheel /tmp/sgl_kernel_wheel
+# ---- 拷贝预调优的 MoE Triton kernel config ----------------------------
+COPY moe_kernels /usr/local/lib/python3.10/dist-packages/sglang/srt/layers/moe/fused_moe_triton/configs
+
+
+COPY --from=builder-extras /wheels /tmp/wheels
 
-#RUN python3 -m pip install --no-cache-dir /tmp/wheels/* && rm -rf /tmp/wheels
 # ✅ 优先装你自编的 torch，避免被 PyPI 上的覆盖
 RUN ls -lh /tmp/wheels && \
     rm -f /tmp/wheels/torch-2.7.1a0+*.whl && \
     rm -f /tmp/wheels/huggingface_hub-0.33.4*.whl && \
     python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/torch*.whl && \
     python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/vllm-*.whl && \
-    #python3 -m pip install --no-cache-dir --no-deps /tmp/sgl_kernel_wheel/*.whl && \
     python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/sgl_kernel-*.whl && \
     python3 -m pip install --no-cache-dir --no-deps /tmp/wheels/* && \
     python3 -c "from torch.distributed import Backend; print('✅ Runtime torch distributed OK, GLOO =', Backend.GLOO)" && \
diff --git a/moe_kernels/triton_3_3_1/E=128,N=384,device_name=NVIDIA_GeForce_RTX_3090.json b/moe_kernels/triton_3_3_1/E=128,N=384,device_name=NVIDIA_GeForce_RTX_3090.json
new file mode 100644
index 0000000..661af2d
--- /dev/null
+++ b/moe_kernels/triton_3_3_1/E=128,N=384,device_name=NVIDIA_GeForce_RTX_3090.json
@@ -0,0 +1,10 @@
+{
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    }
+}