From dd1ca5728a2af60225b6d62001de825a9e1de3e1 Mon Sep 17 00:00:00 2001 From: hailin Date: Tue, 5 Aug 2025 16:54:36 +0800 Subject: [PATCH] . --- Dockerfile | 4 ++-- app/main.py | 30 +++++++++++++++++++++--------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6a4056d..672d237 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,8 +27,8 @@ EXPOSE 8001 # 新增:给 PT 显存分段配置,减少碎片 (可选但推荐) ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32 -# 启动:Gunicorn + 4 worker,每个 worker 一个独立进程 +# 启动:Gunicorn + 1 worker,每个 worker 一个独立进程 CMD ["gunicorn", "app.main:app", \ "-k", "uvicorn.workers.UvicornWorker", \ - "-w", "4", \ + "-w", "1", \ "-b", "0.0.0.0:8001"] \ No newline at end of file diff --git a/app/main.py b/app/main.py index f86c632..88c4b15 100644 --- a/app/main.py +++ b/app/main.py @@ -27,9 +27,9 @@ from FlagEmbedding import BGEM3FlagModel # Config # -----------------------------------------------------------------------------# MODEL_PATH = "model/bge-m3" # 按需改成你的权重路径 -MODEL_VRAM_MB = int(os.getenv("MODEL_VRAM_MB", "16384")) # bge-m3-large fp16 ≈ 16 GiB -POST_LOAD_GAP_MB = 512 -SAFE_MIN_FREE_MB = MODEL_VRAM_MB + POST_LOAD_GAP_MB # == 16896 MB +MODEL_VRAM_MB = int(os.getenv("MODEL_VRAM_MB", "8000")) # bge-m3-large fp32 ≈ 8 GiB +POST_LOAD_GAP_MB = 192 +SAFE_MIN_FREE_MB = MODEL_VRAM_MB + POST_LOAD_GAP_MB # == 8192 MB # -----------------------------------------------------------------------------# # Logging @@ -179,12 +179,24 @@ logger.info("Using SAFE_MIN_FREE_MB = %d MB", SAFE_MIN_FREE_MB) # ② -------- FastAPI 启动预热 -------- @app.on_event("startup") -async def warm_up(): - logger.info("Warm-up on %s", DEVICE) - try: - _ = model.encode(["warmup"], return_dense=True, num_processes=1) - except Exception as e: - logger.warning("Warm-up failed: %s — 首条请求时再退避", e) +def warm_up(): + def _warm_worker(t, q): + try: + _ = model.encode(t, return_dense=True, num_processes=1) + q.put("ok") + except Exception as e: + q.put(str(e)) + + texts = ["warmup"] + q = mp.Queue() + p = mp.Process(target=_warm_worker, args=(texts, q)) + p.start() + p.join(timeout=60) + + if not q.empty() and q.get() == "ok": + logger.info("Warm-up complete.") + else: + logger.warning("Warm-up failed or timed out.")