From 94dad95fc21b46efff258d483d255de6dbfd2406 Mon Sep 17 00:00:00 2001 From: hailin Date: Tue, 5 Aug 2025 11:35:36 +0800 Subject: [PATCH] . --- Dockerfile | 12 +++++++-- app/main.py | 71 ++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 61 insertions(+), 22 deletions(-) diff --git a/Dockerfile b/Dockerfile index ea22c57..6a4056d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,6 +21,14 @@ COPY model/bge-m3 /app/model/bge-m3 # 暴露端口 EXPOSE 8001 -# 启动 FastAPI 服务 -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8001"] +# # 启动 FastAPI 服务 +# CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8001"] +# 新增:给 PT 显存分段配置,减少碎片 (可选但推荐) +ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32 + +# 启动:Gunicorn + 4 worker,每个 worker 一个独立进程 +CMD ["gunicorn", "app.main:app", \ + "-k", "uvicorn.workers.UvicornWorker", \ + "-w", "4", \ + "-b", "0.0.0.0:8001"] \ No newline at end of file diff --git a/app/main.py b/app/main.py index 7c399cc..0b686ba 100644 --- a/app/main.py +++ b/app/main.py @@ -15,6 +15,7 @@ import os import sys import time from typing import List, Union +import multiprocessing as mp import torch from fastapi import FastAPI, HTTPException @@ -175,31 +176,61 @@ class EmbeddingRequest(BaseModel): model: str = "text-embedding-bge-m3" -fallback_done = False # prevent endless downgrade loop - - def _encode(texts: List[str]): - """Encode with single downgrade to CPU on OOM / CUDA failure.""" - global model, DEVICE, PRECISION, fallback_done + """ + 单次请求: + 1. 子进程跑 GPU 推理;成功→返回 + 2. 若子进程 OOM / CUDA Error → 同一次请求 fallback 到 CPU + 绝不改全局状态,其他并发请求不受影响 + """ + def _worker(t, q): + try: + q.put(("ok", model.encode(t, return_dense=True))) + except Exception as e: + q.put(("err", str(e))) - try: - return model.encode(texts, return_dense=True) + q = mp.Queue() + p = mp.Process(target=_worker, args=(texts, q)) + p.start() + p.join(timeout=60) - except RuntimeError as err: - is_oom = "out of memory" in str(err).lower() - is_cuda_fail = "cuda error" in str(err).lower() or "device-side assert" in str( - err - ).lower() - - if (is_oom or is_cuda_fail) and not fallback_done: - logger.error("GPU failure (%s). Falling back to CPU…", err) - fallback_done = True + if not q.empty(): + status, payload = q.get() + if status == "ok": + return payload + if "out of memory" in payload.lower() or "cuda error" in payload.lower(): + logger.warning("GPU OOM → 本次请求改走 CPU:%s", payload) torch.cuda.empty_cache() - DEVICE = "cpu" - model, PRECISION = load_model(DEVICE) - return model.encode(texts, return_dense=True) + cpu_model, _ = load_model("cpu") + return cpu_model.encode(texts, return_dense=True) + raise RuntimeError(payload) - raise # second failure → propagate + raise RuntimeError("子进程异常退出,无返回") + +# fallback_done = False # prevent endless downgrade loop + +# def _encode(texts: List[str]): +# """Encode with single downgrade to CPU on OOM / CUDA failure.""" +# global model, DEVICE, PRECISION, fallback_done + +# try: +# return model.encode(texts, return_dense=True) + +# except RuntimeError as err: +# is_oom = "out of memory" in str(err).lower() +# is_cuda_fail = "cuda error" in str(err).lower() or "device-side assert" in str( +# err +# ).lower() + +# if (is_oom or is_cuda_fail) and not fallback_done: +# logger.error("GPU failure (%s). Falling back to CPU…", err) +# fallback_done = True +# torch.cuda.empty_cache() +# DEVICE = "cpu" +# model, PRECISION = load_model(DEVICE) +# return model.encode(texts, return_dense=True) + +# raise # second failure → propagate @app.post("/v1/embeddings")