This commit is contained in:
hailin 2025-08-05 17:17:45 +08:00
parent c6c629f376
commit 2dd0928d6e
1 changed files with 10 additions and 21 deletions

View File

@ -217,29 +217,18 @@ def _worker(t, q):
def _encode(texts: List[str]): def _encode(texts: List[str]):
""" try:
单次请求 return model.encode(texts, return_dense=True)
1. 子进程跑 GPU 推理成功返回 except RuntimeError as e:
2. 若子进程 OOM / CUDA Error 同一次请求 fallback CPU if "out of memory" in str(e).lower() or "cuda error" in str(e).lower():
绝不改全局状态其他并发请求不受影响 logger.warning("GPU OOM → fallback to CPU: %s", str(e))
"""
q = mp.Queue()
p = mp.Process(target=_worker, args=(texts, q))
p.start()
p.join(timeout=60)
if not q.empty():
status, payload = q.get()
if status == "ok":
return payload
if "out of memory" in payload.lower() or "cuda error" in payload.lower():
logger.warning("GPU OOM → 本次请求改走 CPU%s", payload)
torch.cuda.empty_cache() torch.cuda.empty_cache()
cpu_model, _ = load_model("cpu") global CPU_MODEL_CACHE
return cpu_model.encode(texts, return_dense=True) if CPU_MODEL_CACHE is None:
raise RuntimeError(payload) CPU_MODEL_CACHE, _ = load_model("cpu")
return CPU_MODEL_CACHE.encode(texts, return_dense=True)
raise
raise RuntimeError("子进程异常退出,无返回")
@app.post("/v1/embeddings") @app.post("/v1/embeddings")