This commit is contained in:
hailin 2025-08-05 11:35:36 +08:00
parent 2de829db93
commit 94dad95fc2
2 changed files with 61 additions and 22 deletions

View File

@ -21,6 +21,14 @@ COPY model/bge-m3 /app/model/bge-m3
# 暴露端口 # 暴露端口
EXPOSE 8001 EXPOSE 8001
# 启动 FastAPI 服务 # # 启动 FastAPI 服务
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8001"] # CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8001"]
# 新增:给 PT 显存分段配置,减少碎片 (可选但推荐)
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32
# 启动Gunicorn + 4 worker每个 worker 一个独立进程
CMD ["gunicorn", "app.main:app", \
"-k", "uvicorn.workers.UvicornWorker", \
"-w", "4", \
"-b", "0.0.0.0:8001"]

View File

@ -15,6 +15,7 @@ import os
import sys import sys
import time import time
from typing import List, Union from typing import List, Union
import multiprocessing as mp
import torch import torch
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
@ -175,31 +176,61 @@ class EmbeddingRequest(BaseModel):
model: str = "text-embedding-bge-m3" model: str = "text-embedding-bge-m3"
fallback_done = False # prevent endless downgrade loop
def _encode(texts: List[str]): def _encode(texts: List[str]):
"""Encode with single downgrade to CPU on OOM / CUDA failure.""" """
global model, DEVICE, PRECISION, fallback_done 单次请求
1. 子进程跑 GPU 推理成功返回
2. 若子进程 OOM / CUDA Error 同一次请求 fallback CPU
绝不改全局状态其他并发请求不受影响
"""
def _worker(t, q):
try:
q.put(("ok", model.encode(t, return_dense=True)))
except Exception as e:
q.put(("err", str(e)))
try: q = mp.Queue()
return model.encode(texts, return_dense=True) p = mp.Process(target=_worker, args=(texts, q))
p.start()
p.join(timeout=60)
except RuntimeError as err: if not q.empty():
is_oom = "out of memory" in str(err).lower() status, payload = q.get()
is_cuda_fail = "cuda error" in str(err).lower() or "device-side assert" in str( if status == "ok":
err return payload
).lower() if "out of memory" in payload.lower() or "cuda error" in payload.lower():
logger.warning("GPU OOM → 本次请求改走 CPU%s", payload)
if (is_oom or is_cuda_fail) and not fallback_done:
logger.error("GPU failure (%s). Falling back to CPU…", err)
fallback_done = True
torch.cuda.empty_cache() torch.cuda.empty_cache()
DEVICE = "cpu" cpu_model, _ = load_model("cpu")
model, PRECISION = load_model(DEVICE) return cpu_model.encode(texts, return_dense=True)
return model.encode(texts, return_dense=True) raise RuntimeError(payload)
raise # second failure → propagate raise RuntimeError("子进程异常退出,无返回")
# fallback_done = False # prevent endless downgrade loop
# def _encode(texts: List[str]):
# """Encode with single downgrade to CPU on OOM / CUDA failure."""
# global model, DEVICE, PRECISION, fallback_done
# try:
# return model.encode(texts, return_dense=True)
# except RuntimeError as err:
# is_oom = "out of memory" in str(err).lower()
# is_cuda_fail = "cuda error" in str(err).lower() or "device-side assert" in str(
# err
# ).lower()
# if (is_oom or is_cuda_fail) and not fallback_done:
# logger.error("GPU failure (%s). Falling back to CPU…", err)
# fallback_done = True
# torch.cuda.empty_cache()
# DEVICE = "cpu"
# model, PRECISION = load_model(DEVICE)
# return model.encode(texts, return_dense=True)
# raise # second failure → propagate
@app.post("/v1/embeddings") @app.post("/v1/embeddings")