This commit is contained in:
hailin 2025-08-05 11:35:36 +08:00
parent 2de829db93
commit 94dad95fc2
2 changed files with 61 additions and 22 deletions

View File

@ -21,6 +21,14 @@ COPY model/bge-m3 /app/model/bge-m3
# 暴露端口
EXPOSE 8001
# 启动 FastAPI 服务
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8001"]
# # 启动 FastAPI 服务
# CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8001"]
# 新增:给 PT 显存分段配置,减少碎片 (可选但推荐)
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32
# 启动Gunicorn + 4 worker每个 worker 一个独立进程
CMD ["gunicorn", "app.main:app", \
"-k", "uvicorn.workers.UvicornWorker", \
"-w", "4", \
"-b", "0.0.0.0:8001"]

View File

@ -15,6 +15,7 @@ import os
import sys
import time
from typing import List, Union
import multiprocessing as mp
import torch
from fastapi import FastAPI, HTTPException
@ -175,31 +176,61 @@ class EmbeddingRequest(BaseModel):
model: str = "text-embedding-bge-m3"
fallback_done = False # prevent endless downgrade loop
def _encode(texts: List[str]):
"""Encode with single downgrade to CPU on OOM / CUDA failure."""
global model, DEVICE, PRECISION, fallback_done
"""
单次请求
1. 子进程跑 GPU 推理成功返回
2. 若子进程 OOM / CUDA Error 同一次请求 fallback CPU
绝不改全局状态其他并发请求不受影响
"""
def _worker(t, q):
try:
return model.encode(texts, return_dense=True)
q.put(("ok", model.encode(t, return_dense=True)))
except Exception as e:
q.put(("err", str(e)))
except RuntimeError as err:
is_oom = "out of memory" in str(err).lower()
is_cuda_fail = "cuda error" in str(err).lower() or "device-side assert" in str(
err
).lower()
q = mp.Queue()
p = mp.Process(target=_worker, args=(texts, q))
p.start()
p.join(timeout=60)
if (is_oom or is_cuda_fail) and not fallback_done:
logger.error("GPU failure (%s). Falling back to CPU…", err)
fallback_done = True
if not q.empty():
status, payload = q.get()
if status == "ok":
return payload
if "out of memory" in payload.lower() or "cuda error" in payload.lower():
logger.warning("GPU OOM → 本次请求改走 CPU%s", payload)
torch.cuda.empty_cache()
DEVICE = "cpu"
model, PRECISION = load_model(DEVICE)
return model.encode(texts, return_dense=True)
cpu_model, _ = load_model("cpu")
return cpu_model.encode(texts, return_dense=True)
raise RuntimeError(payload)
raise # second failure → propagate
raise RuntimeError("子进程异常退出,无返回")
# fallback_done = False # prevent endless downgrade loop
# def _encode(texts: List[str]):
# """Encode with single downgrade to CPU on OOM / CUDA failure."""
# global model, DEVICE, PRECISION, fallback_done
# try:
# return model.encode(texts, return_dense=True)
# except RuntimeError as err:
# is_oom = "out of memory" in str(err).lower()
# is_cuda_fail = "cuda error" in str(err).lower() or "device-side assert" in str(
# err
# ).lower()
# if (is_oom or is_cuda_fail) and not fallback_done:
# logger.error("GPU failure (%s). Falling back to CPU…", err)
# fallback_done = True
# torch.cuda.empty_cache()
# DEVICE = "cpu"
# model, PRECISION = load_model(DEVICE)
# return model.encode(texts, return_dense=True)
# raise # second failure → propagate
@app.post("/v1/embeddings")