This commit is contained in:
parent
2de829db93
commit
94dad95fc2
12
Dockerfile
12
Dockerfile
|
|
@ -21,6 +21,14 @@ COPY model/bge-m3 /app/model/bge-m3
|
||||||
# 暴露端口
|
# 暴露端口
|
||||||
EXPOSE 8001
|
EXPOSE 8001
|
||||||
|
|
||||||
# 启动 FastAPI 服务
|
# # 启动 FastAPI 服务
|
||||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8001"]
|
# CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8001"]
|
||||||
|
|
||||||
|
# 新增:给 PT 显存分段配置,减少碎片 (可选但推荐)
|
||||||
|
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32
|
||||||
|
|
||||||
|
# 启动:Gunicorn + 4 worker,每个 worker 一个独立进程
|
||||||
|
CMD ["gunicorn", "app.main:app", \
|
||||||
|
"-k", "uvicorn.workers.UvicornWorker", \
|
||||||
|
"-w", "4", \
|
||||||
|
"-b", "0.0.0.0:8001"]
|
||||||
71
app/main.py
71
app/main.py
|
|
@ -15,6 +15,7 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
import multiprocessing as mp
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException
|
||||||
|
|
@ -175,31 +176,61 @@ class EmbeddingRequest(BaseModel):
|
||||||
model: str = "text-embedding-bge-m3"
|
model: str = "text-embedding-bge-m3"
|
||||||
|
|
||||||
|
|
||||||
fallback_done = False # prevent endless downgrade loop
|
|
||||||
|
|
||||||
|
|
||||||
def _encode(texts: List[str]):
|
def _encode(texts: List[str]):
|
||||||
"""Encode with single downgrade to CPU on OOM / CUDA failure."""
|
"""
|
||||||
global model, DEVICE, PRECISION, fallback_done
|
单次请求:
|
||||||
|
1. 子进程跑 GPU 推理;成功→返回
|
||||||
|
2. 若子进程 OOM / CUDA Error → 同一次请求 fallback 到 CPU
|
||||||
|
绝不改全局状态,其他并发请求不受影响
|
||||||
|
"""
|
||||||
|
def _worker(t, q):
|
||||||
|
try:
|
||||||
|
q.put(("ok", model.encode(t, return_dense=True)))
|
||||||
|
except Exception as e:
|
||||||
|
q.put(("err", str(e)))
|
||||||
|
|
||||||
try:
|
q = mp.Queue()
|
||||||
return model.encode(texts, return_dense=True)
|
p = mp.Process(target=_worker, args=(texts, q))
|
||||||
|
p.start()
|
||||||
|
p.join(timeout=60)
|
||||||
|
|
||||||
except RuntimeError as err:
|
if not q.empty():
|
||||||
is_oom = "out of memory" in str(err).lower()
|
status, payload = q.get()
|
||||||
is_cuda_fail = "cuda error" in str(err).lower() or "device-side assert" in str(
|
if status == "ok":
|
||||||
err
|
return payload
|
||||||
).lower()
|
if "out of memory" in payload.lower() or "cuda error" in payload.lower():
|
||||||
|
logger.warning("GPU OOM → 本次请求改走 CPU:%s", payload)
|
||||||
if (is_oom or is_cuda_fail) and not fallback_done:
|
|
||||||
logger.error("GPU failure (%s). Falling back to CPU…", err)
|
|
||||||
fallback_done = True
|
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
DEVICE = "cpu"
|
cpu_model, _ = load_model("cpu")
|
||||||
model, PRECISION = load_model(DEVICE)
|
return cpu_model.encode(texts, return_dense=True)
|
||||||
return model.encode(texts, return_dense=True)
|
raise RuntimeError(payload)
|
||||||
|
|
||||||
raise # second failure → propagate
|
raise RuntimeError("子进程异常退出,无返回")
|
||||||
|
|
||||||
|
# fallback_done = False # prevent endless downgrade loop
|
||||||
|
|
||||||
|
# def _encode(texts: List[str]):
|
||||||
|
# """Encode with single downgrade to CPU on OOM / CUDA failure."""
|
||||||
|
# global model, DEVICE, PRECISION, fallback_done
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# return model.encode(texts, return_dense=True)
|
||||||
|
|
||||||
|
# except RuntimeError as err:
|
||||||
|
# is_oom = "out of memory" in str(err).lower()
|
||||||
|
# is_cuda_fail = "cuda error" in str(err).lower() or "device-side assert" in str(
|
||||||
|
# err
|
||||||
|
# ).lower()
|
||||||
|
|
||||||
|
# if (is_oom or is_cuda_fail) and not fallback_done:
|
||||||
|
# logger.error("GPU failure (%s). Falling back to CPU…", err)
|
||||||
|
# fallback_done = True
|
||||||
|
# torch.cuda.empty_cache()
|
||||||
|
# DEVICE = "cpu"
|
||||||
|
# model, PRECISION = load_model(DEVICE)
|
||||||
|
# return model.encode(texts, return_dense=True)
|
||||||
|
|
||||||
|
# raise # second failure → propagate
|
||||||
|
|
||||||
|
|
||||||
@app.post("/v1/embeddings")
|
@app.post("/v1/embeddings")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue