This commit is contained in:
hailin 2025-08-05 16:54:36 +08:00
parent 53285eecad
commit dd1ca5728a
2 changed files with 23 additions and 11 deletions

View File

@ -27,8 +27,8 @@ EXPOSE 8001
# 新增:给 PT 显存分段配置,减少碎片 (可选但推荐)
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32
# 启动Gunicorn + 4 worker每个 worker 一个独立进程
# 启动Gunicorn + 1 worker每个 worker 一个独立进程
CMD ["gunicorn", "app.main:app", \
"-k", "uvicorn.workers.UvicornWorker", \
"-w", "4", \
"-w", "1", \
"-b", "0.0.0.0:8001"]

View File

@ -27,9 +27,9 @@ from FlagEmbedding import BGEM3FlagModel
# Config
# -----------------------------------------------------------------------------#
MODEL_PATH = "model/bge-m3" # 按需改成你的权重路径
MODEL_VRAM_MB = int(os.getenv("MODEL_VRAM_MB", "16384")) # bge-m3-large fp16 ≈ 16 GiB
POST_LOAD_GAP_MB = 512
SAFE_MIN_FREE_MB = MODEL_VRAM_MB + POST_LOAD_GAP_MB # == 16896 MB
MODEL_VRAM_MB = int(os.getenv("MODEL_VRAM_MB", "8000")) # bge-m3-large fp32 ≈ 8 GiB
POST_LOAD_GAP_MB = 192
SAFE_MIN_FREE_MB = MODEL_VRAM_MB + POST_LOAD_GAP_MB # == 8192 MB
# -----------------------------------------------------------------------------#
# Logging
@ -179,12 +179,24 @@ logger.info("Using SAFE_MIN_FREE_MB = %d MB", SAFE_MIN_FREE_MB)
# ② -------- FastAPI 启动预热 --------
@app.on_event("startup")
async def warm_up():
logger.info("Warm-up on %s", DEVICE)
try:
_ = model.encode(["warmup"], return_dense=True, num_processes=1)
except Exception as e:
logger.warning("Warm-up failed: %s — 首条请求时再退避", e)
def warm_up():
def _warm_worker(t, q):
try:
_ = model.encode(t, return_dense=True, num_processes=1)
q.put("ok")
except Exception as e:
q.put(str(e))
texts = ["warmup"]
q = mp.Queue()
p = mp.Process(target=_warm_worker, args=(texts, q))
p.start()
p.join(timeout=60)
if not q.empty() and q.get() == "ok":
logger.info("Warm-up complete.")
else:
logger.warning("Warm-up failed or timed out.")