This commit is contained in:
parent
fe580f3615
commit
cea0402f14
26
app/main.py
26
app/main.py
|
|
@ -89,6 +89,10 @@ def load_model(device: str):
|
|||
|
||||
mdl = BGEM3FlagModel(MODEL_PATH, use_fp16=use_fp16, device=device)
|
||||
|
||||
if device == "cpu":
|
||||
# 屏蔽 GPU,让后续 torch / BGEM3 都认不出 CUDA
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||||
|
||||
# Simple DataParallel for multi-GPU inference
|
||||
if device.startswith("cuda") and torch.cuda.device_count() > 1:
|
||||
logger.info(
|
||||
|
|
@ -173,25 +177,21 @@ logger.info("Using SAFE_MIN_FREE_MB = %d MB", SAFE_MIN_FREE_MB)
|
|||
|
||||
@app.on_event("startup")
|
||||
async def warm_up_mp_pool():
|
||||
"""
|
||||
GPU: 建立多进程池预热
|
||||
CPU: 单进程预热,但先临时把 torch 看到的 GPU 设为 0 张
|
||||
"""
|
||||
try:
|
||||
if DEVICE.startswith("cuda"):
|
||||
logger.info("Warm-up (GPU) → 预生成多进程池")
|
||||
_ = model.encode(["warmup"], return_dense=True) # 多 GPU
|
||||
logger.info("Warm-up (GPU) → 建多进程池")
|
||||
_ = model.encode(["warmup"], return_dense=True) # GPU
|
||||
else:
|
||||
logger.info("Warm-up (CPU) → 单进程初始化")
|
||||
# --- 关键 3 行 ---------------------------------------------------
|
||||
orig_cnt = torch.cuda.device_count # 保存原函数
|
||||
torch.cuda.device_count = lambda: 0 # 伪装无 GPU
|
||||
# 双保险:彻底把 BGEM3 的设备表改成只含 CPU
|
||||
if hasattr(model, "devices"):
|
||||
model.devices = ["cpu"]
|
||||
model.device = "cpu"
|
||||
_ = model.encode(["warmup"], return_dense=True,
|
||||
num_processes=1) # 单进程
|
||||
torch.cuda.device_count = orig_cnt # 恢复
|
||||
# ----------------------------------------------------------------
|
||||
num_processes=1) # 不再 fork
|
||||
except Exception as e:
|
||||
logger.warning("Warm-up failed: %s — 将在首条请求时再退避", e)
|
||||
logger.warning("Warm-up failed: %s —— 首条请求时再退避", e)
|
||||
|
||||
|
||||
|
||||
class EmbeddingRequest(BaseModel):
|
||||
|
|
|
|||
Loading…
Reference in New Issue