diff --git a/app/main.py b/app/main.py index 17ada9d..2fd092f 100644 --- a/app/main.py +++ b/app/main.py @@ -99,7 +99,7 @@ def _choose_precision_by_idx(idx: int) -> str: return "fp16" return "fp32" except Exception: - return "fp16" if torch.cuda.is_available() else "fp32" + return "fp32" def load_model(device: str): """ @@ -225,6 +225,11 @@ elif args.device is not None: else: model, PRECISION, DEVICE = auto_select_and_load() + +# --- global tokenizer (need this or you'll get "name 'tokenizer' is not defined") --- +tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True) + + # -----------------------------------------------------------------------------# # FastAPI # -----------------------------------------------------------------------------# @@ -236,11 +241,11 @@ logger.info("Using SAFE_MIN_FREE_MB = %d MB, BATCH_SIZE = %d", SAFE_MIN_FREE_MB, def _warmup(): global _READY try: - # 尝试用 batch_size 预热;不支持就回退 - try: - model.encode(["warmup sentence"], return_dense=True, batch_size=BATCH_SIZE) - except TypeError: - _ = _encode_chunked(model, ["warmup sentence"], max(1, min(BATCH_SIZE, 8))) + with torch.inference_mode(): + try: + model.encode(["warmup sentence"], return_dense=True, batch_size=BATCH_SIZE) + except TypeError: + _ = _encode_chunked(model, ["warmup sentence"], max(1, min(BATCH_SIZE, 8))) _READY = True logger.info("Warm-up complete.") except Exception as e: