diff --git a/app/main.py b/app/main.py
index 17ada9d..2fd092f 100644
--- a/app/main.py
+++ b/app/main.py
@@ -99,7 +99,7 @@ def _choose_precision_by_idx(idx: int) -> str:
             return "fp16"
         return "fp32"
     except Exception:
-        return "fp16" if torch.cuda.is_available() else "fp32"
+        return "fp32"
 
 def load_model(device: str):
     """
@@ -225,6 +225,11 @@ elif args.device is not None:
 else:
     model, PRECISION, DEVICE = auto_select_and_load()
 
+
+# --- global tokenizer (need this or you'll get "name 'tokenizer' is not defined") ---
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
+
+
 # -----------------------------------------------------------------------------#
 # FastAPI
 # -----------------------------------------------------------------------------#
@@ -236,11 +241,11 @@ logger.info("Using SAFE_MIN_FREE_MB = %d MB, BATCH_SIZE = %d", SAFE_MIN_FREE_MB,
 def _warmup():
     global _READY
     try:
-        # 尝试用 batch_size 预热；不支持就回退
-        try:
-            model.encode(["warmup sentence"], return_dense=True, batch_size=BATCH_SIZE)
-        except TypeError:
-            _ = _encode_chunked(model, ["warmup sentence"], max(1, min(BATCH_SIZE, 8)))
+        with torch.inference_mode():
+            try:
+                model.encode(["warmup sentence"], return_dense=True, batch_size=BATCH_SIZE)
+            except TypeError:
+                _ = _encode_chunked(model, ["warmup sentence"], max(1, min(BATCH_SIZE, 8)))
         _READY = True
         logger.info("Warm-up complete.")
     except Exception as e: