diff --git a/app/main.py b/app/main.py index 9b2f0b0..17ada9d 100644 --- a/app/main.py +++ b/app/main.py @@ -119,8 +119,7 @@ def load_model(device: str): use_fp16 = (precision == "fp16") # 仅暴露这张卡;进程内映射为 cuda:0 - os.environ["CUDA_VISIBLE_DEVICES"] = str(idx) - mapped = "cuda:0" + mapped = "cuda:0" if device.startswith("cuda") else "cpu" logger.info("Loading BGEM3 on %s (mapped=%s, %s)", device, mapped, precision) mdl = BGEM3FlagModel(MODEL_PATH, use_fp16=use_fp16, device=mapped) @@ -131,48 +130,67 @@ def load_model(device: str): # -----------------------------------------------------------------------------# def auto_select_and_load() -> tuple: """ - 1. 过滤掉空闲显存 < MODEL_VRAM_MB 的 GPU - 2. 按空闲显存降序依次尝试加载 - 3. 载入后再次检查:若剩余 < POST_LOAD_GAP_MB → 视为失败 - 4. 若全部 GPU 不满足 → CPU + 只用 NVML 选卡并在首次 CUDA 调用前设置 CUDA_VISIBLE_DEVICES。 + 选卡规则: + - 过滤空闲显存 < MODEL_VRAM_MB 的卡 + - 按空闲显存降序尝试加载 + - 加载后再用 NVML 复检剩余显存 < POST_LOAD_GAP_MB 则换下一张 + - 全部不满足则 CPU """ - if not torch.cuda.is_available(): - logger.info("No GPU detected → CPU") + # 1) 没有 NVML:无法安全做显存筛选 → 尝试盲选 0 号卡(提前 MASK),失败就 CPU + if not _USE_NVML: + if "CUDA_VISIBLE_DEVICES" not in os.environ or os.environ["CUDA_VISIBLE_DEVICES"] == "": + os.environ["CUDA_VISIBLE_DEVICES"] = "0" + try: + mdl, prec = load_model("cuda:0") # 进程内看到的就是单卡 0 + return mdl, prec, "cuda:0" + except Exception as e: + logger.warning("No NVML or CUDA unusable (%s) → CPU fallback", e) + mdl, prec = load_model("cpu") + return mdl, prec, "cpu" + + # 2) NVML 可用:按空闲显存挑卡(全程不触碰 torch.cuda) + try: + gpu_count = pynvml.nvmlDeviceGetCount() + except Exception as e: + logger.warning("NVML getCount failed (%s) → CPU", e) mdl, prec = load_model("cpu") return mdl, prec, "cpu" - # 收集候选卡 (free_MB, idx) candidates = [] - for idx in range(torch.cuda.device_count()): - free_mb = _gpu_mem_info(idx)[0] // 2**20 - if free_mb >= MODEL_VRAM_MB: - candidates.append((free_mb, idx)) + for idx in range(gpu_count): + try: + free_b, total_b = _gpu_mem_info(idx) # NVML 路径 + free_mb = free_b // 2**20 + if free_mb >= MODEL_VRAM_MB: + candidates.append((free_mb, idx)) + except Exception as e: + logger.warning("NVML query gpu %d failed: %s", idx, e) if not candidates: logger.warning("All GPUs free_mem < %d MB → CPU", MODEL_VRAM_MB) mdl, prec = load_model("cpu") return mdl, prec, "cpu" - # 空闲显存从高到低 + # 3) 从大到小尝试加载;每次尝试前先 MASK 该卡 for free_mb, idx in sorted(candidates, reverse=True): - dev = f"cuda:{idx}" try: - logger.info("Trying %s (free=%d MB)", dev, free_mb) - mdl, prec = load_model(dev) - - # 载入后余量检查:NVML 用全局 idx;无 NVML 时,用进程内 0 号 - if _USE_NVML: - remain_mb = _gpu_mem_info(idx)[0] // 2**20 - else: - remain_mb = _gpu_mem_info(0)[0] // 2**20 + os.environ["CUDA_VISIBLE_DEVICES"] = str(idx) # **关键:先 MASK,再触碰 torch** + dev_label = f"cuda:{idx}" # 对外标注用全局序号 + mdl, prec = load_model("cuda:0") # 进程内实际就是 0 号 + # 载入后用 NVML 复检剩余显存(仍按全局 idx) + remain_mb = _gpu_mem_info(idx)[0] // 2**20 if remain_mb < POST_LOAD_GAP_MB: raise RuntimeError(f"post-load free {remain_mb} MB < {POST_LOAD_GAP_MB} MB") - return mdl, prec, dev - except RuntimeError as e: - logger.warning("%s unusable (%s) → next", dev, e) - torch.cuda.empty_cache() + return mdl, prec, dev_label + except Exception as e: + logger.warning("GPU %d unusable (%s) → next", idx, e) + # 不要在这里调用 torch.cuda.empty_cache(),以免无意中初始化其他设备 + continue + + # 4) 都不行 → CPU logger.warning("No suitable GPU left → CPU fallback") mdl, prec = load_model("cpu") return mdl, prec, "cpu" @@ -188,22 +206,25 @@ args, _ = parser.parse_known_args() if FORCE_DEVICE is not None: if FORCE_DEVICE.lower() == "cpu": + os.environ["CUDA_VISIBLE_DEVICES"] = "" DEVICE = "cpu" + model, PRECISION = load_model("cpu") else: - DEVICE = f"cuda:{int(FORCE_DEVICE)}" if torch.cuda.is_available() else "cpu" - model, PRECISION = load_model(DEVICE) + os.environ["CUDA_VISIBLE_DEVICES"] = str(int(FORCE_DEVICE)) # 先掩蔽 + DEVICE = f"cuda:{int(FORCE_DEVICE)}" # 对外展示全局序号 + model, PRECISION = load_model("cuda:0") # 进程内使用 0 号 elif args.device is not None: if args.device.lower() == "cpu": + os.environ["CUDA_VISIBLE_DEVICES"] = "" DEVICE = "cpu" + model, PRECISION = load_model("cpu") else: - DEVICE = f"cuda:{int(args.device)}" if torch.cuda.is_available() else "cpu" - model, PRECISION = load_model(DEVICE) + os.environ["CUDA_VISIBLE_DEVICES"] = str(int(args.device)) # 先掩蔽 + DEVICE = f"cuda:{int(args.device)}" + model, PRECISION = load_model("cuda:0") else: model, PRECISION, DEVICE = auto_select_and_load() -tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) - - # -----------------------------------------------------------------------------# # FastAPI # -----------------------------------------------------------------------------#