This commit is contained in:
parent
0500e81f1c
commit
1bf58c86e1
83
app/main.py
83
app/main.py
|
|
@ -119,8 +119,7 @@ def load_model(device: str):
|
||||||
use_fp16 = (precision == "fp16")
|
use_fp16 = (precision == "fp16")
|
||||||
|
|
||||||
# 仅暴露这张卡;进程内映射为 cuda:0
|
# 仅暴露这张卡;进程内映射为 cuda:0
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = str(idx)
|
mapped = "cuda:0" if device.startswith("cuda") else "cpu"
|
||||||
mapped = "cuda:0"
|
|
||||||
|
|
||||||
logger.info("Loading BGEM3 on %s (mapped=%s, %s)", device, mapped, precision)
|
logger.info("Loading BGEM3 on %s (mapped=%s, %s)", device, mapped, precision)
|
||||||
mdl = BGEM3FlagModel(MODEL_PATH, use_fp16=use_fp16, device=mapped)
|
mdl = BGEM3FlagModel(MODEL_PATH, use_fp16=use_fp16, device=mapped)
|
||||||
|
|
@ -131,48 +130,67 @@ def load_model(device: str):
|
||||||
# -----------------------------------------------------------------------------#
|
# -----------------------------------------------------------------------------#
|
||||||
def auto_select_and_load() -> tuple:
|
def auto_select_and_load() -> tuple:
|
||||||
"""
|
"""
|
||||||
1. 过滤掉空闲显存 < MODEL_VRAM_MB 的 GPU
|
只用 NVML 选卡并在首次 CUDA 调用前设置 CUDA_VISIBLE_DEVICES。
|
||||||
2. 按空闲显存降序依次尝试加载
|
选卡规则:
|
||||||
3. 载入后再次检查:若剩余 < POST_LOAD_GAP_MB → 视为失败
|
- 过滤空闲显存 < MODEL_VRAM_MB 的卡
|
||||||
4. 若全部 GPU 不满足 → CPU
|
- 按空闲显存降序尝试加载
|
||||||
|
- 加载后再用 NVML 复检剩余显存 < POST_LOAD_GAP_MB 则换下一张
|
||||||
|
- 全部不满足则 CPU
|
||||||
"""
|
"""
|
||||||
if not torch.cuda.is_available():
|
# 1) 没有 NVML:无法安全做显存筛选 → 尝试盲选 0 号卡(提前 MASK),失败就 CPU
|
||||||
logger.info("No GPU detected → CPU")
|
if not _USE_NVML:
|
||||||
|
if "CUDA_VISIBLE_DEVICES" not in os.environ or os.environ["CUDA_VISIBLE_DEVICES"] == "":
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||||
|
try:
|
||||||
|
mdl, prec = load_model("cuda:0") # 进程内看到的就是单卡 0
|
||||||
|
return mdl, prec, "cuda:0"
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("No NVML or CUDA unusable (%s) → CPU fallback", e)
|
||||||
|
mdl, prec = load_model("cpu")
|
||||||
|
return mdl, prec, "cpu"
|
||||||
|
|
||||||
|
# 2) NVML 可用:按空闲显存挑卡(全程不触碰 torch.cuda)
|
||||||
|
try:
|
||||||
|
gpu_count = pynvml.nvmlDeviceGetCount()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("NVML getCount failed (%s) → CPU", e)
|
||||||
mdl, prec = load_model("cpu")
|
mdl, prec = load_model("cpu")
|
||||||
return mdl, prec, "cpu"
|
return mdl, prec, "cpu"
|
||||||
|
|
||||||
# 收集候选卡 (free_MB, idx)
|
|
||||||
candidates = []
|
candidates = []
|
||||||
for idx in range(torch.cuda.device_count()):
|
for idx in range(gpu_count):
|
||||||
free_mb = _gpu_mem_info(idx)[0] // 2**20
|
try:
|
||||||
|
free_b, total_b = _gpu_mem_info(idx) # NVML 路径
|
||||||
|
free_mb = free_b // 2**20
|
||||||
if free_mb >= MODEL_VRAM_MB:
|
if free_mb >= MODEL_VRAM_MB:
|
||||||
candidates.append((free_mb, idx))
|
candidates.append((free_mb, idx))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("NVML query gpu %d failed: %s", idx, e)
|
||||||
|
|
||||||
if not candidates:
|
if not candidates:
|
||||||
logger.warning("All GPUs free_mem < %d MB → CPU", MODEL_VRAM_MB)
|
logger.warning("All GPUs free_mem < %d MB → CPU", MODEL_VRAM_MB)
|
||||||
mdl, prec = load_model("cpu")
|
mdl, prec = load_model("cpu")
|
||||||
return mdl, prec, "cpu"
|
return mdl, prec, "cpu"
|
||||||
|
|
||||||
# 空闲显存从高到低
|
# 3) 从大到小尝试加载;每次尝试前先 MASK 该卡
|
||||||
for free_mb, idx in sorted(candidates, reverse=True):
|
for free_mb, idx in sorted(candidates, reverse=True):
|
||||||
dev = f"cuda:{idx}"
|
|
||||||
try:
|
try:
|
||||||
logger.info("Trying %s (free=%d MB)", dev, free_mb)
|
os.environ["CUDA_VISIBLE_DEVICES"] = str(idx) # **关键:先 MASK,再触碰 torch**
|
||||||
mdl, prec = load_model(dev)
|
dev_label = f"cuda:{idx}" # 对外标注用全局序号
|
||||||
|
mdl, prec = load_model("cuda:0") # 进程内实际就是 0 号
|
||||||
|
|
||||||
# 载入后余量检查:NVML 用全局 idx;无 NVML 时,用进程内 0 号
|
# 载入后用 NVML 复检剩余显存(仍按全局 idx)
|
||||||
if _USE_NVML:
|
|
||||||
remain_mb = _gpu_mem_info(idx)[0] // 2**20
|
remain_mb = _gpu_mem_info(idx)[0] // 2**20
|
||||||
else:
|
|
||||||
remain_mb = _gpu_mem_info(0)[0] // 2**20
|
|
||||||
|
|
||||||
if remain_mb < POST_LOAD_GAP_MB:
|
if remain_mb < POST_LOAD_GAP_MB:
|
||||||
raise RuntimeError(f"post-load free {remain_mb} MB < {POST_LOAD_GAP_MB} MB")
|
raise RuntimeError(f"post-load free {remain_mb} MB < {POST_LOAD_GAP_MB} MB")
|
||||||
return mdl, prec, dev
|
|
||||||
except RuntimeError as e:
|
|
||||||
logger.warning("%s unusable (%s) → next", dev, e)
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
|
return mdl, prec, dev_label
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("GPU %d unusable (%s) → next", idx, e)
|
||||||
|
# 不要在这里调用 torch.cuda.empty_cache(),以免无意中初始化其他设备
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 4) 都不行 → CPU
|
||||||
logger.warning("No suitable GPU left → CPU fallback")
|
logger.warning("No suitable GPU left → CPU fallback")
|
||||||
mdl, prec = load_model("cpu")
|
mdl, prec = load_model("cpu")
|
||||||
return mdl, prec, "cpu"
|
return mdl, prec, "cpu"
|
||||||
|
|
@ -188,22 +206,25 @@ args, _ = parser.parse_known_args()
|
||||||
|
|
||||||
if FORCE_DEVICE is not None:
|
if FORCE_DEVICE is not None:
|
||||||
if FORCE_DEVICE.lower() == "cpu":
|
if FORCE_DEVICE.lower() == "cpu":
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||||||
DEVICE = "cpu"
|
DEVICE = "cpu"
|
||||||
|
model, PRECISION = load_model("cpu")
|
||||||
else:
|
else:
|
||||||
DEVICE = f"cuda:{int(FORCE_DEVICE)}" if torch.cuda.is_available() else "cpu"
|
os.environ["CUDA_VISIBLE_DEVICES"] = str(int(FORCE_DEVICE)) # 先掩蔽
|
||||||
model, PRECISION = load_model(DEVICE)
|
DEVICE = f"cuda:{int(FORCE_DEVICE)}" # 对外展示全局序号
|
||||||
|
model, PRECISION = load_model("cuda:0") # 进程内使用 0 号
|
||||||
elif args.device is not None:
|
elif args.device is not None:
|
||||||
if args.device.lower() == "cpu":
|
if args.device.lower() == "cpu":
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||||||
DEVICE = "cpu"
|
DEVICE = "cpu"
|
||||||
|
model, PRECISION = load_model("cpu")
|
||||||
else:
|
else:
|
||||||
DEVICE = f"cuda:{int(args.device)}" if torch.cuda.is_available() else "cpu"
|
os.environ["CUDA_VISIBLE_DEVICES"] = str(int(args.device)) # 先掩蔽
|
||||||
model, PRECISION = load_model(DEVICE)
|
DEVICE = f"cuda:{int(args.device)}"
|
||||||
|
model, PRECISION = load_model("cuda:0")
|
||||||
else:
|
else:
|
||||||
model, PRECISION, DEVICE = auto_select_and_load()
|
model, PRECISION, DEVICE = auto_select_and_load()
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------#
|
# -----------------------------------------------------------------------------#
|
||||||
# FastAPI
|
# FastAPI
|
||||||
# -----------------------------------------------------------------------------#
|
# -----------------------------------------------------------------------------#
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue