106 lines
3.4 KiB
Python
106 lines
3.4 KiB
Python
import os
|
|
import shutil
|
|
import signal
|
|
import subprocess
|
|
import unittest
|
|
|
|
from test_nightly_gsm8k_eval import parse_models, popen_launch_server_wrapper
|
|
|
|
from sglang.srt.utils import kill_process_tree
|
|
from sglang.test.test_utils import (
|
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
|
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
|
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
|
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
|
|
DEFAULT_MODEL_NAME_FOR_TEST,
|
|
DEFAULT_URL_FOR_TEST,
|
|
CustomTestCase,
|
|
is_in_ci,
|
|
)
|
|
|
|
|
|
class TestNightlyHumanEval(CustomTestCase):
|
|
@classmethod
|
|
def setUpClass(cls):
|
|
if is_in_ci():
|
|
cls.model_groups = [([DEFAULT_MODEL_NAME_FOR_TEST], False, False)]
|
|
else:
|
|
cls.model_groups = [
|
|
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
|
|
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
|
|
(
|
|
parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1),
|
|
True,
|
|
False,
|
|
),
|
|
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
|
|
]
|
|
cls.base_url = DEFAULT_URL_FOR_TEST
|
|
cls.process = None
|
|
cls.eval_process = None
|
|
|
|
@classmethod
|
|
def tearDownClass(cls):
|
|
if cls.process:
|
|
kill_process_tree(cls.process.pid)
|
|
if cls.eval_process:
|
|
kill_process_tree(cls.eval_process.pid)
|
|
|
|
def run_evalplus(self, model):
|
|
print("Delete evalplus results")
|
|
shutil.rmtree("evalplus_results", ignore_errors=True)
|
|
cmd = [
|
|
"evalplus.evaluate",
|
|
"--model",
|
|
model,
|
|
"--dataset",
|
|
"humaneval",
|
|
"--backend",
|
|
"openai",
|
|
"--base-url",
|
|
"http://localhost:6157/v1",
|
|
"--greedy",
|
|
]
|
|
|
|
try:
|
|
self.eval_process = subprocess.Popen(
|
|
cmd,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
preexec_fn=os.setsid,
|
|
)
|
|
|
|
stdout, stderr = self.eval_process.communicate(timeout=600)
|
|
|
|
if self.eval_process.returncode != 0:
|
|
print(f"Fail to human eval model={model} err={stderr}")
|
|
|
|
print("=" * 42)
|
|
print(stdout)
|
|
print("=" * 42)
|
|
except subprocess.TimeoutExpired:
|
|
if self.eval_process:
|
|
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)
|
|
print(f"Timeout during evaluation for model={model}")
|
|
except Exception as e:
|
|
print(f"Error running evalplus for model={model} {str(e)}")
|
|
if self.eval_process:
|
|
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)
|
|
|
|
def test_human_eval_all_models(self):
|
|
for model_group, is_fp8, is_tp2 in self.model_groups:
|
|
for model in model_group:
|
|
# NOTE: only Llama for now
|
|
if "Llama" in model:
|
|
with self.subTest(model=model):
|
|
self.process = popen_launch_server_wrapper(
|
|
self.base_url, model, is_fp8, is_tp2
|
|
)
|
|
self.run_evalplus(model)
|
|
self.tearDownClass()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|