evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/needle_haystack/utils.py

80 lines
2.9 KiB
Python

import matplotlib.pyplot as plt
import os
import re
import seaborn as sns
import string
from matplotlib.colors import LinearSegmentedColormap
def normalize_answer(s):
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def parse_score(score_str: str) -> int:
"""
Parses a score string and returns an integer score.
The score should be in the format [[score]].
"""
score_match = re.search(r'\[\[(\d+)\]\]', score_str)
if score_match:
score = int(score_match.group(1))
return score / 10.0
else:
return 0.0
def draw_score_chat(pivot_table, outpath, show_score=False):
# Create a custom colormap. Go to https://coolors.co/ and pick cool colors
cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])
# Create the heatmap with better aesthetics
plt.figure(figsize=(17.5, 8)) # Can adjust these dimensions as needed
sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=show_score, fmt='.1f', cmap=cmap, cbar_kws={'label': 'Score'})
# More aesthetics
plt.title('Fact Retrieval Across Context Lengths ("Needle In A HayStack")') # Adds a title
plt.xlabel('Token Limit') # X-axis label
plt.ylabel('Depth Percent') # Y-axis label
plt.xticks(rotation=45) # Rotates the x-axis labels to prevent overlap
plt.yticks(rotation=0) # Ensures the y-axis labels are horizontal
plt.tight_layout() # Fits everything neatly into the figure area
# save the figure
plt.savefig(outpath, dpi=300, bbox_inches='tight')
GENERAL_ORM_PROMPT = """You are an expert in verifying if the model answer is correct based on the reference answer.
Your input is a question, a reference answer, and a model answer. You need to check if the model answer is correct based on the reference answer.
You should focus on the correctness of the model answer compared to the reference answer, without attempting to solve the original question.
You must provide your final score in the form of a number from 1 to 10, where:
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference.
Only respond with a numberical score with formatted as [[score]].""" # noqa: E501
ORM_USER_TEMPLATE = """
Question: {question}
Reference Answer: {gold}
Model Answer: {pred}
"""