evalscope_v0.17.0/evalscope.0.17.0/evalscope/benchmarks/needle_haystack/utils.py

import matplotlib.pyplot as plt
import os
import re
import seaborn as sns
import string
from matplotlib.colors import LinearSegmentedColormap


def normalize_answer(s):

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def parse_score(score_str: str) -> int:
    """
    Parses a score string and returns an integer score.
    The score should be in the format [[score]].
    """
    score_match = re.search(r'\[\[(\d+)\]\]', score_str)
    if score_match:
        score = int(score_match.group(1))
        return score / 10.0
    else:
        return 0.0


def draw_score_chat(pivot_table, outpath, show_score=False):
    # Create a custom colormap. Go to https://coolors.co/ and pick cool colors
    cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])

    # Create the heatmap with better aesthetics
    plt.figure(figsize=(17.5, 8))  # Can adjust these dimensions as needed
    sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=show_score, fmt='.1f', cmap=cmap, cbar_kws={'label': 'Score'})

    # More aesthetics
    plt.title('Fact Retrieval Across Context Lengths ("Needle In A HayStack")')  # Adds a title
    plt.xlabel('Token Limit')  # X-axis label
    plt.ylabel('Depth Percent')  # Y-axis label
    plt.xticks(rotation=45)  # Rotates the x-axis labels to prevent overlap
    plt.yticks(rotation=0)  # Ensures the y-axis labels are horizontal
    plt.tight_layout()  # Fits everything neatly into the figure area

    # save the figure
    plt.savefig(outpath, dpi=300, bbox_inches='tight')


GENERAL_ORM_PROMPT = """You are an expert in verifying if the model answer is correct based on the reference answer.
Your input is a question, a reference answer, and a model answer. You need to check if the model answer is correct based on the reference answer.
You should focus on the correctness of the model answer compared to the reference answer, without attempting to solve the original question.
You must provide your final score in the form of a number from 1 to 10, where:

Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference.

Only respond with a numberical score with formatted as [[score]]."""  # noqa: E501

ORM_USER_TEMPLATE = """
Question: {question}

Reference Answer: {gold}

Model Answer: {pred}
"""