evalscope_v0.17.0/evalscope.0.17.0/evalscope/models/adapters/bfcl_adapter.py

245 lines
10 KiB
Python

import json
import time
import uuid
from typing import Any, List, Optional, Union
from evalscope.utils.logger import get_logger
from .server_adapter import ServerModelAdapter
logger = get_logger()
class BFCLAdapter(ServerModelAdapter):
"""
BFCL model adapter to request remote API model and generate results for BFCL evaluation.
Support multi-turn and single-turn function calling tasks.
"""
def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
"""
Args:
api_url: The URL of the remote API model.
model_id: The ID of the remote API model.
api_key: The API key of the remote API model.
"""
super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
"""
Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
where each list is a follow up turn in the conversation
each turn is a List[List[Message]]
Args:
inputs (List[dict]): The input data.
infer_cfg (dict): Inference configuration.
Returns:
res (List[dict]): The model prediction results.
"""
infer_cfg = infer_cfg or {}
results = []
for input_item in inputs:
# This flag decides if we pass tools to the API or try tool calling via prompting
# Passing tools to the API means that we rely on the API to manage system prompt specifics
# and also expect parsed tool calls in the ChatCompletionMessage object
# This is how the is_fc_model=True benchmark is designed to work
# On the other hand, we try to manage
# tool calling via prompting and parse tool calls in the standard text response
# This is how the is_fc_model=False benchmark is designed to work
row = input_item.get('messages')
is_fc_model = row.get('is_fc_model', False)
if is_fc_model:
response = self.generate_turn_with_tools(row, infer_cfg)
else:
response = self.generate_turn(row, infer_cfg)
# wrap response with openai types
res_d = {
'choices': [{
'index': 0,
'message': {
'content': response,
'role': 'assistant'
}
}],
'created': time.time(),
'model': self.model_id,
'object': 'chat.completion',
'usage': {
'completion_tokens': 0,
'prompt_tokens': 0,
'total_tokens': 0
}
}
results.append(res_d)
return results
def generate_turn(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING,
MAXIMUM_STEP_LIMIT)
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
from bfcl_eval.model_handler.utils import default_decode_execute_prompting
all_model_responses = []
current_messages = []
turns = row['turns']
for turn_idx, messages in enumerate(turns):
n_steps = 0
current_responses = []
current_messages += messages.copy()
if str(turn_idx) in row['missing_functions']:
assert len(messages) == 0, 'Holdout turn should not have user message.'
new_turn = [{
'role':
'user',
'content':
DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_PROMPTING.format(
functions=row['missing_functions'][str(turn_idx)]),
}]
current_messages += new_turn
while True:
input_item = {
'messages': current_messages,
}
responses = self.process_single_input(input_item, infer_cfg)
result = responses['choices'][0]['message']['content']
logger.debug(f'Turn:{turn_idx} Step:{n_steps} Result: {result}')
current_messages.append({
'role': 'assistant',
'content': result,
})
current_responses.append(result)
execute_tools = row.get('should_execute_tool_calls', False)
if execute_tools:
try:
tool_calls = default_decode_execute_prompting(result)
except Exception:
tool_calls = None
if tool_calls is None:
break
tool_outputs, _ = execute_multi_turn_func_call(
tool_calls,
initial_config=row['initial_config'],
involved_classes=row['involved_classes'],
model_name='evaluator_loop',
test_entry_id=row['id'],
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
is_evaL_run=False,
)
# Append tool outputs to the current messages
tool_results = []
for tool_output, tool_call in zip(tool_outputs, tool_calls):
tool_results.append({'role': 'tool', 'name': tool_call, 'content': tool_output})
current_messages.append({
'role': 'user',
'content': repr(tool_results),
})
else:
break
n_steps += 1
if n_steps > MAXIMUM_STEP_LIMIT:
logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
break
all_model_responses.append(current_responses)
return all_model_responses
def generate_turn_with_tools(self, row: dict[str, Any], infer_cfg: dict[str, Any]) -> list[str]:
from bfcl_eval.constants.default_prompts import (DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
MAXIMUM_STEP_LIMIT)
from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import execute_multi_turn_func_call
from bfcl_eval.model_handler.utils import convert_to_function_call
all_model_responses = []
current_messages = []
turns = row['turns']
for turn_idx, messages in enumerate(turns):
n_steps = 0
current_responses = []
current_messages += messages.copy()
tools = row['tools']
if str(turn_idx) in row['missing_functions']:
assert len(messages) == 0, 'Holdout turn should not have user message.'
# inject new functions on the fly
new_tools = row['missing_functions'][str(turn_idx)]
for new_tool in new_tools:
tools.append({
'type': 'function',
'function': new_tool[0],
})
new_turn = [{
'role': 'user',
'content': DEFAULT_USER_PROMPT_FOR_ADDITIONAL_FUNCTION_FC,
}]
current_messages += new_turn
while True:
input_item = {
'messages': current_messages,
'tools': tools,
}
responses = self.process_single_input(input_item, infer_cfg)
message = responses['choices'][0]['message']
current_messages.append(message)
if isinstance(message, str):
model_responses = [message]
tool_call_strs = None
elif message.get('tool_calls'):
model_responses = [{
tc['function']['name']: tc['function']['arguments']
} for tc in message['tool_calls']]
try:
tool_call_strs = convert_to_function_call(model_responses)
except Exception as e:
logger.error(f'Error converting tool calls to function call strings: {e}')
tool_call_strs = None
else:
model_responses = [message['content']]
tool_call_strs = None
current_responses.extend(model_responses)
execute_tools = row.get('should_execute_tool_calls', False)
if execute_tools and tool_call_strs is not None:
tool_outputs, _ = execute_multi_turn_func_call(
tool_call_strs,
initial_config=row['initial_config'],
involved_classes=row['involved_classes'],
model_name='evaluator_loop',
test_entry_id=row['id'],
long_context=('long_context' in row['test_category'] or 'composite' in row['test_category']),
is_evaL_run=False,
)
for tc, tool_output in zip(message['tool_calls'], tool_outputs, strict=False):
current_messages.append({
'role': 'tool',
'tool_call_id': tc['id'],
'content': json.dumps({'response': tool_output}),
})
else:
break
n_steps += 1
if n_steps > MAXIMUM_STEP_LIMIT:
logger.error(f'INFERENCE_ERROR: Exceeded max inference steps ({MAXIMUM_STEP_LIMIT})')
break
all_model_responses.append(current_responses)
return all_model_responses