131 lines
4.7 KiB
Python
131 lines
4.7 KiB
Python
"""Cross Encoder Finetuning Engine."""
|
|
from typing import Any, List, Optional, Union
|
|
|
|
from llama_index.finetuning.cross_encoders.dataset_gen import (
|
|
CrossEncoderFinetuningDatasetSample,
|
|
)
|
|
from llama_index.finetuning.types import BaseCrossEncoderFinetuningEngine
|
|
from llama_index.postprocessor import SentenceTransformerRerank
|
|
|
|
|
|
class CrossEncoderFinetuneEngine(BaseCrossEncoderFinetuningEngine):
|
|
"""Cross-Encoders Finetune Engine."""
|
|
|
|
def __init__(
|
|
self,
|
|
dataset: List[CrossEncoderFinetuningDatasetSample],
|
|
model_id: str = "cross-encoder/ms-marco-MiniLM-L-12-v2",
|
|
model_output_path: str = "exp_finetune",
|
|
batch_size: int = 10,
|
|
val_dataset: Union[List[CrossEncoderFinetuningDatasetSample], None] = None,
|
|
loss: Union[Any, None] = None,
|
|
epochs: int = 2,
|
|
show_progress_bar: bool = True,
|
|
evaluation_steps: int = 50,
|
|
) -> None:
|
|
"""Init params."""
|
|
try:
|
|
from sentence_transformers import InputExample
|
|
from sentence_transformers.cross_encoder import CrossEncoder
|
|
from torch.utils.data import DataLoader
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Cannot import sentence-transformers package,",
|
|
"please `pip install sentence-transformers`",
|
|
)
|
|
|
|
self.dataset = dataset
|
|
|
|
self.model_id = model_id
|
|
self.model_output_path = model_output_path
|
|
self.model = CrossEncoder(self.model_id, num_labels=1)
|
|
|
|
examples: Any = []
|
|
for sample in dataset:
|
|
query = sample.query
|
|
text = sample.context
|
|
score = sample.score
|
|
example = InputExample(texts=[query, text], label=score)
|
|
examples.append(example)
|
|
self.examples = examples
|
|
|
|
self.loader: DataLoader = DataLoader(examples, batch_size=batch_size)
|
|
|
|
# define evaluator
|
|
from sentence_transformers.cross_encoder.evaluation import (
|
|
CEBinaryClassificationEvaluator,
|
|
)
|
|
|
|
# TODO: also add support for CERerankingEvaluator
|
|
evaluator: Optional[CEBinaryClassificationEvaluator] = None
|
|
|
|
if val_dataset is not None:
|
|
dev_samples = []
|
|
|
|
for val_sample in val_dataset:
|
|
val_query = val_sample.query
|
|
val_text = val_sample.context
|
|
val_score = val_sample.score
|
|
val_example = InputExample(texts=[val_query, val_text], label=val_score)
|
|
dev_samples.append(val_example)
|
|
|
|
evaluator = CEBinaryClassificationEvaluator.from_input_examples(dev_samples)
|
|
|
|
self.evaluator = evaluator
|
|
|
|
# define loss
|
|
self.loss = loss
|
|
|
|
self.epochs = epochs
|
|
self.show_progress_bar = show_progress_bar
|
|
self.evaluation_steps = evaluation_steps
|
|
self.warmup_steps = int(len(self.loader) * epochs * 0.1)
|
|
|
|
def finetune(self, **train_kwargs: Any) -> None:
|
|
"""Finetune model."""
|
|
self.model.fit(
|
|
train_dataloader=self.loader,
|
|
epochs=self.epochs,
|
|
warmup_steps=self.warmup_steps,
|
|
output_path=self.model_output_path,
|
|
show_progress_bar=self.show_progress_bar,
|
|
evaluator=self.evaluator,
|
|
evaluation_steps=self.evaluation_steps,
|
|
)
|
|
# CrossEncoder library's fit function does not save model when evaluator is None
|
|
# https://github.com/UKPLab/sentence-transformers/issues/2324
|
|
if self.evaluator is None:
|
|
self.model.save(self.model_output_path)
|
|
else:
|
|
pass
|
|
|
|
def push_to_hub(self, repo_id: Any = None) -> None:
|
|
"""
|
|
Saves the model and tokenizer to HuggingFace hub.
|
|
"""
|
|
if repo_id is not None:
|
|
try:
|
|
self.model.model.push_to_hub(repo_id=repo_id)
|
|
self.model.tokenizer.push_to_hub(repo_id=repo_id)
|
|
|
|
except ValueError:
|
|
raise ValueError(
|
|
"HuggingFace CLI/Hub login not "
|
|
"completed provide token to login using"
|
|
"huggingface_hub.login() see this "
|
|
"https://huggingface.co/docs/transformers/model_sharing#share-a-model"
|
|
)
|
|
else:
|
|
raise ValueError("No value provided for repo_id")
|
|
|
|
def get_finetuned_model(
|
|
self, model_name: str, top_n: int = 3
|
|
) -> SentenceTransformerRerank:
|
|
"""
|
|
Loads the model from huggingface hub as re-ranker.
|
|
|
|
:param repo_id: Huggingface Hub repo from where you want to load the model
|
|
:param top_n: The value of nodes the re-ranker should filter
|
|
"""
|
|
return SentenceTransformerRerank(model=model_name, top_n=top_n)
|