faiss_rag_enterprise/llama_index/llama_dataset/base.py

323 lines
10 KiB
Python

"""Llama Dataset Class."""
import json
from abc import abstractmethod
from enum import Enum
from typing import Generator, Generic, List, Optional, Type, TypeVar, Union
import tqdm
from openai import RateLimitError
from pandas import DataFrame as PandasDataFrame
from llama_index.async_utils import asyncio_module
from llama_index.bridge.pydantic import BaseModel, Field, PrivateAttr
from llama_index.core.base_query_engine import BaseQueryEngine
from llama_index.evaluation import BaseEvaluator
PredictorType = Union[BaseQueryEngine, BaseEvaluator]
P = TypeVar("P", bound=PredictorType)
class CreatedByType(str, Enum):
"""The kinds of rag data examples."""
HUMAN = "human"
AI = "ai"
def __str__(self) -> str:
return self.value
class CreatedBy(BaseModel):
model_name: Optional[str] = Field(
default_factory=str, description="When CreatedByType.AI, specify model name."
)
type: CreatedByType
def __str__(self) -> str:
if self.type == "ai":
return f"{self.type!s} ({self.model_name})"
else:
return str(self.type)
class BaseLlamaExamplePrediction(BaseModel):
"""Base llama dataset example class."""
@property
@abstractmethod
def class_name(self) -> str:
"""Class name."""
return "BaseLlamaPrediction"
class BaseLlamaDataExample(BaseModel):
"""Base llama dataset example class."""
@property
@abstractmethod
def class_name(self) -> str:
"""Class name."""
return "BaseLlamaDataExample"
class BaseLlamaPredictionDataset(BaseModel):
_prediction_type: Type[BaseLlamaExamplePrediction] = BaseLlamaExamplePrediction # type: ignore[misc]
predictions: List[BaseLlamaExamplePrediction] = Field(
default=list, description="Predictions on train_examples."
)
def __getitem__(self, val: Union[slice, int]) -> List[BaseLlamaExamplePrediction]:
"""Enable slicing and indexing.
Returns the desired slice on `predictions`.
"""
return self.predictions[val]
@abstractmethod
def to_pandas(self) -> PandasDataFrame:
"""Create pandas dataframe."""
def save_json(self, path: str) -> None:
"""Save json."""
with open(path, "w") as f:
predictions = None
if self.predictions:
predictions = [
self._prediction_type.dict(el) for el in self.predictions
]
data = {
"predictions": predictions,
}
json.dump(data, f, indent=4)
@classmethod
def from_json(cls, path: str) -> "BaseLlamaPredictionDataset":
"""Load json."""
with open(path) as f:
data = json.load(f)
predictions = [cls._prediction_type.parse_obj(el) for el in data["predictions"]]
return cls(
predictions=predictions,
)
@property
@abstractmethod
def class_name(self) -> str:
"""Class name."""
return "BaseLlamaPredictionDataset"
class BaseLlamaDataset(BaseModel, Generic[P]):
_example_type: Type[BaseLlamaDataExample] = BaseLlamaDataExample # type: ignore[misc]
examples: List[BaseLlamaDataExample] = Field(
default=[], description="Data examples of this dataset."
)
_predictions_cache: List[BaseLlamaExamplePrediction] = PrivateAttr(
default_factory=list
)
def __getitem__(self, val: Union[slice, int]) -> List[BaseLlamaDataExample]:
"""Enable slicing and indexing.
Returns the desired slice on `examples`.
"""
return self.examples[val]
@abstractmethod
def to_pandas(self) -> PandasDataFrame:
"""Create pandas dataframe."""
def save_json(self, path: str) -> None:
"""Save json."""
with open(path, "w") as f:
examples = [self._example_type.dict(el) for el in self.examples]
data = {
"examples": examples,
}
json.dump(data, f, indent=4)
@classmethod
def from_json(cls, path: str) -> "BaseLlamaDataset":
"""Load json."""
with open(path) as f:
data = json.load(f)
examples = [cls._example_type.parse_obj(el) for el in data["examples"]]
return cls(
examples=examples,
)
@abstractmethod
def _construct_prediction_dataset(
self, predictions: List[BaseLlamaExamplePrediction]
) -> BaseLlamaPredictionDataset:
"""Construct the specific prediction dataset.
Args:
predictions (List[BaseLlamaExamplePrediction]): the list of predictions.
Returns:
BaseLlamaPredictionDataset: A dataset of predictions.
"""
@abstractmethod
def _predict_example(
self,
predictor: P,
example: BaseLlamaDataExample,
sleep_time_in_seconds: int = 0,
) -> BaseLlamaExamplePrediction:
"""Predict on a single example.
NOTE: Subclasses need to implement this.
Args:
predictor (PredictorType): The predictor to make the prediciton with.
example (BaseLlamaDataExample): The example to predict on.
Returns:
BaseLlamaExamplePrediction: The prediction.
"""
def make_predictions_with(
self,
predictor: P,
show_progress: bool = False,
batch_size: int = 20,
sleep_time_in_seconds: int = 0,
) -> BaseLlamaPredictionDataset:
"""Predict with a given query engine.
Args:
predictor (PredictorType): The predictor to make predictions with.
show_progress (bool, optional): Show progress of making predictions.
batch_size (int): Used to batch async calls, especially to reduce chances
of hitting RateLimitError from openai.
sleep_time_in_seconds (int): Amount of time to sleep between batch call
to reduce chance of hitting RateLimitError from openai.
Returns:
BaseLlamaPredictionDataset: A dataset of predictions.
"""
if self._predictions_cache:
start_example_position = len(self._predictions_cache)
else:
start_example_position = 0
for batch in self._batch_examples(
batch_size=batch_size, start_position=start_example_position
):
if show_progress:
example_iterator = tqdm.tqdm(batch)
else:
example_iterator = batch
for example in example_iterator:
self._predictions_cache.append(
self._predict_example(predictor, example, sleep_time_in_seconds)
)
return self._construct_prediction_dataset(predictions=self._predictions_cache)
# async methods
@abstractmethod
async def _apredict_example(
self,
predictor: P,
example: BaseLlamaDataExample,
sleep_time_in_seconds: int,
) -> BaseLlamaExamplePrediction:
"""Async predict on a single example.
NOTE: Subclasses need to implement this.
Args:
predictor (PredictorType): The predictor to make the prediciton with.
example (BaseLlamaDataExample): The example to predict on.
Returns:
BaseLlamaExamplePrediction: The prediction.
"""
def _batch_examples(
self,
batch_size: int = 20,
start_position: int = 0,
) -> Generator[List[BaseLlamaDataExample], None, None]:
"""Batches examples and predictions with a given batch_size."""
num_examples = len(self.examples)
for ndx in range(start_position, num_examples, batch_size):
yield self.examples[ndx : min(ndx + batch_size, num_examples)]
async def amake_predictions_with(
self,
predictor: P,
show_progress: bool = False,
batch_size: int = 20,
sleep_time_in_seconds: int = 1,
) -> BaseLlamaPredictionDataset:
"""Async predict with a given query engine.
Args:
predictor (PredictorType): The predictor to make predictions with.
show_progress (bool, optional): Show progress of making predictions.
batch_size (int): Used to batch async calls, especially to reduce chances
of hitting RateLimitError from openai.
sleep_time_in_seconds (int): Amount of time to sleep between batch call
to reduce chance of hitting RateLimitError from openai.
Returns:
BaseLlamaPredictionDataset: A dataset of predictions.
"""
if self._predictions_cache:
start_example_position = len(self._predictions_cache)
else:
start_example_position = 0
for batch in self._batch_examples(
batch_size=batch_size, start_position=start_example_position
):
tasks = []
for example in batch:
tasks.append(
self._apredict_example(predictor, example, sleep_time_in_seconds)
)
asyncio_mod = asyncio_module(show_progress=show_progress)
try:
if show_progress:
batch_predictions = await asyncio_mod.gather(
*tasks, desc="Batch processing of predictions"
)
else:
batch_predictions = await asyncio_mod.gather(*tasks)
except RateLimitError as err:
if show_progress:
asyncio_mod.close()
raise ValueError(
"You've hit rate limits on your OpenAI subscription. This"
" class caches previous predictions after each successful"
" batch execution. Based off this cache, when executing this"
" command again it will attempt to predict on only the examples "
"that have not yet been predicted. Try reducing your batch_size."
) from err
self._predictions_cache += batch_predictions
# time.sleep(sleep_time_in_seconds)
prediction_dataset = self._construct_prediction_dataset(
predictions=self._predictions_cache
)
self._predictions_cache = [] # clear cache
return prediction_dataset
@property
@abstractmethod
def class_name(self) -> str:
"""Class name."""
return "BaseLlamaDataset"