faiss_rag_enterprise/llama_index/program/multi_modal_llm_program.py

117 lines
3.8 KiB
Python

from typing import Any, Dict, Optional, Sequence, Type, cast
from llama_index.bridge.pydantic import BaseModel
from llama_index.multi_modal_llms import MultiModalLLM, OpenAIMultiModal
from llama_index.output_parsers.pydantic import PydanticOutputParser
from llama_index.prompts.base import BasePromptTemplate, PromptTemplate
from llama_index.schema import ImageDocument
from llama_index.types import BasePydanticProgram
from llama_index.utils import print_text
class MultiModalLLMCompletionProgram(BasePydanticProgram[BaseModel]):
"""
Multi Modal LLM Completion Program.
Uses generic Multi Modal LLM completion + an output parser to generate a structured output.
"""
def __init__(
self,
output_parser: PydanticOutputParser,
prompt: BasePromptTemplate,
multi_modal_llm: MultiModalLLM,
image_documents: Sequence[ImageDocument],
verbose: bool = False,
) -> None:
self._output_parser = output_parser
self._multi_modal_llm = multi_modal_llm
self._prompt = prompt
self._image_documents = image_documents
self._verbose = verbose
self._prompt.output_parser = output_parser
@classmethod
def from_defaults(
cls,
output_parser: PydanticOutputParser,
prompt_template_str: Optional[str] = None,
prompt: Optional[PromptTemplate] = None,
multi_modal_llm: Optional[MultiModalLLM] = None,
image_documents: Optional[Sequence[ImageDocument]] = None,
verbose: bool = False,
**kwargs: Any,
) -> "MultiModalLLMCompletionProgram":
multi_modal_llm = multi_modal_llm or OpenAIMultiModal(
temperature=0, model="gpt-4-vision-preview"
)
if prompt is None and prompt_template_str is None:
raise ValueError("Must provide either prompt or prompt_template_str.")
if prompt is not None and prompt_template_str is not None:
raise ValueError("Must provide either prompt or prompt_template_str.")
if prompt_template_str is not None:
prompt = PromptTemplate(prompt_template_str)
return cls(
output_parser,
prompt=cast(PromptTemplate, prompt),
multi_modal_llm=multi_modal_llm,
image_documents=image_documents or [],
verbose=verbose,
)
@property
def output_cls(self) -> Type[BaseModel]:
return self._output_parser.output_cls
@property
def prompt(self) -> BasePromptTemplate:
return self._prompt
@prompt.setter
def prompt(self, prompt: BasePromptTemplate) -> None:
self._prompt = prompt
def __call__(
self,
llm_kwargs: Optional[Dict[str, Any]] = None,
*args: Any,
**kwargs: Any,
) -> BaseModel:
llm_kwargs = llm_kwargs or {}
formatted_prompt = self._prompt.format(llm=self._multi_modal_llm, **kwargs)
response = self._multi_modal_llm.complete(
formatted_prompt,
image_documents=self._image_documents,
**llm_kwargs,
)
raw_output = response.text
if self._verbose:
print_text(f"> Raw output: {raw_output}\n", color="llama_blue")
return self._output_parser.parse(raw_output)
async def acall(
self,
llm_kwargs: Optional[Dict[str, Any]] = None,
*args: Any,
**kwargs: Any,
) -> BaseModel:
llm_kwargs = llm_kwargs or {}
formatted_prompt = self._prompt.format(llm=self._multi_modal_llm, **kwargs)
response = await self._multi_modal_llm.acomplete(
formatted_prompt,
image_documents=self._image_documents,
**llm_kwargs,
)
raw_output = response.text
if self._verbose:
print_text(f"> Raw output: {raw_output}\n", color="llama_blue")
return self._output_parser.parse(raw_output)