faiss_rag_enterprise/llama_index/readers/elasticsearch.py

88 lines
2.7 KiB
Python

"""Elasticsearch (or Opensearch) reader over REST api.
This only uses the basic search api, so it will work with Elasticsearch and Opensearch.
"""
from typing import Any, List, Optional
from llama_index.bridge.pydantic import PrivateAttr
from llama_index.readers.base import BasePydanticReader
from llama_index.schema import Document
class ElasticsearchReader(BasePydanticReader):
"""
Read documents from an Elasticsearch/Opensearch index.
These documents can then be used in a downstream Llama Index data structure.
Args:
endpoint (str): URL (http/https) of cluster
index (str): Name of the index (required)
httpx_client_args (dict): Optional additional args to pass to the `httpx.Client`
"""
is_remote: bool = True
endpoint: str
index: str
httpx_client_args: Optional[dict] = None
_client: Any = PrivateAttr()
def __init__(
self, endpoint: str, index: str, httpx_client_args: Optional[dict] = None
):
"""Initialize with parameters."""
import_err_msg = """
`httpx` package not found. Install via `pip install httpx`
"""
try:
import httpx
except ImportError:
raise ImportError(import_err_msg)
self._client = httpx.Client(base_url=endpoint, **(httpx_client_args or {}))
super().__init__(
endpoint=endpoint, index=index, httpx_client_args=httpx_client_args
)
@classmethod
def class_name(cls) -> str:
return "ElasticsearchReader"
def load_data(
self,
field: str,
query: Optional[dict] = None,
embedding_field: Optional[str] = None,
) -> List[Document]:
"""Read data from the Elasticsearch index.
Args:
field (str): Field in the document to retrieve text from
query (Optional[dict]): Elasticsearch JSON query DSL object.
For example:
{"query": {"match": {"message": {"query": "this is a test"}}}}
embedding_field (Optional[str]): If there are embeddings stored in
this index, this field can be used
to set the embedding field on the returned Document list.
Returns:
List[Document]: A list of documents.
"""
res = self._client.post(f"{self.index}/_search", json=query).json()
documents = []
for hit in res["hits"]["hits"]:
doc_id = hit["_id"]
value = hit["_source"][field]
embedding = hit["_source"].get(embedding_field or "", None)
documents.append(
Document(
id_=doc_id, text=value, metadata=hit["_source"], embedding=embedding
)
)
return documents