88 lines
2.7 KiB
Python
88 lines
2.7 KiB
Python
"""Elasticsearch (or Opensearch) reader over REST api.
|
|
|
|
This only uses the basic search api, so it will work with Elasticsearch and Opensearch.
|
|
|
|
"""
|
|
|
|
|
|
from typing import Any, List, Optional
|
|
|
|
from llama_index.bridge.pydantic import PrivateAttr
|
|
from llama_index.readers.base import BasePydanticReader
|
|
from llama_index.schema import Document
|
|
|
|
|
|
class ElasticsearchReader(BasePydanticReader):
|
|
"""
|
|
Read documents from an Elasticsearch/Opensearch index.
|
|
|
|
These documents can then be used in a downstream Llama Index data structure.
|
|
|
|
Args:
|
|
endpoint (str): URL (http/https) of cluster
|
|
index (str): Name of the index (required)
|
|
httpx_client_args (dict): Optional additional args to pass to the `httpx.Client`
|
|
"""
|
|
|
|
is_remote: bool = True
|
|
endpoint: str
|
|
index: str
|
|
httpx_client_args: Optional[dict] = None
|
|
|
|
_client: Any = PrivateAttr()
|
|
|
|
def __init__(
|
|
self, endpoint: str, index: str, httpx_client_args: Optional[dict] = None
|
|
):
|
|
"""Initialize with parameters."""
|
|
import_err_msg = """
|
|
`httpx` package not found. Install via `pip install httpx`
|
|
"""
|
|
try:
|
|
import httpx
|
|
except ImportError:
|
|
raise ImportError(import_err_msg)
|
|
self._client = httpx.Client(base_url=endpoint, **(httpx_client_args or {}))
|
|
|
|
super().__init__(
|
|
endpoint=endpoint, index=index, httpx_client_args=httpx_client_args
|
|
)
|
|
|
|
@classmethod
|
|
def class_name(cls) -> str:
|
|
return "ElasticsearchReader"
|
|
|
|
def load_data(
|
|
self,
|
|
field: str,
|
|
query: Optional[dict] = None,
|
|
embedding_field: Optional[str] = None,
|
|
) -> List[Document]:
|
|
"""Read data from the Elasticsearch index.
|
|
|
|
Args:
|
|
field (str): Field in the document to retrieve text from
|
|
query (Optional[dict]): Elasticsearch JSON query DSL object.
|
|
For example:
|
|
{"query": {"match": {"message": {"query": "this is a test"}}}}
|
|
embedding_field (Optional[str]): If there are embeddings stored in
|
|
this index, this field can be used
|
|
to set the embedding field on the returned Document list.
|
|
|
|
Returns:
|
|
List[Document]: A list of documents.
|
|
|
|
"""
|
|
res = self._client.post(f"{self.index}/_search", json=query).json()
|
|
documents = []
|
|
for hit in res["hits"]["hits"]:
|
|
doc_id = hit["_id"]
|
|
value = hit["_source"][field]
|
|
embedding = hit["_source"].get(embedding_field or "", None)
|
|
documents.append(
|
|
Document(
|
|
id_=doc_id, text=value, metadata=hit["_source"], embedding=embedding
|
|
)
|
|
)
|
|
return documents
|