125 lines
4.5 KiB
Python
125 lines
4.5 KiB
Python
"""JSON Reader."""
|
|
|
|
import json
|
|
import re
|
|
from typing import Any, Generator, List, Optional
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.schema import Document
|
|
|
|
|
|
def _depth_first_yield(
|
|
json_data: Any,
|
|
levels_back: int,
|
|
collapse_length: Optional[int],
|
|
path: List[str],
|
|
ensure_ascii: bool = False,
|
|
) -> Generator[str, None, None]:
|
|
"""Do depth first yield of all of the leaf nodes of a JSON.
|
|
|
|
Combines keys in the JSON tree using spaces.
|
|
|
|
If levels_back is set to 0, prints all levels.
|
|
If collapse_length is not None and the json_data is <= that number
|
|
of characters, then we collapse it into one line.
|
|
|
|
"""
|
|
if isinstance(json_data, (dict, list)):
|
|
# only try to collapse if we're not at a leaf node
|
|
json_str = json.dumps(json_data, ensure_ascii=ensure_ascii)
|
|
if collapse_length is not None and len(json_str) <= collapse_length:
|
|
new_path = path[-levels_back:]
|
|
new_path.append(json_str)
|
|
yield " ".join(new_path)
|
|
return
|
|
elif isinstance(json_data, dict):
|
|
for key, value in json_data.items():
|
|
new_path = path[:]
|
|
new_path.append(key)
|
|
yield from _depth_first_yield(
|
|
value, levels_back, collapse_length, new_path
|
|
)
|
|
elif isinstance(json_data, list):
|
|
for _, value in enumerate(json_data):
|
|
yield from _depth_first_yield(value, levels_back, collapse_length, path)
|
|
else:
|
|
new_path = path[-levels_back:]
|
|
new_path.append(str(json_data))
|
|
yield " ".join(new_path)
|
|
|
|
|
|
class JSONReader(BaseReader):
|
|
"""JSON reader.
|
|
|
|
Reads JSON documents with options to help suss out relationships between nodes.
|
|
|
|
Args:
|
|
levels_back (int): the number of levels to go back in the JSON tree, 0
|
|
if you want all levels. If levels_back is None, then we just format the
|
|
JSON and make each line an embedding
|
|
|
|
collapse_length (int): the maximum number of characters a JSON fragment
|
|
would be collapsed in the output (levels_back needs to be not None)
|
|
ex: if collapse_length = 10, and
|
|
input is {a: [1, 2, 3], b: {"hello": "world", "foo": "bar"}}
|
|
then a would be collapsed into one line, while b would not.
|
|
Recommend starting around 100 and then adjusting from there.
|
|
|
|
is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format.
|
|
Defaults to False.
|
|
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
levels_back: Optional[int] = None,
|
|
collapse_length: Optional[int] = None,
|
|
ensure_ascii: bool = False,
|
|
is_jsonl: Optional[bool] = False,
|
|
) -> None:
|
|
"""Initialize with arguments."""
|
|
super().__init__()
|
|
self.levels_back = levels_back
|
|
self.collapse_length = collapse_length
|
|
self.ensure_ascii = ensure_ascii
|
|
self.is_jsonl = is_jsonl
|
|
|
|
def load_data(self, input_file: str) -> List[Document]:
|
|
"""Load data from the input file."""
|
|
with open(input_file, encoding="utf-8") as f:
|
|
load_data = []
|
|
if self.is_jsonl:
|
|
for line in f:
|
|
load_data.append(json.loads(line.strip()))
|
|
else:
|
|
load_data = [json.load(f)]
|
|
|
|
documents = []
|
|
for data in load_data:
|
|
# print(data)
|
|
if self.levels_back is None:
|
|
# If levels_back isn't set, we just format and make each
|
|
# line an embedding
|
|
json_output = json.dumps(
|
|
data, indent=0, ensure_ascii=self.ensure_ascii
|
|
)
|
|
lines = json_output.split("\n")
|
|
useful_lines = [
|
|
line for line in lines if not re.match(r"^[{}\[\],]*$", line)
|
|
]
|
|
documents.append(Document(text="\n".join(useful_lines)))
|
|
elif self.levels_back is not None:
|
|
# If levels_back is set, we make the embeddings contain the labels
|
|
# from further up the JSON tree
|
|
lines = [
|
|
*_depth_first_yield(
|
|
data,
|
|
self.levels_back,
|
|
self.collapse_length,
|
|
[],
|
|
self.ensure_ascii,
|
|
)
|
|
]
|
|
documents.append(Document(text="\n".join(lines)))
|
|
return documents
|