faiss_rag_enterprise/llama_index/readers/file/epub_reader.py

44 lines
1.2 KiB
Python

"""Epub parser.
Contains parsers for epub files.
"""
from pathlib import Path
from typing import Dict, List, Optional
from llama_index.readers.base import BaseReader
from llama_index.schema import Document
class EpubReader(BaseReader):
"""Epub Parser."""
def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file."""
try:
import ebooklib
import html2text
from ebooklib import epub
except ImportError:
raise ImportError(
"Please install extra dependencies that are required for "
"the EpubReader: "
"`pip install EbookLib html2text`"
)
text_list = []
book = epub.read_epub(file, options={"ignore_ncx": True})
# Iterate through all chapters.
for item in book.get_items():
# Chapters are typically located in epub documents items.
if item.get_type() == ebooklib.ITEM_DOCUMENT:
text_list.append(
html2text.html2text(item.get_content().decode("utf-8"))
)
text = "\n".join(text_list)
return [Document(text=text, metadata=extra_info or {})]