import { FileItemChunk } from "@/types" import { encode } from "gpt-tokenizer" import { PDFLoader } from "langchain/document_loaders/fs/pdf" import { RecursiveCharacterTextSplitter } from "langchain/text_splitter" import { CHUNK_OVERLAP, CHUNK_SIZE } from "." export const processPdf = async (pdf: Blob): Promise => { const loader = new PDFLoader(pdf) const docs = await loader.load() let completeText = docs.map(doc => doc.pageContent).join(" ") const splitter = new RecursiveCharacterTextSplitter({ chunkSize: CHUNK_SIZE, chunkOverlap: CHUNK_OVERLAP }) const splitDocs = await splitter.createDocuments([completeText]) let chunks: FileItemChunk[] = [] for (let i = 0; i < splitDocs.length; i++) { const doc = splitDocs[i] chunks.push({ content: doc.pageContent, tokens: encode(doc.pageContent).length }) } return chunks }