31 lines
898 B
TypeScript
31 lines
898 B
TypeScript
import { FileItemChunk } from "@/types"
|
|
import { encode } from "gpt-tokenizer"
|
|
import { PDFLoader } from "langchain/document_loaders/fs/pdf"
|
|
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
|
|
import { CHUNK_OVERLAP, CHUNK_SIZE } from "."
|
|
|
|
export const processPdf = async (pdf: Blob): Promise<FileItemChunk[]> => {
|
|
const loader = new PDFLoader(pdf)
|
|
const docs = await loader.load()
|
|
let completeText = docs.map(doc => doc.pageContent).join(" ")
|
|
|
|
const splitter = new RecursiveCharacterTextSplitter({
|
|
chunkSize: CHUNK_SIZE,
|
|
chunkOverlap: CHUNK_OVERLAP
|
|
})
|
|
const splitDocs = await splitter.createDocuments([completeText])
|
|
|
|
let chunks: FileItemChunk[] = []
|
|
|
|
for (let i = 0; i < splitDocs.length; i++) {
|
|
const doc = splitDocs[i]
|
|
|
|
chunks.push({
|
|
content: doc.pageContent,
|
|
tokens: encode(doc.pageContent).length
|
|
})
|
|
}
|
|
|
|
return chunks
|
|
}
|