UNPKG

@robypag/langchain-splitter

Version:

A small wrapper module to simplify files and buffers tokenization using langchain

245 lines (235 loc) 9.01 kB
// src/lib/pdf.ts import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf"; import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; // src/lib/utils/errors.ts var PdfLoadingError = class extends Error { constructor(message) { super(message); this.name = "PdfLoadingError"; } }; var TextLoadingError = class extends Error { constructor(message) { super(message); this.name = "TextLoadingError"; } }; var CsvLoadingError = class extends Error { constructor(message) { super(message); this.name = "CsvLoadingError"; } }; var WordLoadingError = class extends Error { constructor(message) { super(message); this.name = "WordLoadingError"; } }; var PowerpointLoadingError = class extends Error { constructor(message) { super(message); this.name = "PowerpointLoadingError"; } }; var UnrecognizableFileType = class extends Error { constructor(message) { super(message); this.name = "UnrecognizableFileType"; } }; var UnsupportedFileType = class extends Error { constructor(message) { super(message); this.name = "UnsupportedFileType"; } }; // src/lib/pdf.ts async function tokenizePDF(filePath, chunkSize = 1e3, chunkOverlap = 200, options) { try { const pdfLoader = new PDFLoader(filePath, { splitPages: options?.splitByPage || false }); const document = await pdfLoader.load(); const splitter = new RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap }); const fullText = document.map((d) => d.pageContent).join("\n\n"); const totalChars = fullText.length; const estimatedChunks = Math.ceil((totalChars - chunkOverlap) / (chunkSize - chunkOverlap)); console.log(`Total chars: ${totalChars}`); console.log(`Estimated chunks: ~${estimatedChunks}`); return await splitter.createDocuments([fullText]); } catch (error) { console.error("Error tokenizing PDF:", error); if (error instanceof Error) { throw new PdfLoadingError(error.message); } else throw new PdfLoadingError(`An error occurred while loading the PDF file at ${filePath}`); } } // src/lib/plaintext.ts import { RecursiveCharacterTextSplitter as RecursiveCharacterTextSplitter2 } from "langchain/text_splitter"; import { TextLoader } from "langchain/document_loaders/fs/text"; async function tokenizePlaintextFile(filePath, chunkSize = 1e3, chunkOverlap = 200, options) { try { const textLoader = new TextLoader(filePath); const document = await textLoader.load(); const splitter = new RecursiveCharacterTextSplitter2({ chunkSize, chunkOverlap }); const fullContent = document.map((d) => d.pageContent).join("\n"); return await splitter.createDocuments([fullContent]); } catch (error) { console.error("Error tokenizing file:", error); if (error instanceof Error) { throw new TextLoadingError(error.message); } else throw new TextLoadingError(`An error occurred while loading the file at ${filePath}`); } } // src/lib/utils/index.ts import * as fs from "fs"; import os from "os"; import path from "path"; import { Readable } from "stream"; import { v4 as uuidv4 } from "uuid"; import { pipeline } from "stream/promises"; async function writeTempFile(content, extension) { const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), "tokenize-")); const tempFilePath = path.join(tempDir, `temp-${uuidv4()}.${extension}`); if (typeof content === "string") { await fs.promises.writeFile(tempFilePath, content, "utf8"); } else if (content instanceof Buffer) { await fs.promises.writeFile(tempFilePath, content); } else if (content instanceof Readable) { const writeStream = fs.createWriteStream(tempFilePath); await pipeline(content, writeStream); } return tempFilePath; } async function removeTempFile(filePath) { try { await fs.promises.unlink(filePath); await fs.promises.rmdir(path.dirname(filePath)); } catch (cleanupError) { console.error("Error cleaning up temporary file:", cleanupError); } } // src/lib/word.ts import { DocxLoader } from "@langchain/community/document_loaders/fs/docx"; import { RecursiveCharacterTextSplitter as RecursiveCharacterTextSplitter3 } from "langchain/text_splitter"; async function tokenizeWordDocument(filePath, chunkSize = 1e3, chunkOverlap = 200, options) { try { const docxBuffer = new DocxLoader(filePath); const document = await docxBuffer.load(); const splitter = new RecursiveCharacterTextSplitter3({ chunkSize, chunkOverlap }); return options?.splitByPage ? await splitter.splitDocuments(document) : await splitter.createDocuments([document.map((d) => d.pageContent).join("\n\n")]); } catch (error) { console.error("Error tokenizing Word document:", error); if (error instanceof Error) { throw new WordLoadingError(error.message); } else throw new WordLoadingError(`An error occurred while loading the PDF file at ${filePath}`); } } // src/index.ts import { lookup } from "mime-types"; // src/lib/csv.ts import { RecursiveCharacterTextSplitter as RecursiveCharacterTextSplitter4 } from "langchain/text_splitter"; import { CSVLoader } from "@langchain/community/document_loaders/fs/csv"; async function tokenizeCsvFile(filePath, chunkSize = 1e3, chunkOverlap = 200, options) { try { const textLoader = new CSVLoader(filePath); const document = await textLoader.load(); const splitter = new RecursiveCharacterTextSplitter4({ chunkSize, chunkOverlap }); const fullContent = document.map((d) => d.pageContent).join("\n"); return await splitter.createDocuments([fullContent]); } catch (error) { console.error("Error tokenizing file:", error); if (error instanceof Error) { throw new CsvLoadingError(error.message); } else throw new CsvLoadingError(`An error occurred while loading the file at ${filePath}`); } } // src/index.ts import { Readable as Readable2 } from "node:stream"; // src/lib/powerpoint.ts import { PPTXLoader } from "@langchain/community/document_loaders/fs/pptx"; import { RecursiveCharacterTextSplitter as RecursiveCharacterTextSplitter5 } from "langchain/text_splitter"; async function tokenizePowerpointDocument(filePath, chunkSize = 1e3, chunkOverlap = 200, options) { try { const pptxBuffer = new PPTXLoader(filePath); const document = await pptxBuffer.load(); const splitter = new RecursiveCharacterTextSplitter5({ chunkSize, chunkOverlap }); return options?.splitByPage ? await splitter.splitDocuments(document) : await splitter.createDocuments([document.map((d) => d.pageContent).join("\n\n")]); } catch (error) { console.error("Error tokenizing Powerpoint document:", error); if (error instanceof Error) { throw new PowerpointLoadingError(error.message); } else throw new PowerpointLoadingError(`An error occurred while loading the Powerpoint file at ${filePath}`); } } // src/index.ts var fileHandlers = { "application/vnd.openxmlformats-officedocument.wordprocessingml.document": tokenizeWordDocument, "application/msword": tokenizeWordDocument, "application/vnd.ms-powerpoint": tokenizePowerpointDocument, "application/vnd.openxmlformats-officedocument.presentationml.presentation": tokenizePowerpointDocument, "application/pdf": tokenizePDF, "text/csv": tokenizeCsvFile }; var defaultHandler = tokenizePlaintextFile; var streamToBuffer = async (stream) => { const chunks = []; for await (const chunk of stream) { chunks.push(chunk); } return Buffer.concat(chunks); }; async function tokenizeFile(filePath, chunkOverlap = 200, chunkSize = 1e3, options) { let mimeType = lookup(filePath); if (mimeType === false) { throw new UnrecognizableFileType(`The filetype provided at path ${filePath} is unrecognizable`); } const handler = fileHandlers[mimeType] || (mimeType.startsWith("text/") ? defaultHandler : null); if (!handler) { throw new UnsupportedFileType( `The filetype provided at path ${filePath} is not supported (mime type: ${mimeType})` ); } const document = await handler(filePath, chunkSize, chunkOverlap, options); return document.map((doc, index) => { return { id: doc.id ?? `idx-${index}`, metadata: doc.metadata, content: doc.pageContent }; }); } async function tokenizeFromBufferOrString(content, extension, chunkOverlap = 200, chunkSize = 1e3, options) { let tempFilePath = null; try { const fileData = content instanceof Readable2 ? await streamToBuffer(content) : content; tempFilePath = await writeTempFile(fileData, extension); return await tokenizeFile(tempFilePath, chunkOverlap, chunkSize, options); } catch (error) { console.error("Error during tokenization:", error); throw error; } finally { if (tempFilePath) { await removeTempFile(tempFilePath); } } } export { tokenizeFile, tokenizeFromBufferOrString }; //# sourceMappingURL=index.mjs.map