UNPKG

@robypag/langchain-splitter

Version:

A small wrapper module to simplify files and buffers tokenization using langchain

283 lines (271 loc) 10.9 kB
"use strict"; var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var src_exports = {}; __export(src_exports, { tokenizeFile: () => tokenizeFile, tokenizeFromBufferOrString: () => tokenizeFromBufferOrString }); module.exports = __toCommonJS(src_exports); // src/lib/pdf.ts var import_pdf = require("@langchain/community/document_loaders/fs/pdf"); var import_text_splitter = require("langchain/text_splitter"); // src/lib/utils/errors.ts var PdfLoadingError = class extends Error { constructor(message) { super(message); this.name = "PdfLoadingError"; } }; var TextLoadingError = class extends Error { constructor(message) { super(message); this.name = "TextLoadingError"; } }; var CsvLoadingError = class extends Error { constructor(message) { super(message); this.name = "CsvLoadingError"; } }; var WordLoadingError = class extends Error { constructor(message) { super(message); this.name = "WordLoadingError"; } }; var PowerpointLoadingError = class extends Error { constructor(message) { super(message); this.name = "PowerpointLoadingError"; } }; var UnrecognizableFileType = class extends Error { constructor(message) { super(message); this.name = "UnrecognizableFileType"; } }; var UnsupportedFileType = class extends Error { constructor(message) { super(message); this.name = "UnsupportedFileType"; } }; // src/lib/pdf.ts async function tokenizePDF(filePath, chunkSize = 1e3, chunkOverlap = 200, options) { try { const pdfLoader = new import_pdf.PDFLoader(filePath, { splitPages: options?.splitByPage || false }); const document = await pdfLoader.load(); const splitter = new import_text_splitter.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap }); const fullText = document.map((d) => d.pageContent).join("\n\n"); const totalChars = fullText.length; const estimatedChunks = Math.ceil((totalChars - chunkOverlap) / (chunkSize - chunkOverlap)); console.log(`Total chars: ${totalChars}`); console.log(`Estimated chunks: ~${estimatedChunks}`); return await splitter.createDocuments([fullText]); } catch (error) { console.error("Error tokenizing PDF:", error); if (error instanceof Error) { throw new PdfLoadingError(error.message); } else throw new PdfLoadingError(`An error occurred while loading the PDF file at ${filePath}`); } } // src/lib/plaintext.ts var import_text_splitter2 = require("langchain/text_splitter"); var import_text = require("langchain/document_loaders/fs/text"); async function tokenizePlaintextFile(filePath, chunkSize = 1e3, chunkOverlap = 200, options) { try { const textLoader = new import_text.TextLoader(filePath); const document = await textLoader.load(); const splitter = new import_text_splitter2.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap }); const fullContent = document.map((d) => d.pageContent).join("\n"); return await splitter.createDocuments([fullContent]); } catch (error) { console.error("Error tokenizing file:", error); if (error instanceof Error) { throw new TextLoadingError(error.message); } else throw new TextLoadingError(`An error occurred while loading the file at ${filePath}`); } } // src/lib/utils/index.ts var fs = __toESM(require("fs"), 1); var import_os = __toESM(require("os"), 1); var import_path = __toESM(require("path"), 1); var import_stream = require("stream"); var import_uuid = require("uuid"); var import_promises = require("stream/promises"); async function writeTempFile(content, extension) { const tempDir = await fs.promises.mkdtemp(import_path.default.join(import_os.default.tmpdir(), "tokenize-")); const tempFilePath = import_path.default.join(tempDir, `temp-${(0, import_uuid.v4)()}.${extension}`); if (typeof content === "string") { await fs.promises.writeFile(tempFilePath, content, "utf8"); } else if (content instanceof Buffer) { await fs.promises.writeFile(tempFilePath, content); } else if (content instanceof import_stream.Readable) { const writeStream = fs.createWriteStream(tempFilePath); await (0, import_promises.pipeline)(content, writeStream); } return tempFilePath; } async function removeTempFile(filePath) { try { await fs.promises.unlink(filePath); await fs.promises.rmdir(import_path.default.dirname(filePath)); } catch (cleanupError) { console.error("Error cleaning up temporary file:", cleanupError); } } // src/lib/word.ts var import_docx = require("@langchain/community/document_loaders/fs/docx"); var import_text_splitter3 = require("langchain/text_splitter"); async function tokenizeWordDocument(filePath, chunkSize = 1e3, chunkOverlap = 200, options) { try { const docxBuffer = new import_docx.DocxLoader(filePath); const document = await docxBuffer.load(); const splitter = new import_text_splitter3.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap }); return options?.splitByPage ? await splitter.splitDocuments(document) : await splitter.createDocuments([document.map((d) => d.pageContent).join("\n\n")]); } catch (error) { console.error("Error tokenizing Word document:", error); if (error instanceof Error) { throw new WordLoadingError(error.message); } else throw new WordLoadingError(`An error occurred while loading the PDF file at ${filePath}`); } } // src/index.ts var import_mime_types = require("mime-types"); // src/lib/csv.ts var import_text_splitter4 = require("langchain/text_splitter"); var import_csv = require("@langchain/community/document_loaders/fs/csv"); async function tokenizeCsvFile(filePath, chunkSize = 1e3, chunkOverlap = 200, options) { try { const textLoader = new import_csv.CSVLoader(filePath); const document = await textLoader.load(); const splitter = new import_text_splitter4.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap }); const fullContent = document.map((d) => d.pageContent).join("\n"); return await splitter.createDocuments([fullContent]); } catch (error) { console.error("Error tokenizing file:", error); if (error instanceof Error) { throw new CsvLoadingError(error.message); } else throw new CsvLoadingError(`An error occurred while loading the file at ${filePath}`); } } // src/index.ts var import_node_stream = require("stream"); // src/lib/powerpoint.ts var import_pptx = require("@langchain/community/document_loaders/fs/pptx"); var import_text_splitter5 = require("langchain/text_splitter"); async function tokenizePowerpointDocument(filePath, chunkSize = 1e3, chunkOverlap = 200, options) { try { const pptxBuffer = new import_pptx.PPTXLoader(filePath); const document = await pptxBuffer.load(); const splitter = new import_text_splitter5.RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap }); return options?.splitByPage ? await splitter.splitDocuments(document) : await splitter.createDocuments([document.map((d) => d.pageContent).join("\n\n")]); } catch (error) { console.error("Error tokenizing Powerpoint document:", error); if (error instanceof Error) { throw new PowerpointLoadingError(error.message); } else throw new PowerpointLoadingError(`An error occurred while loading the Powerpoint file at ${filePath}`); } } // src/index.ts var fileHandlers = { "application/vnd.openxmlformats-officedocument.wordprocessingml.document": tokenizeWordDocument, "application/msword": tokenizeWordDocument, "application/vnd.ms-powerpoint": tokenizePowerpointDocument, "application/vnd.openxmlformats-officedocument.presentationml.presentation": tokenizePowerpointDocument, "application/pdf": tokenizePDF, "text/csv": tokenizeCsvFile }; var defaultHandler = tokenizePlaintextFile; var streamToBuffer = async (stream) => { const chunks = []; for await (const chunk of stream) { chunks.push(chunk); } return Buffer.concat(chunks); }; async function tokenizeFile(filePath, chunkOverlap = 200, chunkSize = 1e3, options) { let mimeType = (0, import_mime_types.lookup)(filePath); if (mimeType === false) { throw new UnrecognizableFileType(`The filetype provided at path ${filePath} is unrecognizable`); } const handler = fileHandlers[mimeType] || (mimeType.startsWith("text/") ? defaultHandler : null); if (!handler) { throw new UnsupportedFileType( `The filetype provided at path ${filePath} is not supported (mime type: ${mimeType})` ); } const document = await handler(filePath, chunkSize, chunkOverlap, options); return document.map((doc, index) => { return { id: doc.id ?? `idx-${index}`, metadata: doc.metadata, content: doc.pageContent }; }); } async function tokenizeFromBufferOrString(content, extension, chunkOverlap = 200, chunkSize = 1e3, options) { let tempFilePath = null; try { const fileData = content instanceof import_node_stream.Readable ? await streamToBuffer(content) : content; tempFilePath = await writeTempFile(fileData, extension); return await tokenizeFile(tempFilePath, chunkOverlap, chunkSize, options); } catch (error) { console.error("Error during tokenization:", error); throw error; } finally { if (tempFilePath) { await removeTempFile(tempFilePath); } } } // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { tokenizeFile, tokenizeFromBufferOrString }); //# sourceMappingURL=index.cjs.map