@langchain/community
Version:
Third-party integrations for LangChain.js
1 lines • 6.01 kB
Source Map (JSON)
{"version":3,"file":"docx.cjs","names":["BufferLoader","Document"],"sources":["../../../src/document_loaders/fs/docx.ts"],"sourcesContent":["import { Document } from \"@langchain/core/documents\";\nimport { BufferLoader } from \"@langchain/classic/document_loaders/fs/buffer\";\n\ntype DocxLoaderOptions = {\n type: \"docx\" | \"doc\";\n};\n/**\n * A class that extends the `BufferLoader` class. It represents a document\n * loader that loads documents from DOCX files.\n * It has a constructor that takes a `filePathOrBlob` parameter representing the path to the word\n * file or a Blob object, and an optional `options` parameter of type\n * `DocxLoaderOptions`\n */\nexport class DocxLoader extends BufferLoader {\n protected options: DocxLoaderOptions = { type: \"docx\" };\n\n constructor(filePathOrBlob: string | Blob, options?: DocxLoaderOptions) {\n super(filePathOrBlob);\n if (options) {\n this.options = {\n ...options,\n };\n }\n }\n\n /**\n * A method that takes a `raw` buffer and `metadata` as parameters and\n * returns a promise that resolves to an array of `Document` instances. It\n * uses the `extractRawText` function from the `mammoth` module or\n * `extract` method from the `word-extractor` module to extract\n * the raw text content from the buffer. If the extracted text content is\n * empty, it returns an empty array. Otherwise, it creates a new\n * `Document` instance with the extracted text content and the provided\n * metadata, and returns it as an array.\n * @param raw The raw buffer from which to extract text content.\n * @param metadata The metadata to be associated with the created `Document` instance.\n * @returns A promise that resolves to an array of `Document` instances.\n */\n public async parse(\n raw: Buffer,\n metadata: Document[\"metadata\"]\n ): Promise<Document[]> {\n if (this.options.type === \"doc\") {\n return this.parseDoc(raw, metadata);\n }\n return this.parseDocx(raw, metadata);\n }\n\n /**\n * A private method that takes a `raw` buffer and `metadata` as parameters and\n * returns a promise that resolves to an array of `Document` instances. It\n * uses the `extractRawText` function from the `mammoth` module to extract\n * the raw text content from the buffer. If the extracted text content is\n * empty, it returns an empty array. Otherwise, it creates a new\n * `Document` instance with the extracted text content and the provided\n * metadata, and returns it as an array.\n * @param raw The raw buffer from which to extract text content.\n * @param metadata The metadata to be associated with the created `Document` instance.\n * @returns A promise that resolves to an array of `Document` instances.\n */\n private async parseDocx(\n raw: Buffer,\n metadata: Document[\"metadata\"]\n ): Promise<Document[]> {\n if (this.options.type === \"doc\") {\n return this.parseDoc(raw, metadata);\n }\n const { extractRawText } = await DocxLoaderImports();\n const docx = await extractRawText({\n buffer: raw,\n });\n\n if (!docx.value) return [];\n\n return [\n new Document({\n pageContent: docx.value,\n metadata,\n }),\n ];\n }\n\n /**\n * A private method that takes a `raw` buffer and `metadata` as parameters and\n * returns a promise that resolves to an array of `Document` instances. It\n * uses the `extract` method from the `word-extractor` module to extract\n * the raw text content from the buffer. If the extracted text content is\n * empty, it returns an empty array. Otherwise, it creates a new\n * `Document` instance with the extracted text content and the provided\n * metadata, and returns it as an array.\n * @param raw The raw buffer from which to extract text content.\n * @param metadata The metadata to be associated with the created `Document` instance.\n * @returns A promise that resolves to an array of `Document` instances.\n */\n private async parseDoc(\n raw: Buffer,\n metadata: Document[\"metadata\"]\n ): Promise<Document[]> {\n const WordExtractor = await DocLoaderImports();\n const extractor = new WordExtractor();\n const doc = await extractor.extract(raw);\n return [\n new Document({\n pageContent: doc.getBody(),\n metadata,\n }),\n ];\n }\n}\n\nasync function DocxLoaderImports() {\n try {\n const { extractRawText } = await import(\"mammoth\");\n return { extractRawText };\n } catch (e) {\n console.error(e);\n throw new Error(\n \"Failed to load mammoth. Please install it with eg. `npm install mammoth`.\"\n );\n }\n}\n\nasync function DocLoaderImports() {\n try {\n const WordExtractor = await import(\"word-extractor\");\n return WordExtractor.default;\n } catch (e) {\n console.error(e);\n throw new Error(\n \"Failed to load word-extractor. Please install it with eg. `npm install word-extractor`.\"\n );\n }\n}\n"],"mappings":";;;;;;;;;;;;;AAaA,IAAa,aAAb,cAAgCA,8CAAAA,aAAa;CAC3C,UAAuC,EAAE,MAAM,QAAQ;CAEvD,YAAY,gBAA+B,SAA6B;AACtE,QAAM,eAAe;AACrB,MAAI,QACF,MAAK,UAAU,EACb,GAAG,SACJ;;;;;;;;;;;;;;;CAiBL,MAAa,MACX,KACA,UACqB;AACrB,MAAI,KAAK,QAAQ,SAAS,MACxB,QAAO,KAAK,SAAS,KAAK,SAAS;AAErC,SAAO,KAAK,UAAU,KAAK,SAAS;;;;;;;;;;;;;;CAetC,MAAc,UACZ,KACA,UACqB;AACrB,MAAI,KAAK,QAAQ,SAAS,MACxB,QAAO,KAAK,SAAS,KAAK,SAAS;EAErC,MAAM,EAAE,mBAAmB,MAAM,mBAAmB;EACpD,MAAM,OAAO,MAAM,eAAe,EAChC,QAAQ,KACT,CAAC;AAEF,MAAI,CAAC,KAAK,MAAO,QAAO,EAAE;AAE1B,SAAO,CACL,IAAIC,0BAAAA,SAAS;GACX,aAAa,KAAK;GAClB;GACD,CAAC,CACH;;;;;;;;;;;;;;CAeH,MAAc,SACZ,KACA,UACqB;AAIrB,SAAO,CACL,IAAIA,0BAAAA,SAAS;GACX,cAHQ,MADM,KADI,OAAM,kBAAkB,IACT,CACT,QAAQ,IAAI,EAGnB,SAAS;GAC1B;GACD,CAAC,CACH;;;AAIL,eAAe,oBAAoB;AACjC,KAAI;EACF,MAAM,EAAE,mBAAmB,MAAM,OAAO;AACxC,SAAO,EAAE,gBAAgB;UAClB,GAAG;AACV,UAAQ,MAAM,EAAE;AAChB,QAAM,IAAI,MACR,4EACD;;;AAIL,eAAe,mBAAmB;AAChC,KAAI;AAEF,UADsB,MAAM,OAAO,mBACd;UACd,GAAG;AACV,UAAQ,MAAM,EAAE;AAChB,QAAM,IAAI,MACR,0FACD"}