UNPKG

dtamind-components

Version:

Apps integration for Dtamind. Contain Nodes and Credentials.

98 lines 3.99 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.WordLoader = void 0; const buffer_1 = require("langchain/document_loaders/fs/buffer"); const officeparser_1 = require("officeparser"); /** * Document loader that uses officeparser to load Word documents. * * The document is parsed into a single Document with metadata including * document type and extracted text content. */ class WordLoader extends buffer_1.BufferLoader { constructor(filePathOrBlob) { super(filePathOrBlob); this.attributes = []; this.attributes = []; } /** * Parse Word document * * @param raw Raw data Buffer * @param metadata Document metadata * @returns Array of Documents */ async parse(raw, metadata) { const result = []; this.attributes = [ { name: 'documentType', description: 'Type of document', type: 'string' }, { name: 'pageCount', description: 'Number of pages/sections', type: 'number' } ]; try { // Use officeparser to extract text from Word document const data = await (0, officeparser_1.parseOfficeAsync)(raw); if (typeof data === 'string' && data.trim()) { // Split content by common page/section separators const sections = this.splitIntoSections(data); sections.forEach((sectionContent, index) => { if (sectionContent.trim()) { result.push({ pageContent: sectionContent.trim(), metadata: { documentType: 'word', pageNumber: index + 1, ...metadata } }); } }); } } catch (error) { console.error('Error parsing Word file:', error); throw new Error(`Failed to parse Word file: ${error instanceof Error ? error.message : 'Unknown error'}`); } return result; } /** * Split content into sections based on common patterns * This is a heuristic approach since officeparser returns plain text */ splitIntoSections(content) { // Try to split by common section patterns const sectionPatterns = [ /\n\s*Page\s+\d+/gi, /\n\s*Section\s+\d+/gi, /\n\s*Chapter\s+\d+/gi, /\n\s*\d+\.\s+/gi, // Numbered sections like "1. ", "2. " /\n\s*[A-Z][A-Z\s]{2,}\n/g, // ALL CAPS headings /\n\s*_{5,}/g, // Long underscores as separators /\n\s*-{5,}/g // Long dashes as separators ]; let sections = []; // Try each pattern and use the one that creates the most reasonable splits for (const pattern of sectionPatterns) { const potentialSections = content.split(pattern); if (potentialSections.length > 1 && potentialSections.length < 50) { // Reasonable number of sections sections = potentialSections; break; } } // If no good pattern found, split by multiple newlines as a fallback if (sections.length === 0) { sections = content.split(/\n\s*\n\s*\n\s*\n/); } // If still no good split, split by double newlines if (sections.length === 0 || sections.every((section) => section.trim().length < 20)) { sections = content.split(/\n\s*\n\s*\n/); } // If still no good split, treat entire content as one section if (sections.length === 0 || sections.every((section) => section.trim().length < 10)) { sections = [content]; } return sections.filter((section) => section.trim().length > 0); } } exports.WordLoader = WordLoader; //# sourceMappingURL=WordLoader.js.map