UNPKG

dtamind-components

Version:

Apps integration for Dtamind. Contain Nodes and Credentials.

92 lines 3.65 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.PowerpointLoader = void 0; const buffer_1 = require("langchain/document_loaders/fs/buffer"); const officeparser_1 = require("officeparser"); /** * Document loader that uses officeparser to load PowerPoint documents. * * Each slide is parsed into a separate Document with metadata including * slide number and extracted text content. */ class PowerpointLoader extends buffer_1.BufferLoader { constructor(filePathOrBlob) { super(filePathOrBlob); this.attributes = []; this.attributes = []; } /** * Parse PowerPoint document * * @param raw Raw data Buffer * @param metadata Document metadata * @returns Array of Documents */ async parse(raw, metadata) { const result = []; this.attributes = [ { name: 'slideNumber', description: 'Slide number', type: 'number' }, { name: 'documentType', description: 'Type of document', type: 'string' } ]; try { // Use officeparser to extract text from PowerPoint const data = await (0, officeparser_1.parseOfficeAsync)(raw); if (typeof data === 'string' && data.trim()) { // Split content by common slide separators or use the entire content as one document const slides = this.splitIntoSlides(data); slides.forEach((slideContent, index) => { if (slideContent.trim()) { result.push({ pageContent: slideContent.trim(), metadata: { slideNumber: index + 1, documentType: 'powerpoint', ...metadata } }); } }); } } catch (error) { console.error('Error parsing PowerPoint file:', error); throw new Error(`Failed to parse PowerPoint file: ${error instanceof Error ? error.message : 'Unknown error'}`); } return result; } /** * Split content into slides based on common patterns * This is a heuristic approach since officeparser returns plain text */ splitIntoSlides(content) { // Try to split by common slide patterns const slidePatterns = [ /\n\s*Slide\s+\d+/gi, /\n\s*Page\s+\d+/gi, /\n\s*\d+\s*\/\s*\d+/gi, /\n\s*_{3,}/g, // Underscores as separators /\n\s*-{3,}/g // Dashes as separators ]; let slides = []; // Try each pattern and use the one that creates the most reasonable splits for (const pattern of slidePatterns) { const potentialSlides = content.split(pattern); if (potentialSlides.length > 1 && potentialSlides.length < 100) { // Reasonable number of slides slides = potentialSlides; break; } } // If no good pattern found, split by double newlines as a fallback if (slides.length === 0) { slides = content.split(/\n\s*\n\s*\n/); } // If still no good split, treat entire content as one slide if (slides.length === 0 || slides.every((slide) => slide.trim().length < 10)) { slides = [content]; } return slides.filter((slide) => slide.trim().length > 0); } } exports.PowerpointLoader = PowerpointLoader; //# sourceMappingURL=PowerpointLoader.js.map