UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

github.com/juspay/neurolink

juspay/neurolink

158 lines • 5.97 kB

JavaScript

/** * PowerPoint (PPTX) Processing Utility * * Extracts text content from PowerPoint (.pptx) files by treating them * as ZIP archives and parsing the slide XML files within. * * PPTX files are ZIP archives containing: * - ppt/slides/slide1.xml, slide2.xml, ... — slide content * - ppt/slideMasters/ — master slide templates * - ppt/slideLayouts/ — slide layout definitions * * Text is extracted from `<a:t>` elements in the slide XML files. * Slides are sorted by number and presented in reading order. * * Uses `adm-zip` (already a project dependency) for ZIP extraction. * * @module processors/document/PptxProcessor * * @example * ```typescript * import { PptxProcessor } from "./PptxProcessor.js"; * * const text = await PptxProcessor.extractText(buffer); * if (text) { * console.log("Extracted text:", text); * } * ``` */ import AdmZip from "adm-zip"; /** * Regex to match text content within PowerPoint XML `<a:t>` elements. * These elements contain the actual visible text on slides. */ const TEXT_ELEMENT_REGEX = /<a:t[^>]*>([\s\S]*?)<\/a:t>/g; /** * Regex to match slide filenames and extract slide number. * Matches entries like "ppt/slides/slide1.xml", "ppt/slides/slide12.xml". */ const SLIDE_ENTRY_REGEX = /^ppt\/slides\/slide(\d+)\.xml$/; /** * Static utility class for extracting text from PPTX files. * * Designed as a static class (not extending BaseFileProcessor) because * PPTX processing is straightforward ZIP+XML extraction and does not * need the full download/validate/process pipeline of BaseFileProcessor. */ export class PptxProcessor { /** * Extract all text content from a PPTX buffer. * * @param content - Raw PPTX file buffer * @returns Formatted text content with slide headers, or null if no text found * @throws Error if the buffer is not a valid ZIP/PPTX file */ static async extractText(content) { const zip = new AdmZip(content); const entries = zip.getEntries(); // Collect slide entries with their slide numbers for sorting const slides = []; for (const entry of entries) { const match = entry.entryName.match(SLIDE_ENTRY_REGEX); if (match) { const slideNumber = parseInt(match[1], 10); const xmlContent = entry.getData().toString("utf-8"); slides.push({ slideNumber, xml: xmlContent }); } } // Sort slides by number (slide1, slide2, ...) slides.sort((a, b) => a.slideNumber - b.slideNumber); if (slides.length === 0) { return null; } const parts = []; parts.push(`Presentation: ${slides.length} slide(s)\n`); for (const slide of slides) { const texts = PptxProcessor.extractTextFromXml(slide.xml); if (texts.length > 0) { parts.push(`### Slide ${slide.slideNumber}`); parts.push(texts.join("\n")); parts.push(""); // blank line between slides } } const result = parts.join("\n").trim(); return result || null; } /** * Extract text strings from a slide XML document. * Finds all `<a:t>` elements and returns their text content. * * @param xml - Raw XML string from a slide file * @returns Array of text strings found in the slide */ static extractTextFromXml(xml) { const texts = []; // Reset regex state for re-entrant usage TEXT_ELEMENT_REGEX.lastIndex = 0; for (let match = TEXT_ELEMENT_REGEX.exec(xml); match !== null; match = TEXT_ELEMENT_REGEX.exec(xml)) { const text = match[1].trim(); if (text) { texts.push(text); } } return texts; } // =========================================================================== // TARGETED EXTRACTION API // =========================================================================== /** * Extract text from specific slides in a PPTX file. * * Called by the `extract_file_content` tool for targeted slide access. * * @param content - Raw PPTX file buffer * @param slideNumbers - Array of 1-indexed slide numbers to extract * @returns Formatted text from the requested slides */ static async extractSlides(content, slideNumbers) { const zip = new AdmZip(content); const entries = zip.getEntries(); // Collect all slides const slides = []; for (const entry of entries) { const match = entry.entryName.match(SLIDE_ENTRY_REGEX); if (match) { const slideNumber = parseInt(match[1], 10); if (slideNumbers.includes(slideNumber)) { const xmlContent = entry.getData().toString("utf-8"); slides.push({ slideNumber, xml: xmlContent }); } } } slides.sort((a, b) => a.slideNumber - b.slideNumber); if (slides.length === 0) { // List total slides to help the LLM let totalSlides = 0; for (const entry of entries) { if (SLIDE_ENTRY_REGEX.test(entry.entryName)) { totalSlides++; } } return `Slides ${slideNumbers.join(", ")} not found. This presentation has ${totalSlides} slide(s).`; } const parts = []; for (const slide of slides) { const texts = PptxProcessor.extractTextFromXml(slide.xml); parts.push(`### Slide ${slide.slideNumber}`); if (texts.length > 0) { parts.push(texts.join("\n")); } else { parts.push("(No text content on this slide)"); } parts.push(""); } return parts.join("\n").trim(); } } //# sourceMappingURL=PptxProcessor.js.map