@lobehub/chat

Version:

Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.

github.com/lobehub/lobe-chat

lobehub/lobe-chat

218 lines (197 loc) • 9.26 kB

text/typescript

import debug from 'debug'; import path from 'node:path'; import type { DocumentPage, FileLoaderInterface } from '../../types'; import { type ExtractedFile, extractFiles, parseString } from '../../utils/parser-utils'; const log = debug('file-loaders:pptx'); /** * Represents a loader for PPTX files using extracted utility functions. * * This loader reads a PPTX file, extracts text content from each slide, * and represents each slide as a `DocumentPage`. */ export class PptxLoader implements FileLoaderInterface { /** * Loads pages from the specified PPTX file path. * * @param filePath The absolute path to the PPTX file. * @returns A Promise resolving to an array of `DocumentPage` objects. * If loading or parsing fails, it returns an array containing a single * `DocumentPage` object with error information in its metadata. */ async loadPages(filePath: string): Promise<DocumentPage[]> { log('Loading PPTX file:', filePath); const sourceFileName = path.basename(filePath); log('Source file name:', sourceFileName); try { // --- File Extraction Step --- const slidesRegex = /ppt\/slides\/slide\d+\.xml/g; const slideNumberRegex = /slide(\d+)\.xml/; log('Extracting slide XML files from PPTX'); // Extract only slide XML files const slideFiles: ExtractedFile[] = await extractFiles(filePath, (fileName) => slidesRegex.test(fileName), ); log('Extracted slide files:', slideFiles.length); // --- Validation Step --- if (slideFiles.length === 0) { log('No slide XML files found in the PPTX file'); console.warn(`No slide XML files found in ${sourceFileName}. May be corrupted or empty.`); return [ this.createErrorPage( 'No slides found. The PPTX file might be empty, corrupted, or does not contain standard slide XMLs.', sourceFileName, ), ]; } // --- Sorting Step --- log('Sorting slide files by slide number'); // Sort files based on the slide number extracted from the path slideFiles.sort((a, b) => { const matchA = a.path.match(slideNumberRegex); const matchB = b.path.match(slideNumberRegex); const numA = matchA ? parseInt(matchA[1], 10) : Infinity; const numB = matchB ? parseInt(matchB[1], 10) : Infinity; return numA - numB; }); log('Slide files sorted'); // --- Page Creation Step --- log('Creating document pages from slide files'); const pages: DocumentPage[] = slideFiles .map((slideFile, index) => { try { log(`Processing slide ${index + 1}/${slideFiles.length}, path: ${slideFile.path}`); const xmlDoc = parseString(slideFile.content); const paragraphNodes = xmlDoc.getElementsByTagName('a:p'); log(`Found ${paragraphNodes.length} paragraph nodes in slide ${index + 1}`); const slideText = Array.from(paragraphNodes) .map((pNode) => { const textNodes = pNode.getElementsByTagName('a:t'); return Array.from(textNodes) .map((tNode) => (tNode.childNodes[0] ? tNode.childNodes[0].nodeValue : '')) .join(''); // Join text within a paragraph without spaces }) .filter((text) => text.length > 0) // Filter out empty paragraphs .join('\n'); // Join paragraphs with newline const lines = slideText.split('\n'); const slideNumberMatch = slideFile.path.match(slideNumberRegex); const slideNumber = slideNumberMatch ? parseInt(slideNumberMatch[1], 10) : index + 1; // Fallback to index if regex fails log( `Slide ${index + 1} text extracted, lines: ${lines.length}, characters: ${slideText.length}`, ); const metadata = { pageCount: slideFiles.length, // Total number of slides found slideNumber: slideNumber, sourceFileName, }; return { charCount: slideText.length, lineCount: lines.length, metadata: metadata, pageContent: slideText.trim(), // Trim final content }; } catch (parseError) { log(`Error parsing slide ${slideFile.path}`); console.error( `Failed to parse XML for slide ${slideFile.path} in ${sourceFileName}: ${parseError instanceof Error ? parseError.message : String(parseError)}`, ); // Create a specific error page for this slide, or could return null and filter later // Returning null might be better if one slide fails but others succeed. // For now, let's keep it simple and create an error page for this slide. return this.createErrorPage( `Error parsing slide ${slideFile.path}: ${parseError instanceof Error ? parseError.message : String(parseError)}`, sourceFileName, slideFile.path, ); } }) // Filter out any potential nulls if we change the error handling above .filter((page): page is DocumentPage => page !== null); log(`Created ${pages.length} document pages from slides`); if (pages.length === 0) { // This case might happen if all slides failed to parse log('Parsing resulted in zero valid pages'); console.warn(`Parsing resulted in zero valid pages for ${sourceFileName}`); return [this.createErrorPage('Parsing resulted in zero valid pages.', sourceFileName)]; } // Check if all pages are error pages const allErrored = pages.every((page) => page.metadata?.error); if (allErrored) { // If all pages resulted in errors, perhaps return a single summary error log('All slides failed to parse'); console.warn(`All slides failed to parse for ${sourceFileName}`); return [this.createErrorPage('All slides failed to parse correctly.', sourceFileName)]; // Or return all the individual error pages: return pages; } log('PPTX loading completed successfully'); return pages; } catch (error) { // --- Error Handling Step --- // This catches errors from extractFiles or other unexpected issues log('Error loading or processing PPTX file'); const errorMessage = `Failed to load or process PPTX file: ${error instanceof Error ? error.message : String(error)}`; console.error(errorMessage, { filePath }); return [this.createErrorPage(errorMessage, sourceFileName)]; } } /** * Aggregates the content from all DocumentPages (slides). * * Prepends each slide's content with a "## Slide: N" header. * Joins the content of slides with a standard separator. * * @param pages An array of `DocumentPage` objects obtained from `loadPages`. * @returns A Promise resolving to the aggregated content string. */ async aggregateContent(pages: DocumentPage[]): Promise<string> { log('Aggregating content from', pages.length, 'PPTX pages'); // Ensure pages array is valid and non-empty before proceeding // Filter out error pages before aggregation unless we want to include error messages const validPages = pages.filter((page) => !page.metadata?.error); log( `Found ${validPages.length} valid pages for aggregation (${pages.length - validPages.length} error pages filtered out)`, ); if (validPages.length === 0) { // If only error pages existed, return empty or a summary error message log('No valid pages found, returning content of first page (may be error page)'); return pages[0]?.pageContent || ''; // Return content of the first page (might be an error page) } const result = validPages .map((page) => { const slideNumber = page.metadata?.slideNumber; // Use Markdown H2 for slide headers const header = slideNumber ? `<slide_page pageNumber="${slideNumber}">` : '<slide_page>'; // Fallback header return `${header} ${page.pageContent} </slide_page>`; }) .join('\n\n'); // Use Markdown horizontal rule as separator log('PPTX content aggregated successfully, length:', result.length); return result; } /** * Helper method to create a standardized error page object. * * @param errorInfo A string describing the error. * @param sourceFileName The name of the file that caused the error. * @param sourceFilePath Optional: Specific path within the archive that caused the error (e.g., slide path) * @returns A `DocumentPage` object representing the error state. */ private createErrorPage( errorInfo: string, sourceFileName: string, sourceFilePath?: string, ): DocumentPage { log('Creating error page:', errorInfo); return { charCount: 0, lineCount: 0, metadata: { error: errorInfo, pageCount: 0, sourceFileName: sourceFileName, ...(sourceFilePath && { sourceFilePath }), // Add specific path if available }, pageContent: '', // Error pages have no content }; } }