@lobehub/chat
Version:
Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.
218 lines (197 loc) • 9.26 kB
text/typescript
import debug from 'debug';
import path from 'node:path';
import type { DocumentPage, FileLoaderInterface } from '../../types';
import { type ExtractedFile, extractFiles, parseString } from '../../utils/parser-utils';
const log = debug('file-loaders:pptx');
/**
* Represents a loader for PPTX files using extracted utility functions.
*
* This loader reads a PPTX file, extracts text content from each slide,
* and represents each slide as a `DocumentPage`.
*/
export class PptxLoader implements FileLoaderInterface {
/**
* Loads pages from the specified PPTX file path.
*
* @param filePath The absolute path to the PPTX file.
* @returns A Promise resolving to an array of `DocumentPage` objects.
* If loading or parsing fails, it returns an array containing a single
* `DocumentPage` object with error information in its metadata.
*/
async loadPages(filePath: string): Promise<DocumentPage[]> {
log('Loading PPTX file:', filePath);
const sourceFileName = path.basename(filePath);
log('Source file name:', sourceFileName);
try {
// --- File Extraction Step ---
const slidesRegex = /ppt\/slides\/slide\d+\.xml/g;
const slideNumberRegex = /slide(\d+)\.xml/;
log('Extracting slide XML files from PPTX');
// Extract only slide XML files
const slideFiles: ExtractedFile[] = await extractFiles(filePath, (fileName) =>
slidesRegex.test(fileName),
);
log('Extracted slide files:', slideFiles.length);
// --- Validation Step ---
if (slideFiles.length === 0) {
log('No slide XML files found in the PPTX file');
console.warn(`No slide XML files found in ${sourceFileName}. May be corrupted or empty.`);
return [
this.createErrorPage(
'No slides found. The PPTX file might be empty, corrupted, or does not contain standard slide XMLs.',
sourceFileName,
),
];
}
// --- Sorting Step ---
log('Sorting slide files by slide number');
// Sort files based on the slide number extracted from the path
slideFiles.sort((a, b) => {
const matchA = a.path.match(slideNumberRegex);
const matchB = b.path.match(slideNumberRegex);
const numA = matchA ? parseInt(matchA[1], 10) : Infinity;
const numB = matchB ? parseInt(matchB[1], 10) : Infinity;
return numA - numB;
});
log('Slide files sorted');
// --- Page Creation Step ---
log('Creating document pages from slide files');
const pages: DocumentPage[] = slideFiles
.map((slideFile, index) => {
try {
log(`Processing slide ${index + 1}/${slideFiles.length}, path: ${slideFile.path}`);
const xmlDoc = parseString(slideFile.content);
const paragraphNodes = xmlDoc.getElementsByTagName('a:p');
log(`Found ${paragraphNodes.length} paragraph nodes in slide ${index + 1}`);
const slideText = Array.from(paragraphNodes)
.map((pNode) => {
const textNodes = pNode.getElementsByTagName('a:t');
return Array.from(textNodes)
.map((tNode) => (tNode.childNodes[0] ? tNode.childNodes[0].nodeValue : ''))
.join(''); // Join text within a paragraph without spaces
})
.filter((text) => text.length > 0) // Filter out empty paragraphs
.join('\n'); // Join paragraphs with newline
const lines = slideText.split('\n');
const slideNumberMatch = slideFile.path.match(slideNumberRegex);
const slideNumber = slideNumberMatch ? parseInt(slideNumberMatch[1], 10) : index + 1; // Fallback to index if regex fails
log(
`Slide ${index + 1} text extracted, lines: ${lines.length}, characters: ${slideText.length}`,
);
const metadata = {
pageCount: slideFiles.length, // Total number of slides found
slideNumber: slideNumber,
sourceFileName,
};
return {
charCount: slideText.length,
lineCount: lines.length,
metadata: metadata,
pageContent: slideText.trim(), // Trim final content
};
} catch (parseError) {
log(`Error parsing slide ${slideFile.path}`);
console.error(
`Failed to parse XML for slide ${slideFile.path} in ${sourceFileName}: ${parseError instanceof Error ? parseError.message : String(parseError)}`,
);
// Create a specific error page for this slide, or could return null and filter later
// Returning null might be better if one slide fails but others succeed.
// For now, let's keep it simple and create an error page for this slide.
return this.createErrorPage(
`Error parsing slide ${slideFile.path}: ${parseError instanceof Error ? parseError.message : String(parseError)}`,
sourceFileName,
slideFile.path,
);
}
})
// Filter out any potential nulls if we change the error handling above
.filter((page): page is DocumentPage => page !== null);
log(`Created ${pages.length} document pages from slides`);
if (pages.length === 0) {
// This case might happen if all slides failed to parse
log('Parsing resulted in zero valid pages');
console.warn(`Parsing resulted in zero valid pages for ${sourceFileName}`);
return [this.createErrorPage('Parsing resulted in zero valid pages.', sourceFileName)];
}
// Check if all pages are error pages
const allErrored = pages.every((page) => page.metadata?.error);
if (allErrored) {
// If all pages resulted in errors, perhaps return a single summary error
log('All slides failed to parse');
console.warn(`All slides failed to parse for ${sourceFileName}`);
return [this.createErrorPage('All slides failed to parse correctly.', sourceFileName)];
// Or return all the individual error pages: return pages;
}
log('PPTX loading completed successfully');
return pages;
} catch (error) {
// --- Error Handling Step ---
// This catches errors from extractFiles or other unexpected issues
log('Error loading or processing PPTX file');
const errorMessage = `Failed to load or process PPTX file: ${error instanceof Error ? error.message : String(error)}`;
console.error(errorMessage, { filePath });
return [this.createErrorPage(errorMessage, sourceFileName)];
}
}
/**
* Aggregates the content from all DocumentPages (slides).
*
* Prepends each slide's content with a "## Slide: N" header.
* Joins the content of slides with a standard separator.
*
* @param pages An array of `DocumentPage` objects obtained from `loadPages`.
* @returns A Promise resolving to the aggregated content string.
*/
async aggregateContent(pages: DocumentPage[]): Promise<string> {
log('Aggregating content from', pages.length, 'PPTX pages');
// Ensure pages array is valid and non-empty before proceeding
// Filter out error pages before aggregation unless we want to include error messages
const validPages = pages.filter((page) => !page.metadata?.error);
log(
`Found ${validPages.length} valid pages for aggregation (${pages.length - validPages.length} error pages filtered out)`,
);
if (validPages.length === 0) {
// If only error pages existed, return empty or a summary error message
log('No valid pages found, returning content of first page (may be error page)');
return pages[0]?.pageContent || ''; // Return content of the first page (might be an error page)
}
const result = validPages
.map((page) => {
const slideNumber = page.metadata?.slideNumber;
// Use Markdown H2 for slide headers
const header = slideNumber ? `<slide_page pageNumber="${slideNumber}">` : '<slide_page>'; // Fallback header
return `${header}
${page.pageContent}
</slide_page>`;
})
.join('\n\n'); // Use Markdown horizontal rule as separator
log('PPTX content aggregated successfully, length:', result.length);
return result;
}
/**
* Helper method to create a standardized error page object.
*
* @param errorInfo A string describing the error.
* @param sourceFileName The name of the file that caused the error.
* @param sourceFilePath Optional: Specific path within the archive that caused the error (e.g., slide path)
* @returns A `DocumentPage` object representing the error state.
*/
private createErrorPage(
errorInfo: string,
sourceFileName: string,
sourceFilePath?: string,
): DocumentPage {
log('Creating error page:', errorInfo);
return {
charCount: 0,
lineCount: 0,
metadata: {
error: errorInfo,
pageCount: 0,
sourceFileName: sourceFileName,
...(sourceFilePath && { sourceFilePath }), // Add specific path if available
},
pageContent: '', // Error pages have no content
};
}
}