UNPKG

@smythos/sdk

Version:
303 lines (256 loc) 11.4 kB
import { DocParser, TDocumentParseSettings, TParsedDocument } from '../DocParser.class'; import { readFile } from 'fs/promises'; import * as mammoth from 'mammoth'; export class DOCXParser extends DocParser { protected supportedMimeTypes: string[] = ['application/vnd.openxmlformats-officedocument.wordprocessingml.document']; protected supportedExtensions: string[] = ['docx']; async parse(source: string, params?: TDocumentParseSettings): Promise<TParsedDocument> { try { const dataBuffer = await readFile(source); // Parse document structure with enhanced page break detection const documentStructure = await this.extractDocumentStructure(dataBuffer); // Extract images using mammoth (better image handling) const images = await this.extractImages(dataBuffer); // Split content into pages based on page breaks const pages = this.createPagesFromStructure(documentStructure, images); console.log( `Successfully parsed DOCX: ${pages.length} pages with ${pages.reduce( (total, page) => total + page.content.length, 0 )} total content items` ); return { title: this.extractTitleFromPath(source), metadata: { uri: source, author: '', date: '', tags: [], }, pages: pages, }; } catch (error) { console.error('DOCX parsing error:', error); throw error; } } private async extractDocumentStructure(buffer: Buffer): Promise<any[]> { // Use mammoth to get structured content with enhanced page break detection const options = { includeDefaultStyleMap: true, includeEmbeddedStyleMap: true, // Custom style map to detect page breaks and preserve structure styleMap: [ "p[style-name='Normal'] => p:fresh", "p[style-name='Heading 1'] => h1:fresh", "p[style-name='Heading 2'] => h2:fresh", "p[style-name='Heading 3'] => h3:fresh", "br[type='page'] => hr.page-break", "p[style-name='Page Break'] => hr.page-break", // Add more page break style mappings "p[style-name='Page Break Before'] => hr.page-break", "p[style-name='Page Break After'] => hr.page-break", ], }; const result = await mammoth.convertToHtml({ buffer }, options); // Parse the HTML to identify content blocks and page breaks return this.parseHtmlToStructure(result.value); } private parseHtmlToStructure(html: string): any[] { const structure: any[] = []; // Enhanced page break detection patterns const pageBreakPatterns = [ /<hr[^>]*class="page-break"[^>]*>/gi, /<hr[^>]*>/gi, // Any hr tag could be a page break /<div[^>]*page-break[^>]*>/gi, /<p[^>]*page-break[^>]*>/gi, // Look for Word-specific page break indicators /<w:br[^>]*w:type="page"[^>]*>/gi, /<w:lastRenderedPageBreak[^>]*>/gi, ]; let content = html; let pageBreakFound = false; // Replace all page break patterns with a consistent marker pageBreakPatterns.forEach((pattern) => { if (pattern.test(content)) { pageBreakFound = true; content = content.replace(pattern, '<!-- PAGE_BREAK -->'); } }); // If no explicit page breaks found, look for other indicators if (!pageBreakFound) { // Look for form feed characters content = content.replace(/\f/g, '<!-- PAGE_BREAK -->'); // Look for multiple consecutive line breaks that might indicate page breaks content = content.replace(/(<br\s*\/?>){4,}/gi, '<!-- PAGE_BREAK -->'); // Look for large gaps in content (multiple empty paragraphs) content = content.replace(/(<p[^>]*>\s*<\/p>\s*){4,}/gi, '<!-- PAGE_BREAK -->'); // Look for heading patterns that might indicate new pages content = content.replace(/(<\/h[1-3]>\s*<h[1-2][^>]*>)/gi, '$1<!-- PAGE_BREAK -->'); } // Split by page break markers const sections = content.split('<!-- PAGE_BREAK -->'); sections.forEach((section, index) => { const cleanSection = section.trim(); if (cleanSection.length > 0) { structure.push({ type: 'page', pageNumber: index + 1, content: cleanSection, isPageBreak: index > 0, }); } }); // If we still only have one section and it's very long, split by content length if (structure.length === 1 && structure[0].content.length > 4000) { return this.splitLongContentIntoPages(structure[0].content); } // Ensure we have at least one page return structure.length > 0 ? structure : [ { type: 'page', pageNumber: 1, content: html, isPageBreak: false, }, ]; } private splitLongContentIntoPages(content: string): any[] { // Split very long content into reasonable page sizes const targetPageSize = 3000; // Increased target size const pages: any[] = []; // Split by paragraphs first to maintain content integrity const paragraphs = content.split(/(<\/p>)/gi); let currentPage = ''; let pageNumber = 1; for (let i = 0; i < paragraphs.length; i += 2) { const paragraph = paragraphs[i] || ''; const closingTag = paragraphs[i + 1] || ''; const fullParagraph = paragraph + closingTag; // Check if adding this paragraph would exceed the target size if (currentPage.length + fullParagraph.length > targetPageSize && currentPage.length > 500) { // Save current page and start new one pages.push({ type: 'page', pageNumber: pageNumber, content: currentPage.trim(), isPageBreak: pageNumber > 1, }); currentPage = fullParagraph; pageNumber++; } else { currentPage += fullParagraph; } } // Add the last page if it has content if (currentPage.trim().length > 0) { pages.push({ type: 'page', pageNumber: pageNumber, content: currentPage.trim(), isPageBreak: pageNumber > 1, }); } return pages.length > 0 ? pages : [ { type: 'page', pageNumber: 1, content: content, isPageBreak: false, }, ]; } private async extractImages(buffer: Buffer): Promise<Map<string, string>> { const images = new Map<string, string>(); try { const options = { convertImage: mammoth.images.imgElement((image: any) => { return image.read('base64').then((imageBuffer: Buffer) => { const base64 = imageBuffer.toString('base64'); const contentType = this.getImageContentType(image.contentType); const dataUrl = `data:${contentType};base64,${base64}`; // Store image with a key for later reference const imageKey = `img_${images.size}`; images.set(imageKey, dataUrl); return { src: dataUrl, alt: `Embedded image (${image.contentType})`, }; }); }), }; await mammoth.convertToHtml({ buffer }, options); } catch (error) { console.warn('Could not extract images:', error); } return images; } private createPagesFromStructure(structure: any[], images: Map<string, string>): any[] { return structure.map((pageData) => { const content = this.parseHtmlContent(pageData.content, images); return { content: content, metadata: { pageNumber: pageData.pageNumber, hasPageBreak: pageData.isPageBreak, }, }; }); } private parseHtmlContent(html: string, images: Map<string, string>): any[] { const content: any[] = []; // Split HTML into text and image sections const parts = html.split(/(<img[^>]*>)/g); for (const part of parts) { if (part.trim() === '') continue; if (part.startsWith('<img')) { // Extract image information const srcMatch = part.match(/src="([^"]+)"/); const altMatch = part.match(/alt="([^"]+)"/); if (srcMatch) { const src = srcMatch[1]; const alt = altMatch ? altMatch[1] : 'Embedded image'; content.push({ type: 'image' as const, data: src, text: `[${alt}]`, }); } } else { // Clean up HTML and extract text const cleanText = part .replace(/<[^>]*>/g, ' ') // Remove HTML tags .replace(/\s+/g, ' ') // Normalize whitespace .trim(); if (cleanText.length > 0) { content.push({ type: 'text' as const, data: cleanText, text: cleanText, }); } } } return content; } private getImageContentType(mammothContentType: string): string { const typeMap: { [key: string]: string } = { 'image/png': 'image/png', 'image/jpeg': 'image/jpeg', 'image/jpg': 'image/jpeg', 'image/gif': 'image/gif', 'image/bmp': 'image/bmp', 'image/webp': 'image/webp', 'image/svg+xml': 'image/svg+xml', }; return typeMap[mammothContentType] || 'image/png'; } private extractTitleFromPath(filePath: string): string { const fileName = filePath.split(/[/\\]/).pop() || ''; return fileName.replace(/\.[^.]*$/, ''); } }