UNPKG

file2md

Version:

A TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX, HWP, HWPX) into Markdown with image and layout preservation

github.com/ricky-clevi/file2md

ricky-clevi/file2md

1,165 lines • 50.1 kB

JavaScript

import JSZip from 'jszip'; import { XMLParser } from 'fast-xml-parser'; import { JSDOM } from 'jsdom'; import path from 'node:path'; import { Buffer } from 'node:buffer'; import { setupBrowserPolyfills } from '../utils/browser-polyfills.js'; import { ParseError } from '../types/errors.js'; /** * Build a relationship map for HWPX content files (rId -> target zip path) * HWPX follows OPC; relationships are stored alongside content files: * Contents/section0.xml -> Contents/_rels/section0.xml.rels */ async function buildRelationshipMap(zip, contentFileNames) { const relMap = {}; const parser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_', trimValues: true }); for (const contentFileName of contentFileNames) { try { const dir = path.posix.dirname(contentFileName); const base = path.posix.basename(contentFileName); const relsPath = path.posix.join(dir, '_rels', `${base}.rels`); const relsFile = zip.file(relsPath); if (!relsFile) continue; const relsXml = await relsFile.async('string'); const rels = parser.parse(relsXml); const relationships = rels?.Relationships?.Relationship; if (!relationships) continue; const relArray = Array.isArray(relationships) ? relationships : [relationships]; for (const rel of relArray) { const relObj = rel; const id = relObj['@_Id'] || relObj['@_ID']; const targetRaw = relObj['@_Target'] || relObj['@_HRef']; if (!id || !targetRaw) continue; // Normalize target to a POSIX zip path and try to resolve to an existing entry const tryCandidates = []; let target = targetRaw.replace(/\\/g, '/'); if (target.startsWith('/')) { target = target.slice(1); // remove leading slash } // Candidate 1: resolve relative to the content file directory tryCandidates.push(path.posix.normalize(path.posix.join(dir, target))); // Candidate 2: as-is normalized (some rels already relative to root) tryCandidates.push(path.posix.normalize(target)); // Candidate 3: strip common prefixes (e.g., Contents/) if (target.includes('BinData/')) { const tail = target.split('BinData/').pop(); tryCandidates.push(`BinData/${tail}`); } const resolvedExisting = tryCandidates.find(c => !!zip.file(c)); // Store with r:id as key (common in content) relMap[id] = resolvedExisting || tryCandidates[0]; // Also store with potential 'r:id' prefix to increase hit rate in matching relMap[`r:${id}`] = relMap[id]; } } catch (e) { console.warn('Failed to parse relationship file for', contentFileName, e); } } return relMap; } /** * Augment relationship map using HWPX content manifest (Contents/content.hpf) * This file often maps binItem ids to actual BinData/* targets. */ async function augmentRelationshipMapWithContentHpf(zip, relMap) { const contentHpf = zip.file('Contents/content.hpf'); if (!contentHpf) return relMap; try { const xml = await contentHpf.async('string'); const parser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_', trimValues: true }); const parsed = parser.parse(xml); const idToTarget = { ...relMap }; const visit = (node, depth = 0) => { if (!node || depth > 8) return; if (typeof node === 'object') { const obj = node; const id = obj['@_id'] || obj['@_ID'] || obj['@_itemID'] || obj['@_binItem']; const href = obj['@_Target'] || obj['@_HRef'] || obj['@_href'] || obj['@_path'] || obj['@_src']; if (id && href && /BinData\//i.test(String(href))) { let target = String(href).replace(/\\/g, '/'); if (target.startsWith('/')) target = target.slice(1); // Prefer explicit BinData prefix if (!target.includes('BinData/')) { const tail = target.split('BinData/').pop(); if (tail) target = `BinData/${tail}`; } idToTarget[id] = path.posix.normalize(target); } for (const value of Object.values(obj)) { visit(value, depth + 1); } } else if (Array.isArray(node)) { for (const item of node) visit(item, depth + 1); } }; visit(parsed, 0); return idToTarget; } catch (e) { console.warn('Failed to parse content.hpf for binItem mapping:', e); return relMap; } } /** * Parse HWP or HWPX buffer and convert to markdown */ export async function parseHwp(buffer, imageExtractor, chartExtractor, options = {}) { try { const format = detectHwpFormat(buffer); switch (format) { case 'hwp': return await parseHwpBinary(buffer, imageExtractor, chartExtractor, options); case 'hwpx': return await parseHwpxXml(buffer, imageExtractor, chartExtractor, options); default: throw new ParseError('HWP', 'Unsupported HWP format variant'); } } catch (error) { const message = error instanceof Error ? error.message : 'Unknown error'; throw new ParseError('HWP', message, error); } } /** * Detect HWP format based on file signature */ function detectHwpFormat(buffer) { if (buffer.length < 4) { return 'unknown'; } // Check for CFB/OLE2 signature (HWP binary format) if (buffer.length >= 8) { const cfbSignature = buffer.subarray(0, 8); const expectedCfb = Buffer.from([0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1]); if (cfbSignature.equals(expectedCfb)) { return 'hwp'; } } // Check for ZIP signature (HWPX format) const zipSignature = buffer.subarray(0, 4); const expectedZip = Buffer.from([0x50, 0x4B, 0x03, 0x04]); if (zipSignature.equals(expectedZip)) { return 'hwpx'; } return 'unknown'; } /** * Parse HWP binary format using hwp.js */ async function parseHwpBinary(buffer, imageExtractor, _chartExtractor, options) { try { // Setup browser polyfills before importing hwp.js setupBrowserPolyfills(); // Dynamic import of hwp.js to handle potential loading issues const { Viewer } = await import('hwp.js'); // Convert Buffer to Uint8Array for hwp.js const uint8Array = new Uint8Array(buffer); // Create a virtual DOM environment for hwp.js const dom = new JSDOM('<!DOCTYPE html><html><body><div id="hwp-container"></div></body></html>', { pretendToBeVisual: true, resources: "usable" }); const originalDocument = global.document; const originalWindow = global.window; const originalIntersectionObserver = global.IntersectionObserver; const originalResizeObserver = global.ResizeObserver; const originalMutationObserver = global.MutationObserver; // Set global DOM objects for hwp.js global.document = dom.window.document; global.window = dom.window; // Ensure our polyfills are available in the DOM window as well if (!dom.window.IntersectionObserver) { dom.window.IntersectionObserver = global.IntersectionObserver; } if (!dom.window.ResizeObserver) { dom.window.ResizeObserver = global.ResizeObserver; } if (!dom.window.MutationObserver) { dom.window.MutationObserver = global.MutationObserver; } try { const container = global.document.getElementById('hwp-container'); if (!container) { throw new Error('Failed to create container element'); } // Initialize hwp.js viewer with error handling let viewer; try { viewer = new Viewer(container, uint8Array); // Check if viewer was created successfully if (!viewer) { throw new Error('Viewer instance is null or undefined'); } // Verify viewer has expected properties const viewerObj = viewer; if (viewerObj && typeof viewerObj === 'object') { console.log('Viewer created successfully'); } } catch (viewerError) { console.warn('Failed to initialize hwp.js Viewer:', viewerError); throw new Error(`hwp.js Viewer initialization failed: ${viewerError instanceof Error ? viewerError.message : 'Unknown error'}`); } // Wait longer for viewer to process the document and render content await new Promise(resolve => setTimeout(resolve, 3000)); // Try multiple approaches to extract actual content let textContent = []; // Attempt 1: Extract from viewer container textContent = extractTextFromViewer(container); // Attempt 2: If no meaningful content found, try direct viewer access if (textContent.length === 0 || isOnlyCopyrightMessage(textContent)) { textContent = extractFromViewerInstance(viewer, container); } // Attempt 3: If still no content, try broader DOM extraction if (textContent.length === 0 || isOnlyCopyrightMessage(textContent)) { textContent = extractFromEntireContainer(container); } // Parse images if requested const images = options.extractImages !== false ? await extractHwpImages(container, imageExtractor) : []; // Convert to markdown const markdown = convertHwpContentToMarkdown(textContent); return { markdown, images, charts: [], // hwp.js doesn't directly expose chart data metadata: { format: 'hwp', parser: 'hwp.js', totalParagraphs: textContent.length } }; } finally { // Restore global objects global.document = originalDocument; global.window = originalWindow; if (originalIntersectionObserver !== undefined) { global.IntersectionObserver = originalIntersectionObserver; } if (originalResizeObserver !== undefined) { global.ResizeObserver = originalResizeObserver; } if (originalMutationObserver !== undefined) { global.MutationObserver = originalMutationObserver; } } } catch (error) { const message = error instanceof Error ? error.message : 'Unknown error'; throw new ParseError('HWP', `Failed to parse HWP file with hwp.js: ${message}`, error); } } /** * Parse HWPX XML format using JSZip and fast-xml-parser */ async function parseHwpxXml(buffer, imageExtractor, _chartExtractor, options) { try { const zip = await JSZip.loadAsync(buffer); // Log all files in the ZIP for debugging const allFiles = Object.keys(zip.files); console.log('HWPX archive contains files:', allFiles); // Find main content files in HWPX (OWPML format) // HWPX structure typically has sections in Contents/section0.xml, section1.xml, etc. const contentFiles = [ 'Contents/section0.xml', 'Contents/section1.xml', 'Contents/content.hpf', 'Contents/header.xml', 'Contents/content.xml', 'content.xml', 'Contents/document.xml', 'document.xml', 'Contents/body.xml', 'body.xml', 'version.xml', 'mimetype' ]; // Try to find any section files const sectionFiles = allFiles.filter(f => f.match(/Contents\/section\d+\.xml/)); if (sectionFiles.length > 0) { console.log('Found section files:', sectionFiles); } // Try to find XML files const xmlFiles = allFiles.filter(f => f.endsWith('.xml')); if (xmlFiles.length > 0) { console.log('Found XML files:', xmlFiles); } let contentFile = null; let contentFileName = ''; // First try section files if (sectionFiles.length > 0) { contentFileName = sectionFiles[0]; contentFile = zip.file(contentFileName); } // Then try our known content files if (!contentFile) { for (const fileName of contentFiles) { contentFile = zip.file(fileName); if (contentFile) { contentFileName = fileName; break; } } } // If still not found, try any XML file if (!contentFile && xmlFiles.length > 0) { for (const xmlFile of xmlFiles) { if (!xmlFile.includes('_rels') && !xmlFile.includes('meta')) { contentFile = zip.file(xmlFile); if (contentFile) { contentFileName = xmlFile; break; } } } } if (!contentFile) { // Create a more informative error message const fileList = allFiles.slice(0, 10).join(', '); const moreFiles = allFiles.length > 10 ? ` ... and ${allFiles.length - 10} more files` : ''; throw new ParseError('HWPX', `No content XML file found in HWPX archive. Files found: ${fileList}${moreFiles}`); } // Extract images from ZIP if requested (do this before parsing to pass images to parser) const images = options.extractImages !== false ? await extractHwpxImages(zip, imageExtractor) : []; // Build relationships map for all content files we will parse const relContentFiles = sectionFiles.length > 0 ? sectionFiles.sort() : [contentFileName]; let relationshipMap = await buildRelationshipMap(zip, relContentFiles); relationshipMap = await augmentRelationshipMapWithContentHpf(zip, relationshipMap); // Parse all section files if multiple exist let allContent = ''; if (sectionFiles.length > 0) { // Process all section files in order for (const sectionFileName of sectionFiles.sort()) { const sectionFile = zip.file(sectionFileName); if (sectionFile) { const xmlContent = await sectionFile.async('string'); const parser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_', textNodeName: '#text', parseAttributeValue: true, trimValues: true }); const parsedXml = parser.parse(xmlContent); console.log(`Parsed HWPX section: ${sectionFileName}`); // Convert each section to markdown and combine const sectionMarkdown = convertOwpmlToMarkdown(parsedXml, images, relationshipMap); if (sectionMarkdown && sectionMarkdown.trim()) { allContent += `${sectionMarkdown}\n\n`; } } } } else { // Parse single content file const xmlContent = await contentFile.async('string'); const parser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_', textNodeName: '#text', parseAttributeValue: true, trimValues: true }); const parsedXml = parser.parse(xmlContent); console.log(`Parsed HWPX XML from ${contentFileName}`); allContent = convertOwpmlToMarkdown(parsedXml, images, relationshipMap); } const markdown = allContent.trim() || '*No readable content found in HWPX file*'; return { markdown, images, charts: [], // Basic implementation, can be enhanced later metadata: { format: 'hwpx', parser: 'jszip+fast-xml-parser', contentFile: contentFileName, zipEntries: Object.keys(zip.files).length } }; } catch (error) { const message = error instanceof Error ? error.message : 'Unknown error'; throw new ParseError('HWPX', `Failed to parse HWPX file: ${message}`, error); } } /** * Check if text is the Korean copyright message from hwp.js */ function isCopyrightMessage(text) { const copyrightPatterns = [ /본\s*제품은\s*한글과컴퓨터의\s*한\/글\s*문서\s*파일/, /Copyright\s*2020\s*Han\s*Lee/, /hanlee\.dev@gmail\.com/, /개발하였습니다/, /참고하여\s*개발/, /공개\s*문서를\s*참고/ ]; return copyrightPatterns.some(pattern => pattern.test(text)); } /** * Check if the entire content array contains only copyright messages */ function isOnlyCopyrightMessage(paragraphs) { if (paragraphs.length === 0) return true; const nonCopyrightContent = paragraphs.filter(p => !isCopyrightMessage(p) && p.trim().length > 0); return nonCopyrightContent.length === 0; } /** * Attempt to extract content directly from hwp.js viewer instance */ function extractFromViewerInstance(viewer, _container) { const paragraphs = []; try { // Try to access viewer's internal content if available const viewerObj = viewer; if (viewerObj.text) { const text = viewerObj.text.trim(); if (text && !isCopyrightMessage(text)) { paragraphs.push(...text.split(/\n\s*\n/).filter(p => p.trim().length > 0)); } } if (viewerObj.pages && Array.isArray(viewerObj.pages)) { for (const page of viewerObj.pages) { const pageObj = page; if (pageObj.text && !isCopyrightMessage(pageObj.text)) { paragraphs.push(...pageObj.text.split(/\n\s*\n/).filter(p => p.trim().length > 0)); } if (pageObj.content && !isCopyrightMessage(pageObj.content)) { paragraphs.push(...pageObj.content.split(/\n\s*\n/).filter(p => p.trim().length > 0)); } } } } catch (e) { console.warn('Failed to extract from viewer instance:', e); } return paragraphs.filter(p => !isCopyrightMessage(p)); } /** * Extract all text from container, including from child elements */ function extractFromEntireContainer(container) { const paragraphs = []; try { // Get all text nodes recursively const ownerDocument = container.ownerDocument; if (!ownerDocument) { throw new Error('No owner document found'); } const walker = ownerDocument.createTreeWalker(container, NodeFilter.SHOW_TEXT, null); const textNodes = []; let node = walker.nextNode(); while (node !== null) { const text = node.textContent?.trim(); if (text && text.length > 2 && !isCopyrightMessage(text)) { textNodes.push(text); } node = walker.nextNode(); } // Combine adjacent text nodes and split by natural breaks const combinedText = textNodes.join(' ').trim(); if (combinedText) { const lines = combinedText.split(/[\r\n]+/).filter(line => { const trimmed = line.trim(); return trimmed.length > 0 && !isCopyrightMessage(trimmed); }); paragraphs.push(...lines.map(line => line.trim())); } } catch (e) { console.warn('Failed to extract from entire container:', e); // Final fallback: just get textContent and clean it up const allText = container.textContent?.trim(); if (allText && !isCopyrightMessage(allText)) { const cleaned = allText .split(/[\r\n]+/) .map(line => line.trim()) .filter(line => line.length > 0 && !isCopyrightMessage(line)); paragraphs.push(...cleaned); } } return paragraphs.filter(p => !isCopyrightMessage(p)); } /** * Extract text content from hwp.js rendered DOM */ function extractTextFromViewer(container) { const paragraphs = []; // Look for various text-containing elements, with improved hwp.js specific selectors const textSelectors = [ // hwp.js specific selectors (v0.0.3 might use these) '.hwp-para', '.hwp-text', '.hwp-line', '.hwp-char', '[data-hwp-text]', '[data-text]', '[data-content]', // Generic selectors as fallback 'p', 'div[data-type="paragraph"]', 'div[data-type="text"]', 'span[data-type="text"]', 'div[style*="text"]', 'span', 'div' ]; for (const selector of textSelectors) { try { const elements = container.querySelectorAll(selector); elements.forEach(element => { const text = element.textContent?.trim(); if (text && text.length > 0 && !paragraphs.includes(text) && !isCopyrightMessage(text)) { paragraphs.push(text); } }); // Only break if we found meaningful content (not just copyright) if (paragraphs.length > 0 && !isOnlyCopyrightMessage(paragraphs)) break; } catch { continue; // Try next selector } } // Fallback: get all text content and filter if (paragraphs.length === 0 || isOnlyCopyrightMessage(paragraphs)) { const allText = container.textContent?.trim(); if (allText) { // Split by common paragraph separators const lines = allText.split(/\n\s*\n|\r\n\s*\r\n|\n/); const filteredLines = lines .filter(line => line.trim().length > 0) .filter(line => !isCopyrightMessage(line.trim())) .map(line => line.trim()); if (filteredLines.length > 0) { paragraphs.length = 0; // Clear any copyright-only content paragraphs.push(...filteredLines); } } } return paragraphs.filter(p => !isCopyrightMessage(p)); } /** * Extract images from HWP binary format */ async function extractHwpImages(container, imageExtractor) { const images = []; try { // Look for image elements in the rendered content const imgElements = container.querySelectorAll('img, canvas, [style*="background-image"]'); imgElements.forEach((element, index) => { const src = element.getAttribute('src'); if (src && src.startsWith('data:')) { // Handle base64 embedded images try { const matches = src.match(/data:image\/([^;]+);base64,(.+)/); if (matches) { const format = matches[1]; const base64Data = matches[2]; const imageBuffer = Buffer.from(base64Data, 'base64'); const filename = `hwp-image-${index + 1}.${format}`; const imagePath = path.join(imageExtractor.imageDirectory, filename); // Note: This is a simplified approach // In a real implementation, you'd save the buffer to disk images.push({ originalPath: `hwp-embedded-${index + 1}`, savedPath: imagePath, format, size: imageBuffer.length }); } } catch (e) { console.warn(`Failed to process embedded image ${index + 1}:`, e); } } }); } catch (e) { console.warn('Failed to extract images from HWP:', e); } return images; } /** * Extract images from HWPX ZIP archive */ async function extractHwpxImages(zip, imageExtractor) { const images = []; try { // Look for image files in the ZIP archive - typically in BinData folder const imageExtensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff']; for (const [fileName, file] of Object.entries(zip.files)) { if (!file.dir && imageExtensions.some(ext => fileName.toLowerCase().endsWith(ext))) { try { const imageBuffer = await file.async('nodebuffer'); const extension = path.extname(fileName).slice(1).toLowerCase(); // Actually save the image to disk using imageExtractor // Pass originalPath and basePath (empty string for base) const savedPath = await imageExtractor.saveImage(imageBuffer, fileName, ''); if (savedPath) { images.push({ originalPath: fileName, savedPath, format: extension, size: imageBuffer.length }); console.log(`Extracted and saved image: ${fileName} -> ${savedPath}`); } } catch (e) { console.warn(`Failed to extract image ${fileName}:`, e); } } } } catch (e) { console.warn('Failed to extract images from HWPX:', e); } return images; } /** * Convert HWP text content to markdown */ function convertHwpContentToMarkdown(textContent) { // Filter out any remaining copyright messages const filteredContent = textContent.filter(p => !isCopyrightMessage(p) && p.trim().length > 0); if (filteredContent.length === 0) { return '*No readable content found in HWP file. The file may be corrupted, encrypted, or contain only images/graphics.*'; } let markdown = ''; filteredContent.forEach((paragraph, index) => { if (paragraph.trim().length === 0) return; // Escape tilde characters to prevent strikethrough formatting in markdown const escapedParagraph = paragraph.replace(/~/g, '\\~'); // Simple heuristics for formatting if (escapedParagraph.length < 50 && index === 0 && !escapedParagraph.match(/^[0-9]+\./)) { // Likely a title markdown += `# ${escapedParagraph}\n\n`; } else if (escapedParagraph.match(/^[0-9]+\./)) { // Numbered list item markdown += `${escapedParagraph}\n`; } else if (escapedParagraph.match(/^[-•*]/)) { // Bullet list item markdown += `${escapedParagraph}\n`; } else { // Regular paragraph markdown += `${escapedParagraph}\n\n`; } }); const result = markdown.trim(); // Final check - if we only got copyright or very short content, provide helpful message if (result.length < 10 || isCopyrightMessage(result)) { return '*Unable to extract meaningful content from HWP file. This may be due to the limitations of the hwp.js library version 0.0.3 or the file format. Consider converting the file to HWPX format for better results.*'; } return result; } /** * Smart join markdown parts with appropriate spacing */ function smartJoinMarkdownParts(parts) { if (parts.length === 0) return ''; const result = []; for (let i = 0; i < parts.length; i++) { const part = parts[i].trim(); if (!part) continue; // Add appropriate spacing based on content type and position if (i > 0) { const prevPart = parts[i - 1].trim(); // Detect if we should add a paragraph break const shouldAddParagraphBreak = shouldAddParagraphBreakBetween(prevPart, part); if (shouldAddParagraphBreak) { // Add double line break for paragraph separation result.push(''); result.push(''); } else { // Add single space for sentence continuation result.push(' '); } } // Escape tilde characters to prevent strikethrough formatting in markdown const escapedPart = part.replace(/~/g, '\\~'); result.push(escapedPart); } return result.join(''); } /** * Determine if we should add a paragraph break between two parts */ function shouldAddParagraphBreakBetween(prevPart, currentPart) { // Images always get paragraph breaks if (prevPart.startsWith('![') || currentPart.startsWith('![')) { return true; } // Headings, list items get paragraph breaks if (currentPart.match(/^#{1,6}\s/) || currentPart.match(/^[-*+]\s/) || currentPart.match(/^\d+\.\s/)) { return true; } // If previous part ends with sentence terminator and current part starts a new sentence const prevEndsWithSentence = /[.!?]$/.test(prevPart); const currentStartsWithNewSentence = /^[A-Z가-힣]/.test(currentPart) && !currentPart.startsWith(','); // If previous part ends with a period that's likely an abbreviation const prevEndsWithAbbreviation = /(약|등|명|년|월|일|시|분|초|km|m|%)$/.test(prevPart); // Add paragraph break if: // 1. Previous part ends with sentence terminator AND current part starts a new sentence // 2. BUT don't add break if it looks like an abbreviation or continuation if (prevEndsWithSentence && currentStartsWithNewSentence && !prevEndsWithAbbreviation) { return true; } // Special patterns that indicate paragraph breaks const paragraphStartPatterns = [ /^ㅇ\s/, // Korean bullet point /^그리고/, /^또한/, /^한편/, /^아울러/, /^우선/, /^따라서/, /^그러나/, /^하지만/, /^또는/, /^및/, /^또/, /^이에/, /^여기에/, /^이로써/, /^결국/, /^마지막으로/ ]; const isParagraphStart = paragraphStartPatterns.some(pattern => pattern.test(currentPart)); if (isParagraphStart) { return true; } // Statistical/data patterns that often start new paragraphs const dataPatterns = [ /^\*\s/, // Bullet points with asterisk /^\d+%\s?$/, // Percentage alone /^\d+명\s?$/, // Number + "명" (people) /^\d+년\s?$/, // Year /^\d+월\s?$/, // Month /^\d+일\s?$/, // Day /^\d+시\s?$/, // Hour /^\d+분\s?$/, // Minute /^\d+초\s?$/, // Second /^평일\s/, /^주말\s/, /^금요일\s/, /^토요일\s/, /^일요일\s/, /^월요일\s/, /^화요일\s/, /^수요일\s/, /^목요일\s/ ]; const isDataStart = dataPatterns.some(pattern => pattern.test(currentPart)); if (isDataStart && prevEndsWithSentence) { return true; } return false; } /** * Convert OWPML structure to markdown */ function convertOwpmlToMarkdown(owpmlData, images = [], relationshipMap = {}) { let markdown = ''; try { // HWPX uses hp:p for paragraphs and hp:t for text // Navigate the structure to find text nodes and image references const contentItems = []; const positionCounter = { value: 0 }; // Extract all text content and image references recursively extractContentNodes(owpmlData, contentItems, positionCounter, images, relationshipMap); // DEBUG: Log extracted content items console.log('[DEBUG] Total content items extracted:', contentItems.length); console.log('[DEBUG] Content items breakdown:'); contentItems.forEach((item, index) => { console.log(`[DEBUG] Item ${index}: type=${item.type}, position=${item.position}, content="${item.content.substring(0, 50)}${item.content.length > 50 ? '...' : ''}"`); }); // Sort by position to maintain document order contentItems.sort((a, b) => a.position - b.position); // Build markdown with text and image references if (contentItems.length > 0) { const markdownParts = []; for (const item of contentItems) { if (item.type === 'text' && item.content.trim().length > 0) { markdownParts.push(item.content); } else if (item.type === 'image' && item.content.trim().length > 0) { markdownParts.push(item.content); } } // DEBUG: Log how parts are being joined console.log('[DEBUG] Total markdown parts:', markdownParts.length); console.log('[DEBUG] Markdown parts before joining:'); markdownParts.forEach((part, index) => { console.log(`[DEBUG] Part ${index}: "${part.substring(0, 50)}${part.length > 50 ? '...' : ''}"`); }); console.log('[DEBUG] Joining with smart spacing - single line breaks between paragraphs'); // Smart joining: use double line breaks for paragraph separation, single line breaks for flow markdown = smartJoinMarkdownParts(markdownParts); } if (!markdown.trim()) { markdown = '*No readable content found in HWPX file*'; } } catch (e) { console.warn('Error processing OWPML structure:', e); markdown = '*Error processing HWPX content*'; } return markdown; } /** * Extract content nodes (text and images) from OWPML structure */ function extractContentNodes(obj, contentItems, positionCounter, images, relationshipMap) { if (!obj) return; // If it's a string and looks like actual text (not XML attribute values) if (typeof obj === 'string') { // Filter out numeric-only strings, single words that look like attribute values const trimmed = obj.trim(); if (trimmed && !trimmed.match(/^[0-9\s.-]+$/) && // Skip pure numbers !trimmed.match(/^[A-Z_]+$/) && // Skip constants like "BOTH", "LEFT_ONLY" trimmed.length > 2 && // Skip very short strings !trimmed.includes('pixel') && // Skip image metadata !trimmed.startsWith('그림입니다') && // Skip image placeholders !trimmed.includes('원본 그림')) { // Skip image descriptions contentItems.push({ type: 'text', content: trimmed, position: positionCounter.value++ }); } return; } // If it's an array, process each item if (Array.isArray(obj)) { for (const item of obj) { extractContentNodes(item, contentItems, positionCounter, images, relationshipMap); } return; } // If it's an object, look for content if (typeof obj === 'object' && obj !== null) { // Check for image/drawing references in HWPX // HWPX can have hp:pic, PICTURE, IMAGE, or drawing objects if (obj['hp:pic'] || obj['PICTURE'] || obj['IMAGE'] || obj['hp:draw'] || obj['DRAWING']) { // Try to find a matching image and insert reference const imageRef = findImageReference(obj, images, relationshipMap); if (imageRef) { contentItems.push({ type: 'image', content: imageRef, position: positionCounter.value++ }); } } // HWPX specific text node handling // Look for hp:p (paragraphs) and hp:t (text) nodes if (obj['hp:p']) { const paragraphs = Array.isArray(obj['hp:p']) ? obj['hp:p'] : [obj['hp:p']]; for (const para of paragraphs) { extractParagraphContent(para, contentItems, positionCounter, images, relationshipMap); } } // Also check for plain p nodes if (obj.p) { const paragraphs = Array.isArray(obj.p) ? obj.p : [obj.p]; for (const para of paragraphs) { extractParagraphContent(para, contentItems, positionCounter, images, relationshipMap); } } // Check for TEXT nodes if (obj.TEXT) { const textNodes = Array.isArray(obj.TEXT) ? obj.TEXT : [obj.TEXT]; for (const textNode of textNodes) { if (textNode['#text']) { const text = textNode['#text'].trim(); if (text && !isMetadata(text)) { contentItems.push({ type: 'text', content: text, position: positionCounter.value++ }); } } } } // Recursively process all properties for (const [key, value] of Object.entries(obj)) { // Skip attribute keys and known metadata keys if (!key.startsWith('@_') && !key.startsWith('_') && key !== 'SECDEF' && key !== 'DOCSUMMARY' && key !== 'MAPPINGTABLE' && key !== 'COMPATIBLE_DOCUMENT' && key !== 'LAYOUTCOMPATIBILITY') { extractContentNodes(value, contentItems, positionCounter, images, relationshipMap); } } } } /** * Find image reference based on drawing/picture object */ function findImageReference(drawingObj, images, relationshipMap) { // Reset global counter at the start of each conversion to ensure fresh sequence // This is a simple approach - in production you might want a more sophisticated reset mechanism if (images.length > 0 && (!('__globalImageCounter' in findImageReference) || findImageReference.__globalImageCounter >= images.length * 2)) { findImageReference.__globalImageCounter = 0; console.log('[DEBUG] Reset global image counter for new conversion'); } if (!images || images.length === 0) return null; try { // Try to find an image ID or reference in the drawing object const obj = drawingObj; // Look for common image reference patterns let imageId = null; // Helper: recursively search for an image relationship id within the drawing object const findRidDeep = (o, depth = 0) => { if (!o || depth > 4) return null; if (typeof o !== 'object') return null; const r = o; const directId = r['@_id'] || r['@_refId'] || r['@_href'] || r['@_r:id'] || r['@_rId'] || r['@_rid'] || r['refId'] || r['r:id']; if (directId && typeof directId === 'string') return directId; for (const v of Object.values(r)) { const nested = findRidDeep(v, depth + 1); if (nested) return nested; } return null; }; imageId = findRidDeep(obj); // If we found an ID, try to match it with our extracted images if (imageId && typeof imageId === 'string') { // Resolve via relationships first (rId -> target path inside zip) // Normalize id to check variants (with/without r:) const variants = [imageId, imageId.startsWith('r:') ? imageId.slice(2) : `r:${imageId}`]; const targetPath = variants.map(v => relationshipMap[v]).find(Boolean); let matchingImage; if (targetPath) { matchingImage = images.find(img => img.originalPath.replace(/\\/g, '/').toLowerCase() === targetPath.toLowerCase()); // Also try endsWith for safety if some paths differ in prefixes if (!matchingImage) { matchingImage = images.find(img => targetPath.toLowerCase().endsWith(img.originalPath.replace(/\\/g, '/').toLowerCase()) || img.originalPath.replace(/\\/g, '/').toLowerCase().endsWith(targetPath.toLowerCase())); } } // Fallback to substring match if (!matchingImage && typeof imageId === 'string') { matchingImage = images.find(img => img.originalPath.includes(imageId) || img.savedPath.includes(imageId)); } if (matchingImage) { const imageName = path.basename(matchingImage.savedPath); const markdownRef = `![Image](images/${imageName})`; console.log(`[DEBUG] Found matching image: ${markdownRef} (originalPath: ${matchingImage.originalPath}, savedPath: ${matchingImage.savedPath})`); return markdownRef; } } // If no rId path, look for direct BinData references OR binItem id links const findDirectImageTarget = (o, depth = 0) => { if (!o || depth > 6) return null; if (typeof o === 'string') { const s = o.trim(); const m = s.match(/BinData\/[\w.-]+\.(png|jpg|jpeg|gif|bmp|tiff)/i); if (m) return m[0]; return null; } if (typeof o === 'object') { const r = o; // Check common attributes const candidates = [r['@_Target'], r['@_HRef'], r['@_src'], r['@_href'], r['@_path'], r['@_file']]; for (const c of candidates) { const found = findDirectImageTarget(c, depth + 1); if (found) return found; } // If binItem id present, map via relationshipMap as a second step const binId = r['@_binItemRef'] || r['@_binItem'] || r['@_idref'] || r['@_idRef']; if (binId && relationshipMap[binId]) { return relationshipMap[binId]; } for (const v of Object.values(r)) { const found = findDirectImageTarget(v, depth + 1); if (found) return found; } } return null; }; const directTarget = findDirectImageTarget(obj); if (directTarget) { const targetLc = directTarget.replace(/\\/g, '/').toLowerCase(); const matchingImage = images.find(img => { const origLc = img.originalPath.replace(/\\/g, '/').toLowerCase(); return origLc === targetLc || origLc.endsWith(targetLc) || targetLc.endsWith(origLc); }); if (matchingImage) { const imageName = path.basename(matchingImage.savedPath); const markdownRef = `![Image](images/${imageName})`; console.log(`[DEBUG] Found direct target match: ${markdownRef} (originalPath: ${matchingImage.originalPath}, savedPath: ${matchingImage.savedPath})`); return markdownRef; } } // If no specific match found, but we do have extracted images, use a global counter // to ensure each image reference gets a different image in sequence if (!('__globalImageCounter' in findImageReference)) { findImageReference.__globalImageCounter = 0; } const globalCounter = findImageReference.__globalImageCounter; const selected = images[globalCounter % images.length]; findImageReference.__globalImageCounter = globalCounter + 1; if (selected) { const imageName = path.basename(selected.savedPath); const markdownRef = `![Image](images/${imageName})`; console.log(`[DEBUG] Using sequential image reference: ${markdownRef} (counter: ${globalCounter}, total images: ${images.length})`); console.log(`[DEBUG] Selected image: originalPath=${selected.originalPath}, savedPath=${selected.savedPath}`); return markdownRef; } console.log(`[DEBUG] No fallback image available (total images: ${images.length})`); return null; } catch (e) { console.warn('Error finding image reference:', e); } return null; } /** * Extract content from a paragraph node (both text and images) */ function extractParagraphContent(para, contentItems, positionCounter, images, relationshipMap) { if (!para) return; console.log('[DEBUG] Processing new paragraph'); // Check for images/drawings in paragraph first const obj = para; if (obj['hp:pic'] || obj['PICTURE'] || obj['IMAGE'] || obj['hp:draw'] || obj['DRAWING']) { const imageRef = findImageReference(obj, images, relationshipMap); if (imageRef) { contentItems.push({ type: 'image', content: imageRef, position: positionCounter.value++ }); } } // Extract and combine all text content from this paragraph into a single content item const combinedText = extractCombinedParagraphText(para); if (combinedText && combinedText.trim().length > 0) { console.log(`[DEBUG] Adding combined paragraph text: "${combinedText.substring(0, 50)}${combinedText.length > 50 ? '...' : ''}"`); contentItems.push({ type: 'text', content: combinedText.trim(), position: positionCounter.value++ }); } } /** * Extract and combine all text content from a paragraph into a single string */ function extractCombinedParagraphText(para) { if (!para) return ''; const textSegments = []; // Look for hp:run or run nodes const runs = para['hp:run'] || para['run'] || para['RUN']; if (runs) { const runArray = Array.isArray(runs) ? runs : [runs]; console.log(`[DEBUG] Processing ${runArray.length} runs in paragraph for combination`); for (const run of runArray) { // Look for hp:t or t nodes (text content) const textNode = run['hp:t'] || run['t'] || run['T'] || run['#text']; if (textNode) { if (typeof textNode === 'string') { const text = textNode.trim(); if (text && !isMetadata(text)) { console.log(`[DEBUG] Adding text segment from run: "${text.substring(0, 30)}${text.length > 30 ? '...' : ''}"`); textSegments.push(text); } } else if (textNode['#text']) { const text = textNode['#text'].trim(); if (text && !isMetadata(text)) { console.log(`[DEBUG] Adding text segment from run.#text: "${text.substring(0, 30)}${text.length > 30 ? '...' : ''}"`); textSegments.push(text); } } } } } // Also check for direct text content if (para['#text']) { const text = para['#text'].trim(); if (text && !isMetadata(text)) { textSegments.push(text); } } // Check for TEXT child nodes if (para.TEXT) { const textNodes = Array.isArray(para.TEXT) ? para.TEXT : [para.TEXT]; for (const textNode of textNodes) { if (textNode['#text']) { const text = textNode['#text'].trim(); if (text && !isMetadata(text)) { textSegments.push(text); } } else if (typeof textNode === 'string') { const text = textNode.trim(); if (text && !isMetadata(text)) { textSegments.push(text); } } } } // Combine all text segments with single spaces const combinedText = textSegments.join(' '); console.log(`[DEBUG] Combined ${textSegments.length} segments into: "${combinedText.substring(0, 50)}${combinedText.length > 50 ? '...' : ''}"`); return combinedText; } /** * Check if a string looks like metadata rather than document content */ function isMetadata(text) { // Filter out common metadata patterns return text.match(/^[A-Z_]+$/) !== null || // Constants text.match(/^[0-9\s.-]+$/) !== null || // Pure numbers text.includes('pixel') || // Image metadata text.startsWith('그림입니다') || // Korean "This is an image" text.includes('원본 그림') || // Korean "Original image" text.includes('.jpg') || // File names text.includes('.png') || text.includes('.bmp') || text.includes('http://') || // URLs in metadata text.length < 3; // Very short strings } //# sourceMappingURL=hwp-parser.js.map