file2md
Version:
A TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX, HWP, HWPX) into Markdown with image and layout preservation
282 lines ⢠11.1 kB
JavaScript
export class LayoutParser {
tableCounter = 0;
/**
* Parse an advanced table with merged cells and styling
*/
parseAdvancedTable(tableData, options = {}) {
if (!tableData.rows || tableData.rows.length === 0) {
return '';
}
const { preserveAlignment = true, preserveColors = false } = options;
let markdown = '';
const rows = tableData.rows;
const colCount = Math.max(...rows.map(row => row.cells ? row.cells.length : 0));
// Process each row
for (const [rowIndex, row] of rows.entries()) {
let rowMarkdown = '|';
if (!row.cells)
continue;
// Process each cell
for (let colIndex = 0; colIndex < colCount; colIndex++) {
const cell = row.cells[colIndex];
if (!cell) {
rowMarkdown += ' |';
continue;
}
let cellContent = cell.text || '';
// Handle merged cells
if (cell.merged) {
if (cell.colSpan > 1) {
// For horizontal merge, add extra columns
cellContent += ' '.repeat(Math.max(0, cell.colSpan - 1) * 3);
}
// Note: Markdown doesn't support rowspan, so we approximate
}
// Process markdown formatting in cell content
cellContent = this.processCellFormatting(cellContent);
// Apply additional text formatting from cell properties
if (cell.bold && !cellContent.includes('**')) {
cellContent = `**${cellContent}**`;
}
if (cell.italic && !cellContent.includes('*')) {
cellContent = `*${cellContent}*`;
}
// Apply alignment (approximate with spaces)
if (preserveAlignment && cell.alignment) {
const cellWidth = Math.max(cellContent.length, 10);
switch (cell.alignment) {
case 'center': {
const padding = Math.floor((cellWidth - cellContent.length) / 2);
cellContent = `${' '.repeat(padding)}${cellContent}${' '.repeat(padding)}`;
break;
}
case 'right': {
cellContent = cellContent.padStart(cellWidth);
break;
}
// 'left' and 'justify' use default formatting
}
}
// Add background color note if enabled
if (preserveColors && cell.backgroundColor) {
cellContent += ` <!-- bg:${cell.backgroundColor} -->`;
}
rowMarkdown += ` ${cellContent} |`;
}
markdown += `${rowMarkdown}\n`;
// Add header separator after first row
if (rowIndex === 0) {
let separator = '|';
for (const cell of rows[0].cells) {
let sepContent = ' --- ';
// Apply alignment in separator
if (preserveAlignment && cell?.alignment) {
switch (cell.alignment) {
case 'center':
sepContent = ':---:';
break;
case 'right':
sepContent = ' ---:';
break;
case 'left':
default:
sepContent = ':--- ';
break;
}
}
separator += `${sepContent}|`;
}
markdown += `${separator}
`;
}
}
return markdown;
}
/**
* Parse lists with proper nesting
*/
parseList(listData) {
if (!listData.items || listData.items.length === 0)
return '';
const processListItems = (items, level = 0) => {
let result = '';
for (const item of items) {
const indent = ' '.repeat(level);
const marker = listData.isOrdered ? '1.' : '-';
let itemText = item.text || '';
// Apply formatting
if (item.bold)
itemText = `**${itemText}**`;
if (item.italic)
itemText = `*${itemText}*`;
result += `${indent}${marker} ${itemText}\n`;
// Handle nested lists
if (item.children && item.children.length > 0) {
result += processListItems(item.children, level + 1);
}
}
return result;
};
return processListItems(listData.items);
}
/**
* Create text box representation
*/
createTextBox(content, position) {
let markdown = '';
if (position && (position.x || position.y)) {
markdown += `<!-- Position: x=${position.x || 0}, y=${position.y || 0} -->\n`;
}
markdown += '> **Text Box**\n';
markdown += '> \n';
// Split content into lines and add blockquote formatting
const lines = content.split('\n');
for (const line of lines) {
markdown += `> ${line}\n`;
}
return `${markdown}\n`;
}
/**
* Create multi-column layout approximation - only for genuine multi-column content
*/
createColumns(columns) {
if (!columns || columns.length <= 1) {
return columns[0]?.content || '';
}
// Be much more conservative - only create columns if there's substantial content
// and it looks like genuinely different content types
const substantialColumns = columns.filter(col => col.content && col.content.trim().length > 10);
if (substantialColumns.length <= 1) {
// Just concatenate content with line breaks
return columns.map(col => col.content || '').filter(c => c.trim()).join('\n\n');
}
// Only create table format if we have 2-4 substantial columns
if (substantialColumns.length > 4) {
return substantialColumns.map(col => col.content || '').join('\n\n');
}
let markdown = '';
// Create a simple side-by-side layout without excessive table structure
markdown += '|';
for (const [i] of substantialColumns.entries()) {
markdown += ` Column ${i + 1} |`;
}
markdown += '\n';
markdown += '|';
// eslint-disable-next-line @typescript-eslint/no-unused-vars
for (const _ of substantialColumns) {
markdown += ' --- |';
}
markdown += '\n';
// Create a single row with all content
markdown += '|';
for (const column of substantialColumns) {
const content = column.content || '';
// Limit content length to prevent excessive table width
const truncated = content.length > 200 ? `${content.substring(0, 200)}...` : content;
markdown += ` ${truncated.replace(/\n/g, '<br>')} |`;
}
markdown += '\n';
return `${markdown}\n`;
}
/**
* Parse headers and footers
*/
parseHeaderFooter(content, type = 'header') {
if (!content)
return '';
const marker = type === 'header' ? 'š' : 'š»';
return `<!-- Document ${type} -->\n> ${marker} ${content}\n\n`;
}
/**
* Create divider/separator
*/
createDivider(style = 'simple') {
switch (style) {
case 'thick':
return '\nāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n\n';
case 'dashed':
return '\n---\n\n';
case 'dotted':
return '\n⢠⢠⢠⢠⢠⢠⢠⢠⢠⢠⢠⢠⢠⢠⢠⢠⢠⢠⢠⢠ā¢\n\n';
default:
return '\n---\n\n';
}
}
/**
* Calculate relative positioning for layout elements with improved grouping
*/
calculateRelativePosition(elements) {
// Sort elements by their Y position primarily, with much larger threshold for "same row"
return [...elements].sort((a, b) => {
const aY = a.position?.y || 0;
const bY = b.position?.y || 0;
const aX = a.position?.x || 0;
const bX = b.position?.x || 0;
const yDiff = aY - bY;
// Increase threshold significantly to avoid over-segmentation
if (Math.abs(yDiff) < 200) { // Much larger tolerance for same "section"
return aX - bX;
}
return yDiff;
});
}
/**
* Format text with approximate font sizes using headers
*/
formatWithSize(text, fontSize) {
if (!fontSize || fontSize === 'normal')
return text;
const size = typeof fontSize === 'string' ? parseFloat(fontSize) : fontSize;
// Map font sizes to markdown headers (approximate)
if (size >= 24)
return `# ${text}`;
if (size >= 20)
return `## ${text}`;
if (size >= 16)
return `### ${text}`;
if (size >= 14)
return `#### ${text}`;
if (size <= 10)
return `<small>${text}</small>`;
return text;
}
/**
* Process markdown formatting within table cells
*/
processCellFormatting(text) {
if (!text)
return text;
// Convert headers to bold text (since headers don't work well in table cells)
text = text.replace(/^(#{1,6})\s+(.+)$/gm, (_match, hashes, content) => {
const level = hashes.length;
// Convert headers to bold text with size indicators
if (level <= 2) {
return `**${content.toUpperCase()}**`; // Major headers become uppercase bold
}
else {
return `**${content}**`; // Minor headers become bold
}
});
// Ensure bold and italic formatting is preserved
// Bold: **text** or __text__
text = text.replace(/\*\*([^*]+)\*\*/g, '**$1**');
text = text.replace(/__([^_]+)__/g, '**$1**');
// Italic: *text* or _text_ (but not within bold)
text = text.replace(/(?<!\*)\*([^*]+)\*(?!\*)/g, '*$1*');
text = text.replace(/(?<!_)_([^_]+)_(?!_)/g, '*$1*');
return text;
}
/**
* Reset internal counters
*/
reset() {
this.tableCounter = 0;
}
/**
* Get current table counter
*/
get currentTableCount() {
return this.tableCounter;
}
}
//# sourceMappingURL=layout-parser.js.map