UNPKG

@knowcode/convert-to-markdown

Version:

Convert Excel, PDF, and Word documents to clean, AI-ready formats like Markdown and JSON

147 lines (127 loc) 4.5 kB
/** * PDF converter functions * Handles conversion of PDF files to Markdown format */ const pdfParse = require('pdf-parse'); const { cleanTextForMarkdown, createMetadataHeader, calculateStats } = require('../utils/common'); /** * Detects and formats tables in text * @param {string} text - Text containing potential tables * @returns {string} Text with formatted tables */ function detectAndFormatTables(text) { const lines = text.split('\n'); const formattedLines = []; let inTable = false; let tableData = []; let maxColumns = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); const cells = line.split(/\s{3,}/); // Split on 3 or more spaces // Detect if this might be a table row if (cells.length > 1 && cells.every(cell => cell.trim().length > 0)) { if (!inTable) { inTable = true; maxColumns = cells.length; } tableData.push(cells); } else if (inTable) { // End of table detected if (tableData.length > 0) { // Format the collected table data const markdownTable = formatMarkdownTable(tableData, maxColumns); formattedLines.push(markdownTable); formattedLines.push(''); // Add empty line after table } inTable = false; tableData = []; maxColumns = 0; formattedLines.push(line); } else { formattedLines.push(line); } } // Handle any remaining table if (inTable && tableData.length > 0) { formattedLines.push(formatMarkdownTable(tableData, maxColumns)); } return formattedLines.join('\n'); } /** * Formats table data as markdown * @param {Array<Array<string>>} tableData - Table data as 2D array * @param {number} columnCount - Number of columns * @returns {string} Formatted markdown table */ function formatMarkdownTable(tableData, columnCount) { // Ensure all rows have the same number of columns const normalizedData = tableData.map(row => { while (row.length < columnCount) { row.push(''); } return row; }); // Create markdown table const header = normalizedData[0].map(cell => cell.trim() || ' '); const separator = Array(columnCount).fill('---'); const body = normalizedData.slice(1); const markdownRows = [ `| ${header.join(' | ')} |`, `| ${separator.join(' | ')} |`, ...body.map(row => `| ${row.map(cell => cell.trim() || ' ').join(' | ')} |`) ]; return markdownRows.join('\n'); } /** * Converts PDF buffer to Markdown format * @param {Buffer} buffer - PDF file buffer * @param {Object} options - Conversion options * @param {string} options.filename - Original filename * @returns {Promise<Object>} Conversion result with markdown and statistics */ async function convertPdfToMarkdown(buffer, options = {}) { const { filename = 'document.pdf' } = options; // Parse PDF const data = await pdfParse(buffer); let content = data.text; // Process the content content = detectAndFormatTables(content); // Clean up the text and convert to markdown const markdownLines = content .split('\n') .map(line => { line = cleanTextForMarkdown(line); // Detect and format headings if (line.match(/^[A-Z\s]{5,}$/)) { return `\n## ${line}\n`; } return line; }) .filter(line => line.trim()); // Remove empty lines // Add metadata section const metadata = { title: filename.replace(/\.pdf$/i, ''), type: 'markdown', source: 'pdf' }; const markdownWithMetadata = createMetadataHeader(metadata) + markdownLines.join('\n'); // Calculate statistics const stats = calculateStats(markdownWithMetadata, { numberOfPages: data.numpages || 0, numberOfTables: (markdownWithMetadata.match(/\|.*\|/g) || []).length / 3, // Approximate table count estimatedTokensPerLine: (calculateStats(markdownWithMetadata).estimatedTokens / markdownWithMetadata.split('\n').length).toFixed(1), pdfInfo: data.info || {} }); return { document: markdownWithMetadata, markdown: markdownWithMetadata, stats }; } module.exports = { convertPdfToMarkdown };