@knowcode/convert-to-markdown
Version:
Convert Excel, PDF, and Word documents to clean, AI-ready formats like Markdown and JSON
147 lines (127 loc) • 4.5 kB
JavaScript
/**
* PDF converter functions
* Handles conversion of PDF files to Markdown format
*/
const pdfParse = require('pdf-parse');
const {
cleanTextForMarkdown,
createMetadataHeader,
calculateStats
} = require('../utils/common');
/**
* Detects and formats tables in text
* @param {string} text - Text containing potential tables
* @returns {string} Text with formatted tables
*/
function detectAndFormatTables(text) {
const lines = text.split('\n');
const formattedLines = [];
let inTable = false;
let tableData = [];
let maxColumns = 0;
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
const cells = line.split(/\s{3,}/); // Split on 3 or more spaces
// Detect if this might be a table row
if (cells.length > 1 && cells.every(cell => cell.trim().length > 0)) {
if (!inTable) {
inTable = true;
maxColumns = cells.length;
}
tableData.push(cells);
} else if (inTable) {
// End of table detected
if (tableData.length > 0) {
// Format the collected table data
const markdownTable = formatMarkdownTable(tableData, maxColumns);
formattedLines.push(markdownTable);
formattedLines.push(''); // Add empty line after table
}
inTable = false;
tableData = [];
maxColumns = 0;
formattedLines.push(line);
} else {
formattedLines.push(line);
}
}
// Handle any remaining table
if (inTable && tableData.length > 0) {
formattedLines.push(formatMarkdownTable(tableData, maxColumns));
}
return formattedLines.join('\n');
}
/**
* Formats table data as markdown
* @param {Array<Array<string>>} tableData - Table data as 2D array
* @param {number} columnCount - Number of columns
* @returns {string} Formatted markdown table
*/
function formatMarkdownTable(tableData, columnCount) {
// Ensure all rows have the same number of columns
const normalizedData = tableData.map(row => {
while (row.length < columnCount) {
row.push('');
}
return row;
});
// Create markdown table
const header = normalizedData[0].map(cell => cell.trim() || ' ');
const separator = Array(columnCount).fill('---');
const body = normalizedData.slice(1);
const markdownRows = [
`| ${header.join(' | ')} |`,
`| ${separator.join(' | ')} |`,
...body.map(row => `| ${row.map(cell => cell.trim() || ' ').join(' | ')} |`)
];
return markdownRows.join('\n');
}
/**
* Converts PDF buffer to Markdown format
* @param {Buffer} buffer - PDF file buffer
* @param {Object} options - Conversion options
* @param {string} options.filename - Original filename
* @returns {Promise<Object>} Conversion result with markdown and statistics
*/
async function convertPdfToMarkdown(buffer, options = {}) {
const { filename = 'document.pdf' } = options;
// Parse PDF
const data = await pdfParse(buffer);
let content = data.text;
// Process the content
content = detectAndFormatTables(content);
// Clean up the text and convert to markdown
const markdownLines = content
.split('\n')
.map(line => {
line = cleanTextForMarkdown(line);
// Detect and format headings
if (line.match(/^[A-Z\s]{5,}$/)) {
return `\n## ${line}\n`;
}
return line;
})
.filter(line => line.trim()); // Remove empty lines
// Add metadata section
const metadata = {
title: filename.replace(/\.pdf$/i, ''),
type: 'markdown',
source: 'pdf'
};
const markdownWithMetadata = createMetadataHeader(metadata) + markdownLines.join('\n');
// Calculate statistics
const stats = calculateStats(markdownWithMetadata, {
numberOfPages: data.numpages || 0,
numberOfTables: (markdownWithMetadata.match(/\|.*\|/g) || []).length / 3, // Approximate table count
estimatedTokensPerLine: (calculateStats(markdownWithMetadata).estimatedTokens / markdownWithMetadata.split('\n').length).toFixed(1),
pdfInfo: data.info || {}
});
return {
document: markdownWithMetadata,
markdown: markdownWithMetadata,
stats
};
}
module.exports = {
convertPdfToMarkdown
};