doc-to-readable
Version:
Universal document-to-markdown and section splitter for HTML, URLs, and PDFs.
334 lines (308 loc) • 13.4 kB
JavaScript
let getDocument, pdfjsLib;
let pdfjsAvailable = true;
let pdfjsInitialized = false;
async function initPdfjs() {
if (pdfjsInitialized) return;
try {
const pdfjsDist = await import('pdfjs-dist');
getDocument = pdfjsDist.getDocument;
pdfjsLib = pdfjsDist;
// Configure pdfjs worker
const pdfjsVersion = '5.3.93';
const workerPath = `https://cdn.jsdelivr.net/npm/pdfjs-dist@${pdfjsVersion}/build/pdf.worker.min.mjs`;
if (typeof window !== 'undefined' && typeof document !== 'undefined') {
pdfjsLib.GlobalWorkerOptions.workerSrc = workerPath;
} else {
console.debug('Running in a non-browser environment, workerSrc not set.');
}
pdfjsAvailable = true;
} catch (e) {
pdfjsAvailable = false;
console.debug('[pdf-to-html] pdfjs-dist not available or failed to load. PDF conversion is disabled in this environment.');
}
pdfjsInitialized = true;
}
// Escape HTML characters, preserve math and special characters
function escapeHtml(str, isMath = false) {
if (isMath) return str;
return str.replace(/[&<>"']/g, s => ({
'&': '&',
'<': '<',
'>': '>',
'"': '"',
"'": ''
})[s]);
}
// Cluster items by x-coordinate with dynamic tolerance
function clusterXCoordinates(items, xTolerance) {
const clusters = [];
for (const item of items) {
const x = item.transform[4];
let cluster = clusters.find(c => c.items.some(i => Math.abs(i.transform[4] - x) < xTolerance));
if (!cluster) {
cluster = { items: [] };
clusters.push(cluster);
}
cluster.items.push(item);
}
return clusters.map(c => c.items.sort((a, b) => a.transform[4] - b.transform[4])).filter(c => c.length > 0);
}
// Calculate dynamic x-tolerance
function calculateXTolerance(items) {
const xPositions = items.map(i => i.transform[4]).sort((a, b) => a - b);
const gaps = xPositions.slice(1).map((x, i) => x - xPositions[i]).filter(g => g > 0);
const avgGap = gaps.length ? gaps.reduce((sum, g) => sum + g, 0) / gaps.length : 15;
return Math.max(10, avgGap * 1.5);
}
// Group text items into lines
function groupTextByLine(items, yTolerance = 5) {
const lines = [];
for (const item of items) {
const y = Math.round(item.transform[5]);
let line = lines.find(l => Math.abs(l.y - y) < yTolerance);
if (!line) {
line = { y, items: [] };
lines.push(line);
}
line.items.push(item);
}
lines.sort((a, b) => b.y - a.y);
lines.forEach(line => line.items.sort((a, b) => a.transform[4] - b.transform[4]));
return lines;
}
// Group lines into paragraphs
function groupLinesIntoParagraphs(lines, minGap = 10) {
if (lines.length === 0) return [];
const paragraphs = [];
let current = [lines[0]];
for (let i = 1; i < lines.length; ++i) {
const gap = Math.abs(lines[i].y - lines[i - 1].y);
if (gap > minGap) {
paragraphs.push(current);
current = [];
}
current.push(lines[i]);
}
if (current.length) paragraphs.push(current);
return paragraphs;
}
// Detect headers
function headingLevel(line, allFontSizes, lines, index) {
const fontSizes = line.items.map(i => i.height || 0);
const size = Math.max(...fontSizes);
const text = line.items.map(i => i.str).join('').trim();
const textLength = text.length;
const isBold = line.items.some(i => /Bold/.test(i.fontName || ''));
const isUppercase = text === text.toUpperCase();
const medianSize = allFontSizes.sort((a, b) => a - b)[Math.floor(allFontSizes.length / 2)];
const isFollowedByGap = index < lines.length - 1 && Math.abs(lines[index + 1].y - line.y) > 15;
const isIsolated = (index === 0 || Math.abs(lines[index - 1].y - line.y) > 10) &&
(index === lines.length - 1 || Math.abs(lines[index + 1].y - line.y) > 10);
const blacklist = [/^\d+\./, /^Figure/, /^Table/, /^Note:/, /^•/, /^-/];
const isSurrounded = (index > 0 && Math.abs(lines[index - 1].y - line.y) < 5) ||
(index < lines.length - 1 && Math.abs(lines[index + 1].y - line.y) < 5);
if (blacklist.some(regex => regex.test(text)) || isSurrounded) return 0;
if (size >= 1.7 * medianSize && textLength <= 50 && isIsolated) return 1;
if (size >= 1.4 * medianSize && textLength <= 75 && isIsolated) return 2;
if ((isBold || isUppercase) && textLength <= 30 && isFollowedByGap && isIsolated) return 2;
return 0;
}
// Detect code lines
function isCodeLine(line) {
const text = line.items.map(i => i.str).join('');
const codePatterns = /\b(function|var|let|const|if|else|for|while|return|class|async|await)\b|[\{\}\(\)\[\];,=+\-*\/=>]|\/\/|#/g;
const fontName = line.items[0].fontName || '';
const patternCount = (text.match(codePatterns) || []).length;
return (/Mono|Courier/i.test(fontName) || patternCount >= 2) && text.length > 10;
}
// Detect math content
function isMathContent(line) {
const text = line.items.map(i => i.str).join('');
const mathDelimiters = /^\$[\s\S]*\$|^\(.+\)$|^\[[\s\S]*\]$|^\${2}[\s\S]*\${2}$/;
const mathChars = /[\u03B1-\u03C9\u0391-\u03A9∫∑∏∞√∂\^\/\\frac\\sum]/;
const inlineMath = /\b[\w]+[\^_][\w]+|[\w]+\/[\w]+/g;
return mathDelimiters.test(text) || (mathChars.test(text) && !/^[a-zA-Z\s]+$/.test(text)) || inlineMath.test(text);
}
// Apply styles without nesting
function spanify(item, isMath = false) {
let s = escapeHtml(item.str, isMath);
const font = item.fontName || '';
const classes = [];
if (/Bold/.test(font)) classes.push('bold');
if (/Italic|Oblique/.test(font)) classes.push('italic');
if (classes.length) s = `<span class="${classes.join(' ')}">${s}</span>`;
return s;
}
// Detect tables with strict validation
function detectTable(paragraphs, xTolerance) {
const tables = [];
let i = 0;
while (i < paragraphs.length) {
let j = i;
const rows = [];
const blacklist = [/^\d+\./, /^•/, /^-/];
while (j < paragraphs.length && paragraphs[j].length > 0) {
const items = paragraphs[j][0].items;
const text = items.map(i => i.str).join('').trim();
if (items.length < 2 || blacklist.some(regex => regex.test(text))) break;
const clusters = clusterXCoordinates(items, xTolerance);
if (clusters.length >= 2) {
rows.push(clusters);
// Validate column consistency across rows
if (rows.length >= 2) {
const colCounts = rows.map(r => r.length);
const maxCols = Math.max(...colCounts);
const minCols = Math.min(...colCounts);
if (maxCols - minCols <= 1) { // Allow ±1 column variation
const nonEmptyCells = rows.flat().filter(cell => cell.map(i => i.str).join('').trim()).length;
if (nonEmptyCells / rows.flat().length >= 0.5) {
tables.push({ start: i, end: j, rows });
i = j + 1;
break;
}
}
}
}
j++;
}
i++;
}
return tables;
}
// Main conversion function with custom title
export async function pdfToHtmlFromBuffer(arrayBuffer, customTitle = '') {
await initPdfjs();
if (!pdfjsAvailable) {
console.debug('[pdf-to-html] pdfjs-dist not available; cannot convert PDF to HTML.');
throw new Error('pdfjs-dist not available; PDF conversion is disabled in this environment.');
}
try {
const pdfDocument = await getDocument({ data: arrayBuffer }).promise;
const meta = await pdfDocument.getMetadata().catch(() => ({ info: { Title: '' } }));
const docTitle = meta.info.Title || customTitle;
const titleDiv = docTitle===''? '': `<title>${escapeHtml(docTitle)}</title>`
let html = '<article role="main">\n';
let skippedText = [];
for (let pageNum = 1; pageNum <= pdfDocument.numPages; ++pageNum) {
try {
const page = await pdfDocument.getPage(pageNum);
const textContent = await page.getTextContent();
const allFontSizes = textContent.items.map(i => i.height || 0).filter(h => h > 0);
const xTolerance = calculateXTolerance(textContent.items);
const lines = groupTextByLine(textContent.items);
const paragraphs = groupLinesIntoParagraphs(lines);
const tables = detectTable(paragraphs, xTolerance);
let tableStartIndices = new Set(tables.map(t => t.start));
for (let p = 0; p < paragraphs.length; ++p) {
if (tableStartIndices.has(p)) {
const table = tables.find(t => t.start === p);
const maxCols = Math.max(...table.rows.map(row => row.length));
const tableContent = table.rows.map(row => row.map(cell => cell.map(i => i.str).join('')).join('')).join('');
if (tableContent.trim()) {
html += `<table role="grid" aria-label="Table from page ${pageNum}">\n<caption>Table ${pageNum}</caption>\n<thead>\n<tr>` +
table.rows[0].map(cell => `<th scope="col">${cell.map(i => escapeHtml(i.str)).join(' ') || ''}</th>`).join('') +
'</tr>\n</thead>\n<tbody>\n';
table.rows.slice(1).forEach(row => {
html += '<tr>' +
row.map(cell => `<td>${cell.map(i => escapeHtml(i.str)).join(' ') || ''}</td>`).join('') +
(row.length < maxCols ? '<td></td>'.repeat(maxCols - row.length) : '') +
'</tr>\n';
});
html += '</tbody>\n</table>\n';
} else {
skippedText.push(`Table on page ${pageNum} skipped due to empty content`);
}
p = table.end;
continue;
}
const linesInPara = paragraphs[p];
if (!linesInPara || !linesInPara.length) continue;
if (linesInPara.length > 1 && linesInPara.some(isCodeLine)) {
const codeText = linesInPara.map(line => line.items.map(i => escapeHtml(i.str)).join(' ')).join('\n').trim();
if (codeText) {
html += `<pre><code>${codeText}</code></pre>\n`;
} else {
skippedText.push(`Code block on page ${pageNum} skipped: ${codeText}`);
}
continue;
}
for (let i = 0; i < linesInPara.length; i++) {
const line = linesInPara[i];
const text = line.items.map(i => i.str).join('');
if (!text.trim() && !text.match(/[\s•\-\–]/)) {
skippedText.push(`Line on page ${pageNum} skipped: ${text}`);
continue;
}
if (isMathContent(line)) {
const mathText = line.items.map(i => i.str).join('');
const ariaLabel = `Mathematical expression: ${mathText.replace(/[\$\[\]\(\)\\]/g, '')}`;
if (mathText.startsWith('$') && mathText.endsWith('$') || mathText.startsWith('\\(') && mathText.endsWith('\\)')) {
html += `<span class="math-inline" aria-label="${ariaLabel}">${mathText}</span>\n`;
} else {
html += `<div class="math-display" aria-label="${ariaLabel}">${mathText}</div>\n`;
}
continue;
}
const level = headingLevel(line, allFontSizes, linesInPara, i);
if (level > 0) {
const headerText = line.items.map(i => spanify(i)).join(' ');
if (headerText.trim()) {
html += `<h${level}>${headerText}</h${level}>\n`;
} else {
skippedText.push(`Header on page ${pageNum} skipped: ${headerText}`);
}
continue;
}
if (isCodeLine(line)) {
const codeText = line.items.map(i => escapeHtml(i.str)).join(' ');
if (codeText.trim()) {
html += `<pre><code>${codeText}</code></pre>\n`;
} else {
skippedText.push(`Code line on page ${pageNum} skipped: ${codeText}`);
}
continue;
}
const paraText = line.items.map(i => spanify(i)).join(' ');
if (paraText.trim() || text.match(/[\s•\-\–]/)) {
html += `<p>${paraText}</p>\n`;
} else {
skippedText.push(`Paragraph on page ${pageNum} skipped: ${paraText}`);
}
}
}
} catch (error) {
console.debug(`Error processing page ${pageNum}:`, error);
skippedText.push(`Page ${pageNum} processing failed: ${error.message}`);
}
}
if (skippedText.length) {
console.debug('Skipped content:', skippedText);
}
html += '</article>\n';
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
${titleDiv}
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js" async></script>
<style>
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; vertical-align: top; }
pre { background: #f4f4f4; padding: 10px; border-radius: 4px; }
.math-inline, .math-display { font-family: 'Times New Roman', serif; }
h1, h2 { margin: 0.5em 0; }
p { margin: 0.5em 0; }
.bold { font-weight: bold; }
.italic { font-style: italic; }
</style>
</head>
<body>
${html}
</body>
</html>`;
} catch (error) {
console.debug('Error processing PDF:', error);
return '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>Error</title></head><body><p>Error processing PDF</p></body></html>';
}
}