defuddle
Version:
Extract article content and metadata from web pages.
613 lines • 29.7 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.isGenericElement = isGenericElement;
exports.asGenericElement = asGenericElement;
exports.createMarkdownContent = createMarkdownContent;
exports.toMarkdown = toMarkdown;
const turndown_1 = __importDefault(require("turndown"));
const utils_1 = require("./utils");
const dom_1 = require("./utils/dom");
function isGenericElement(node) {
return node !== null && typeof node === 'object' && 'getAttribute' in node;
}
function asGenericElement(node) {
return node;
}
function createMarkdownContent(content, url) {
const footnotes = {};
const turndownService = new turndown_1.default({
headingStyle: 'atx',
hr: '---',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
emDelimiter: '*',
preformattedCode: true,
});
turndownService.addRule('table', {
filter: 'table',
replacement: function (content, node) {
if (!isGenericElement(node))
return content;
// Check if it's an ArXiv equation table
if (node.classList?.contains('ltx_equation') || node.classList?.contains('ltx_eqn_table')) {
return handleNestedEquations(node);
}
// Detect layout tables (used for styling/positioning, not data)
const hasNestedTables = node.querySelector('table') !== null;
const directCells = Array.from(node.querySelectorAll('td, th')).filter((el) => (0, dom_1.isDirectTableChild)(el, node));
if (hasNestedTables || directCells.length <= 1) {
const directRows = Array.from(node.querySelectorAll('tr')).filter((el) => (0, dom_1.isDirectTableChild)(el, node));
const cellCounts = directRows.map((tr) => directCells.filter((cell) => cell.parentNode === tr).length);
const isSingleColumn = directRows.length > 0
&& new Set(cellCounts).size === 1
&& cellCounts[0] <= 1;
if (isSingleColumn) {
// Layout table — extract content, don't convert to markdown table
return '\n\n' + turndownService.turndown(directCells.map((cell) => (0, dom_1.serializeHTML)(cell)).join('')) + '\n\n';
}
}
// Check if the table has colspan or rowspan
const cells = Array.from(node.querySelectorAll('td, th'));
const hasComplexStructure = cells.some(cell => isGenericElement(asGenericElement(cell)) && (cell.hasAttribute('colspan') || cell.hasAttribute('rowspan')));
if (hasComplexStructure) {
// Clean up the table HTML
const cleanedTable = cleanupTableHTML(node);
return '\n\n' + cleanedTable + '\n\n';
}
// Process simple tables as before
// Use node.rows/row.cells when available (browser/JSDOM), fall back to
// querySelectorAll for environments like linkedom that lack these properties
const tableEl = node;
const rowElements = tableEl.rows && tableEl.rows.length > 0
? Array.from(tableEl.rows)
: Array.from(node.querySelectorAll('tr')).filter((tr) => (0, dom_1.isDirectTableChild)(tr, node));
const rows = rowElements.map((row) => {
const cellElements = row.cells && row.cells.length > 0
? Array.from(row.cells)
: Array.from(row.querySelectorAll('td, th')).filter((cell) => cell.parentNode === row);
const cellContents = cellElements.map((cell) => {
// Remove newlines and trim the content
let cellContent = turndownService.turndown((0, dom_1.serializeHTML)(cell))
.replace(/\n/g, ' ')
.trim();
// Escape pipe characters
cellContent = cellContent.replace(/\|/g, '\\|');
return cellContent;
});
return `| ${cellContents.join(' | ')} |`;
});
if (!rows.length)
return content;
// Create the separator row
const separatorRow = `| ${Array(rows[0].split('|').length - 2).fill('---').join(' | ')} |`;
// Combine all rows
const tableContent = [rows[0], separatorRow, ...rows.slice(1)].join('\n');
return `\n\n${tableContent}\n\n`;
}
});
turndownService.remove(['style', 'script']);
// Keep iframes, video, audio, sup, and sub elements
// @ts-ignore
turndownService.keep(['iframe', 'video', 'audio', 'sup', 'sub', 'svg', 'math']);
turndownService.remove(['button']);
turndownService.addRule('list', {
filter: ['ul', 'ol'],
replacement: function (content, node) {
// Remove trailing newlines/spaces from content
content = content.trim();
// Add a newline before the list if it's a top-level list
const element = node;
const isTopLevel = !(element.parentNode && (element.parentNode.nodeName === 'UL' || element.parentNode.nodeName === 'OL'));
return (isTopLevel ? '\n' : '') + content + '\n';
}
});
// Lists with tab indentation
turndownService.addRule('listItem', {
filter: 'li',
replacement: function (content, node, options) {
if (!isGenericElement(node))
return content;
// Handle task list items
const isTaskListItem = node.classList?.contains('task-list-item');
const checkbox = node.querySelector('input[type="checkbox"]');
let taskListMarker = '';
if (isTaskListItem && checkbox && isGenericElement(checkbox)) {
// Remove the checkbox from content since we'll add markdown checkbox
content = content.replace(/<input[^>]*>/, '');
taskListMarker = checkbox.getAttribute('checked') ? '[x] ' : '[ ] ';
}
content = content
// Remove trailing newlines
.replace(/\n+$/, '')
// Split into lines
.split('\n')
// Remove empty lines
.filter(line => line.length > 0)
// Add indentation to continued lines
.join('\n\t');
let prefix = options.bulletListMarker + ' ';
let parent = node.parentNode;
// Calculate the nesting level
let level = 0;
let currentParent = node.parentNode;
while (currentParent && isGenericElement(currentParent)) {
if (currentParent.nodeName === 'UL' || currentParent.nodeName === 'OL') {
level++;
}
else if (currentParent.nodeName !== 'LI') {
break;
}
currentParent = currentParent.parentNode;
}
// Add tab indentation based on nesting level, ensuring it's never negative
const indentLevel = Math.max(0, level - 1);
prefix = '\t'.repeat(indentLevel) + prefix;
if (parent && isGenericElement(parent) && parent.nodeName === 'OL') {
let start = parent.getAttribute('start');
let index = 1;
const children = Array.from(parent.children || []);
for (let i = 0; i < children.length; i++) {
if (children[i] === node) {
index = i + 1;
break;
}
}
prefix = '\t'.repeat(level - 1) + (start ? Number(start) + index - 1 : index) + '. ';
}
return prefix + taskListMarker + content.trim() + (node.nextSibling && !/\n$/.test(content) ? '\n' : '');
}
});
turndownService.addRule('figure', {
filter: 'figure',
replacement: function (content, node) {
if (!isGenericElement(node))
return content;
const img = node.querySelector('img');
const figcaption = node.querySelector('figcaption');
if (!img || !isGenericElement(img))
return content;
const alt = img.getAttribute('alt') || '';
const src = img.getAttribute('src') || '';
let caption = '';
if (figcaption && isGenericElement(figcaption)) {
const tagSpan = figcaption.querySelector('.ltx_tag_figure');
const tagText = tagSpan && isGenericElement(tagSpan) ? tagSpan.textContent?.trim() : '';
// Process the caption content, including math elements
let captionContent = (0, dom_1.serializeHTML)(figcaption);
const ownerDoc = node.ownerDocument;
captionContent = captionContent.replace(/<math.*?>(.*?)<\/math>/g, (match, mathContent, offset, string) => {
let latex = '';
if (ownerDoc) {
const fragment = (0, dom_1.parseHTML)(ownerDoc, match);
const mathElement = fragment.querySelector('math');
latex = mathElement && isGenericElement(mathElement) ? extractLatex(mathElement) : '';
}
const prevChar = string[offset - 1] || '';
const nextChar = string[offset + match.length] || '';
const isStartOfLine = offset === 0 || /\s/.test(prevChar);
const isEndOfLine = offset + match.length === string.length || /\s/.test(nextChar);
const leftSpace = (!isStartOfLine && !/[\s$]/.test(prevChar)) ? ' ' : '';
const rightSpace = (!isEndOfLine && !/[\s$]/.test(nextChar)) ? ' ' : '';
return `${leftSpace}$${latex}$${rightSpace}`;
});
// Convert the processed caption content to markdown
const captionMarkdown = turndownService.turndown(captionContent);
// Combine tag and processed caption
caption = `${tagText} ${captionMarkdown}`.trim();
}
// Handle references in the caption
caption = caption.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, href) => {
return `[${text}](${href})`;
});
return `\n\n${caption}\n\n`;
}
});
// Use Obsidian format for YouTube embeds and tweets
turndownService.addRule('embedToMarkdown', {
filter: function (node) {
if (!isGenericElement(node))
return false;
const src = node.getAttribute('src');
return !!src && (!!src.match(/(?:youtube\.com|youtube-nocookie\.com|youtu\.be)/) ||
!!src.match(/(?:twitter\.com|x\.com)/));
},
replacement: function (content, node) {
if (!isGenericElement(node))
return content;
const src = node.getAttribute('src');
if (src) {
const youtubeMatch = src.match(/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtube-nocookie\.com|youtu\.be)\/(?:embed\/|watch\?v=)?([a-zA-Z0-9_-]+)/);
if (youtubeMatch && youtubeMatch[1]) {
return `\n\n`;
}
// Direct URL: /user/status/id
const tweetDirectMatch = src.match(/(?:https?:\/\/)?(?:www\.)?(?:twitter\.com|x\.com)\/([^/]+)\/status\/([0-9]+)/);
if (tweetDirectMatch) {
return `\n\n`;
}
// Platform embed: ?id=
const tweetEmbedMatch = src.match(/(?:https?:\/\/)?(?:platform\.)?twitter\.com\/embed\/Tweet\.html\?.*?id=([0-9]+)/);
if (tweetEmbedMatch) {
return `\n\n`;
}
}
return content;
}
});
turndownService.addRule('highlight', {
filter: 'mark',
replacement: function (content) {
return '==' + content + '==';
}
});
turndownService.addRule('strikethrough', {
filter: (node) => node.nodeName === 'DEL' ||
node.nodeName === 'S' ||
node.nodeName === 'STRIKE',
replacement: function (content) {
return '~~' + content + '~~';
}
});
// Add a new custom rule for complex link structures
turndownService.addRule('complexLinkStructure', {
filter: function (node, options) {
return (node.nodeName === 'A' &&
node.childNodes.length > 1 &&
Array.from(node.childNodes).some(child => ['H1', 'H2', 'H3', 'H4', 'H5', 'H6'].includes(child.nodeName)));
},
replacement: function (content, node, options) {
if (!isGenericElement(node))
return content;
const href = node.getAttribute('href');
const title = node.getAttribute('title');
// Extract the heading — use outerHTML to preserve the heading tag
const headingNode = node.querySelector('h1, h2, h3, h4, h5, h6');
const headingContent = headingNode ? turndownService.turndown(headingNode.outerHTML) : '';
// Remove the heading from the content
if (headingNode) {
headingNode.remove();
}
// Convert the remaining content
const remainingContent = turndownService.turndown((0, dom_1.serializeHTML)(node));
// Construct the new markdown
let markdown = `${headingContent}\n\n${remainingContent}\n\n`;
if (href) {
markdown += `[View original](${href})`;
if (title) {
markdown += ` "${title}"`;
}
}
return markdown;
}
});
turndownService.addRule('arXivEnumerate', {
filter: (node) => {
return node.nodeName === 'OL' && isGenericElement(node) && (node.classList?.contains('ltx_enumerate') ?? false);
},
replacement: function (content, node) {
if (!isGenericElement(node))
return content;
const items = Array.from(node.children || []).map((item, index) => {
if (isGenericElement(item)) {
const itemContent = ((0, dom_1.serializeHTML)(item) || '').replace(/^<span class="ltx_tag ltx_tag_item">\d+\.<\/span>\s*/, '');
return `${index + 1}. ${turndownService.turndown(itemContent)}`;
}
return '';
});
return '\n\n' + items.join('\n\n') + '\n\n';
}
});
turndownService.addRule('citations', {
filter: (node) => {
if (isGenericElement(node)) {
const id = node.getAttribute('id');
return node.nodeName === 'SUP' && id !== null && id.startsWith('fnref:');
}
return false;
},
replacement: (content, node) => {
if (isGenericElement(node)) {
const id = node.getAttribute('id');
if (node.nodeName === 'SUP' && id !== null && id.startsWith('fnref:')) {
const primaryNumber = id.replace('fnref:', '').split('-')[0];
return `[^${primaryNumber}]`;
}
}
return content;
}
});
// Footnotes list
turndownService.addRule('footnotesList', {
filter: (node) => {
if (isGenericElement(node)) {
const parentNode = node.parentNode;
return (node.nodeName === 'OL' &&
parentNode !== null &&
isGenericElement(parentNode) &&
parentNode.getAttribute('id') === 'footnotes');
}
return false;
},
replacement: (content, node) => {
if (!isGenericElement(node))
return content;
const references = Array.from(node.children || []).map(li => {
let id;
if (isGenericElement(li)) {
const liId = li.getAttribute('id');
if (liId !== null) {
if (liId.startsWith('fn:')) {
id = liId.replace('fn:', '');
}
else {
const match = liId.split('/').pop()?.match(/cite_note-(.+)/);
id = match ? match[1] : liId;
}
}
// Remove the leading sup element if its content matches the footnote id
const supElement = li.querySelector('sup');
if (supElement && isGenericElement(supElement) && supElement.textContent?.trim() === id) {
supElement.remove();
}
const referenceContent = turndownService.turndown((0, dom_1.serializeHTML)(li));
// Remove the backlink from the footnote content
const cleanedContent = referenceContent.replace(/\s*↩︎$/, '').trim();
return `[^${id?.toLowerCase()}]: ${cleanedContent}`;
}
return '';
});
return '\n\n' + references.join('\n\n') + '\n\n';
}
});
// General removal rules for varous website elements
turndownService.addRule('removals', {
filter: function (node) {
if (!isGenericElement(node))
return false;
// Remove the Defuddle backlink from the footnote content
if (node.getAttribute('href')?.includes('#fnref'))
return true;
if (node.classList?.contains('footnote-backref'))
return true;
return false;
},
replacement: function (content, node) {
return '';
}
});
turndownService.addRule('handleTextNodesInTables', {
filter: function (node) {
return (0, utils_1.isTextNode)(node) &&
node.parentNode !== null &&
node.parentNode.nodeName === 'TD';
},
replacement: function (content) {
return content;
}
});
turndownService.addRule('preformattedCode', {
filter: (node) => {
return node.nodeName === 'PRE';
},
replacement: (content, node) => {
if (!isGenericElement(node))
return content;
const codeElement = node.querySelector('code');
if (!codeElement || !isGenericElement(codeElement))
return content;
const language = codeElement.getAttribute('data-lang')
|| codeElement.getAttribute('data-language')
|| codeElement.getAttribute('class')?.match(/language-(\w+)/)?.[1]
|| node.getAttribute('data-language')
|| '';
const code = codeElement.textContent || '';
// Clean up the content and escape backticks
const cleanCode = code
.trim()
.replace(/`/g, '\\`');
return `\n\`\`\`${language}\n${cleanCode}\n\`\`\`\n`;
}
});
turndownService.addRule('math', {
filter: (node) => {
return node.nodeName.toLowerCase() === 'math' ||
(isGenericElement(node) &&
(node.classList?.contains('mwe-math-element') ||
node.classList?.contains('mwe-math-fallback-image-inline') ||
node.classList?.contains('mwe-math-fallback-image-display')));
},
replacement: (content, node) => {
if (!isGenericElement(node))
return content;
let latex = extractLatex(node);
// Remove leading and trailing whitespace
latex = latex.trim();
// Check if the math element is within a table
const isInTable = typeof node.closest === 'function' ? node.closest('table') !== null : false;
// Check if it's an inline or block math element
if (!isInTable && (node.getAttribute('display') === 'block' ||
node.classList?.contains('mwe-math-fallback-image-display') ||
(node.parentNode && isGenericElement(node.parentNode) &&
node.parentNode.classList?.contains('mwe-math-element') &&
node.parentNode.previousSibling && isGenericElement(node.parentNode.previousSibling) &&
node.parentNode.previousSibling.nodeName.toLowerCase() === 'p'))) {
return `\n$$\n${latex}\n$$\n`;
}
else {
// For inline math, ensure there's a space before and after only if needed
const prevNode = node.previousSibling;
const nextNode = node.nextSibling;
const prevChar = prevNode && isGenericElement(prevNode) ? prevNode.textContent?.slice(-1) || '' : '';
const nextChar = nextNode && isGenericElement(nextNode) ? nextNode.textContent?.[0] || '' : '';
const isStartOfLine = !prevNode || ((0, utils_1.isTextNode)(prevNode) && prevNode.textContent?.trim() === '');
const isEndOfLine = !nextNode || ((0, utils_1.isTextNode)(nextNode) && nextNode.textContent?.trim() === '');
const leftSpace = (!isStartOfLine && prevChar && !/[\s$]/.test(prevChar)) ? ' ' : '';
const rightSpace = (!isEndOfLine && nextChar && !/[\s$]/.test(nextChar)) ? ' ' : '';
return `${leftSpace}$${latex}$${rightSpace}`;
}
}
});
turndownService.addRule('katex', {
filter: (node) => {
return isGenericElement(node) &&
(node.classList?.contains('math') || node.classList?.contains('katex'));
},
replacement: (content, node) => {
if (!isGenericElement(node))
return content;
// Try to find the original LaTeX content
// 1. Check data-latex attribute
let latex = node.getAttribute('data-latex');
// 2. If no data-latex, try to get from .katex-mathml
if (!latex) {
const mathml = node.querySelector('.katex-mathml annotation[encoding="application/x-tex"]');
latex = mathml && isGenericElement(mathml) ? mathml.textContent || '' : '';
}
// 3. If still no content, use text content as fallback
if (!latex) {
latex = node.textContent?.trim() || '';
}
// Determine if it's an inline formula
const mathElement = node.querySelector('.katex-mathml math');
const isInline = node.classList?.contains('math-inline') ||
(mathElement && isGenericElement(mathElement) && mathElement.getAttribute('display') !== 'block');
if (isInline) {
return `$${latex}$`;
}
else {
return `\n$$\n${latex}\n$$\n`;
}
}
});
turndownService.addRule('callout', {
filter: (node) => {
return (node.nodeName.toLowerCase() === 'div' &&
isGenericElement(node) &&
node.classList?.contains('markdown-alert'));
},
replacement: (content, node) => {
if (!isGenericElement(node))
return content;
// Get alert type from the class (e.g., markdown-alert-note -> NOTE)
const alertClasses = Array.from(node.classList ? Object.keys(node.classList) : []);
const typeClass = alertClasses.find(c => c.startsWith('markdown-alert-') && c !== 'markdown-alert');
const type = typeClass ? typeClass.replace('markdown-alert-', '').toUpperCase() : 'NOTE';
// Find the title element and content
const titleElement = node.querySelector('.markdown-alert-title');
const contentElement = node.querySelector('p:not(.markdown-alert-title)');
// Extract content, removing the title from it if present
let alertContent = content;
if (titleElement && isGenericElement(titleElement) && titleElement.textContent) {
alertContent = contentElement && isGenericElement(contentElement) ? contentElement.textContent || '' : content.replace(titleElement.textContent, '');
}
// Format as Obsidian callout
return `\n> [!${type}]\n> ${alertContent.trim().replace(/\n/g, '\n> ')}\n`;
}
});
// Callout asides (standardized to blockquote with data-callout attribute)
turndownService.addRule('calloutAside', {
filter: (node) => {
return (node.nodeName === 'BLOCKQUOTE' &&
isGenericElement(node) &&
!!node.getAttribute('data-callout'));
},
replacement: (content, node) => {
if (!isGenericElement(node))
return content;
const type = node.getAttribute('data-callout') || 'note';
const title = type.charAt(0).toUpperCase() + type.slice(1);
const lines = content.trim().split('\n');
const quotedContent = lines.map(line => `> ${line}`).join('\n');
return `\n> [!${type}] ${title}\n${quotedContent}\n`;
}
});
function handleNestedEquations(element) {
const mathElements = element.querySelectorAll('math[alttext]');
if (mathElements.length === 0)
return '';
return Array.from(mathElements).map(mathElement => {
const alttext = mathElement.getAttribute('alttext');
if (alttext) {
// Check if it's an inline or block equation
const isInline = mathElement.closest('.ltx_eqn_inline') !== null;
return isInline ? `$${alttext.trim()}$` : `\n$$\n${alttext.trim()}\n$$`;
}
return '';
}).join('\n\n');
}
function cleanupTableHTML(element) {
const allowedAttributes = ['src', 'href', 'style', 'align', 'width', 'height', 'rowspan', 'colspan', 'bgcolor', 'scope', 'valign', 'headers'];
const cleanElement = (element) => {
Array.from(element.attributes).forEach(attr => {
if (!allowedAttributes.includes(attr.name)) {
element.removeAttribute(attr.name);
}
});
element.childNodes.forEach(child => {
if ((0, utils_1.isElement)(child)) {
cleanElement(child);
}
});
};
// Create a clone of the table to avoid modifying the original DOM
const tableClone = element.cloneNode(true);
cleanElement(tableClone);
// outerHTML encodes & as &, which breaks LaTeX alignment
// characters inside math delimiters. Decode common entities since
// the output goes into markdown, not back through an HTML parser.
return tableClone.outerHTML
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>');
}
function extractLatex(element) {
// Check if the element is a <math> element and has an alttext attribute
let latex = element.getAttribute('data-latex');
let alttext = element.getAttribute('alttext');
if (latex) {
return latex.trim();
}
else if (alttext) {
return alttext.trim();
}
return '';
}
try {
let markdown = turndownService.turndown(content);
// Remove the title from the beginning of the content if it exists
const titleMatch = markdown.match(/^# .+\n+/);
if (titleMatch) {
markdown = markdown.slice(titleMatch[0].length);
}
// Remove any empty links e.g. [](example.com) that remain, along with surrounding newlines
// But don't affect image links like 
markdown = markdown.replace(/\n*(?<!!)\[]\([^)]+\)\n*/g, '');
// Remove any consecutive newlines more than two
markdown = markdown.replace(/\n{3,}/g, '\n\n');
// Append footnotes at the end of the document
if (Object.keys(footnotes).length > 0) {
markdown += '\n\n---\n\n';
for (const [id, content] of Object.entries(footnotes)) {
markdown += `[^${id}]: ${content}\n\n`;
}
}
return markdown.trim();
}
catch (error) {
console.error('Error converting HTML to Markdown:', error);
console.log('Problematic content:', content.substring(0, 1000) + '...');
return `Partial conversion completed with errors. Original HTML:\n\n${content}`;
}
}
function toMarkdown(result, options, url) {
if (options.markdown) {
result.content = createMarkdownContent(result.content, url);
}
else if (options.separateMarkdown) {
result.contentMarkdown = createMarkdownContent(result.content, url);
}
}
//# sourceMappingURL=markdown.js.map