defuddle
Version:
Extract article content and metadata from web pages.
831 lines • 36.4 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.standardizeContent = standardizeContent;
const constants_1 = require("./constants");
const math_1 = require("./elements/math");
const code_1 = require("./elements/code");
const footnotes_1 = require("./elements/footnotes");
const headings_1 = require("./elements/headings");
const images_1 = require("./elements/images");
const utils_1 = require("./utils");
const ELEMENT_STANDARDIZATION_RULES = [
...math_1.mathRules,
...code_1.codeBlockRules,
...headings_1.headingRules,
...images_1.imageRules,
// Convert divs with paragraph role to actual paragraphs
{
selector: 'div[data-testid^="paragraph"], div[role="paragraph"]',
element: 'p',
transform: (el, doc) => {
const p = doc.createElement('p');
// Copy innerHTML
p.innerHTML = el.innerHTML;
// Copy allowed attributes
Array.from(el.attributes).forEach(attr => {
if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
p.setAttribute(attr.name, attr.value);
}
});
return p;
}
},
// Convert divs with list roles to actual lists
{
selector: 'div[role="list"]',
element: 'ul',
// Custom handler for list type detection and transformation
transform: (el, doc) => {
// First determine if this is an ordered list
const firstItem = el.querySelector('div[role="listitem"] .label');
const label = firstItem?.textContent?.trim() || '';
const isOrdered = label.match(/^\d+\)/);
// Create the appropriate list type
const list = doc.createElement(isOrdered ? 'ol' : 'ul');
// Process each list item
const items = el.querySelectorAll('div[role="listitem"]');
items.forEach(item => {
const li = doc.createElement('li');
const content = item.querySelector('.content');
if (content) {
// Convert any paragraph divs inside content
const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
paragraphDivs.forEach(div => {
const p = doc.createElement('p');
p.innerHTML = div.innerHTML;
div.replaceWith(p);
});
// Convert any nested lists recursively
const nestedLists = content.querySelectorAll('div[role="list"]');
nestedLists.forEach(nestedList => {
const firstNestedItem = nestedList.querySelector('div[role="listitem"] .label');
const nestedLabel = firstNestedItem?.textContent?.trim() || '';
const isNestedOrdered = nestedLabel.match(/^\d+\)/);
const newNestedList = doc.createElement(isNestedOrdered ? 'ol' : 'ul');
// Process nested items
const nestedItems = nestedList.querySelectorAll('div[role="listitem"]');
nestedItems.forEach(nestedItem => {
const nestedLi = doc.createElement('li');
const nestedContent = nestedItem.querySelector('.content');
if (nestedContent) {
// Convert paragraph divs in nested items
const nestedParagraphs = nestedContent.querySelectorAll('div[role="paragraph"]');
nestedParagraphs.forEach(div => {
const p = doc.createElement('p');
p.innerHTML = div.innerHTML;
div.replaceWith(p);
});
nestedLi.innerHTML = nestedContent.innerHTML;
}
newNestedList.appendChild(nestedLi);
});
nestedList.replaceWith(newNestedList);
});
li.innerHTML = content.innerHTML;
}
list.appendChild(li);
});
return list;
}
},
{
selector: 'div[role="listitem"]',
element: 'li',
// Custom handler for list item content
transform: (el, doc) => {
const content = el.querySelector('.content');
if (!content)
return el;
// Convert any paragraph divs inside content
const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
paragraphDivs.forEach(div => {
const p = doc.createElement('p');
p.innerHTML = div.innerHTML;
div.replaceWith(p);
});
return content;
}
}
];
function standardizeContent(element, metadata, doc, debug = false) {
standardizeSpaces(element);
// Remove HTML comments
removeHtmlComments(element);
// Handle H1 elements - remove first one and convert others to H2
standardizeHeadings(element, metadata.title, doc);
// Standardize footnotes and citations
(0, footnotes_1.standardizeFootnotes)(element);
// Convert embedded content to standard formats
standardizeElements(element, doc);
// If not debug mode, do the full cleanup
if (!debug) {
// First pass of div flattening
flattenWrapperElements(element, doc);
// Strip unwanted attributes
stripUnwantedAttributes(element, debug);
// Remove empty elements
removeEmptyElements(element);
// Remove trailing headings
removeTrailingHeadings(element);
// Final pass of div flattening after cleanup operations
flattenWrapperElements(element, doc);
// Standardize consecutive br elements
stripExtraBrElements(element);
// Clean up empty lines
removeEmptyLines(element, doc);
}
else {
// In debug mode, still do basic cleanup but preserve structure
stripUnwantedAttributes(element, debug);
removeTrailingHeadings(element);
stripExtraBrElements(element);
(0, utils_1.logDebug)('Debug mode: Skipping div flattening to preserve structure');
}
}
function standardizeSpaces(element) {
const processNode = (node) => {
// Skip pre and code elements
if ((0, utils_1.isElement)(node)) {
const tag = node.tagName.toLowerCase();
if (tag === 'pre' || tag === 'code') {
return;
}
}
// Process text nodes
if ((0, utils_1.isTextNode)(node)) {
const text = node.textContent || '';
// Replace with regular spaces, except when it's a single between words
const newText = text.replace(/\xA0+/g, (match) => {
// If it's a single between word characters, preserve it
if (match.length === 1) {
const prev = node.previousSibling?.textContent?.slice(-1);
const next = node.nextSibling?.textContent?.charAt(0);
if (prev?.match(/\w/) && next?.match(/\w/)) {
return '\xA0';
}
}
return ' '.repeat(match.length);
});
if (newText !== text) {
node.textContent = newText;
}
}
// Process children recursively
if (node.hasChildNodes()) {
Array.from(node.childNodes).forEach(processNode);
}
};
processNode(element);
}
function removeTrailingHeadings(element) {
let removedCount = 0;
const hasContentAfter = (el) => {
// Check if there's any meaningful content after this element
let nextContent = '';
let sibling = el.nextSibling;
// First check direct siblings
while (sibling) {
if ((0, utils_1.isTextNode)(sibling)) { // TEXT_NODE
nextContent += sibling.textContent || '';
}
else if ((0, utils_1.isElement)(sibling)) { // ELEMENT_NODE
// If we find an element sibling, check its content
nextContent += sibling.textContent || '';
}
sibling = sibling.nextSibling;
}
// If we found meaningful content at this level, return true
if (nextContent.trim()) {
return true;
}
// If no content found at this level and we have a parent,
// check for content after the parent
const parent = el.parentElement;
if (parent && parent !== element) {
return hasContentAfter(parent);
}
return false;
};
// Process all headings from bottom to top
const headings = Array.from(element.querySelectorAll('h1, h2, h3, h4, h5, h6'))
.reverse();
headings.forEach(heading => {
if (!hasContentAfter(heading)) {
heading.remove();
removedCount++;
}
else {
// Stop processing once we find a heading with content after it
return;
}
});
if (removedCount > 0) {
(0, utils_1.logDebug)('Removed trailing headings:', removedCount);
}
}
function standardizeHeadings(element, title, doc) {
const normalizeText = (text) => {
return text
.replace(/\u00A0/g, ' ') // Convert non-breaking spaces to regular spaces
.replace(/\s+/g, ' ') // Normalize all whitespace to single spaces
.trim()
.toLowerCase();
};
const h1s = element.getElementsByTagName('h1');
Array.from(h1s).forEach(h1 => {
const h2 = doc.createElement('h2');
h2.innerHTML = h1.innerHTML;
// Copy allowed attributes
Array.from(h1.attributes).forEach(attr => {
if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
h2.setAttribute(attr.name, attr.value);
}
});
h1.parentNode?.replaceChild(h2, h1);
});
// Remove first H2 if it matches title
const h2s = element.getElementsByTagName('h2');
if (h2s.length > 0) {
const firstH2 = h2s[0];
const firstH2Text = normalizeText(firstH2.textContent || '');
const normalizedTitle = normalizeText(title);
if (normalizedTitle && normalizedTitle === firstH2Text) {
firstH2.remove();
}
}
}
function removeHtmlComments(element) {
let removedCount = 0;
// Get all elements and check their child nodes
const allElements = Array.from(element.getElementsByTagName('*'));
// Process each element's child nodes
allElements.forEach(el => {
const childNodes = Array.from(el.childNodes);
childNodes.forEach(node => {
if ((0, utils_1.isCommentNode)(node)) {
node.remove();
removedCount++;
}
});
});
(0, utils_1.logDebug)('Removed HTML comments:', removedCount);
}
function stripUnwantedAttributes(element, debug) {
let attributeCount = 0;
const processElement = (el) => {
// Skip SVG elements - preserve all their attributes
if (el.tagName.toLowerCase() === 'svg' || el.namespaceURI === 'http://www.w3.org/2000/svg') {
return;
}
const attributes = Array.from(el.attributes);
const tag = el.tagName.toLowerCase();
attributes.forEach(attr => {
const attrName = attr.name.toLowerCase();
const attrValue = attr.value;
// Special cases for preserving specific attributes
if (
// Preserve footnote IDs
(attrName === 'id' && (attrValue.startsWith('fnref:') || // Footnote reference
attrValue.startsWith('fn:') || // Footnote content
attrValue === 'footnotes' // Footnotes container
)) ||
// Preserve code block language classes and footnote backref class
(attrName === 'class' && ((tag === 'code' && attrValue.startsWith('language-')) ||
attrValue === 'footnote-backref'))) {
return;
}
// In debug mode, allow debug attributes and data- attributes
if (debug) {
if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName) &&
!constants_1.ALLOWED_ATTRIBUTES_DEBUG.has(attrName) &&
!attrName.startsWith('data-')) {
el.removeAttribute(attr.name);
attributeCount++;
}
}
else {
// In normal mode, only allow standard attributes
if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName)) {
el.removeAttribute(attr.name);
attributeCount++;
}
}
});
};
processElement(element);
element.querySelectorAll('*').forEach(processElement);
(0, utils_1.logDebug)('Stripped attributes:', attributeCount);
}
function removeEmptyElements(element) {
let removedCount = 0;
let iterations = 0;
let keepRemoving = true;
while (keepRemoving) {
iterations++;
keepRemoving = false;
// Get all elements without children, working from deepest first
const emptyElements = Array.from(element.getElementsByTagName('*')).filter(el => {
if (constants_1.ALLOWED_EMPTY_ELEMENTS.has(el.tagName.toLowerCase())) {
return false;
}
// Check if element has only whitespace or
const textContent = el.textContent || '';
const hasOnlyWhitespace = textContent.trim().length === 0;
const hasNbsp = textContent.includes('\u00A0'); // Unicode non-breaking space
// Check if element has no meaningful children
const hasNoChildren = !el.hasChildNodes() ||
(Array.from(el.childNodes).every(node => {
if ((0, utils_1.isTextNode)(node)) { // TEXT_NODE
const nodeText = node.textContent || '';
return nodeText.trim().length === 0 && !nodeText.includes('\u00A0');
}
return false;
}));
// Special case: Check for divs that only contain spans with commas
if (el.tagName.toLowerCase() === 'div') {
const children = Array.from(el.children);
const hasOnlyCommaSpans = children.length > 0 && children.every(child => {
if (child.tagName.toLowerCase() !== 'span')
return false;
const content = child.textContent?.trim() || '';
return content === ',' || content === '' || content === ' ';
});
if (hasOnlyCommaSpans)
return true;
}
return hasOnlyWhitespace && !hasNbsp && hasNoChildren;
});
if (emptyElements.length > 0) {
emptyElements.forEach(el => {
el.remove();
removedCount++;
});
keepRemoving = true;
}
}
(0, utils_1.logDebug)('Removed empty elements:', removedCount, 'iterations:', iterations);
}
function stripExtraBrElements(element) {
let processedCount = 0;
const startTime = Date.now();
// Get all br elements directly
const brElements = Array.from(element.getElementsByTagName('br'));
// Keep track of consecutive br elements
let consecutiveBrs = [];
// Helper to process collected br elements
const processBrs = () => {
if (consecutiveBrs.length > 2) {
// Keep only two br elements
for (let i = 2; i < consecutiveBrs.length; i++) {
consecutiveBrs[i].remove();
processedCount++;
}
}
consecutiveBrs = [];
};
// Process all br elements
brElements.forEach(currentNode => {
// Check if this br is consecutive with previous ones
let isConsecutive = false;
if (consecutiveBrs.length > 0) {
const lastBr = consecutiveBrs[consecutiveBrs.length - 1];
let node = currentNode.previousSibling;
// Skip whitespace text nodes
while (node && (0, utils_1.isTextNode)(node) && !node.textContent?.trim()) {
node = node.previousSibling;
}
if (node === lastBr) {
isConsecutive = true;
}
}
if (isConsecutive) {
consecutiveBrs.push(currentNode);
}
else {
// Process any previously collected brs before starting new group
processBrs();
consecutiveBrs = [currentNode];
}
});
// Process any remaining br elements
processBrs();
const endTime = Date.now();
(0, utils_1.logDebug)('Standardized br elements:', {
removed: processedCount,
processingTime: `${(endTime - startTime).toFixed(2)}ms`
});
}
function removeEmptyLines(element, doc) {
let removedCount = 0;
const startTime = Date.now();
// First pass: remove empty text nodes
const removeEmptyTextNodes = (node) => {
// Skip if inside pre or code
if ((0, utils_1.isElement)(node)) {
const tag = node.tagName.toLowerCase();
if (tag === 'pre' || tag === 'code') {
return;
}
}
// Process children first (depth-first)
const children = Array.from(node.childNodes);
children.forEach(removeEmptyTextNodes);
// Then handle this node
if ((0, utils_1.isTextNode)(node)) {
const text = node.textContent || '';
// If it's completely empty or just special characters/whitespace, remove it
if (!text || text.match(/^[\u200C\u200B\u200D\u200E\u200F\uFEFF\xA0\s]*$/)) {
node.parentNode?.removeChild(node);
removedCount++;
}
else {
// Clean up the text content while preserving important spaces
const newText = text
.replace(/\n{3,}/g, '\n\n') // More than 2 newlines -> 2 newlines
.replace(/^[\n\r\t]+/, '') // Remove leading newlines/tabs (preserve spaces)
.replace(/[\n\r\t]+$/, '') // Remove trailing newlines/tabs (preserve spaces)
.replace(/[ \t]*\n[ \t]*/g, '\n') // Remove spaces around newlines
.replace(/[ \t]{3,}/g, ' ') // 3+ spaces -> 1 space
.replace(/^[ ]+$/, ' ') // Multiple spaces between elements -> single space
.replace(/\s+([,.!?:;])/g, '$1') // Remove spaces before punctuation
// Clean up zero-width characters and multiple non-breaking spaces
.replace(/[\u200C\u200B\u200D\u200E\u200F\uFEFF]+/g, '')
.replace(/(?:\xA0){2,}/g, '\xA0'); // Multiple -> single
if (newText !== text) {
node.textContent = newText;
removedCount += text.length - newText.length;
}
}
}
};
// Second pass: clean up empty elements and normalize spacing
const cleanupEmptyElements = (node) => {
if (!(0, utils_1.isElement)(node))
return;
// Skip pre and code elements
const tag = node.tagName.toLowerCase();
if (tag === 'pre' || tag === 'code') {
return;
}
// Process children first (depth-first)
Array.from(node.childNodes)
.filter(utils_1.isElement)
.forEach(cleanupEmptyElements);
// Then normalize this element's whitespace
node.normalize(); // Combine adjacent text nodes
// Special handling for block elements
const isBlockElement = (0, utils_1.getComputedStyle)(node)?.display === 'block';
// Only remove empty text nodes at the start and end if they contain just newlines/tabs
// For block elements, also remove spaces
const startPattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
const endPattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
while (node.firstChild &&
(0, utils_1.isTextNode)(node.firstChild) &&
(node.firstChild.textContent || '').match(startPattern)) {
node.removeChild(node.firstChild);
removedCount++;
}
while (node.lastChild &&
(0, utils_1.isTextNode)(node.lastChild) &&
(node.lastChild.textContent || '').match(endPattern)) {
node.removeChild(node.lastChild);
removedCount++;
}
// Ensure there's a space between inline elements if needed
if (!isBlockElement) {
const children = Array.from(node.childNodes);
for (let i = 0; i < children.length - 1; i++) {
const current = children[i];
const next = children[i + 1];
// Only add space between elements or between element and text
if ((0, utils_1.isElement)(current) || (0, utils_1.isElement)(next)) {
// Get the text content
const nextContent = next.textContent || '';
const currentContent = current.textContent || '';
// Don't add space if:
// 1. Next content starts with punctuation or closing parenthesis
// 2. Current content ends with punctuation or opening parenthesis
// 3. There's already a space
const nextStartsWithPunctuation = nextContent.match(/^[,.!?:;)\]]/);
const currentEndsWithPunctuation = currentContent.match(/[,.!?:;(\[]\s*$/);
const hasSpace = ((0, utils_1.isTextNode)(current) &&
(current.textContent || '').endsWith(' ')) ||
((0, utils_1.isTextNode)(next) &&
(next.textContent || '').startsWith(' '));
// Only add space if none of the above conditions are true
if (!nextStartsWithPunctuation &&
!currentEndsWithPunctuation &&
!hasSpace) {
const space = doc.createTextNode(' ');
node.insertBefore(space, next);
}
}
}
}
};
// Run both passes
removeEmptyTextNodes(element);
cleanupEmptyElements(element);
const endTime = Date.now();
(0, utils_1.logDebug)('Removed empty lines:', {
charactersRemoved: removedCount,
processingTime: `${(endTime - startTime).toFixed(2)}ms`
});
}
function standardizeElements(element, doc) {
let processedCount = 0;
// Convert elements based on standardization rules
ELEMENT_STANDARDIZATION_RULES.forEach(rule => {
const elements = element.querySelectorAll(rule.selector);
elements.forEach(el => {
if (rule.transform) {
// If there's a transform function, use it to create the new element
const transformed = rule.transform(el, doc);
el.replaceWith(transformed);
processedCount++;
}
});
});
// Convert lite-youtube elements
const liteYoutubeElements = element.querySelectorAll('lite-youtube');
liteYoutubeElements.forEach(el => {
const videoId = el.getAttribute('videoid');
if (!videoId)
return;
const iframe = doc.createElement('iframe');
iframe.width = '560';
iframe.height = '315';
iframe.src = `https://www.youtube.com/embed/${videoId}`;
iframe.title = el.getAttribute('videotitle') || 'YouTube video player';
iframe.frameBorder = '0';
iframe.allow = 'accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share';
iframe.setAttribute('allowfullscreen', '');
el.replaceWith(iframe);
processedCount++;
});
(0, utils_1.logDebug)('Converted embedded elements:', processedCount);
}
function flattenWrapperElements(element, doc) {
let processedCount = 0;
const startTime = Date.now();
// Process in batches to maintain performance
let keepProcessing = true;
// Helper function to check if an element directly contains inline content
// This helps prevent unwrapping divs that visually act as paragraphs.
function hasDirectInlineContent(el) {
for (const child of el.childNodes) {
// Check for non-empty text nodes
if ((0, utils_1.isTextNode)(child) && child.textContent?.trim()) {
return true;
}
// Check for element nodes that are considered inline
if ((0, utils_1.isElement)(child) && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())) {
return true;
}
}
return false;
}
const shouldPreserveElement = (el) => {
const tagName = el.tagName.toLowerCase();
// Check if element should be preserved
if (constants_1.PRESERVE_ELEMENTS.has(tagName))
return true;
// Check for semantic roles
const role = el.getAttribute('role');
if (role && ['article', 'main', 'navigation', 'banner', 'contentinfo'].includes(role)) {
return true;
}
// Check for semantic classes
const className = el.className;
if (typeof className === 'string' && className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)) {
return true;
}
// Check if element contains mixed content types that should be preserved
const children = Array.from(el.children);
const hasPreservedElements = children.some(child => constants_1.PRESERVE_ELEMENTS.has(child.tagName.toLowerCase()) ||
child.getAttribute('role') === 'article' ||
(child.className && typeof child.className === 'string' &&
child.className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)));
if (hasPreservedElements)
return true;
return false;
};
const isWrapperElement = (el) => {
// If it directly contains inline content, it's NOT a wrapper
if (hasDirectInlineContent(el)) {
return false;
}
// Check if it's just empty space
if (!el.textContent?.trim())
return true;
// Check if it only contains other block elements
const children = Array.from(el.children);
if (children.length === 0)
return true;
// Check if all children are block elements
const allBlockElements = children.every(child => {
const tag = child.tagName.toLowerCase();
return constants_1.BLOCK_ELEMENTS.includes(tag) ||
tag === 'p' || tag === 'h1' || tag === 'h2' ||
tag === 'h3' || tag === 'h4' || tag === 'h5' || tag === 'h6' ||
tag === 'ul' || tag === 'ol' || tag === 'pre' || tag === 'blockquote' ||
tag === 'figure';
});
if (allBlockElements)
return true;
// Check for common wrapper patterns
const className = el.className.toLowerCase();
const isWrapper = /(?:wrapper|container|layout|row|col|grid|flex|outer|inner|content-area)/i.test(className);
if (isWrapper)
return true;
// Check if it has excessive whitespace or empty text nodes
const textNodes = Array.from(el.childNodes).filter(node => (0, utils_1.isTextNode)(node) && node.textContent?.trim());
if (textNodes.length === 0)
return true;
// Check if it only contains block elements
const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
const tag = child.tagName.toLowerCase();
return constants_1.INLINE_ELEMENTS.has(tag);
});
if (hasOnlyBlockElements)
return true;
return false;
};
// Function to process a single element
const processElement = (el) => {
// Skip processing if element has been removed or should be preserved
if (!el.isConnected || shouldPreserveElement(el))
return false;
const tagName = el.tagName.toLowerCase();
// Case 1: Element is truly empty (no text content, no child elements) and not self-closing
if (!constants_1.ALLOWED_EMPTY_ELEMENTS.has(tagName) && !el.children.length && !el.textContent?.trim()) {
el.remove();
processedCount++;
return true;
}
// Case 2: Top-level element - be more aggressive
if (el.parentElement === element) {
const children = Array.from(el.children);
const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
const tag = child.tagName.toLowerCase();
return constants_1.INLINE_ELEMENTS.has(tag);
});
if (hasOnlyBlockElements) {
const fragment = doc.createDocumentFragment();
while (el.firstChild) {
fragment.appendChild(el.firstChild);
}
el.replaceWith(fragment);
processedCount++;
return true;
}
}
// Case 3: Wrapper element - merge up aggressively
if (isWrapperElement(el)) {
// Special case: if element only contains block elements, merge them up
const children = Array.from(el.children);
const onlyBlockElements = !children.some(child => {
const tag = child.tagName.toLowerCase();
return constants_1.INLINE_ELEMENTS.has(tag);
});
if (onlyBlockElements) {
const fragment = doc.createDocumentFragment();
while (el.firstChild) {
fragment.appendChild(el.firstChild);
}
el.replaceWith(fragment);
processedCount++;
return true;
}
// Otherwise handle as normal wrapper
const fragment = doc.createDocumentFragment();
while (el.firstChild) {
fragment.appendChild(el.firstChild);
}
el.replaceWith(fragment);
processedCount++;
return true;
}
// Case 4: Element only contains text and/or inline elements - convert to paragraph
const childNodes = Array.from(el.childNodes);
const hasOnlyInlineOrText = childNodes.length > 0 && childNodes.every(child => ((0, utils_1.isTextNode)(child)) ||
((0, utils_1.isElement)(child) && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())));
if (hasOnlyInlineOrText && el.textContent?.trim()) { // Ensure there's actual content
const p = doc.createElement('p');
// Move all children (including inline tags like <font>) to the new <p>
while (el.firstChild) {
p.appendChild(el.firstChild);
}
el.replaceWith(p);
processedCount++;
return true;
}
// Case 5: Element has single child - unwrap only if child is block-level
if (el.children.length === 1) {
const child = el.firstElementChild;
const childTag = child.tagName.toLowerCase();
// Only unwrap if the single child is a block element and not preserved
if (constants_1.BLOCK_ELEMENTS.includes(childTag) && !shouldPreserveElement(child)) {
el.replaceWith(child);
processedCount++;
return true;
}
}
// Case 6: Deeply nested element - merge up
let nestingDepth = 0;
let parent = el.parentElement;
while (parent) {
const parentTag = parent.tagName.toLowerCase();
if (constants_1.BLOCK_ELEMENTS.includes(parentTag)) {
nestingDepth++;
}
parent = parent.parentElement;
}
// Only unwrap if nested AND does not contain direct inline content
if (nestingDepth > 0 && !hasDirectInlineContent(el)) {
const fragment = doc.createDocumentFragment();
while (el.firstChild) {
fragment.appendChild(el.firstChild);
}
el.replaceWith(fragment);
processedCount++;
return true;
}
return false;
};
// First pass: Process top-level wrapper elements
const processTopLevelElements = () => {
const topElements = Array.from(element.children).filter(el => constants_1.BLOCK_ELEMENTS.includes(el.tagName.toLowerCase()));
let modified = false;
topElements.forEach(el => {
if (processElement(el)) {
modified = true;
}
});
return modified;
};
// Second pass: Process remaining wrapper elements from deepest to shallowest
const processRemainingElements = () => {
// Get all wrapper elements
const allElements = Array.from(element.querySelectorAll(constants_1.BLOCK_ELEMENTS.join(',')))
.sort((a, b) => {
// Count nesting depth
const getDepth = (el) => {
let depth = 0;
let parent = el.parentElement;
while (parent) {
const parentTag = parent.tagName.toLowerCase();
if (constants_1.BLOCK_ELEMENTS.includes(parentTag))
depth++;
parent = parent.parentElement;
}
return depth;
};
return getDepth(b) - getDepth(a); // Process deepest first
});
let modified = false;
allElements.forEach(el => {
if (processElement(el)) {
modified = true;
}
});
return modified;
};
// Final cleanup pass - aggressively flatten remaining wrapper elements
const finalCleanup = () => {
const remainingElements = Array.from(element.querySelectorAll(constants_1.BLOCK_ELEMENTS.join(',')));
let modified = false;
remainingElements.forEach(el => {
// Check if element only contains paragraphs
const children = Array.from(el.children);
const onlyParagraphs = children.length > 0 && children.every(child => child.tagName.toLowerCase() === 'p');
// Unwrap if it only contains paragraphs OR is a non-preserved wrapper element
if (onlyParagraphs || (!shouldPreserveElement(el) && isWrapperElement(el))) {
const fragment = doc.createDocumentFragment();
while (el.firstChild) {
fragment.appendChild(el.firstChild);
}
el.replaceWith(fragment);
processedCount++;
modified = true;
}
});
return modified;
};
// Execute all passes until no more changes
do {
keepProcessing = false;
if (processTopLevelElements())
keepProcessing = true;
if (processRemainingElements())
keepProcessing = true;
if (finalCleanup())
keepProcessing = true;
} while (keepProcessing);
const endTime = Date.now();
(0, utils_1.logDebug)('Flattened wrapper elements:', {
count: processedCount,
processingTime: `${(endTime - startTime).toFixed(2)}ms`
});
}
//# sourceMappingURL=standardize.js.map