UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

192 lines 6.95 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.mathSelectors = exports.isBlockDisplay = exports.getBasicLatexFromElement = exports.getMathMLFromElement = void 0; const getMathMLFromElement = (el) => { // 1. Direct MathML content if (el.tagName.toLowerCase() === 'math') { const isBlock = el.getAttribute('display') === 'block'; return { mathml: el.outerHTML, latex: el.getAttribute('alttext') || null, isBlock }; } // 2. MathML in data-mathml attribute const mathmlStr = el.getAttribute('data-mathml'); if (mathmlStr) { const tempDiv = document.createElement('div'); tempDiv.innerHTML = mathmlStr; const mathElement = tempDiv.querySelector('math'); if (mathElement) { const isBlock = mathElement.getAttribute('display') === 'block'; return { mathml: mathElement.outerHTML, latex: mathElement.getAttribute('alttext') || null, isBlock }; } } // 3. MathJax assistive MathML const assistiveMmlContainer = el.querySelector('.MJX_Assistive_MathML, mjx-assistive-mml'); if (assistiveMmlContainer) { const mathElement = assistiveMmlContainer.querySelector('math'); if (mathElement) { // Check both the math element and container for display mode const mathDisplayAttr = mathElement.getAttribute('display'); const containerDisplayAttr = assistiveMmlContainer.getAttribute('display'); const isBlock = mathDisplayAttr === 'block' || containerDisplayAttr === 'block'; return { mathml: mathElement.outerHTML, latex: mathElement.getAttribute('alttext') || null, isBlock }; } } // 4. KaTeX MathML const katexMathml = el.querySelector('.katex-mathml math'); if (katexMathml) { return { mathml: katexMathml.outerHTML, latex: null, // We'll get LaTeX separately for KaTeX isBlock: false // We'll determine this from container }; } return null; }; exports.getMathMLFromElement = getMathMLFromElement; const getBasicLatexFromElement = (el) => { // Direct data-latex attribute const dataLatex = el.getAttribute('data-latex'); if (dataLatex) { return dataLatex; } // WordPress LaTeX images if (el.tagName.toLowerCase() === 'img' && el.classList.contains('latex')) { // Try alt text first as it's cleaner const altLatex = el.getAttribute('alt'); if (altLatex) { return altLatex; } // Fallback to extracting from URL const src = el.getAttribute('src'); if (src) { const match = src.match(/latex\.php\?latex=([^&]+)/); if (match) { return decodeURIComponent(match[1]) .replace(/\+/g, ' ') // Replace + with spaces .replace(/%5C/g, '\\'); // Fix escaped backslashes } } } // LaTeX in annotation const annotation = el.querySelector('annotation[encoding="application/x-tex"]'); if (annotation?.textContent) { return annotation.textContent.trim(); } // KaTeX formats if (el.matches('.katex')) { const katexAnnotation = el.querySelector('.katex-mathml annotation[encoding="application/x-tex"]'); if (katexAnnotation?.textContent) { return katexAnnotation.textContent.trim(); } } // MathJax scripts if (el.matches('script[type="math/tex"]') || el.matches('script[type="math/tex; mode=display"]')) { return el.textContent?.trim() || null; } // Check for sibling script element if (el.parentElement) { const siblingScript = el.parentElement.querySelector('script[type="math/tex"], script[type="math/tex; mode=display"]'); if (siblingScript) { return siblingScript.textContent?.trim() || null; } } // Fallback to alt text or text content return el.getAttribute('alt') || el.textContent?.trim() || null; }; exports.getBasicLatexFromElement = getBasicLatexFromElement; const isBlockDisplay = (el) => { // Check explicit display attribute const displayAttr = el.getAttribute('display'); if (displayAttr === 'block') { return true; } // Check common class names const classNames = el.className.toLowerCase(); if (classNames.includes('display') || classNames.includes('block')) { return true; } // Check container classes const container = el.closest('.katex-display, .MathJax_Display, [data-display="block"]'); if (container) { return true; } // Check if preceded by block element const prevElement = el.previousElementSibling; if (prevElement?.tagName.toLowerCase() === 'p') { return true; } // Check specific formats if (el.matches('.mwe-math-fallback-image-display')) { return true; } // Check KaTeX display mode if (el.matches('.katex')) { // KaTeX elements are inline by default // Only block if explicitly marked as display return el.closest('.katex-display') !== null; } // Check MathJax v3 display attribute if (el.hasAttribute('display')) { return el.getAttribute('display') === 'true'; } // Check MathJax script display attribute if (el.matches('script[type="math/tex; mode=display"]')) { return true; } if (el.hasAttribute('display')) { return el.getAttribute('display') === 'true'; } // Check parent container display attribute const parentContainer = el.closest('[display]'); if (parentContainer) { return parentContainer.getAttribute('display') === 'true'; } return false; }; exports.isBlockDisplay = isBlockDisplay; // Shared selector for math elements exports.mathSelectors = [ // WordPress LaTeX images 'img.latex[src*="latex.php"]', // MathJax elements (v2 and v3) 'span.MathJax', 'mjx-container', 'script[type="math/tex"]', 'script[type="math/tex; mode=display"]', '.MathJax_Preview + script[type="math/tex"]', '.MathJax_Display', '.MathJax_SVG', '.MathJax_MathML', // MediaWiki math elements '.mwe-math-element', '.mwe-math-fallback-image-inline', '.mwe-math-fallback-image-display', '.mwe-math-mathml-inline', '.mwe-math-mathml-display', // KaTeX elements '.katex', '.katex-display', '.katex-mathml', '.katex-html', '[data-katex]', 'script[type="math/katex"]', // Generic math elements and other formats 'math', '[data-math]', '[data-latex]', '[data-tex]', 'script[type^="math/"]', 'annotation[encoding="application/x-tex"]' ].join(','); //# sourceMappingURL=math.base.js.map