UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

619 lines 29.5 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.standardizeFootnotes = standardizeFootnotes; const constants_1 = require("../constants"); const dom_1 = require("../utils/dom"); class FootnoteHandler { constructor(doc) { this.genericContainer = null; this.doc = doc; } createFootnoteItem(footnoteNumber, content, refs) { const doc = typeof content === 'string' ? this.doc : content.ownerDocument; const newItem = doc.createElement('li'); newItem.className = 'footnote'; newItem.id = `fn:${footnoteNumber}`; // Handle content if (typeof content === 'string') { const paragraph = doc.createElement('p'); paragraph.appendChild((0, dom_1.parseHTML)(doc, content)); newItem.appendChild(paragraph); } else { // Get all paragraphs from the content const paragraphs = Array.from(content.querySelectorAll('p')); if (paragraphs.length === 0) { // If no paragraphs, wrap content in a paragraph const paragraph = doc.createElement('p'); (0, dom_1.transferContent)(content, paragraph); this.removeBackrefs(paragraph); newItem.appendChild(paragraph); } else { // Copy existing paragraphs paragraphs.forEach((p) => { const newP = doc.createElement('p'); (0, dom_1.transferContent)(p, newP); this.removeBackrefs(newP); newItem.appendChild(newP); }); } } // Add backlink(s) to the last paragraph const lastParagraph = newItem.querySelector('p:last-of-type') || newItem; refs.forEach((refId, index) => { const backlink = doc.createElement('a'); backlink.href = `#${refId}`; backlink.title = 'return to article'; backlink.className = 'footnote-backref'; backlink.textContent = '↩'; if (index < refs.length - 1) { backlink.textContent += ' '; } lastParagraph.appendChild(backlink); }); return newItem; } collectFootnotes(element) { const footnotes = {}; let footnoteCount = 1; const processedIds = new Set(); // Track processed IDs // Collect all footnotes and their IDs from footnote lists const footnoteLists = element.querySelectorAll(constants_1.FOOTNOTE_LIST_SELECTORS); footnoteLists.forEach((list) => { // Wikidot uses div.footnotes-footer containing div.footnote-footer children if (list.matches('div.footnotes-footer')) { const footnoteDivs = list.querySelectorAll('div.footnote-footer'); footnoteDivs.forEach((div) => { const divId = div.id || ''; const match = divId.match(/^footnote-(\d+)$/); if (match) { const id = match[1]; if (!processedIds.has(id)) { // Clone the div to avoid modifying the original DOM const clone = div.cloneNode(true); // Remove the back-link anchor const backLink = clone.querySelector('a'); if (backLink) backLink.remove(); // Get remaining text and strip leading ". " let text = (0, dom_1.serializeHTML)(clone); text = text.replace(/^\s*\.\s*/, ''); const contentDiv = element.ownerDocument.createElement('div'); contentDiv.appendChild((0, dom_1.parseHTML)(element.ownerDocument, text.trim())); footnotes[footnoteCount] = { content: contentDiv, originalId: id, refs: [] }; processedIds.add(id); footnoteCount++; } } }); return; } // Substack has individual footnote divs with no parent if (list.matches('div.footnote[data-component-name="FootnoteToDOM"]')) { const anchor = list.querySelector('a.footnote-number'); const content = list.querySelector('.footnote-content'); if (anchor && content) { const id = anchor.id.replace('footnote-', '').toLowerCase(); if (id && !processedIds.has(id)) { footnotes[footnoteCount] = { content: content, originalId: id, refs: [] }; processedIds.add(id); footnoteCount++; } } return; } // Common format using OL/UL and LI elements const items = list.querySelectorAll('li, div[role="listitem"]'); items.forEach((li) => { let id = ''; let content = null; // Handle citations with .citations class const citationsDiv = li.querySelector('.citations'); if (citationsDiv?.id?.toLowerCase().startsWith('r')) { id = citationsDiv.id.toLowerCase(); // Look for citation content within the citations div const citationContent = citationsDiv.querySelector('.citation-content'); if (citationContent) { content = citationContent; } } else { // Extract ID from various formats if (li.id.toLowerCase().startsWith('bib.bib')) { id = li.id.replace('bib.bib', '').toLowerCase(); } else if (li.id.toLowerCase().startsWith('fn:')) { id = li.id.replace('fn:', '').toLowerCase(); } else if (li.id.toLowerCase().startsWith('fn')) { id = li.id.replace('fn', '').toLowerCase(); // Nature.com } else if (li.hasAttribute('data-counter')) { id = li.getAttribute('data-counter')?.replace(/\.$/, '')?.toLowerCase() || ''; } else { const match = li.id.split('/').pop()?.match(/cite_note-(.+)/); id = match ? match[1].toLowerCase() : li.id.toLowerCase(); } content = li; } if (id && !processedIds.has(id)) { footnotes[footnoteCount] = { content: content || li, originalId: id, refs: [] }; processedIds.add(id); footnoteCount++; } }); }); // Generic fallback: if no footnotes found via selectors, try ID-based detection if (footnoteCount === 1) { // Step 1: Find all in-text anchors linking to fragment IDs with short numeric text const candidateRefs = new Map(); // fragment -> [anchor elements] const allAnchors = element.querySelectorAll('a[href*="#"]'); allAnchors.forEach((a) => { const href = a.getAttribute('href') || ''; const fragment = href.split('#').pop()?.toLowerCase(); if (!fragment) return; const text = a.textContent?.trim() || ''; if (!/^\[?\(?\d{1,4}\)?\]?$/.test(text)) return; // Must be inside a sup or span to look like a footnote ref const parent = a.parentElement; if (!parent) return; const parentTag = parent.tagName.toLowerCase(); if (parentTag !== 'sup' && parentTag !== 'span' && a.tagName.toLowerCase() !== 'a') return; if (!candidateRefs.has(fragment)) { candidateRefs.set(fragment, []); } candidateRefs.get(fragment).push(a); }); if (candidateRefs.size >= 2) { // Step 2: Find a container where multiple children have IDs matching our fragments const fragmentSet = new Set(candidateRefs.keys()); const containers = element.querySelectorAll('div, section, aside, footer'); let bestContainer = null; let bestMatchCount = 0; containers.forEach((container) => { // Skip containers that are the main content element itself if (container === element) return; const children = container.querySelectorAll('p[id], li[id], div[id]'); let matchCount = 0; children.forEach((child) => { if (fragmentSet.has(child.id.toLowerCase())) { matchCount++; } }); if (matchCount >= 2 && matchCount >= bestMatchCount) { bestMatchCount = matchCount; bestContainer = container; } }); if (bestContainer) { // Step 3: Extract footnotes from the container const idElements = bestContainer.querySelectorAll('p[id], li[id], div[id]'); const orderedElements = []; idElements.forEach((el) => { if (fragmentSet.has(el.id.toLowerCase())) { orderedElements.push(el); } }); // Step 4: Handle multi-paragraph footnotes (group consecutive non-ID elements) orderedElements.forEach((el) => { const id = el.id.toLowerCase(); if (processedIds.has(id)) return; const contentDiv = element.ownerDocument.createElement('div'); // Clone the element content const clone = el.cloneNode(true); // Strip leading footnote number (e.g. "1. " or "1 ") const firstText = clone.childNodes[0]; if (firstText && firstText.nodeType === 3) { firstText.textContent = firstText.textContent.replace(/^\d+\.\s*/, ''); } contentDiv.appendChild(clone); // Check for consecutive siblings without IDs (multi-paragraph footnotes) let sibling = el.nextElementSibling; while (sibling && !sibling.id) { const sibClone = sibling.cloneNode(true); contentDiv.appendChild(sibClone); sibling = sibling.nextElementSibling; } footnotes[footnoteCount] = { content: contentDiv, originalId: id, refs: [] }; processedIds.add(id); footnoteCount++; }); // Step 5: Store container for later removal this.genericContainer = bestContainer; } } } return footnotes; } removeBackrefs(el) { el.querySelectorAll('a').forEach((a) => { const text = a.textContent?.trim().replace(/\uFE0E|\uFE0F/g, '') || ''; if (/^[\u21A9\u21A5\u2191\u21B5\u2934\u2935\u23CE]+$/.test(text) || a.classList?.contains('footnote-backref')) { a.remove(); } }); // Clean up trailing text nodes that are only whitespace/punctuation // (remnants from around removed backref links, e.g. " ." or " , .") while (el.lastChild && el.lastChild.nodeType === 3) { const text = el.lastChild.textContent; if (/^[\s,.;]*$/.test(text)) { el.lastChild.remove(); } else { break; } } } findOuterFootnoteContainer(el) { let current = el; let parent = el.parentElement; // Keep going up until we find an element that's not a span or sup while (parent && (parent.tagName.toLowerCase() === 'span' || parent.tagName.toLowerCase() === 'sup')) { current = parent; parent = parent.parentElement; } return current; } // Every footnote reference should be a sup element with an anchor inside // e.g. <sup id="fnref:1"><a href="#fn:1">1</a></sup> createFootnoteReference(footnoteNumber, refId) { const sup = this.doc.createElement('sup'); sup.id = refId; const link = this.doc.createElement('a'); link.href = `#fn:${footnoteNumber}`; link.textContent = footnoteNumber; sup.appendChild(link); return sup; } /** * Handle CSS sidenote footnotes where content is embedded inline in the text. * Pattern: <span class="footnote-container"> * <label class="footnote-number"></label> * <input class="margin-toggle"> * <span class="footnote">Content...</span> * </span> */ collectInlineSidenotes(element) { const footnotes = {}; const containers = element.querySelectorAll('span.footnote-container, span.sidenote-container'); if (containers.length === 0) return footnotes; let footnoteCount = 1; containers.forEach((container) => { const content = container.querySelector('span.footnote, span.sidenote'); if (!content) return; // Clone content so we can manipulate it without affecting the DOM const contentClone = content.cloneNode(true); footnotes[footnoteCount] = { content: contentClone, originalId: String(footnoteCount), refs: [`fnref:${footnoteCount}`] }; // Replace the container with a standard footnote reference const ref = this.createFootnoteReference(String(footnoteCount), `fnref:${footnoteCount}`); container.replaceWith(ref); footnoteCount++; }); return footnotes; } standardizeFootnotes(element) { // Handle CSS sidenote footnotes first const sidenotes = this.collectInlineSidenotes(element); const footnotes = this.collectFootnotes(element); // Standardize inline footnotes using the collected IDs const footnoteInlineReferences = element.querySelectorAll(constants_1.FOOTNOTE_INLINE_REFERENCES); // Group references by their parent sup element const supGroups = new Map(); footnoteInlineReferences.forEach((el) => { if (!el || !el.parentNode) return; let footnoteId = ''; let footnoteContent = ''; // Extract footnote ID based on element type // Wikidot: <sup class="footnoteref"><a id="footnoteref-N" href="javascript:;">N</a></sup> if (el.matches('sup.footnoteref')) { const link = el.querySelector('a[id^="footnoteref-"]'); if (link) { const linkId = link.id || ''; const match = linkId.match(/^footnoteref-(\d+)$/); if (match) { footnoteId = match[1]; } } // Nature.com } else if (el.matches('a[id^="ref-link"]')) { footnoteId = el.textContent?.trim() || ''; // Science.org } else if (el.matches('a[role="doc-biblioref"]')) { const xmlRid = el.getAttribute('data-xml-rid'); if (xmlRid) { footnoteId = xmlRid; } else { const href = el.getAttribute('href'); if (href?.startsWith('#core-R')) { footnoteId = href.replace('#core-', ''); } } // Substack } else if (el.matches('a.footnote-anchor, span.footnote-hovercard-target a')) { const id = el.id?.replace('footnote-anchor-', '') || ''; if (id) { footnoteId = id.toLowerCase(); } // Arxiv — handle multi-citation groups (e.g. [35, 2, 5]) } else if (el.matches('cite.ltx_cite')) { const links = Array.from(el.querySelectorAll('a')); if (links.length > 0) { // Process all links in the citation group const refs = []; links.forEach((link) => { const href = link.getAttribute('href'); if (!href) return; const match = href.split('/').pop()?.match(/bib\.bib(\d+)/); if (!match) return; const citationId = match[1].toLowerCase(); const entry = Object.entries(footnotes).find(([_, data]) => data.originalId === citationId); if (!entry) return; const [fnNum, fnData] = entry; const refId = fnData.refs.length > 0 ? `fnref:${fnNum}-${fnData.refs.length + 1}` : `fnref:${fnNum}`; fnData.refs.push(refId); refs.push(this.createFootnoteReference(fnNum, refId)); }); if (refs.length > 0) { const container = this.findOuterFootnoteContainer(el); const fragment = el.ownerDocument.createDocumentFragment(); refs.forEach((ref, i) => { if (i > 0) { fragment.appendChild(el.ownerDocument.createTextNode(' ')); } fragment.appendChild(ref); }); container.replaceWith(fragment); // Skip the default single-footnote handling below return; } } } else if (el.matches('sup.reference')) { const links = el.querySelectorAll('a'); Array.from(links).forEach((link) => { const href = link.getAttribute('href'); if (href) { const match = href.split('/').pop()?.match(/(?:cite_note|cite_ref)-(.+)/); if (match) { footnoteId = match[1].toLowerCase(); } } }); } else if (el.matches('sup[id^="fnref:"]')) { footnoteId = el.id.replace('fnref:', '').toLowerCase(); } else if (el.matches('sup[id^="fnr"]')) { footnoteId = el.id.replace('fnr', '').toLowerCase(); } else if (el.matches('span.footnote-reference')) { footnoteId = el.getAttribute('data-footnote-id') || ''; // LessWrong uses id="fnrefXXX" on the span if (!footnoteId && el.id?.startsWith('fnref')) { footnoteId = el.id.replace('fnref', '').toLowerCase(); } } else if (el.matches('span.footnote-link')) { footnoteId = el.getAttribute('data-footnote-id') || ''; footnoteContent = el.getAttribute('data-footnote-content') || ''; } else if (el.matches('a.citation')) { footnoteId = el.textContent?.trim() || ''; footnoteContent = el.getAttribute('href') || ''; } else if (el.matches('a[id^="fnref"]')) { footnoteId = el.id.replace('fnref', '').toLowerCase(); } else { // Other citation types const href = el.getAttribute('href'); if (href) { const id = href.replace(/^[#]/, ''); footnoteId = id.toLowerCase(); } } if (footnoteId) { // Find the footnote number by matching the original ID const footnoteEntry = Object.entries(footnotes).find(([_, data]) => data.originalId === footnoteId.toLowerCase()); if (footnoteEntry) { const [footnoteNumber, footnoteData] = footnoteEntry; // Create footnote reference ID const refId = footnoteData.refs.length > 0 ? `fnref:${footnoteNumber}-${footnoteData.refs.length + 1}` : `fnref:${footnoteNumber}`; footnoteData.refs.push(refId); // Find the outermost container (span or sup) const container = this.findOuterFootnoteContainer(el); // If container is a sup, group references if (container.tagName.toLowerCase() === 'sup') { if (!supGroups.has(container)) { supGroups.set(container, []); } const group = supGroups.get(container); group.push(this.createFootnoteReference(footnoteNumber, refId)); } else { // Replace the container directly container.replaceWith(this.createFootnoteReference(footnoteNumber, refId)); } } } }); // Fallback: match remaining unmatched footnotes const unmatchedFootnotes = Object.entries(footnotes).filter(([_, data]) => data.refs.length === 0); if (unmatchedFootnotes.length > 0) { // Build lookup maps const footnoteIdMap = new Map(); const footnoteNumMap = new Map(); unmatchedFootnotes.forEach(([num, data]) => { footnoteIdMap.set(data.originalId, [num, data]); footnoteNumMap.set(num, [num, data]); }); // Pass 1: Match by fragment link (e.g. <a href="#mn37note01">1</a>) const allLinks = element.querySelectorAll('a[href*="#"]'); allLinks.forEach((link) => { if (!link.parentNode) return; // Skip if already inside a standardized footnote ref const closestFnref = link.closest('[id^="fnref:"]'); if (closestFnref) return; // Skip if inside the footnotes section itself const closestFootnotes = link.closest('#footnotes'); if (closestFootnotes) return; // Skip if inside the generic container (footnote definitions) if (this.genericContainer && this.genericContainer.contains(link)) return; const href = link.getAttribute('href') || ''; const fragment = href.split('#').pop()?.toLowerCase(); if (!fragment) return; const entry = footnoteIdMap.get(fragment); if (!entry) return; // Validate it looks like a footnote marker const text = link.textContent?.trim() || ''; if (!/^[\[\(]?\d{1,4}[\]\)]?$/.test(text)) return; const [footnoteNumber, footnoteData] = entry; const refId = footnoteData.refs.length > 0 ? `fnref:${footnoteNumber}-${footnoteData.refs.length + 1}` : `fnref:${footnoteNumber}`; footnoteData.refs.push(refId); const container = this.findOuterFootnoteContainer(link); container.replaceWith(this.createFootnoteReference(footnoteNumber, refId)); }); // Pass 2: Match sup/span elements with numeric text (e.g. <sup class="footnote-ref">1</sup>) const stillUnmatched = Object.entries(footnotes).filter(([_, data]) => data.refs.length === 0); if (stillUnmatched.length > 0) { const supElements = element.querySelectorAll('sup, span.footnote-ref'); supElements.forEach((el) => { if (!el.parentNode) return; // Skip if already standardized if (el.id?.startsWith('fnref:')) return; // Skip if inside the footnotes section if (el.closest('#footnotes')) return; const text = el.textContent?.trim() || ''; const match = text.match(/^[\[\(]?(\d{1,4})[\]\)]?$/); if (!match) return; const num = match[1]; // Match against footnote number or originalId const entry = footnoteNumMap.get(num) || footnoteIdMap.get(num); if (!entry) return; const [footnoteNumber, footnoteData] = entry; if (footnoteData.refs.length > 0) return; // Already matched const refId = `fnref:${footnoteNumber}`; footnoteData.refs.push(refId); const container = this.findOuterFootnoteContainer(el); container.replaceWith(this.createFootnoteReference(footnoteNumber, refId)); }); } } // Handle grouped references supGroups.forEach((references, container) => { if (references.length > 0) { // Create a document fragment to hold all the references const fragment = this.doc.createDocumentFragment(); // Add each reference as its own sup element references.forEach((ref) => { const link = ref.querySelector('a'); if (link) { const sup = this.doc.createElement('sup'); sup.id = ref.id; sup.appendChild(link.cloneNode(true)); fragment.appendChild(sup); } }); container.replaceWith(fragment); } }); // Create the standardized footnote list const newList = this.doc.createElement('div'); newList.id = 'footnotes'; const orderedList = this.doc.createElement('ol'); // Merge sidenotes and regular footnotes const allFootnotes = { ...sidenotes, ...footnotes }; // Create footnote items in order Object.entries(allFootnotes).forEach(([number, data]) => { const newItem = this.createFootnoteItem(parseInt(number), data.content, data.refs); orderedList.appendChild(newItem); }); // Remove original footnote lists const footnoteLists = element.querySelectorAll(constants_1.FOOTNOTE_LIST_SELECTORS); footnoteLists.forEach((list) => list.remove()); // Remove generically-detected footnote container if (this.genericContainer && this.genericContainer.parentNode) { this.genericContainer.remove(); } // If we have any footnotes, add the new list to the document if (orderedList.children.length > 0) { newList.appendChild(orderedList); element.appendChild(newList); } } } /** * Standardizes footnotes in the given element * @param element The element to standardize footnotes in */ function standardizeFootnotes(element) { // Get the document from the element's ownerDocument const doc = element.ownerDocument; if (!doc) { console.warn('standardizeFootnotes: No document available'); return; } const handler = new FootnoteHandler(doc); handler.standardizeFootnotes(element); } //# sourceMappingURL=footnotes.js.map