UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

316 lines 13.9 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.standardizeFootnotes = standardizeFootnotes; const constants_1 = require("../constants"); class FootnoteHandler { constructor(doc) { this.doc = doc; } createFootnoteItem(footnoteNumber, content, refs) { const doc = typeof content === 'string' ? this.doc : content.ownerDocument; const newItem = doc.createElement('li'); newItem.className = 'footnote'; newItem.id = `fn:${footnoteNumber}`; // Handle content if (typeof content === 'string') { const paragraph = doc.createElement('p'); paragraph.innerHTML = content; newItem.appendChild(paragraph); } else { // Get all paragraphs from the content const paragraphs = Array.from(content.querySelectorAll('p')); if (paragraphs.length === 0) { // If no paragraphs, wrap content in a paragraph const paragraph = doc.createElement('p'); paragraph.innerHTML = content.innerHTML; newItem.appendChild(paragraph); } else { // Copy existing paragraphs paragraphs.forEach((p) => { const newP = doc.createElement('p'); newP.innerHTML = p.innerHTML; newItem.appendChild(newP); }); } } // Add backlink(s) to the last paragraph const lastParagraph = newItem.querySelector('p:last-of-type') || newItem; refs.forEach((refId, index) => { const backlink = doc.createElement('a'); backlink.href = `#${refId}`; backlink.title = 'return to article'; backlink.className = 'footnote-backref'; backlink.innerHTML = '↩'; if (index < refs.length - 1) { backlink.innerHTML += ' '; } lastParagraph.appendChild(backlink); }); return newItem; } collectFootnotes(element) { const footnotes = {}; let footnoteCount = 1; const processedIds = new Set(); // Track processed IDs // Collect all footnotes and their IDs from footnote lists const footnoteLists = element.querySelectorAll(constants_1.FOOTNOTE_LIST_SELECTORS); footnoteLists.forEach((list) => { // Substack has individual footnote divs with no parent if (list.matches('div.footnote[data-component-name="FootnoteToDOM"]')) { const anchor = list.querySelector('a.footnote-number'); const content = list.querySelector('.footnote-content'); if (anchor && content) { const id = anchor.id.replace('footnote-', '').toLowerCase(); if (id && !processedIds.has(id)) { footnotes[footnoteCount] = { content: content, originalId: id, refs: [] }; processedIds.add(id); footnoteCount++; } } return; } // Common format using OL/UL and LI elements const items = list.querySelectorAll('li, div[role="listitem"]'); items.forEach((li) => { let id = ''; let content = null; // Handle citations with .citations class const citationsDiv = li.querySelector('.citations'); if (citationsDiv?.id?.toLowerCase().startsWith('r')) { id = citationsDiv.id.toLowerCase(); // Look for citation content within the citations div const citationContent = citationsDiv.querySelector('.citation-content'); if (citationContent) { content = citationContent; } } else { // Extract ID from various formats if (li.id.toLowerCase().startsWith('bib.bib')) { id = li.id.replace('bib.bib', '').toLowerCase(); } else if (li.id.toLowerCase().startsWith('fn:')) { id = li.id.replace('fn:', '').toLowerCase(); } else if (li.id.toLowerCase().startsWith('fn')) { id = li.id.replace('fn', '').toLowerCase(); // Nature.com } else if (li.hasAttribute('data-counter')) { id = li.getAttribute('data-counter')?.replace(/\.$/, '')?.toLowerCase() || ''; } else { const match = li.id.split('/').pop()?.match(/cite_note-(.+)/); id = match ? match[1].toLowerCase() : li.id.toLowerCase(); } content = li; } if (id && !processedIds.has(id)) { footnotes[footnoteCount] = { content: content || li, originalId: id, refs: [] }; processedIds.add(id); footnoteCount++; } }); }); return footnotes; } findOuterFootnoteContainer(el) { let current = el; let parent = el.parentElement; // Keep going up until we find an element that's not a span or sup while (parent && (parent.tagName.toLowerCase() === 'span' || parent.tagName.toLowerCase() === 'sup')) { current = parent; parent = parent.parentElement; } return current; } // Every footnote reference should be a sup element with an anchor inside // e.g. <sup id="fnref:1"><a href="#fn:1">1</a></sup> createFootnoteReference(footnoteNumber, refId) { const sup = this.doc.createElement('sup'); sup.id = refId; const link = this.doc.createElement('a'); link.href = `#fn:${footnoteNumber}`; link.textContent = footnoteNumber; sup.appendChild(link); return sup; } standardizeFootnotes(element) { const footnotes = this.collectFootnotes(element); // Standardize inline footnotes using the collected IDs const footnoteInlineReferences = element.querySelectorAll(constants_1.FOOTNOTE_INLINE_REFERENCES); // Group references by their parent sup element const supGroups = new Map(); footnoteInlineReferences.forEach((el) => { if (!el) return; let footnoteId = ''; let footnoteContent = ''; // Extract footnote ID based on element type // Nature.com if (el.matches('a[id^="ref-link"]')) { footnoteId = el.textContent?.trim() || ''; // Science.org } else if (el.matches('a[role="doc-biblioref"]')) { const xmlRid = el.getAttribute('data-xml-rid'); if (xmlRid) { footnoteId = xmlRid; } else { const href = el.getAttribute('href'); if (href?.startsWith('#core-R')) { footnoteId = href.replace('#core-', ''); } } // Substack } else if (el.matches('a.footnote-anchor, span.footnote-hovercard-target a')) { const id = el.id?.replace('footnote-anchor-', '') || ''; if (id) { footnoteId = id.toLowerCase(); } // Arxiv } else if (el.matches('cite.ltx_cite')) { const link = el.querySelector('a'); if (link) { const href = link.getAttribute('href'); if (href) { const match = href.split('/').pop()?.match(/bib\.bib(\d+)/); if (match) { footnoteId = match[1].toLowerCase(); } } } } else if (el.matches('sup.reference')) { const links = el.querySelectorAll('a'); Array.from(links).forEach((link) => { const href = link.getAttribute('href'); if (href) { const match = href.split('/').pop()?.match(/(?:cite_note|cite_ref)-(.+)/); if (match) { footnoteId = match[1].toLowerCase(); } } }); } else if (el.matches('sup[id^="fnref:"]')) { footnoteId = el.id.replace('fnref:', '').toLowerCase(); } else if (el.matches('sup[id^="fnr"]')) { footnoteId = el.id.replace('fnr', '').toLowerCase(); } else if (el.matches('span.footnote-reference')) { footnoteId = el.getAttribute('data-footnote-id') || ''; } else if (el.matches('span.footnote-link')) { footnoteId = el.getAttribute('data-footnote-id') || ''; footnoteContent = el.getAttribute('data-footnote-content') || ''; } else if (el.matches('a.citation')) { footnoteId = el.textContent?.trim() || ''; footnoteContent = el.getAttribute('href') || ''; } else if (el.matches('a[id^="fnref"]')) { footnoteId = el.id.replace('fnref', '').toLowerCase(); } else { // Other citation types const href = el.getAttribute('href'); if (href) { const id = href.replace(/^[#]/, ''); footnoteId = id.toLowerCase(); } } if (footnoteId) { // Find the footnote number by matching the original ID const footnoteEntry = Object.entries(footnotes).find(([_, data]) => data.originalId === footnoteId.toLowerCase()); if (footnoteEntry) { const [footnoteNumber, footnoteData] = footnoteEntry; // Create footnote reference ID const refId = footnoteData.refs.length > 0 ? `fnref:${footnoteNumber}-${footnoteData.refs.length + 1}` : `fnref:${footnoteNumber}`; footnoteData.refs.push(refId); // Find the outermost container (span or sup) const container = this.findOuterFootnoteContainer(el); // If container is a sup, group references if (container.tagName.toLowerCase() === 'sup') { if (!supGroups.has(container)) { supGroups.set(container, []); } const group = supGroups.get(container); group.push(this.createFootnoteReference(footnoteNumber, refId)); } else { // Replace the container directly container.replaceWith(this.createFootnoteReference(footnoteNumber, refId)); } } } }); // Handle grouped references supGroups.forEach((references, container) => { if (references.length > 0) { // Create a document fragment to hold all the references const fragment = this.doc.createDocumentFragment(); // Add each reference as its own sup element references.forEach((ref) => { const link = ref.querySelector('a'); if (link) { const sup = this.doc.createElement('sup'); sup.id = ref.id; sup.appendChild(link.cloneNode(true)); fragment.appendChild(sup); } }); container.replaceWith(fragment); } }); // Create the standardized footnote list const newList = this.doc.createElement('div'); newList.id = 'footnotes'; const orderedList = this.doc.createElement('ol'); // Create footnote items in order Object.entries(footnotes).forEach(([number, data]) => { const newItem = this.createFootnoteItem(parseInt(number), data.content, data.refs); orderedList.appendChild(newItem); }); // Remove original footnote lists const footnoteLists = element.querySelectorAll(constants_1.FOOTNOTE_LIST_SELECTORS); footnoteLists.forEach((list) => list.remove()); // If we have any footnotes, add the new list to the document if (orderedList.children.length > 0) { newList.appendChild(orderedList); element.appendChild(newList); } } } /** * Standardizes footnotes in the given element * @param element The element to standardize footnotes in */ function standardizeFootnotes(element) { // Get the document from the element's ownerDocument const doc = element.ownerDocument; if (!doc) { console.warn('standardizeFootnotes: No document available'); return; } const handler = new FootnoteHandler(doc); handler.standardizeFootnotes(element); } //# sourceMappingURL=footnotes.js.map