defuddle
Version:
Extract article content and metadata from web pages.
316 lines • 13.9 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
exports.standardizeFootnotes = standardizeFootnotes;
const constants_1 = require("../constants");
class FootnoteHandler {
constructor(doc) {
this.doc = doc;
}
createFootnoteItem(footnoteNumber, content, refs) {
const doc = typeof content === 'string' ? this.doc : content.ownerDocument;
const newItem = doc.createElement('li');
newItem.className = 'footnote';
newItem.id = `fn:${footnoteNumber}`;
// Handle content
if (typeof content === 'string') {
const paragraph = doc.createElement('p');
paragraph.innerHTML = content;
newItem.appendChild(paragraph);
}
else {
// Get all paragraphs from the content
const paragraphs = Array.from(content.querySelectorAll('p'));
if (paragraphs.length === 0) {
// If no paragraphs, wrap content in a paragraph
const paragraph = doc.createElement('p');
paragraph.innerHTML = content.innerHTML;
newItem.appendChild(paragraph);
}
else {
// Copy existing paragraphs
paragraphs.forEach((p) => {
const newP = doc.createElement('p');
newP.innerHTML = p.innerHTML;
newItem.appendChild(newP);
});
}
}
// Add backlink(s) to the last paragraph
const lastParagraph = newItem.querySelector('p:last-of-type') || newItem;
refs.forEach((refId, index) => {
const backlink = doc.createElement('a');
backlink.href = `#${refId}`;
backlink.title = 'return to article';
backlink.className = 'footnote-backref';
backlink.innerHTML = '↩';
if (index < refs.length - 1) {
backlink.innerHTML += ' ';
}
lastParagraph.appendChild(backlink);
});
return newItem;
}
collectFootnotes(element) {
const footnotes = {};
let footnoteCount = 1;
const processedIds = new Set(); // Track processed IDs
// Collect all footnotes and their IDs from footnote lists
const footnoteLists = element.querySelectorAll(constants_1.FOOTNOTE_LIST_SELECTORS);
footnoteLists.forEach((list) => {
// Substack has individual footnote divs with no parent
if (list.matches('div.footnote[data-component-name="FootnoteToDOM"]')) {
const anchor = list.querySelector('a.footnote-number');
const content = list.querySelector('.footnote-content');
if (anchor && content) {
const id = anchor.id.replace('footnote-', '').toLowerCase();
if (id && !processedIds.has(id)) {
footnotes[footnoteCount] = {
content: content,
originalId: id,
refs: []
};
processedIds.add(id);
footnoteCount++;
}
}
return;
}
// Common format using OL/UL and LI elements
const items = list.querySelectorAll('li, div[role="listitem"]');
items.forEach((li) => {
let id = '';
let content = null;
// Handle citations with .citations class
const citationsDiv = li.querySelector('.citations');
if (citationsDiv?.id?.toLowerCase().startsWith('r')) {
id = citationsDiv.id.toLowerCase();
// Look for citation content within the citations div
const citationContent = citationsDiv.querySelector('.citation-content');
if (citationContent) {
content = citationContent;
}
}
else {
// Extract ID from various formats
if (li.id.toLowerCase().startsWith('bib.bib')) {
id = li.id.replace('bib.bib', '').toLowerCase();
}
else if (li.id.toLowerCase().startsWith('fn:')) {
id = li.id.replace('fn:', '').toLowerCase();
}
else if (li.id.toLowerCase().startsWith('fn')) {
id = li.id.replace('fn', '').toLowerCase();
// Nature.com
}
else if (li.hasAttribute('data-counter')) {
id = li.getAttribute('data-counter')?.replace(/\.$/, '')?.toLowerCase() || '';
}
else {
const match = li.id.split('/').pop()?.match(/cite_note-(.+)/);
id = match ? match[1].toLowerCase() : li.id.toLowerCase();
}
content = li;
}
if (id && !processedIds.has(id)) {
footnotes[footnoteCount] = {
content: content || li,
originalId: id,
refs: []
};
processedIds.add(id);
footnoteCount++;
}
});
});
return footnotes;
}
findOuterFootnoteContainer(el) {
let current = el;
let parent = el.parentElement;
// Keep going up until we find an element that's not a span or sup
while (parent && (parent.tagName.toLowerCase() === 'span' ||
parent.tagName.toLowerCase() === 'sup')) {
current = parent;
parent = parent.parentElement;
}
return current;
}
// Every footnote reference should be a sup element with an anchor inside
// e.g. <sup id="fnref:1"><a href="#fn:1">1</a></sup>
createFootnoteReference(footnoteNumber, refId) {
const sup = this.doc.createElement('sup');
sup.id = refId;
const link = this.doc.createElement('a');
link.href = `#fn:${footnoteNumber}`;
link.textContent = footnoteNumber;
sup.appendChild(link);
return sup;
}
standardizeFootnotes(element) {
const footnotes = this.collectFootnotes(element);
// Standardize inline footnotes using the collected IDs
const footnoteInlineReferences = element.querySelectorAll(constants_1.FOOTNOTE_INLINE_REFERENCES);
// Group references by their parent sup element
const supGroups = new Map();
footnoteInlineReferences.forEach((el) => {
if (!el)
return;
let footnoteId = '';
let footnoteContent = '';
// Extract footnote ID based on element type
// Nature.com
if (el.matches('a[id^="ref-link"]')) {
footnoteId = el.textContent?.trim() || '';
// Science.org
}
else if (el.matches('a[role="doc-biblioref"]')) {
const xmlRid = el.getAttribute('data-xml-rid');
if (xmlRid) {
footnoteId = xmlRid;
}
else {
const href = el.getAttribute('href');
if (href?.startsWith('#core-R')) {
footnoteId = href.replace('#core-', '');
}
}
// Substack
}
else if (el.matches('a.footnote-anchor, span.footnote-hovercard-target a')) {
const id = el.id?.replace('footnote-anchor-', '') || '';
if (id) {
footnoteId = id.toLowerCase();
}
// Arxiv
}
else if (el.matches('cite.ltx_cite')) {
const link = el.querySelector('a');
if (link) {
const href = link.getAttribute('href');
if (href) {
const match = href.split('/').pop()?.match(/bib\.bib(\d+)/);
if (match) {
footnoteId = match[1].toLowerCase();
}
}
}
}
else if (el.matches('sup.reference')) {
const links = el.querySelectorAll('a');
Array.from(links).forEach((link) => {
const href = link.getAttribute('href');
if (href) {
const match = href.split('/').pop()?.match(/(?:cite_note|cite_ref)-(.+)/);
if (match) {
footnoteId = match[1].toLowerCase();
}
}
});
}
else if (el.matches('sup[id^="fnref:"]')) {
footnoteId = el.id.replace('fnref:', '').toLowerCase();
}
else if (el.matches('sup[id^="fnr"]')) {
footnoteId = el.id.replace('fnr', '').toLowerCase();
}
else if (el.matches('span.footnote-reference')) {
footnoteId = el.getAttribute('data-footnote-id') || '';
}
else if (el.matches('span.footnote-link')) {
footnoteId = el.getAttribute('data-footnote-id') || '';
footnoteContent = el.getAttribute('data-footnote-content') || '';
}
else if (el.matches('a.citation')) {
footnoteId = el.textContent?.trim() || '';
footnoteContent = el.getAttribute('href') || '';
}
else if (el.matches('a[id^="fnref"]')) {
footnoteId = el.id.replace('fnref', '').toLowerCase();
}
else {
// Other citation types
const href = el.getAttribute('href');
if (href) {
const id = href.replace(/^[#]/, '');
footnoteId = id.toLowerCase();
}
}
if (footnoteId) {
// Find the footnote number by matching the original ID
const footnoteEntry = Object.entries(footnotes).find(([_, data]) => data.originalId === footnoteId.toLowerCase());
if (footnoteEntry) {
const [footnoteNumber, footnoteData] = footnoteEntry;
// Create footnote reference ID
const refId = footnoteData.refs.length > 0 ?
`fnref:${footnoteNumber}-${footnoteData.refs.length + 1}` :
`fnref:${footnoteNumber}`;
footnoteData.refs.push(refId);
// Find the outermost container (span or sup)
const container = this.findOuterFootnoteContainer(el);
// If container is a sup, group references
if (container.tagName.toLowerCase() === 'sup') {
if (!supGroups.has(container)) {
supGroups.set(container, []);
}
const group = supGroups.get(container);
group.push(this.createFootnoteReference(footnoteNumber, refId));
}
else {
// Replace the container directly
container.replaceWith(this.createFootnoteReference(footnoteNumber, refId));
}
}
}
});
// Handle grouped references
supGroups.forEach((references, container) => {
if (references.length > 0) {
// Create a document fragment to hold all the references
const fragment = this.doc.createDocumentFragment();
// Add each reference as its own sup element
references.forEach((ref) => {
const link = ref.querySelector('a');
if (link) {
const sup = this.doc.createElement('sup');
sup.id = ref.id;
sup.appendChild(link.cloneNode(true));
fragment.appendChild(sup);
}
});
container.replaceWith(fragment);
}
});
// Create the standardized footnote list
const newList = this.doc.createElement('div');
newList.id = 'footnotes';
const orderedList = this.doc.createElement('ol');
// Create footnote items in order
Object.entries(footnotes).forEach(([number, data]) => {
const newItem = this.createFootnoteItem(parseInt(number), data.content, data.refs);
orderedList.appendChild(newItem);
});
// Remove original footnote lists
const footnoteLists = element.querySelectorAll(constants_1.FOOTNOTE_LIST_SELECTORS);
footnoteLists.forEach((list) => list.remove());
// If we have any footnotes, add the new list to the document
if (orderedList.children.length > 0) {
newList.appendChild(orderedList);
element.appendChild(newList);
}
}
}
/**
* Standardizes footnotes in the given element
* @param element The element to standardize footnotes in
*/
function standardizeFootnotes(element) {
// Get the document from the element's ownerDocument
const doc = element.ownerDocument;
if (!doc) {
console.warn('standardizeFootnotes: No document available');
return;
}
const handler = new FootnoteHandler(doc);
handler.standardizeFootnotes(element);
}
//# sourceMappingURL=footnotes.js.map
;