defuddle
Version:
Extract article content and metadata from web pages.
426 lines • 19.7 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.MetadataExtractor = void 0;
class MetadataExtractor {
static extract(doc, schemaOrgData, metaTags) {
let domain = '';
let url = '';
try {
// Try to get URL from document location
url = doc.location?.href || '';
// If no URL from location, try other sources
if (!url) {
url = this.getMetaContent(metaTags, "property", "og:url") ||
this.getMetaContent(metaTags, "property", "twitter:url") ||
this.getSchemaProperty(schemaOrgData, 'url') ||
this.getSchemaProperty(schemaOrgData, 'mainEntityOfPage.url') ||
this.getSchemaProperty(schemaOrgData, 'mainEntity.url') ||
this.getSchemaProperty(schemaOrgData, 'WebSite.url') ||
doc.querySelector('link[rel="canonical"]')?.getAttribute('href') || '';
}
if (url) {
try {
domain = new URL(url).hostname.replace(/^www\./, '');
}
catch (e) {
console.warn('Failed to parse URL:', e);
}
}
}
catch (e) {
// If URL parsing fails, try to get from base tag
const baseTag = doc.querySelector('base[href]');
if (baseTag) {
try {
url = baseTag.getAttribute('href') || '';
domain = new URL(url).hostname.replace(/^www\./, '');
}
catch (e) {
console.warn('Failed to parse base URL:', e);
}
}
}
return {
title: this.getTitle(doc, schemaOrgData, metaTags),
description: this.getDescription(doc, schemaOrgData, metaTags),
domain,
favicon: this.getFavicon(doc, url, metaTags),
image: this.getImage(doc, schemaOrgData, metaTags),
language: this.getLanguage(doc, schemaOrgData, metaTags),
published: this.getPublished(doc, schemaOrgData, metaTags),
author: this.getAuthor(doc, schemaOrgData, metaTags),
site: this.getSite(doc, schemaOrgData, metaTags),
schemaOrgData,
wordCount: 0,
parseTime: 0
};
}
static getAuthor(doc, schemaOrgData, metaTags) {
let authorsString;
// Meta tags - typically expect a single string, possibly comma-separated
authorsString = this.getMetaContent(metaTags, "name", "sailthru.author") ||
this.getMetaContent(metaTags, "property", "author") ||
this.getMetaContent(metaTags, "name", "author") ||
this.getMetaContent(metaTags, "name", "byl") ||
this.getMetaContent(metaTags, "name", "authorList");
if (authorsString)
return authorsString;
// Conventions for research paper meta tags
let authorsStrings = this.getMetaContents(metaTags, "name", "citation_author");
if (authorsStrings.length === 0) {
authorsStrings = this.getMetaContents(metaTags, "property", "dc.creator");
}
if (authorsStrings.length > 0) {
authorsString = authorsStrings.map(s => {
if (!s.includes(','))
return s.trim();
const parts = /(.*),\s(.*)/.exec(s);
if (parts && parts.length === 3) {
return `${parts[2]} ${parts[1]}`;
}
return s.trim();
}).join(', ');
return authorsString;
}
// 2. Schema.org data - deduplicate if it's a list
let schemaAuthors = this.getSchemaProperty(schemaOrgData, 'author.name') ||
this.getSchemaProperty(schemaOrgData, 'author.[].name');
if (schemaAuthors) {
const parts = schemaAuthors.split(',')
.map(part => part.trim().replace(/,$/, '').trim())
.filter(Boolean);
if (parts.length > 0) {
let uniqueSchemaAuthors = [...new Set(parts)];
if (uniqueSchemaAuthors.length > 10) {
uniqueSchemaAuthors = uniqueSchemaAuthors.slice(0, 10);
}
return uniqueSchemaAuthors.join(', ');
}
}
// 3. DOM elements
const collectedAuthorsFromDOM = [];
const addDomAuthor = (value) => {
if (!value)
return;
value.split(',').forEach(namePart => {
const cleanedName = namePart.trim().replace(/,$/, '').trim();
const lowerCleanedName = cleanedName.toLowerCase();
if (cleanedName && lowerCleanedName !== 'author' && lowerCleanedName !== 'authors') {
collectedAuthorsFromDOM.push(cleanedName);
}
});
};
// maxMatches: skip ambiguous selectors with too many matches
// (e.g. testimonials, comments, contributor lists)
const domAuthorSelectors = [
{ selector: '[itemprop="author"]' },
{ selector: '.author', maxMatches: 3 },
{ selector: '[href*="/author/"]', maxMatches: 3 },
{ selector: '.authors a', maxMatches: 3 },
];
for (const { selector, maxMatches } of domAuthorSelectors) {
const matches = doc.querySelectorAll(selector);
if (maxMatches && matches.length > maxMatches)
continue;
matches.forEach(el => addDomAuthor(el.textContent));
}
if (collectedAuthorsFromDOM.length > 0) {
let uniqueAuthors = [...new Set(collectedAuthorsFromDOM.map(name => name.trim()).filter(Boolean))];
if (uniqueAuthors.length > 0) {
if (uniqueAuthors.length > 10) {
uniqueAuthors = uniqueAuthors.slice(0, 10);
}
return uniqueAuthors.join(', ');
}
}
// 4. Author near article heading (byline patterns and date-adjacent names)
const h1 = doc.querySelector('h1');
if (h1) {
// Check siblings of h1 for date-adjacent author names
let sibling = h1.nextElementSibling;
for (let i = 0; i < 3 && sibling; i++) {
const siblingText = sibling.textContent?.trim() || '';
if (this.parseDateText(siblingText)) {
const links = sibling.querySelectorAll('a');
for (const link of links) {
const linkText = (link.textContent?.trim() || '').replace(/\u00a0/g, ' ');
if (linkText.length > 0 && linkText.length < 100 && !this.parseDateText(linkText)) {
return linkText;
}
}
}
sibling = sibling.nextElementSibling;
}
// Search for "By ..." bylines near h1: check siblings of h1
// and siblings of its ancestor containers (up to 3 levels)
let bylineScope = h1;
for (let depth = 0; depth < 3 && bylineScope; depth++) {
let bylineCandidate = bylineScope.previousElementSibling;
// Check a few siblings before
for (let i = 0; i < 3 && bylineCandidate; i++) {
const bylineResult = this.extractByline(bylineCandidate);
if (bylineResult)
return bylineResult;
bylineCandidate = bylineCandidate.previousElementSibling;
}
// Check a few siblings after
bylineCandidate = bylineScope.nextElementSibling;
for (let i = 0; i < 3 && bylineCandidate; i++) {
const bylineResult = this.extractByline(bylineCandidate);
if (bylineResult)
return bylineResult;
bylineCandidate = bylineCandidate.nextElementSibling;
}
bylineScope = bylineScope.parentElement;
}
}
// 5. Fall back to site name
return this.getSiteName(schemaOrgData, metaTags);
}
static extractByline(el) {
// Check the element itself and its direct children for "By ..." text
const candidates = [el, ...el.querySelectorAll('p, span, address')];
for (const candidate of candidates) {
const text = (candidate.textContent?.trim() || '').replace(/\u00a0/g, ' ');
if (text.length > 0 && text.length < 50) {
const bylineMatch = text.match(/^By\s+([A-Z].+)$/i);
if (bylineMatch) {
return bylineMatch[1].trim();
}
}
}
return null;
}
static getSiteName(schemaOrgData, metaTags) {
return (this.getSchemaProperty(schemaOrgData, 'publisher.name') ||
this.getMetaContent(metaTags, "property", "og:site_name") ||
this.getSchemaProperty(schemaOrgData, 'WebSite.name') ||
this.getSchemaProperty(schemaOrgData, 'sourceOrganization.name') ||
this.getMetaContent(metaTags, "name", "copyright") ||
this.getSchemaProperty(schemaOrgData, 'copyrightHolder.name') ||
this.getSchemaProperty(schemaOrgData, 'isPartOf.name') ||
this.getMetaContent(metaTags, "name", "application-name") ||
'');
}
static getSite(doc, schemaOrgData, metaTags) {
return (this.getSiteName(schemaOrgData, metaTags) ||
this.getAuthor(doc, schemaOrgData, metaTags) ||
'');
}
static getTitle(doc, schemaOrgData, metaTags) {
const rawTitle = (this.getMetaContent(metaTags, "property", "og:title") ||
this.getMetaContent(metaTags, "name", "twitter:title") ||
this.getSchemaProperty(schemaOrgData, 'headline') ||
this.getMetaContent(metaTags, "name", "title") ||
this.getMetaContent(metaTags, "name", "sailthru.title") ||
doc.querySelector('title')?.textContent?.trim() ||
'');
return this.cleanTitle(rawTitle, this.getSite(doc, schemaOrgData, metaTags));
}
static cleanTitle(title, siteName) {
if (!title || !siteName)
return title;
// Remove site name if it exists
const siteNameEscaped = siteName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const patterns = [
`\\s*[\\|\\-–—]\\s*${siteNameEscaped}\\s*$`, // Title | Site Name
`^\\s*${siteNameEscaped}\\s*[\\|\\-–—]\\s*`, // Site Name | Title
];
for (const pattern of patterns) {
const regex = new RegExp(pattern, 'i');
if (regex.test(title)) {
title = title.replace(regex, '');
break;
}
}
return title.trim();
}
static getDescription(doc, schemaOrgData, metaTags) {
return (this.getMetaContent(metaTags, "name", "description") ||
this.getMetaContent(metaTags, "property", "description") ||
this.getMetaContent(metaTags, "property", "og:description") ||
this.getSchemaProperty(schemaOrgData, 'description') ||
this.getMetaContent(metaTags, "name", "twitter:description") ||
this.getMetaContent(metaTags, "name", "sailthru.description") ||
'');
}
static getImage(doc, schemaOrgData, metaTags) {
return (this.getMetaContent(metaTags, "property", "og:image") ||
this.getMetaContent(metaTags, "name", "twitter:image") ||
this.getSchemaProperty(schemaOrgData, 'image.url') ||
this.getMetaContent(metaTags, "name", "sailthru.image.full") ||
'');
}
static getLanguage(doc, schemaOrgData, metaTags) {
// 1. <html lang="...">
const htmlLang = doc.documentElement?.getAttribute('lang')?.trim();
if (htmlLang)
return this.normalizeLangCode(htmlLang);
// 2. Content-Language meta tag
const contentLang = this.getMetaContent(metaTags, "name", "content-language") ||
this.getMetaContent(metaTags, "property", "og:locale");
if (contentLang)
return this.normalizeLangCode(contentLang);
// 3. http-equiv Content-Language (stored as name in our meta tag collection)
const httpEquivLang = doc.querySelector('meta[http-equiv="Content-Language" i]')?.getAttribute('content')?.trim();
if (httpEquivLang)
return this.normalizeLangCode(httpEquivLang);
// 4. Schema.org
const schemaLang = this.getSchemaProperty(schemaOrgData, 'inLanguage');
if (schemaLang)
return this.normalizeLangCode(schemaLang);
return '';
}
/**
* Normalize language codes to BCP 47 format (e.g. en_US -> en-US)
*/
static normalizeLangCode(code) {
// Replace underscores with hyphens (og:locale uses en_US)
return code.replace(/_/g, '-');
}
static getFavicon(doc, baseUrl, metaTags) {
const iconFromMeta = this.getMetaContent(metaTags, "property", "og:image:favicon");
if (iconFromMeta)
return iconFromMeta;
const iconLink = doc.querySelector("link[rel='icon']")?.getAttribute("href");
if (iconLink)
return iconLink;
const shortcutLink = doc.querySelector("link[rel='shortcut icon']")?.getAttribute("href");
if (shortcutLink)
return shortcutLink;
// Only try to construct favicon URL if we have a valid HTTP base URL
if (baseUrl && /^https?:\/\//.test(baseUrl)) {
try {
return new URL("/favicon.ico", baseUrl).href;
}
catch (e) {
// Silently fail for invalid URLs
}
}
return '';
}
static getPublished(doc, schemaOrgData, metaTags) {
const result = this.getSchemaProperty(schemaOrgData, 'datePublished') ||
this.getMetaContent(metaTags, "name", "publishDate") ||
this.getMetaContent(metaTags, "property", "article:published_time") ||
doc.querySelector('abbr[itemprop="datePublished"]')?.title?.trim() ||
this.getTimeElement(doc) ||
this.getMetaContent(metaTags, "name", "sailthru.date");
if (result)
return result;
// Look for date text near the article heading
const h1 = doc.querySelector('h1');
if (h1) {
let sibling = h1.nextElementSibling;
for (let i = 0; i < 3 && sibling; i++) {
const parsed = this.parseDateText(sibling.textContent?.trim() || '');
if (parsed)
return parsed;
sibling = sibling.nextElementSibling;
}
}
return '';
}
static getMetaContent(metaTags, attr, value) {
return this.getMetaContents(metaTags, attr, value)[0] ?? "";
}
static getMetaContents(metaTags, attr, value) {
return metaTags.filter(tag => {
const attributeValue = attr === 'name' ? tag.name : tag.property;
return attributeValue?.toLowerCase() === value.toLowerCase();
}).map(tag => tag.content?.trim() ?? "");
}
static getTimeElement(doc) {
const selector = `time`;
const element = Array.from(doc.querySelectorAll(selector))[0];
const content = element ? (element.getAttribute("datetime")?.trim() ?? element.textContent?.trim() ?? "") : "";
return content;
}
static parseDateText(text) {
// "26 February 2025" or "Wednesday, 26 February 2025"
let match = text.match(/\b(\d{1,2})\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})\b/i);
if (match) {
const day = match[1].padStart(2, '0');
const month = this.MONTH_MAP[match[2].toLowerCase()];
return `${match[3]}-${month}-${day}T00:00:00+00:00`;
}
// "February 26, 2025" or "June 5, 2023"
match = text.match(/\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),?\s+(\d{4})\b/i);
if (match) {
const month = this.MONTH_MAP[match[1].toLowerCase()];
const day = match[2].padStart(2, '0');
return `${match[3]}-${month}-${day}T00:00:00+00:00`;
}
return '';
}
static getSchemaProperty(schemaOrgData, property, defaultValue = '') {
if (!schemaOrgData)
return defaultValue;
const searchSchema = (data, props, fullPath, isExactMatch = true) => {
if (typeof data === 'string') {
return props.length === 0 ? [data] : [];
}
if (!data || typeof data !== 'object') {
return [];
}
if (Array.isArray(data)) {
const currentProp = props[0];
if (/^\[\d+\]$/.test(currentProp)) {
const index = parseInt(currentProp.slice(1, -1));
if (data[index]) {
return searchSchema(data[index], props.slice(1), fullPath, isExactMatch);
}
return [];
}
if (props.length === 0 && data.every(item => typeof item === 'string' || typeof item === 'number')) {
return data.map(String);
}
return data.flatMap(item => searchSchema(item, props, fullPath, isExactMatch));
}
const [currentProp, ...remainingProps] = props;
if (!currentProp) {
if (typeof data === 'string')
return [data];
if (typeof data === 'object' && data.name) {
return [data.name];
}
return [];
}
if (data.hasOwnProperty(currentProp)) {
return searchSchema(data[currentProp], remainingProps, fullPath ? `${fullPath}.${currentProp}` : currentProp, true);
}
if (!isExactMatch) {
const nestedResults = [];
for (const key in data) {
if (typeof data[key] === 'object') {
const results = searchSchema(data[key], props, fullPath ? `${fullPath}.${key}` : key, false);
nestedResults.push(...results);
}
}
if (nestedResults.length > 0) {
return nestedResults;
}
}
return [];
};
try {
let results = searchSchema(schemaOrgData, property.split('.'), '', true);
if (results.length === 0) {
results = searchSchema(schemaOrgData, property.split('.'), '', false);
}
const result = results.length > 0 ? results.filter(Boolean).join(', ') : defaultValue;
return result;
}
catch (error) {
console.error(`Error in getSchemaProperty for ${property}:`, error);
return defaultValue;
}
}
}
exports.MetadataExtractor = MetadataExtractor;
MetadataExtractor.MONTH_MAP = {
'january': '01', 'february': '02', 'march': '03', 'april': '04',
'may': '05', 'june': '06', 'july': '07', 'august': '08',
'september': '09', 'october': '10', 'november': '11', 'december': '12'
};
//# sourceMappingURL=metadata.js.map