UNPKG

defuddle

Version:

Extract article content and metadata from web pages.

292 lines 13.3 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.MetadataExtractor = void 0; class MetadataExtractor { static extract(doc, schemaOrgData, metaTags) { let domain = ''; let url = ''; try { // Try to get URL from document location url = doc.location?.href || ''; // If no URL from location, try other sources if (!url) { url = this.getMetaContent(metaTags, "property", "og:url") || this.getMetaContent(metaTags, "property", "twitter:url") || this.getSchemaProperty(schemaOrgData, 'url') || this.getSchemaProperty(schemaOrgData, 'mainEntityOfPage.url') || this.getSchemaProperty(schemaOrgData, 'mainEntity.url') || this.getSchemaProperty(schemaOrgData, 'WebSite.url') || doc.querySelector('link[rel="canonical"]')?.getAttribute('href') || ''; } if (url) { try { domain = new URL(url).hostname.replace(/^www\./, ''); } catch (e) { console.warn('Failed to parse URL:', e); } } } catch (e) { // If URL parsing fails, try to get from base tag const baseTag = doc.querySelector('base[href]'); if (baseTag) { try { url = baseTag.getAttribute('href') || ''; domain = new URL(url).hostname.replace(/^www\./, ''); } catch (e) { console.warn('Failed to parse base URL:', e); } } } return { title: this.getTitle(doc, schemaOrgData, metaTags), description: this.getDescription(doc, schemaOrgData, metaTags), domain, favicon: this.getFavicon(doc, url, metaTags), image: this.getImage(doc, schemaOrgData, metaTags), published: this.getPublished(doc, schemaOrgData, metaTags), author: this.getAuthor(doc, schemaOrgData, metaTags), site: this.getSite(doc, schemaOrgData, metaTags), schemaOrgData, wordCount: 0, parseTime: 0 }; } static getAuthor(doc, schemaOrgData, metaTags) { let authorsString; // Meta tags - typically expect a single string, possibly comma-separated authorsString = this.getMetaContent(metaTags, "name", "sailthru.author") || this.getMetaContent(metaTags, "property", "author") || this.getMetaContent(metaTags, "name", "author") || this.getMetaContent(metaTags, "name", "byl") || this.getMetaContent(metaTags, "name", "authorList"); if (authorsString) return authorsString; // 2. Schema.org data - deduplicate if it's a list let schemaAuthors = this.getSchemaProperty(schemaOrgData, 'author.name') || this.getSchemaProperty(schemaOrgData, 'author.[].name'); if (schemaAuthors) { const parts = schemaAuthors.split(',') .map(part => part.trim().replace(/,$/, '').trim()) .filter(Boolean); if (parts.length > 0) { let uniqueSchemaAuthors = [...new Set(parts)]; if (uniqueSchemaAuthors.length > 10) { uniqueSchemaAuthors = uniqueSchemaAuthors.slice(0, 10); } return uniqueSchemaAuthors.join(', '); } } // 3. DOM elements const collectedAuthorsFromDOM = []; const addDomAuthor = (value) => { if (!value) return; value.split(',').forEach(namePart => { const cleanedName = namePart.trim().replace(/,$/, '').trim(); const lowerCleanedName = cleanedName.toLowerCase(); if (cleanedName && lowerCleanedName !== 'author' && lowerCleanedName !== 'authors') { collectedAuthorsFromDOM.push(cleanedName); } }); }; const domAuthorSelectors = [ '[itemprop="author"]', '.author', '[href*="author"]', '.authors a', ]; domAuthorSelectors.forEach(selector => { doc.querySelectorAll(selector).forEach(el => { addDomAuthor(el.textContent); }); }); if (collectedAuthorsFromDOM.length > 0) { let uniqueAuthors = [...new Set(collectedAuthorsFromDOM.map(name => name.trim()).filter(Boolean))]; if (uniqueAuthors.length > 0) { if (uniqueAuthors.length > 10) { uniqueAuthors = uniqueAuthors.slice(0, 10); } return uniqueAuthors.join(', '); } } // 4. Fallback meta tags and schema properties (less direct for author names) authorsString = this.getMetaContent(metaTags, "name", "copyright") || this.getSchemaProperty(schemaOrgData, 'copyrightHolder.name') || this.getMetaContent(metaTags, "property", "og:site_name") || this.getSchemaProperty(schemaOrgData, 'publisher.name') || this.getSchemaProperty(schemaOrgData, 'sourceOrganization.name') || this.getSchemaProperty(schemaOrgData, 'isPartOf.name') || this.getMetaContent(metaTags, "name", "twitter:creator") || this.getMetaContent(metaTags, "name", "application-name"); if (authorsString) return authorsString; return ''; } static getSite(doc, schemaOrgData, metaTags) { return (this.getSchemaProperty(schemaOrgData, 'publisher.name') || this.getMetaContent(metaTags, "property", "og:site_name") || this.getSchemaProperty(schemaOrgData, 'WebSite.name') || this.getSchemaProperty(schemaOrgData, 'sourceOrganization.name') || this.getMetaContent(metaTags, "name", "copyright") || this.getSchemaProperty(schemaOrgData, 'copyrightHolder.name') || this.getSchemaProperty(schemaOrgData, 'isPartOf.name') || this.getMetaContent(metaTags, "name", "application-name") || this.getAuthor(doc, schemaOrgData, metaTags) || ''); } static getTitle(doc, schemaOrgData, metaTags) { const rawTitle = (this.getMetaContent(metaTags, "property", "og:title") || this.getMetaContent(metaTags, "name", "twitter:title") || this.getSchemaProperty(schemaOrgData, 'headline') || this.getMetaContent(metaTags, "name", "title") || this.getMetaContent(metaTags, "name", "sailthru.title") || doc.querySelector('title')?.textContent?.trim() || ''); return this.cleanTitle(rawTitle, this.getSite(doc, schemaOrgData, metaTags)); } static cleanTitle(title, siteName) { if (!title || !siteName) return title; // Remove site name if it exists const siteNameEscaped = siteName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const patterns = [ `\\s*[\\|\\-–—]\\s*${siteNameEscaped}\\s*$`, // Title | Site Name `^\\s*${siteNameEscaped}\\s*[\\|\\-–—]\\s*`, // Site Name | Title ]; for (const pattern of patterns) { const regex = new RegExp(pattern, 'i'); if (regex.test(title)) { title = title.replace(regex, ''); break; } } return title.trim(); } static getDescription(doc, schemaOrgData, metaTags) { return (this.getMetaContent(metaTags, "name", "description") || this.getMetaContent(metaTags, "property", "description") || this.getMetaContent(metaTags, "property", "og:description") || this.getSchemaProperty(schemaOrgData, 'description') || this.getMetaContent(metaTags, "name", "twitter:description") || this.getMetaContent(metaTags, "name", "sailthru.description") || ''); } static getImage(doc, schemaOrgData, metaTags) { return (this.getMetaContent(metaTags, "property", "og:image") || this.getMetaContent(metaTags, "name", "twitter:image") || this.getSchemaProperty(schemaOrgData, 'image.url') || this.getMetaContent(metaTags, "name", "sailthru.image.full") || ''); } static getFavicon(doc, baseUrl, metaTags) { const iconFromMeta = this.getMetaContent(metaTags, "property", "og:image:favicon"); if (iconFromMeta) return iconFromMeta; const iconLink = doc.querySelector("link[rel='icon']")?.getAttribute("href"); if (iconLink) return iconLink; const shortcutLink = doc.querySelector("link[rel='shortcut icon']")?.getAttribute("href"); if (shortcutLink) return shortcutLink; // Only try to construct favicon URL if we have a valid base URL if (baseUrl) { try { return new URL("/favicon.ico", baseUrl).href; } catch (e) { console.warn('Failed to construct favicon URL:', e); } } return ''; } static getPublished(doc, schemaOrgData, metaTags) { return (this.getSchemaProperty(schemaOrgData, 'datePublished') || this.getMetaContent(metaTags, "name", "publishDate") || this.getMetaContent(metaTags, "property", "article:published_time") || doc.querySelector('abbr[itemprop="datePublished"]')?.title?.trim() || this.getTimeElement(doc) || this.getMetaContent(metaTags, "name", "sailthru.date") || ''); } static getMetaContent(metaTags, attr, value) { const foundTag = metaTags.find(tag => { const attributeValue = attr === 'name' ? tag.name : tag.property; return attributeValue?.toLowerCase() === value.toLowerCase(); }); return foundTag ? foundTag.content?.trim() ?? "" : ""; } static getTimeElement(doc) { const selector = `time`; const element = Array.from(doc.querySelectorAll(selector))[0]; const content = element ? (element.getAttribute("datetime")?.trim() ?? element.textContent?.trim() ?? "") : ""; return content; } static getSchemaProperty(schemaOrgData, property, defaultValue = '') { if (!schemaOrgData) return defaultValue; const searchSchema = (data, props, fullPath, isExactMatch = true) => { if (typeof data === 'string') { return props.length === 0 ? [data] : []; } if (!data || typeof data !== 'object') { return []; } if (Array.isArray(data)) { const currentProp = props[0]; if (/^\\[\\d+\\]$/.test(currentProp)) { const index = parseInt(currentProp.slice(1, -1)); if (data[index]) { return searchSchema(data[index], props.slice(1), fullPath, isExactMatch); } return []; } if (props.length === 0 && data.every(item => typeof item === 'string' || typeof item === 'number')) { return data.map(String); } return data.flatMap(item => searchSchema(item, props, fullPath, isExactMatch)); } const [currentProp, ...remainingProps] = props; if (!currentProp) { if (typeof data === 'string') return [data]; if (typeof data === 'object' && data.name) { return [data.name]; } return []; } if (data.hasOwnProperty(currentProp)) { return searchSchema(data[currentProp], remainingProps, fullPath ? `${fullPath}.${currentProp}` : currentProp, true); } if (!isExactMatch) { const nestedResults = []; for (const key in data) { if (typeof data[key] === 'object') { const results = searchSchema(data[key], props, fullPath ? `${fullPath}.${key}` : key, false); nestedResults.push(...results); } } if (nestedResults.length > 0) { return nestedResults; } } return []; }; try { let results = searchSchema(schemaOrgData, property.split('.'), '', true); if (results.length === 0) { results = searchSchema(schemaOrgData, property.split('.'), '', false); } const result = results.length > 0 ? results.filter(Boolean).join(', ') : defaultValue; return result; } catch (error) { console.error(`Error in getSchemaProperty for ${property}:`, error); return defaultValue; } } } exports.MetadataExtractor = MetadataExtractor; //# sourceMappingURL=metadata.js.map