UNPKG

@devmehq/open-graph-extractor

Version:

Fast, lightweight Open Graph, Twitter Card, and structured data extractor for Node.js with caching and validation

476 lines (475 loc) 18.4 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __exportStar = (this && this.__exportStar) || function(m, exports) { for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.extractOpenGraph = extractOpenGraph; exports.extractOpenGraphAsync = extractOpenGraphAsync; const cheerio = __importStar(require("cheerio")); const cache_1 = require("./cache"); const fallback_1 = require("./fallback"); const fields_1 = require("./fields"); const media_1 = require("./media"); const security_1 = require("./security"); const structured_data_1 = require("./structured-data"); const utils_1 = require("./utils"); const validation_1 = require("./validation"); __exportStar(require("./bulk"), exports); __exportStar(require("./cache"), exports); __exportStar(require("./media"), exports); __exportStar(require("./security"), exports); __exportStar(require("./structured-data"), exports); // Re-export all types and utilities __exportStar(require("./types"), exports); __exportStar(require("./validation"), exports); // Global cache instance let globalCache = null; /** * Extract Open Graph data from HTML with optional enhanced features * @param input - HTML string or Buffer * @param options - extraction options */ function extractOpenGraph(input, options) { // Legacy extraction for backward compatibility let ogObject = {}; const $ = cheerio.load(input); const metaFields = fields_1.fields.concat(options?.customMetaTags ?? []); // find all the open graph info in the meta tags $("meta").each((_index, meta) => { if (!meta.attribs || (!meta.attribs.property && !meta.attribs.name)) return; const property = meta.attribs.property || meta.attribs.name || meta.attribs.itemprop || meta.attribs.itemProp; const content = meta.attribs.content || meta.attribs.value; metaFields.forEach((item) => { if (property.toLowerCase() === item.property.toLowerCase()) { if (!item.multiple) { ogObject[item.fieldName] = content; } else if (!ogObject[item.fieldName]) { ogObject[item.fieldName] = [content]; } else if (Array.isArray(ogObject[item.fieldName])) { ogObject[item.fieldName].push(content); } } }); }); // set ogImage to ogImageSecureURL/ogImageURL if there is no ogImage if (!ogObject.ogImage && ogObject.ogImageSecureURL) { ogObject.ogImage = ogObject.ogImageSecureURL; } else if (!ogObject.ogImage && ogObject.ogImageURL) { ogObject.ogImage = ogObject.ogImageURL; } // formats the multiple media values ogObject = (0, media_1.mediaSetup)(ogObject, options); // if onlyGetOpenGraphInfo isn't set, run the open graph fallbacks if (!options?.onlyGetOpenGraphInfo) { ogObject = (0, fallback_1.fallback)(ogObject, options, $); } // removes any undef ogObject = (0, utils_1.removeNestedUndefinedValues)(ogObject); return ogObject; } /** * Extract Open Graph data with all features (async version) * @param input - HTML string, Buffer, or URL (URL fetching not implemented) * @param options - extraction options */ async function extractOpenGraphAsync(input, options) { return extractOpenGraphAsyncImpl(input, options); } /** * Async extraction implementation (internal) */ async function extractOpenGraphAsyncImpl(input, options) { const startTime = Date.now(); const metrics = { metaTagsFound: 0, structuredDataFound: 0, imagesFound: 0, videosFound: 0, fallbacksUsed: [], }; const errors = []; const warnings = []; const fallbacksUsed = []; let structuredData = { jsonLD: [], schemaOrg: {}, microdata: {}, rdfa: {}, dublinCore: {} }; try { // Initialize cache if needed if (options?.cache?.enabled && !globalCache) { globalCache = (0, cache_1.createCache)({ enabled: options.cache.enabled, ttl: options.cache.ttl || 3600, storage: options.cache.storage || "memory", maxSize: options.cache.maxSize || 1000, keyGenerator: options.cache.keyGenerator, }); } let html; let url; // Check if input is URL or HTML if (typeof input === "string" && (input.startsWith("http://") || input.startsWith("https://"))) { url = input; // Validate URL if security is enabled if (options?.security?.validateUrls && !(0, security_1.validateUrl)(url, options.security)) { throw new Error(`Invalid or blocked URL: ${url}`); } // Normalize URL if (options?.normalizeUrls) { url = (0, security_1.normalizeUrl)(url); } // Check cache if (globalCache && options?.cache?.enabled) { const cached = await globalCache.get(url); if (cached) { return cached; } } // Fetch HTML - in production, you would use axios or another HTTP client // For this library, the user should provide HTML directly or use their own fetching throw new Error("URL fetching not implemented. Please fetch HTML and pass it directly."); } html = typeof input === "string" ? input : input.toString("utf8"); metrics.htmlSize = Buffer.byteLength(html, "utf8"); // Sanitize HTML if security is enabled if (options?.security?.sanitizeHtml) { html = (0, security_1.sanitizeHtml)(html, options.security); } const htmlParseStart = Date.now(); const $ = cheerio.load(html); const htmlParseTime = Date.now() - htmlParseStart; // Extract basic Open Graph data const metaExtractionStart = Date.now(); let ogObject = {}; const metaFields = fields_1.fields.concat(options?.customMetaTags ?? []); // Extract meta tags $("meta").each((_index, meta) => { if (!meta.attribs || (!meta.attribs.property && !meta.attribs.name)) return; const property = meta.attribs.property || meta.attribs.name || meta.attribs.itemprop || meta.attribs.itemProp; const content = meta.attribs.content || meta.attribs.value; metrics.metaTagsFound++; metaFields.forEach((item) => { if (property.toLowerCase() === item.property.toLowerCase()) { if (!item.multiple) { ogObject[item.fieldName] = content; } else if (!ogObject[item.fieldName]) { ogObject[item.fieldName] = [content]; } else if (Array.isArray(ogObject[item.fieldName])) { ogObject[item.fieldName].push(content); } } }); }); // Extract additional metadata ogObject.charset = $("meta[charset]").attr("charset") || $('meta[http-equiv="Content-Type"]') .attr("content") ?.match(/charset=([^;]+)/)?.[1]; ogObject.viewport = $('meta[name="viewport"]').attr("content"); ogObject.robots = $('meta[name="robots"]').attr("content"); ogObject.generator = $('meta[name="generator"]').attr("content"); ogObject.themeColor = $('meta[name="theme-color"]').attr("content"); ogObject.applicationName = $('meta[name="application-name"]').attr("content"); ogObject.canonical = $('link[rel="canonical"]').attr("href"); ogObject.ampUrl = $('link[rel="amphtml"]').attr("href"); ogObject.manifest = $('link[rel="manifest"]').attr("href"); ogObject.maskIcon = $('link[rel="mask-icon"]').attr("href"); ogObject.appleTouchIcon = $('link[rel="apple-touch-icon"]').attr("href"); const metaExtractionTime = Date.now() - metaExtractionStart; // Set URL if available if (url && !ogObject.ogUrl) { ogObject.ogUrl = url; } // Extract structured data if enabled const structuredDataStart = Date.now(); if (options?.extractStructuredData) { structuredData = (0, structured_data_1.extractStructuredData)($); metrics.structuredDataFound = structuredData.jsonLD?.length || 0; ogObject = (0, structured_data_1.mergeStructuredDataWithOG)(ogObject, structuredData); } const structuredDataExtractionTime = Date.now() - structuredDataStart; // Extract enhanced media if (!options?.onlyGetOpenGraphInfo) { // Enhanced image extraction const images = (0, media_1.extractAllImages)($); metrics.imagesFound = images.length; if (images.length > 0 && !ogObject.ogImage) { const bestImage = (0, media_1.selectBestImage)(images); if (bestImage) { ogObject.ogImage = bestImage; fallbacksUsed.push("image-extraction"); } } // Enhanced video extraction const video = (0, media_1.extractVideoMetadata)($, ogObject.ogVideo); if (video) { ogObject.ogVideo = video; metrics.videosFound = 1; } // Enhanced audio extraction const audio = (0, media_1.extractAudioMetadata)($); if (audio && !ogObject.ogAudio) { ogObject.ogAudio = audio.url; if (audio.secureUrl) ogObject.ogAudioSecureURL = audio.secureUrl; if (audio.type) ogObject.ogAudioType = audio.type; } } // Format media ogObject = (0, media_1.mediaSetup)(ogObject, options); // Run fallbacks if not disabled if (!options?.onlyGetOpenGraphInfo) { const beforeFallback = { ...ogObject }; ogObject = (0, fallback_1.fallback)(ogObject, options, $); // Track which fallbacks were used for (const key of Object.keys(ogObject)) { if (!beforeFallback[key] && ogObject[key]) { fallbacksUsed.push(`fallback-${key}`); } } } // Extract article content if enabled if (options?.extractArticleContent) { const articleContent = extractArticleContent($); if (articleContent) { ogObject.articleContent = articleContent.content; ogObject.readingTime = articleContent.readingTime; ogObject.wordCount = articleContent.wordCount; } } // Detect language if enabled if (options?.detectLanguage) { const lang = $("html").attr("lang") || $('meta[http-equiv="content-language"]').attr("content"); if (lang) { ogObject.language = lang; ogObject.textDirection = lang.startsWith("ar") || lang.startsWith("he") ? "rtl" : "ltr"; } } // Clean up undefined values ogObject = (0, utils_1.removeNestedUndefinedValues)(ogObject); // Sanitize data if security is enabled if (options?.security) { ogObject = (0, security_1.sanitizeExtractedData)(ogObject, options.security); } // Validate data if enabled const validationStart = Date.now(); let validationResult = null; let socialScore = null; if (options?.validateData) { validationResult = (0, validation_1.validateOpenGraph)(ogObject); const twitterValidation = (0, validation_1.validateTwitterCard)(ogObject); // Add validation errors and warnings const validation = validationResult; const twitterVal = twitterValidation; errors.push(...validation.errors); warnings.push(...validation.warnings); errors.push(...twitterVal.errors); warnings.push(...twitterVal.warnings); } if (options?.generateScore) { socialScore = (0, validation_1.generateSocialScore)(ogObject); } const validationTime = Date.now() - validationStart; // Calculate confidence score const confidence = calculateConfidence(ogObject, errors, warnings); const confidenceLevel = getConfidenceLevel(confidence); // Complete metrics metrics.fallbacksUsed = fallbacksUsed; metrics.performance = { htmlParseTime, metaExtractionTime, structuredDataExtractionTime, validationTime, totalTime: Date.now() - startTime, }; const result = { data: ogObject, structuredData, errors, warnings, confidence, confidenceLevel, fallbacksUsed, metrics: metrics, }; // Add validation and scoring results if generated if (validationResult) { result.validation = validationResult; } if (socialScore) { result.socialScore = socialScore; } // Cache the result if caching is enabled if (globalCache && options?.cache?.enabled && url) { await globalCache.set(url, result); } return result; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); errors.push({ code: "EXTRACTION_ERROR", message: errorMessage, severity: "critical", timestamp: new Date(), }); return { data: {}, structuredData, errors, warnings, confidence: 0, confidenceLevel: "low", fallbacksUsed, metrics: { ...metrics, performance: { htmlParseTime: 0, metaExtractionTime: 0, structuredDataExtractionTime: 0, validationTime: 0, totalTime: Date.now() - startTime, }, }, }; } } /** * Calculate confidence score based on extracted data */ function calculateConfidence(data, errors, warnings) { let score = 100; // Deduct for missing required fields if (!data.ogTitle) { score -= 20; } if (!data.ogType) { score -= 15; } if (!data.ogImage) { score -= 15; } if (!data.ogUrl) { score -= 10; } if (!data.ogDescription) { score -= 10; } // Deduct for errors score -= errors.filter((e) => e.severity === "critical").length * 10; score -= errors.filter((e) => e.severity === "error").length * 5; // Deduct for warnings score -= warnings.length * 2; // Bonus for additional metadata if (data.twitterCard) { score += 5; } if (data.articlePublishedTime) { score += 3; } if (data.canonical) { score += 3; } if (data.favicon) { score += 2; } return Math.max(0, Math.min(100, score)); } /** * Get confidence level from score */ function getConfidenceLevel(score) { if (score >= 80) { return "high"; } if (score >= 50) { return "medium"; } return "low"; } /** * Extract article content from HTML */ function extractArticleContent($) { // Try to find main content area const selectors = [ "article", '[role="main"]', "main", ".post-content", ".entry-content", ".article-content", ".content", "#content", ]; let content = ""; for (const selector of selectors) { const element = $(selector).first(); if (element.length > 0) { content = element.text().trim(); break; } } if (!content) { // Fallback to largest text block let maxLength = 0; $("p").each((_, element) => { const text = $(element).text().trim(); if (text.length > maxLength) { maxLength = text.length; content = text; } }); } if (!content) { return null; } // Calculate reading time (average 200 words per minute) const words = content.split(/\s+/).filter((word) => word.length > 0); const wordCount = words.length; const readingTime = Math.ceil(wordCount / 200); return { content: content.substring(0, 5000), // Limit content length readingTime, wordCount, }; }