UNPKG

@devmehq/open-graph-extractor

Version:

Fast, lightweight Open Graph, Twitter Card, and structured data extractor for Node.js with caching and validation

527 lines (526 loc) 18.4 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.mediaSetup = mediaSetup; exports.detectImageFormat = detectImageFormat; exports.parseSrcSet = parseSrcSet; exports.extractImageMetadata = extractImageMetadata; exports.extractAllImages = extractAllImages; exports.extractVideoMetadata = extractVideoMetadata; exports.selectBestImage = selectBestImage; exports.extractAudioMetadata = extractAudioMetadata; const fields_1 = require("./fields"); const utils_1 = require("./utils"); const mediaMapperTwitterImage = (item) => ({ url: item[0], width: item[1] || null, height: item[2] || null, alt: item[3] || null, }); const mediaMapperTwitterPlayer = (item) => ({ url: item[0], width: item[1] || null, height: item[2] || null, stream: item[3] || null, }); const mediaMapperMusicSong = (item) => ({ url: item[0] || "", track: item[1] || "", disc: item[2] || "", }); const mediaMapper = (item) => ({ url: item[0], width: item[1] || null, height: item[2] || null, type: item[3] || null, }); const mediaSorter = (a, b) => { if (!(a.url && b.url)) { return 0; } const aRes = a.url.match(/\.(\w{2,5})$/); const aExt = aRes?.[1].toLowerCase() || null; const bRes = b.url.match(/\.(\w{2,5})$/); const bExt = bRes?.[1].toLowerCase() || null; if (aExt === "gif" && bExt !== "gif") { return -1; } if (aExt !== "gif" && bExt === "gif") { return 1; } const aWidth = a.width ? Number.parseInt(a.width, 10) : 0; const aHeight = a.height ? Number.parseInt(a.height, 10) : 0; const bWidth = b.width ? Number.parseInt(b.width, 10) : 0; const bHeight = b.height ? Number.parseInt(b.height, 10) : 0; return Math.max(bWidth, bHeight) - Math.max(aWidth, aHeight); }; const mediaSorterMusicSong = (a, b) => { if (!a.track || !b.track) { return 0; } const aDisc = Number.parseInt(a.disc || "0", 10); const bDisc = Number.parseInt(b.disc || "0", 10); const aTrack = Number.parseInt(a.track, 10); const bTrack = Number.parseInt(b.track, 10); if (aDisc > bDisc) { return 1; } if (aDisc < bDisc) { return -1; } return aTrack - bTrack; }; // lodash zip replacement const zip = (array, ...args) => { if (array === undefined) { return []; } return array.map((value, idx) => [value, ...args.map((arr) => arr[idx])]); }; /* * media setup * @param string ogObject - return open graph info * @param string options - options the user has set * @param function callback */ function mediaSetup(ogObject, options) { // sets ogImage image/width/height/type to null if one This exists if (ogObject.ogImage || ogObject.ogImageWidth || ogObject.twitterImageHeight || ogObject.ogImageType) { ogObject.ogImage = ogObject.ogImage ? ogObject.ogImage : [null]; ogObject.ogImageWidth = ogObject.ogImageWidth ? ogObject.ogImageWidth : [null]; ogObject.ogImageHeight = ogObject.ogImageHeight ? ogObject.ogImageHeight : [null]; ogObject.ogImageType = ogObject.ogImageType ? ogObject.ogImageType : [null]; } // format images const ogImages = zip(ogObject.ogImage, ogObject.ogImageWidth, ogObject.ogImageHeight, ogObject.ogImageType) .map(mediaMapper) .sort(mediaSorter); // sets ogVideo video/width/height/type to null if one this exists if (ogObject.ogVideo || ogObject.ogVideoWidth || ogObject.ogVideoHeight || ogObject.ogVideoType) { ogObject.ogVideo = ogObject.ogVideo ? ogObject.ogVideo : [null]; ogObject.ogVideoWidth = ogObject.ogVideoWidth ? ogObject.ogVideoWidth : [null]; ogObject.ogVideoHeight = ogObject.ogVideoHeight ? ogObject.ogVideoHeight : [null]; ogObject.ogVideoType = ogObject.ogVideoType ? ogObject.ogVideoType : [null]; } // format videos const ogVideos = zip(ogObject.ogVideo, ogObject.ogVideoWidth, ogObject.ogVideoHeight, ogObject.ogVideoType) .map(mediaMapper) .sort(mediaSorter); // sets twitter image/width/height/type to null if one these exists if (ogObject.twitterImageSrc || ogObject.twitterImage || ogObject.twitterImageWidth || ogObject.twitterImageHeight || ogObject.twitterImageAlt) { ogObject.twitterImageSrc = ogObject.twitterImageSrc ? ogObject.twitterImageSrc : [null]; ogObject.twitterImage = ogObject.twitterImage ? ogObject.twitterImage : ogObject.twitterImageSrc; // deafult to twitterImageSrc ogObject.twitterImageWidth = ogObject.twitterImageWidth ? ogObject.twitterImageWidth : [null]; ogObject.twitterImageHeight = ogObject.twitterImageHeight ? ogObject.twitterImageHeight : [null]; ogObject.twitterImageAlt = ogObject.twitterImageAlt ? ogObject.twitterImageAlt : [null]; } // format twitter images const twitterImages = zip(ogObject.twitterImage, ogObject.twitterImageWidth, ogObject.twitterImageHeight, ogObject.twitterImageAlt) .map(mediaMapperTwitterImage) .sort(mediaSorter); // sets twitter player/width/height/stream to null if one these exists if (ogObject.twitterPlayer || ogObject.twitterPlayerWidth || ogObject.twitterPlayerHeight || ogObject.twitterPlayerStream) { ogObject.twitterPlayer = ogObject.twitterPlayer ? ogObject.twitterPlayer : [null]; ogObject.twitterPlayerWidth = ogObject.twitterPlayerWidth ? ogObject.twitterPlayerWidth : [null]; ogObject.twitterPlayerHeight = ogObject.twitterPlayerHeight ? ogObject.twitterPlayerHeight : [null]; ogObject.twitterPlayerStream = ogObject.twitterPlayerStream ? ogObject.twitterPlayerStream : [null]; } // format twitter player const twitterPlayers = zip(ogObject.twitterPlayer, ogObject.twitterPlayerWidth, ogObject.twitterPlayerHeight, ogObject.twitterPlayerStream) .map(mediaMapperTwitterPlayer) .sort(mediaSorter); // sets music song/songTrack/songDisc to null if one This exists if (ogObject.musicSong || ogObject.musicSongTrack || ogObject.musicSongDisc) { ogObject.musicSong = ogObject.musicSong ? ogObject.musicSong : [null]; ogObject.musicSongTrack = ogObject.musicSongTrack ? ogObject.musicSongTrack : [null]; ogObject.musicSongDisc = ogObject.musicSongDisc ? ogObject.musicSongDisc : [null]; } // format music songs const musicSongs = zip(ogObject.musicSong, ogObject.musicSongTrack, ogObject.musicSongDisc) .map(mediaMapperMusicSong) .sort(mediaSorterMusicSong); // remove old values since everything will live under the main property fields_1.fields .filter((item) => item.multiple && item.fieldName && item.fieldName.match("(ogImage|ogVideo|twitter|musicSong).*")) .forEach((item) => { delete ogObject[item.fieldName]; }); if (options?.allMedia) { if (ogImages.length) { ogObject.ogImage = ogImages; } if (ogVideos.length) { ogObject.ogVideo = ogVideos; } if (twitterImages.length) { ogObject.twitterImage = twitterImages; } if (twitterPlayers.length) { ogObject.twitterPlayer = twitterPlayers; } if (musicSongs.length) { ogObject.musicSong = musicSongs; } } else { if (ogImages.length) { [ogObject.ogImage] = ogImages; } if (ogVideos.length) { [ogObject.ogVideo] = ogVideos; } if (twitterImages.length) { [ogObject.twitterImage] = twitterImages; } if (twitterPlayers.length) { [ogObject.twitterPlayer] = twitterPlayers; } if (musicSongs.length) { [ogObject.musicSong] = musicSongs; } } return ogObject; } /** * Detect image format from URL or content type */ function detectImageFormat(url, contentType) { if (contentType) { const format = contentType.split("/")[1]?.toLowerCase(); if (isValidImageFormat(format)) { return format; } } // Try to detect from URL const urlLower = url.toLowerCase(); const extensions = ["jpeg", "jpg", "png", "gif", "webp", "avif", "svg", "bmp", "ico"]; for (const ext of extensions) { if (urlLower.includes(`.${ext}`)) { return ext; } } return undefined; } /** * Check if format is a valid image format */ function isValidImageFormat(format) { if (!format) { return false; } const validFormats = ["jpeg", "jpg", "png", "gif", "webp", "avif", "svg", "bmp", "ico"]; return validFormats.includes(format); } /** * Parse srcset attribute into structured data */ function parseSrcSet(srcset) { const images = []; if (!srcset) { return images; } const parts = srcset.split(",").map((s) => s.trim()); for (const part of parts) { const match = part.match(/^(.+?)\s+(\d+(?:\.\d+)?[wx])$/); if (match) { const [, url, descriptor] = match; const width = descriptor.endsWith("w") ? Number.parseInt(descriptor.slice(0, -1), 10) : Number.parseInt(descriptor.slice(0, -1), 10); images.push({ url: url.trim(), width, descriptor, }); } } return images.sort((a, b) => a.width - b.width); } /** * Extract enhanced image metadata */ function extractImageMetadata($, element) { const $img = $(element); const src = $img.attr("src") || ""; const srcset = $img.attr("srcset"); const _sizes = $img.attr("sizes"); const alt = $img.attr("alt"); const title = $img.attr("title"); const loading = $img.attr("loading"); const width = $img.attr("width"); const height = $img.attr("height"); const metadata = { url: src, type: detectImageFormat(src), alt, caption: title, isLazyLoaded: loading === "lazy", isResponsive: !!srcset, }; if (width) { metadata.width = width; } if (height) { metadata.height = height; } // Parse srcset if available if (srcset) { metadata.srcset = parseSrcSet(srcset); } // Calculate aspect ratio if dimensions are available if (width && height) { const w = Number.parseInt(String(width), 10); const h = Number.parseInt(String(height), 10); if (w > 0 && h > 0) { metadata.aspectRatio = w / h; } } // Check for WebP/AVIF support in picture element const $picture = $img.closest("picture"); if ($picture.length > 0) { const sources = $picture.find("source"); sources.each((_, source) => { const type = $(source).attr("type"); if (type?.includes("webp") || type?.includes("avif")) { metadata.type = type.includes("webp") ? "webp" : "avif"; const srcset = $(source).attr("srcset"); if (srcset) { const parsed = parseSrcSet(srcset); if (parsed.length > 0) { metadata.url = parsed[0].url; if (parsed.length > 1) { metadata.srcset = parsed; } } } } }); } return metadata; } /** * Extract all images with enhanced metadata */ function extractAllImages($) { const images = []; $("img").each((_, element) => { const metadata = extractImageMetadata($, element); if ((0, utils_1.isUrlValid)(metadata.url)) { images.push(metadata); } }); // Also extract images from meta tags $('meta[property="og:image"], meta[name="twitter:image"]').each((_, element) => { const content = $(element).attr("content"); if (content && (0, utils_1.isUrlValid)(content)) { const existing = images.find((img) => img.url === content); if (!existing) { images.push({ url: content, type: detectImageFormat(content), }); } } }); return images; } /** * Extract video metadata */ function extractVideoMetadata($, url) { const metadata = {}; // Try to get from og:video tags const ogVideo = $('meta[property="og:video"]').attr("content") || $('meta[property="og:video:url"]').attr("content"); if (ogVideo) { metadata.url = ogVideo; } else if (url) { metadata.url = url; } // Get secure URL const ogVideoSecure = $('meta[property="og:video:secure_url"]').attr("content"); if (ogVideoSecure) { metadata.secureUrl = ogVideoSecure; } // Get dimensions const width = $('meta[property="og:video:width"]').attr("content"); const height = $('meta[property="og:video:height"]').attr("content"); if (width) { metadata.width = width; } if (height) { metadata.height = height; } // Get type const type = $('meta[property="og:video:type"]').attr("content"); if (type) { metadata.type = type; } // Get duration (Twitter) const duration = $('meta[name="twitter:player:stream:content_type"]').attr("content"); if (duration) { const match = duration.match(/duration=(\d+)/); if (match) { metadata.duration = Number.parseInt(match[1], 10); } } // Extract thumbnails const thumbnails = []; $('meta[property="og:image"]').each((_, element) => { const url = $(element).attr("content"); if (url) { thumbnails.push({ url, format: detectImageFormat(url), }); } }); if (thumbnails.length > 0) { metadata.thumbnails = thumbnails; } // Get embed URL const embedUrl = $('meta[property="og:video:embed_url"]').attr("content") || $('meta[name="twitter:player"]').attr("content"); if (embedUrl) { metadata.embedUrl = embedUrl; } // Look for video elements in the page const $video = $("video").first(); if ($video.length > 0) { if (!metadata.url) { metadata.url = $video.attr("src") || $video.find("source").first().attr("src") || ""; } const poster = $video.attr("poster"); if (poster && (!metadata.thumbnails || metadata.thumbnails.length === 0)) { metadata.thumbnails = [ { url: poster, format: detectImageFormat(poster), }, ]; } // Try to get duration from video element const videoDuration = $video.attr("data-duration"); if (videoDuration && !metadata.duration) { metadata.duration = Number.parseFloat(videoDuration); } } // Look for captions/subtitles const captions = []; $("track").each((_, element) => { const $track = $(element); const src = $track.attr("src"); const srclang = $track.attr("srclang"); const kind = $track.attr("kind"); if (src && srclang) { captions.push({ language: srclang, url: src, kind: kind || "subtitles", }); } }); if (captions.length > 0) { metadata.captions = captions; } if (!metadata.url) { return null; } return metadata; } /** * Find the best image from available options */ function selectBestImage(images) { if (images.length === 0) { return null; } if (images.length === 1) { return images[0]; } // Prefer images with: // 1. Modern formats (WebP, AVIF) // 2. Proper dimensions (1200x630 for social media) // 3. Alt text // 4. Higher resolution const scored = images.map((img) => { let score = 0; // Modern format bonus if (img.type === "webp") { score += 10; } if (img.type === "avif") { score += 15; } // Dimension scoring if (img.width && img.height) { const width = Number.parseInt(String(img.width), 10); const height = Number.parseInt(String(img.height), 10); // Ideal social media dimensions if (width === 1200 && height === 630) { score += 20; } else if (width >= 1200 && height >= 630) { score += 15; } else if (width >= 600 && height >= 315) { score += 10; } // Aspect ratio close to 1.91:1 (Facebook's preferred) const aspectRatio = width / height; if (Math.abs(aspectRatio - 1.91) < 0.1) { score += 10; } } // Alt text bonus if (img.alt) { score += 5; } // Responsive image bonus if (img.isResponsive) { score += 5; } if (img.srcset && img.srcset.length > 0) { score += 5; } return { img, score }; }); scored.sort((a, b) => b.score - a.score); return scored[0].img; } /** * Extract audio metadata */ function extractAudioMetadata($) { const metadata = {}; // Check for og:audio tags const ogAudio = $('meta[property="og:audio"]').attr("content") || $('meta[property="og:audio:url"]').attr("content"); if (ogAudio) { metadata.url = ogAudio; } const ogAudioSecure = $('meta[property="og:audio:secure_url"]').attr("content"); if (ogAudioSecure) { metadata.secureUrl = ogAudioSecure; } const ogAudioType = $('meta[property="og:audio:type"]').attr("content"); if (ogAudioType) { metadata.type = ogAudioType; } // Check for audio elements const $audio = $("audio").first(); if ($audio.length > 0) { if (!metadata.url) { metadata.url = $audio.attr("src") || $audio.find("source").first().attr("src"); } const duration = $audio.attr("data-duration"); if (duration) { metadata.duration = Number.parseFloat(duration); } } return Object.keys(metadata).length > 0 ? metadata : null; }