@devmehq/open-graph-extractor
Version:
Fast, lightweight Open Graph, Twitter Card, and structured data extractor for Node.js with caching and validation
527 lines (526 loc) • 18.4 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.mediaSetup = mediaSetup;
exports.detectImageFormat = detectImageFormat;
exports.parseSrcSet = parseSrcSet;
exports.extractImageMetadata = extractImageMetadata;
exports.extractAllImages = extractAllImages;
exports.extractVideoMetadata = extractVideoMetadata;
exports.selectBestImage = selectBestImage;
exports.extractAudioMetadata = extractAudioMetadata;
const fields_1 = require("./fields");
const utils_1 = require("./utils");
const mediaMapperTwitterImage = (item) => ({
url: item[0],
width: item[1] || null,
height: item[2] || null,
alt: item[3] || null,
});
const mediaMapperTwitterPlayer = (item) => ({
url: item[0],
width: item[1] || null,
height: item[2] || null,
stream: item[3] || null,
});
const mediaMapperMusicSong = (item) => ({
url: item[0] || "",
track: item[1] || "",
disc: item[2] || "",
});
const mediaMapper = (item) => ({
url: item[0],
width: item[1] || null,
height: item[2] || null,
type: item[3] || null,
});
const mediaSorter = (a, b) => {
if (!(a.url && b.url)) {
return 0;
}
const aRes = a.url.match(/\.(\w{2,5})$/);
const aExt = aRes?.[1].toLowerCase() || null;
const bRes = b.url.match(/\.(\w{2,5})$/);
const bExt = bRes?.[1].toLowerCase() || null;
if (aExt === "gif" && bExt !== "gif") {
return -1;
}
if (aExt !== "gif" && bExt === "gif") {
return 1;
}
const aWidth = a.width ? Number.parseInt(a.width, 10) : 0;
const aHeight = a.height ? Number.parseInt(a.height, 10) : 0;
const bWidth = b.width ? Number.parseInt(b.width, 10) : 0;
const bHeight = b.height ? Number.parseInt(b.height, 10) : 0;
return Math.max(bWidth, bHeight) - Math.max(aWidth, aHeight);
};
const mediaSorterMusicSong = (a, b) => {
if (!a.track || !b.track) {
return 0;
}
const aDisc = Number.parseInt(a.disc || "0", 10);
const bDisc = Number.parseInt(b.disc || "0", 10);
const aTrack = Number.parseInt(a.track, 10);
const bTrack = Number.parseInt(b.track, 10);
if (aDisc > bDisc) {
return 1;
}
if (aDisc < bDisc) {
return -1;
}
return aTrack - bTrack;
};
// lodash zip replacement
const zip = (array, ...args) => {
if (array === undefined) {
return [];
}
return array.map((value, idx) => [value, ...args.map((arr) => arr[idx])]);
};
/*
* media setup
* @param string ogObject - return open graph info
* @param string options - options the user has set
* @param function callback
*/
function mediaSetup(ogObject, options) {
// sets ogImage image/width/height/type to null if one This exists
if (ogObject.ogImage || ogObject.ogImageWidth || ogObject.twitterImageHeight || ogObject.ogImageType) {
ogObject.ogImage = ogObject.ogImage ? ogObject.ogImage : [null];
ogObject.ogImageWidth = ogObject.ogImageWidth ? ogObject.ogImageWidth : [null];
ogObject.ogImageHeight = ogObject.ogImageHeight ? ogObject.ogImageHeight : [null];
ogObject.ogImageType = ogObject.ogImageType ? ogObject.ogImageType : [null];
}
// format images
const ogImages = zip(ogObject.ogImage, ogObject.ogImageWidth, ogObject.ogImageHeight, ogObject.ogImageType)
.map(mediaMapper)
.sort(mediaSorter);
// sets ogVideo video/width/height/type to null if one this exists
if (ogObject.ogVideo || ogObject.ogVideoWidth || ogObject.ogVideoHeight || ogObject.ogVideoType) {
ogObject.ogVideo = ogObject.ogVideo ? ogObject.ogVideo : [null];
ogObject.ogVideoWidth = ogObject.ogVideoWidth ? ogObject.ogVideoWidth : [null];
ogObject.ogVideoHeight = ogObject.ogVideoHeight ? ogObject.ogVideoHeight : [null];
ogObject.ogVideoType = ogObject.ogVideoType ? ogObject.ogVideoType : [null];
}
// format videos
const ogVideos = zip(ogObject.ogVideo, ogObject.ogVideoWidth, ogObject.ogVideoHeight, ogObject.ogVideoType)
.map(mediaMapper)
.sort(mediaSorter);
// sets twitter image/width/height/type to null if one these exists
if (ogObject.twitterImageSrc ||
ogObject.twitterImage ||
ogObject.twitterImageWidth ||
ogObject.twitterImageHeight ||
ogObject.twitterImageAlt) {
ogObject.twitterImageSrc = ogObject.twitterImageSrc ? ogObject.twitterImageSrc : [null];
ogObject.twitterImage = ogObject.twitterImage ? ogObject.twitterImage : ogObject.twitterImageSrc; // deafult to twitterImageSrc
ogObject.twitterImageWidth = ogObject.twitterImageWidth ? ogObject.twitterImageWidth : [null];
ogObject.twitterImageHeight = ogObject.twitterImageHeight ? ogObject.twitterImageHeight : [null];
ogObject.twitterImageAlt = ogObject.twitterImageAlt ? ogObject.twitterImageAlt : [null];
}
// format twitter images
const twitterImages = zip(ogObject.twitterImage, ogObject.twitterImageWidth, ogObject.twitterImageHeight, ogObject.twitterImageAlt)
.map(mediaMapperTwitterImage)
.sort(mediaSorter);
// sets twitter player/width/height/stream to null if one these exists
if (ogObject.twitterPlayer ||
ogObject.twitterPlayerWidth ||
ogObject.twitterPlayerHeight ||
ogObject.twitterPlayerStream) {
ogObject.twitterPlayer = ogObject.twitterPlayer ? ogObject.twitterPlayer : [null];
ogObject.twitterPlayerWidth = ogObject.twitterPlayerWidth ? ogObject.twitterPlayerWidth : [null];
ogObject.twitterPlayerHeight = ogObject.twitterPlayerHeight ? ogObject.twitterPlayerHeight : [null];
ogObject.twitterPlayerStream = ogObject.twitterPlayerStream ? ogObject.twitterPlayerStream : [null];
}
// format twitter player
const twitterPlayers = zip(ogObject.twitterPlayer, ogObject.twitterPlayerWidth, ogObject.twitterPlayerHeight, ogObject.twitterPlayerStream)
.map(mediaMapperTwitterPlayer)
.sort(mediaSorter);
// sets music song/songTrack/songDisc to null if one This exists
if (ogObject.musicSong || ogObject.musicSongTrack || ogObject.musicSongDisc) {
ogObject.musicSong = ogObject.musicSong ? ogObject.musicSong : [null];
ogObject.musicSongTrack = ogObject.musicSongTrack ? ogObject.musicSongTrack : [null];
ogObject.musicSongDisc = ogObject.musicSongDisc ? ogObject.musicSongDisc : [null];
}
// format music songs
const musicSongs = zip(ogObject.musicSong, ogObject.musicSongTrack, ogObject.musicSongDisc)
.map(mediaMapperMusicSong)
.sort(mediaSorterMusicSong);
// remove old values since everything will live under the main property
fields_1.fields
.filter((item) => item.multiple && item.fieldName && item.fieldName.match("(ogImage|ogVideo|twitter|musicSong).*"))
.forEach((item) => {
delete ogObject[item.fieldName];
});
if (options?.allMedia) {
if (ogImages.length) {
ogObject.ogImage = ogImages;
}
if (ogVideos.length) {
ogObject.ogVideo = ogVideos;
}
if (twitterImages.length) {
ogObject.twitterImage = twitterImages;
}
if (twitterPlayers.length) {
ogObject.twitterPlayer = twitterPlayers;
}
if (musicSongs.length) {
ogObject.musicSong = musicSongs;
}
}
else {
if (ogImages.length) {
[ogObject.ogImage] = ogImages;
}
if (ogVideos.length) {
[ogObject.ogVideo] = ogVideos;
}
if (twitterImages.length) {
[ogObject.twitterImage] = twitterImages;
}
if (twitterPlayers.length) {
[ogObject.twitterPlayer] = twitterPlayers;
}
if (musicSongs.length) {
[ogObject.musicSong] = musicSongs;
}
}
return ogObject;
}
/**
* Detect image format from URL or content type
*/
function detectImageFormat(url, contentType) {
if (contentType) {
const format = contentType.split("/")[1]?.toLowerCase();
if (isValidImageFormat(format)) {
return format;
}
}
// Try to detect from URL
const urlLower = url.toLowerCase();
const extensions = ["jpeg", "jpg", "png", "gif", "webp", "avif", "svg", "bmp", "ico"];
for (const ext of extensions) {
if (urlLower.includes(`.${ext}`)) {
return ext;
}
}
return undefined;
}
/**
* Check if format is a valid image format
*/
function isValidImageFormat(format) {
if (!format) {
return false;
}
const validFormats = ["jpeg", "jpg", "png", "gif", "webp", "avif", "svg", "bmp", "ico"];
return validFormats.includes(format);
}
/**
* Parse srcset attribute into structured data
*/
function parseSrcSet(srcset) {
const images = [];
if (!srcset) {
return images;
}
const parts = srcset.split(",").map((s) => s.trim());
for (const part of parts) {
const match = part.match(/^(.+?)\s+(\d+(?:\.\d+)?[wx])$/);
if (match) {
const [, url, descriptor] = match;
const width = descriptor.endsWith("w")
? Number.parseInt(descriptor.slice(0, -1), 10)
: Number.parseInt(descriptor.slice(0, -1), 10);
images.push({
url: url.trim(),
width,
descriptor,
});
}
}
return images.sort((a, b) => a.width - b.width);
}
/**
* Extract enhanced image metadata
*/
function extractImageMetadata($, element) {
const $img = $(element);
const src = $img.attr("src") || "";
const srcset = $img.attr("srcset");
const _sizes = $img.attr("sizes");
const alt = $img.attr("alt");
const title = $img.attr("title");
const loading = $img.attr("loading");
const width = $img.attr("width");
const height = $img.attr("height");
const metadata = {
url: src,
type: detectImageFormat(src),
alt,
caption: title,
isLazyLoaded: loading === "lazy",
isResponsive: !!srcset,
};
if (width) {
metadata.width = width;
}
if (height) {
metadata.height = height;
}
// Parse srcset if available
if (srcset) {
metadata.srcset = parseSrcSet(srcset);
}
// Calculate aspect ratio if dimensions are available
if (width && height) {
const w = Number.parseInt(String(width), 10);
const h = Number.parseInt(String(height), 10);
if (w > 0 && h > 0) {
metadata.aspectRatio = w / h;
}
}
// Check for WebP/AVIF support in picture element
const $picture = $img.closest("picture");
if ($picture.length > 0) {
const sources = $picture.find("source");
sources.each((_, source) => {
const type = $(source).attr("type");
if (type?.includes("webp") || type?.includes("avif")) {
metadata.type = type.includes("webp") ? "webp" : "avif";
const srcset = $(source).attr("srcset");
if (srcset) {
const parsed = parseSrcSet(srcset);
if (parsed.length > 0) {
metadata.url = parsed[0].url;
if (parsed.length > 1) {
metadata.srcset = parsed;
}
}
}
}
});
}
return metadata;
}
/**
* Extract all images with enhanced metadata
*/
function extractAllImages($) {
const images = [];
$("img").each((_, element) => {
const metadata = extractImageMetadata($, element);
if ((0, utils_1.isUrlValid)(metadata.url)) {
images.push(metadata);
}
});
// Also extract images from meta tags
$('meta[property="og:image"], meta[name="twitter:image"]').each((_, element) => {
const content = $(element).attr("content");
if (content && (0, utils_1.isUrlValid)(content)) {
const existing = images.find((img) => img.url === content);
if (!existing) {
images.push({
url: content,
type: detectImageFormat(content),
});
}
}
});
return images;
}
/**
* Extract video metadata
*/
function extractVideoMetadata($, url) {
const metadata = {};
// Try to get from og:video tags
const ogVideo = $('meta[property="og:video"]').attr("content") || $('meta[property="og:video:url"]').attr("content");
if (ogVideo) {
metadata.url = ogVideo;
}
else if (url) {
metadata.url = url;
}
// Get secure URL
const ogVideoSecure = $('meta[property="og:video:secure_url"]').attr("content");
if (ogVideoSecure) {
metadata.secureUrl = ogVideoSecure;
}
// Get dimensions
const width = $('meta[property="og:video:width"]').attr("content");
const height = $('meta[property="og:video:height"]').attr("content");
if (width) {
metadata.width = width;
}
if (height) {
metadata.height = height;
}
// Get type
const type = $('meta[property="og:video:type"]').attr("content");
if (type) {
metadata.type = type;
}
// Get duration (Twitter)
const duration = $('meta[name="twitter:player:stream:content_type"]').attr("content");
if (duration) {
const match = duration.match(/duration=(\d+)/);
if (match) {
metadata.duration = Number.parseInt(match[1], 10);
}
}
// Extract thumbnails
const thumbnails = [];
$('meta[property="og:image"]').each((_, element) => {
const url = $(element).attr("content");
if (url) {
thumbnails.push({
url,
format: detectImageFormat(url),
});
}
});
if (thumbnails.length > 0) {
metadata.thumbnails = thumbnails;
}
// Get embed URL
const embedUrl = $('meta[property="og:video:embed_url"]').attr("content") || $('meta[name="twitter:player"]').attr("content");
if (embedUrl) {
metadata.embedUrl = embedUrl;
}
// Look for video elements in the page
const $video = $("video").first();
if ($video.length > 0) {
if (!metadata.url) {
metadata.url = $video.attr("src") || $video.find("source").first().attr("src") || "";
}
const poster = $video.attr("poster");
if (poster && (!metadata.thumbnails || metadata.thumbnails.length === 0)) {
metadata.thumbnails = [
{
url: poster,
format: detectImageFormat(poster),
},
];
}
// Try to get duration from video element
const videoDuration = $video.attr("data-duration");
if (videoDuration && !metadata.duration) {
metadata.duration = Number.parseFloat(videoDuration);
}
}
// Look for captions/subtitles
const captions = [];
$("track").each((_, element) => {
const $track = $(element);
const src = $track.attr("src");
const srclang = $track.attr("srclang");
const kind = $track.attr("kind");
if (src && srclang) {
captions.push({
language: srclang,
url: src,
kind: kind || "subtitles",
});
}
});
if (captions.length > 0) {
metadata.captions = captions;
}
if (!metadata.url) {
return null;
}
return metadata;
}
/**
* Find the best image from available options
*/
function selectBestImage(images) {
if (images.length === 0) {
return null;
}
if (images.length === 1) {
return images[0];
}
// Prefer images with:
// 1. Modern formats (WebP, AVIF)
// 2. Proper dimensions (1200x630 for social media)
// 3. Alt text
// 4. Higher resolution
const scored = images.map((img) => {
let score = 0;
// Modern format bonus
if (img.type === "webp") {
score += 10;
}
if (img.type === "avif") {
score += 15;
}
// Dimension scoring
if (img.width && img.height) {
const width = Number.parseInt(String(img.width), 10);
const height = Number.parseInt(String(img.height), 10);
// Ideal social media dimensions
if (width === 1200 && height === 630) {
score += 20;
}
else if (width >= 1200 && height >= 630) {
score += 15;
}
else if (width >= 600 && height >= 315) {
score += 10;
}
// Aspect ratio close to 1.91:1 (Facebook's preferred)
const aspectRatio = width / height;
if (Math.abs(aspectRatio - 1.91) < 0.1) {
score += 10;
}
}
// Alt text bonus
if (img.alt) {
score += 5;
}
// Responsive image bonus
if (img.isResponsive) {
score += 5;
}
if (img.srcset && img.srcset.length > 0) {
score += 5;
}
return { img, score };
});
scored.sort((a, b) => b.score - a.score);
return scored[0].img;
}
/**
* Extract audio metadata
*/
function extractAudioMetadata($) {
const metadata = {};
// Check for og:audio tags
const ogAudio = $('meta[property="og:audio"]').attr("content") || $('meta[property="og:audio:url"]').attr("content");
if (ogAudio) {
metadata.url = ogAudio;
}
const ogAudioSecure = $('meta[property="og:audio:secure_url"]').attr("content");
if (ogAudioSecure) {
metadata.secureUrl = ogAudioSecure;
}
const ogAudioType = $('meta[property="og:audio:type"]').attr("content");
if (ogAudioType) {
metadata.type = ogAudioType;
}
// Check for audio elements
const $audio = $("audio").first();
if ($audio.length > 0) {
if (!metadata.url) {
metadata.url = $audio.attr("src") || $audio.find("source").first().attr("src");
}
const duration = $audio.attr("data-duration");
if (duration) {
metadata.duration = Number.parseFloat(duration);
}
}
return Object.keys(metadata).length > 0 ? metadata : null;
}