@devmehq/open-graph-extractor
Version:
Fast, lightweight Open Graph, Twitter Card, and structured data extractor for Node.js with caching and validation
586 lines (521 loc) • 17.7 kB
text/typescript
import type { CheerioAPI } from "cheerio";
import * as cheerio from "cheerio";
import { type CacheManager, createCache } from "./cache";
import { fallback, type IFallbackOgObject } from "./fallback";
import { fields } from "./fields";
import {
extractAllImages,
extractAudioMetadata,
extractVideoMetadata,
type IOgObjectMedia,
mediaSetup,
selectBestImage,
} from "./media";
import { normalizeUrl, sanitizeExtractedData, sanitizeHtml, validateUrl } from "./security";
import { extractStructuredData, mergeStructuredDataWithOG } from "./structured-data";
import type {
CacheStorage,
ConfidenceLevel,
IExtractOpenGraphOptions as IAsyncExtractOptions,
IOGResult as IAsyncOGResult,
IError,
IExtractionResult,
IMetrics,
IPerformanceMetrics,
ISocialScore,
IValidationResult,
IWarning,
} from "./types";
import { removeNestedUndefinedValues } from "./utils";
import { generateSocialScore, validateOpenGraph, validateTwitterCard } from "./validation";
export * from "./bulk";
export * from "./cache";
export * from "./media";
export * from "./security";
export * from "./structured-data";
// Re-export all types and utilities
export * from "./types";
export * from "./validation";
// Global cache instance
let globalCache: CacheManager | null = null;
// Enhanced interface that extends original for backward compatibility
export interface IOGResult extends IOgObjectMedia {
ogLocale?: string;
favicon?: string;
ogDate?: string;
alAndroidAppName?: string;
alAndroidPackage?: string;
alAndroidUrl?: string;
alIosAppName?: string;
alIosAppStoreId?: string;
alIosUrl?: string;
alWebShouldFallback?: string;
twitterCard?: string;
twitterSite?: string;
twitterTitle?: string;
twitterDescription?: string;
twitterAppNameiPhone?: string;
twitterAppIdiPhone?: string;
twitterAppUrliPhone?: string;
twitterAppNameiPad?: string;
twitterAppIdiPad?: string;
twitterAppUrliPad?: string;
twitterAppNameGooglePlay?: string;
twitterAppIdGooglePlay?: string;
twitterAppUrlGooglePlay?: string;
ogImageSecureURL?: string;
ogImageURL?: string;
ogSiteName?: string;
charset?: string;
error?: string;
errorDetails?: string;
[key: string]: unknown;
}
// Enhanced options interface that includes all new features while maintaining backward compatibility
export interface IExtractOpenGraphOptions {
customMetaTags?: Array<{
multiple: boolean;
property: string;
fieldName: string;
}>;
allMedia?: boolean;
onlyGetOpenGraphInfo?: boolean;
ogImageFallback?: boolean;
// Enhanced features
cache?: {
enabled: boolean;
ttl?: number;
storage?: string;
maxSize?: number;
keyGenerator?: (url: string) => string;
};
security?: {
sanitizeHtml?: boolean;
detectPII?: boolean;
maskPII?: boolean;
validateUrls?: boolean;
maxRedirects?: number;
timeout?: number;
allowedDomains?: string[];
blockedDomains?: string[];
};
extractStructuredData?: boolean;
validateData?: boolean;
generateScore?: boolean;
extractArticleContent?: boolean;
detectLanguage?: boolean;
normalizeUrls?: boolean;
returnEnhanced?: boolean; // Flag to return enhanced result format
}
/**
* Extract Open Graph data from HTML with optional enhanced features
* @param input - HTML string or Buffer
* @param options - extraction options
*/
export function extractOpenGraph(input: string | Buffer, options?: IExtractOpenGraphOptions): IOGResult {
// Legacy extraction for backward compatibility
let ogObject: IOGResult = {} as IOGResult;
const $ = cheerio.load(input);
const metaFields = fields.concat(options?.customMetaTags ?? []);
// find all the open graph info in the meta tags
$("meta").each((_index, meta) => {
if (!meta.attribs || (!meta.attribs.property && !meta.attribs.name)) return;
const property = meta.attribs.property || meta.attribs.name || meta.attribs.itemprop || meta.attribs.itemProp;
const content = meta.attribs.content || meta.attribs.value;
metaFields.forEach((item) => {
if (property.toLowerCase() === item.property.toLowerCase()) {
if (!item.multiple) {
ogObject[item.fieldName] = content;
} else if (!ogObject[item.fieldName]) {
ogObject[item.fieldName] = [content];
} else if (Array.isArray(ogObject[item.fieldName])) {
(ogObject[item.fieldName] as unknown[]).push(content);
}
}
});
});
// set ogImage to ogImageSecureURL/ogImageURL if there is no ogImage
if (!ogObject.ogImage && ogObject.ogImageSecureURL) {
ogObject.ogImage = ogObject.ogImageSecureURL;
} else if (!ogObject.ogImage && ogObject.ogImageURL) {
ogObject.ogImage = ogObject.ogImageURL;
}
// formats the multiple media values
ogObject = mediaSetup(ogObject, options);
// if onlyGetOpenGraphInfo isn't set, run the open graph fallbacks
if (!options?.onlyGetOpenGraphInfo) {
ogObject = fallback(ogObject, options, $) as IOGResult;
}
// removes any undef
ogObject = removeNestedUndefinedValues(ogObject);
return ogObject;
}
/**
* Extract Open Graph data with all features (async version)
* @param input - HTML string, Buffer, or URL (URL fetching not implemented)
* @param options - extraction options
*/
export async function extractOpenGraphAsync(
input: string | Buffer,
options?: IAsyncExtractOptions,
): Promise<IExtractionResult> {
return extractOpenGraphAsyncImpl(input, options);
}
/**
* Async extraction implementation (internal)
*/
async function extractOpenGraphAsyncImpl(
input: string | Buffer,
options?: IAsyncExtractOptions,
): Promise<IExtractionResult> {
const startTime = Date.now();
const metrics: Partial<IMetrics> = {
metaTagsFound: 0,
structuredDataFound: 0,
imagesFound: 0,
videosFound: 0,
fallbacksUsed: [],
};
const errors: IError[] = [];
const warnings: IWarning[] = [];
const fallbacksUsed: string[] = [];
let structuredData = { jsonLD: [], schemaOrg: {}, microdata: {}, rdfa: {}, dublinCore: {} };
try {
// Initialize cache if needed
if (options?.cache?.enabled && !globalCache) {
globalCache = createCache({
enabled: options.cache.enabled,
ttl: options.cache.ttl || 3600,
storage: (options.cache.storage as CacheStorage) || "memory",
maxSize: options.cache.maxSize || 1000,
keyGenerator: options.cache.keyGenerator,
});
}
let html: string;
let url: string | undefined;
// Check if input is URL or HTML
if (typeof input === "string" && (input.startsWith("http://") || input.startsWith("https://"))) {
url = input;
// Validate URL if security is enabled
if (options?.security?.validateUrls && !validateUrl(url, options.security)) {
throw new Error(`Invalid or blocked URL: ${url}`);
}
// Normalize URL
if (options?.normalizeUrls) {
url = normalizeUrl(url);
}
// Check cache
if (globalCache && options?.cache?.enabled) {
const cached = await globalCache.get(url);
if (cached) {
return cached as IExtractionResult;
}
}
// Fetch HTML - in production, you would use axios or another HTTP client
// For this library, the user should provide HTML directly or use their own fetching
throw new Error("URL fetching not implemented. Please fetch HTML and pass it directly.");
}
html = typeof input === "string" ? input : input.toString("utf8");
metrics.htmlSize = Buffer.byteLength(html, "utf8");
// Sanitize HTML if security is enabled
if (options?.security?.sanitizeHtml) {
html = sanitizeHtml(html, options.security);
}
const htmlParseStart = Date.now();
const $ = cheerio.load(html);
const htmlParseTime = Date.now() - htmlParseStart;
// Extract basic Open Graph data
const metaExtractionStart = Date.now();
let ogObject: IAsyncOGResult = {} as IAsyncOGResult;
const metaFields = fields.concat(options?.customMetaTags ?? []);
// Extract meta tags
$("meta").each((_index, meta) => {
if (!meta.attribs || (!meta.attribs.property && !meta.attribs.name)) return;
const property = meta.attribs.property || meta.attribs.name || meta.attribs.itemprop || meta.attribs.itemProp;
const content = meta.attribs.content || meta.attribs.value;
metrics.metaTagsFound!++;
metaFields.forEach((item) => {
if (property.toLowerCase() === item.property.toLowerCase()) {
if (!item.multiple) {
ogObject[item.fieldName] = content;
} else if (!ogObject[item.fieldName]) {
ogObject[item.fieldName] = [content];
} else if (Array.isArray(ogObject[item.fieldName])) {
(ogObject[item.fieldName] as unknown[]).push(content);
}
}
});
});
// Extract additional metadata
ogObject.charset =
$("meta[charset]").attr("charset") ||
$('meta[http-equiv="Content-Type"]')
.attr("content")
?.match(/charset=([^;]+)/)?.[1];
ogObject.viewport = $('meta[name="viewport"]').attr("content");
ogObject.robots = $('meta[name="robots"]').attr("content");
ogObject.generator = $('meta[name="generator"]').attr("content");
ogObject.themeColor = $('meta[name="theme-color"]').attr("content");
ogObject.applicationName = $('meta[name="application-name"]').attr("content");
ogObject.canonical = $('link[rel="canonical"]').attr("href");
ogObject.ampUrl = $('link[rel="amphtml"]').attr("href");
ogObject.manifest = $('link[rel="manifest"]').attr("href");
ogObject.maskIcon = $('link[rel="mask-icon"]').attr("href");
ogObject.appleTouchIcon = $('link[rel="apple-touch-icon"]').attr("href");
const metaExtractionTime = Date.now() - metaExtractionStart;
// Set URL if available
if (url && !ogObject.ogUrl) {
ogObject.ogUrl = url;
}
// Extract structured data if enabled
const structuredDataStart = Date.now();
if (options?.extractStructuredData) {
structuredData = extractStructuredData($);
metrics.structuredDataFound = structuredData.jsonLD?.length || 0;
ogObject = mergeStructuredDataWithOG(ogObject, structuredData);
}
const structuredDataExtractionTime = Date.now() - structuredDataStart;
// Extract enhanced media
if (!options?.onlyGetOpenGraphInfo) {
// Enhanced image extraction
const images = extractAllImages($);
metrics.imagesFound = images.length;
if (images.length > 0 && !ogObject.ogImage) {
const bestImage = selectBestImage(images);
if (bestImage) {
ogObject.ogImage = bestImage;
fallbacksUsed.push("image-extraction");
}
}
// Enhanced video extraction
const video = extractVideoMetadata($, ogObject.ogVideo as string);
if (video) {
ogObject.ogVideo = video;
metrics.videosFound = 1;
}
// Enhanced audio extraction
const audio = extractAudioMetadata($);
if (audio && !ogObject.ogAudio) {
ogObject.ogAudio = audio.url;
if (audio.secureUrl) ogObject.ogAudioSecureURL = audio.secureUrl;
if (audio.type) ogObject.ogAudioType = audio.type;
}
}
// Format media
ogObject = mediaSetup(ogObject as IOgObjectMedia, options) as IAsyncOGResult;
// Run fallbacks if not disabled
if (!options?.onlyGetOpenGraphInfo) {
const beforeFallback = { ...ogObject };
ogObject = fallback(ogObject as IFallbackOgObject, options, $) as IAsyncOGResult;
// Track which fallbacks were used
for (const key of Object.keys(ogObject)) {
if (!beforeFallback[key] && ogObject[key]) {
fallbacksUsed.push(`fallback-${key}`);
}
}
}
// Extract article content if enabled
if (options?.extractArticleContent) {
const articleContent = extractArticleContent($);
if (articleContent) {
ogObject.articleContent = articleContent.content;
ogObject.readingTime = articleContent.readingTime;
ogObject.wordCount = articleContent.wordCount;
}
}
// Detect language if enabled
if (options?.detectLanguage) {
const lang = $("html").attr("lang") || $('meta[http-equiv="content-language"]').attr("content");
if (lang) {
ogObject.language = lang;
ogObject.textDirection = lang.startsWith("ar") || lang.startsWith("he") ? "rtl" : "ltr";
}
}
// Clean up undefined values
ogObject = removeNestedUndefinedValues(ogObject);
// Sanitize data if security is enabled
if (options?.security) {
ogObject = sanitizeExtractedData(ogObject, options.security);
}
// Validate data if enabled
const validationStart = Date.now();
let validationResult: unknown = null;
let socialScore: unknown = null;
if (options?.validateData) {
validationResult = validateOpenGraph(ogObject);
const twitterValidation = validateTwitterCard(ogObject);
// Add validation errors and warnings
const validation = validationResult as IValidationResult;
const twitterVal = twitterValidation as IValidationResult;
errors.push(...validation.errors);
warnings.push(...validation.warnings);
errors.push(...twitterVal.errors);
warnings.push(...twitterVal.warnings);
}
if (options?.generateScore) {
socialScore = generateSocialScore(ogObject);
}
const validationTime = Date.now() - validationStart;
// Calculate confidence score
const confidence = calculateConfidence(ogObject, errors, warnings);
const confidenceLevel = getConfidenceLevel(confidence);
// Complete metrics
metrics.fallbacksUsed = fallbacksUsed;
metrics.performance = {
htmlParseTime,
metaExtractionTime,
structuredDataExtractionTime,
validationTime,
totalTime: Date.now() - startTime,
} as IPerformanceMetrics;
const result: IExtractionResult = {
data: ogObject,
structuredData,
errors,
warnings,
confidence,
confidenceLevel,
fallbacksUsed,
metrics: metrics as IMetrics,
};
// Add validation and scoring results if generated
if (validationResult) {
result.validation = validationResult as IValidationResult;
}
if (socialScore) {
result.socialScore = socialScore as ISocialScore;
}
// Cache the result if caching is enabled
if (globalCache && options?.cache?.enabled && url) {
await globalCache.set(url, result);
}
return result;
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
errors.push({
code: "EXTRACTION_ERROR",
message: errorMessage,
severity: "critical",
timestamp: new Date(),
});
return {
data: {} as IAsyncOGResult,
structuredData,
errors,
warnings,
confidence: 0,
confidenceLevel: "low",
fallbacksUsed,
metrics: {
...metrics,
performance: {
htmlParseTime: 0,
metaExtractionTime: 0,
structuredDataExtractionTime: 0,
validationTime: 0,
totalTime: Date.now() - startTime,
},
} as IMetrics,
};
}
}
/**
* Calculate confidence score based on extracted data
*/
function calculateConfidence(data: IAsyncOGResult, errors: IError[], warnings: IWarning[]): number {
let score = 100;
// Deduct for missing required fields
if (!data.ogTitle) {
score -= 20;
}
if (!data.ogType) {
score -= 15;
}
if (!data.ogImage) {
score -= 15;
}
if (!data.ogUrl) {
score -= 10;
}
if (!data.ogDescription) {
score -= 10;
}
// Deduct for errors
score -= errors.filter((e) => e.severity === "critical").length * 10;
score -= errors.filter((e) => e.severity === "error").length * 5;
// Deduct for warnings
score -= warnings.length * 2;
// Bonus for additional metadata
if (data.twitterCard) {
score += 5;
}
if (data.articlePublishedTime) {
score += 3;
}
if (data.canonical) {
score += 3;
}
if (data.favicon) {
score += 2;
}
return Math.max(0, Math.min(100, score));
}
/**
* Get confidence level from score
*/
function getConfidenceLevel(score: number): ConfidenceLevel {
if (score >= 80) {
return "high";
}
if (score >= 50) {
return "medium";
}
return "low";
}
/**
* Extract article content from HTML
*/
function extractArticleContent($: CheerioAPI): { content: string; readingTime: number; wordCount: number } | null {
// Try to find main content area
const selectors = [
"article",
'[role="main"]',
"main",
".post-content",
".entry-content",
".article-content",
".content",
"#content",
];
let content = "";
for (const selector of selectors) {
const element = $(selector).first();
if (element.length > 0) {
content = element.text().trim();
break;
}
}
if (!content) {
// Fallback to largest text block
let maxLength = 0;
$("p").each((_, element) => {
const text = $(element).text().trim();
if (text.length > maxLength) {
maxLength = text.length;
content = text;
}
});
}
if (!content) {
return null;
}
// Calculate reading time (average 200 words per minute)
const words = content.split(/\s+/).filter((word) => word.length > 0);
const wordCount = words.length;
const readingTime = Math.ceil(wordCount / 200);
return {
content: content.substring(0, 5000), // Limit content length
readingTime,
wordCount,
};
}