@devmehq/open-graph-extractor
Version:
Fast, lightweight Open Graph, Twitter Card, and structured data extractor for Node.js with caching and validation
476 lines (475 loc) • 18.4 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __exportStar = (this && this.__exportStar) || function(m, exports) {
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractOpenGraph = extractOpenGraph;
exports.extractOpenGraphAsync = extractOpenGraphAsync;
const cheerio = __importStar(require("cheerio"));
const cache_1 = require("./cache");
const fallback_1 = require("./fallback");
const fields_1 = require("./fields");
const media_1 = require("./media");
const security_1 = require("./security");
const structured_data_1 = require("./structured-data");
const utils_1 = require("./utils");
const validation_1 = require("./validation");
__exportStar(require("./bulk"), exports);
__exportStar(require("./cache"), exports);
__exportStar(require("./media"), exports);
__exportStar(require("./security"), exports);
__exportStar(require("./structured-data"), exports);
// Re-export all types and utilities
__exportStar(require("./types"), exports);
__exportStar(require("./validation"), exports);
// Global cache instance
let globalCache = null;
/**
* Extract Open Graph data from HTML with optional enhanced features
* @param input - HTML string or Buffer
* @param options - extraction options
*/
function extractOpenGraph(input, options) {
// Legacy extraction for backward compatibility
let ogObject = {};
const $ = cheerio.load(input);
const metaFields = fields_1.fields.concat(options?.customMetaTags ?? []);
// find all the open graph info in the meta tags
$("meta").each((_index, meta) => {
if (!meta.attribs || (!meta.attribs.property && !meta.attribs.name))
return;
const property = meta.attribs.property || meta.attribs.name || meta.attribs.itemprop || meta.attribs.itemProp;
const content = meta.attribs.content || meta.attribs.value;
metaFields.forEach((item) => {
if (property.toLowerCase() === item.property.toLowerCase()) {
if (!item.multiple) {
ogObject[item.fieldName] = content;
}
else if (!ogObject[item.fieldName]) {
ogObject[item.fieldName] = [content];
}
else if (Array.isArray(ogObject[item.fieldName])) {
ogObject[item.fieldName].push(content);
}
}
});
});
// set ogImage to ogImageSecureURL/ogImageURL if there is no ogImage
if (!ogObject.ogImage && ogObject.ogImageSecureURL) {
ogObject.ogImage = ogObject.ogImageSecureURL;
}
else if (!ogObject.ogImage && ogObject.ogImageURL) {
ogObject.ogImage = ogObject.ogImageURL;
}
// formats the multiple media values
ogObject = (0, media_1.mediaSetup)(ogObject, options);
// if onlyGetOpenGraphInfo isn't set, run the open graph fallbacks
if (!options?.onlyGetOpenGraphInfo) {
ogObject = (0, fallback_1.fallback)(ogObject, options, $);
}
// removes any undef
ogObject = (0, utils_1.removeNestedUndefinedValues)(ogObject);
return ogObject;
}
/**
* Extract Open Graph data with all features (async version)
* @param input - HTML string, Buffer, or URL (URL fetching not implemented)
* @param options - extraction options
*/
async function extractOpenGraphAsync(input, options) {
return extractOpenGraphAsyncImpl(input, options);
}
/**
* Async extraction implementation (internal)
*/
async function extractOpenGraphAsyncImpl(input, options) {
const startTime = Date.now();
const metrics = {
metaTagsFound: 0,
structuredDataFound: 0,
imagesFound: 0,
videosFound: 0,
fallbacksUsed: [],
};
const errors = [];
const warnings = [];
const fallbacksUsed = [];
let structuredData = { jsonLD: [], schemaOrg: {}, microdata: {}, rdfa: {}, dublinCore: {} };
try {
// Initialize cache if needed
if (options?.cache?.enabled && !globalCache) {
globalCache = (0, cache_1.createCache)({
enabled: options.cache.enabled,
ttl: options.cache.ttl || 3600,
storage: options.cache.storage || "memory",
maxSize: options.cache.maxSize || 1000,
keyGenerator: options.cache.keyGenerator,
});
}
let html;
let url;
// Check if input is URL or HTML
if (typeof input === "string" && (input.startsWith("http://") || input.startsWith("https://"))) {
url = input;
// Validate URL if security is enabled
if (options?.security?.validateUrls && !(0, security_1.validateUrl)(url, options.security)) {
throw new Error(`Invalid or blocked URL: ${url}`);
}
// Normalize URL
if (options?.normalizeUrls) {
url = (0, security_1.normalizeUrl)(url);
}
// Check cache
if (globalCache && options?.cache?.enabled) {
const cached = await globalCache.get(url);
if (cached) {
return cached;
}
}
// Fetch HTML - in production, you would use axios or another HTTP client
// For this library, the user should provide HTML directly or use their own fetching
throw new Error("URL fetching not implemented. Please fetch HTML and pass it directly.");
}
html = typeof input === "string" ? input : input.toString("utf8");
metrics.htmlSize = Buffer.byteLength(html, "utf8");
// Sanitize HTML if security is enabled
if (options?.security?.sanitizeHtml) {
html = (0, security_1.sanitizeHtml)(html, options.security);
}
const htmlParseStart = Date.now();
const $ = cheerio.load(html);
const htmlParseTime = Date.now() - htmlParseStart;
// Extract basic Open Graph data
const metaExtractionStart = Date.now();
let ogObject = {};
const metaFields = fields_1.fields.concat(options?.customMetaTags ?? []);
// Extract meta tags
$("meta").each((_index, meta) => {
if (!meta.attribs || (!meta.attribs.property && !meta.attribs.name))
return;
const property = meta.attribs.property || meta.attribs.name || meta.attribs.itemprop || meta.attribs.itemProp;
const content = meta.attribs.content || meta.attribs.value;
metrics.metaTagsFound++;
metaFields.forEach((item) => {
if (property.toLowerCase() === item.property.toLowerCase()) {
if (!item.multiple) {
ogObject[item.fieldName] = content;
}
else if (!ogObject[item.fieldName]) {
ogObject[item.fieldName] = [content];
}
else if (Array.isArray(ogObject[item.fieldName])) {
ogObject[item.fieldName].push(content);
}
}
});
});
// Extract additional metadata
ogObject.charset =
$("meta[charset]").attr("charset") ||
$('meta[http-equiv="Content-Type"]')
.attr("content")
?.match(/charset=([^;]+)/)?.[1];
ogObject.viewport = $('meta[name="viewport"]').attr("content");
ogObject.robots = $('meta[name="robots"]').attr("content");
ogObject.generator = $('meta[name="generator"]').attr("content");
ogObject.themeColor = $('meta[name="theme-color"]').attr("content");
ogObject.applicationName = $('meta[name="application-name"]').attr("content");
ogObject.canonical = $('link[rel="canonical"]').attr("href");
ogObject.ampUrl = $('link[rel="amphtml"]').attr("href");
ogObject.manifest = $('link[rel="manifest"]').attr("href");
ogObject.maskIcon = $('link[rel="mask-icon"]').attr("href");
ogObject.appleTouchIcon = $('link[rel="apple-touch-icon"]').attr("href");
const metaExtractionTime = Date.now() - metaExtractionStart;
// Set URL if available
if (url && !ogObject.ogUrl) {
ogObject.ogUrl = url;
}
// Extract structured data if enabled
const structuredDataStart = Date.now();
if (options?.extractStructuredData) {
structuredData = (0, structured_data_1.extractStructuredData)($);
metrics.structuredDataFound = structuredData.jsonLD?.length || 0;
ogObject = (0, structured_data_1.mergeStructuredDataWithOG)(ogObject, structuredData);
}
const structuredDataExtractionTime = Date.now() - structuredDataStart;
// Extract enhanced media
if (!options?.onlyGetOpenGraphInfo) {
// Enhanced image extraction
const images = (0, media_1.extractAllImages)($);
metrics.imagesFound = images.length;
if (images.length > 0 && !ogObject.ogImage) {
const bestImage = (0, media_1.selectBestImage)(images);
if (bestImage) {
ogObject.ogImage = bestImage;
fallbacksUsed.push("image-extraction");
}
}
// Enhanced video extraction
const video = (0, media_1.extractVideoMetadata)($, ogObject.ogVideo);
if (video) {
ogObject.ogVideo = video;
metrics.videosFound = 1;
}
// Enhanced audio extraction
const audio = (0, media_1.extractAudioMetadata)($);
if (audio && !ogObject.ogAudio) {
ogObject.ogAudio = audio.url;
if (audio.secureUrl)
ogObject.ogAudioSecureURL = audio.secureUrl;
if (audio.type)
ogObject.ogAudioType = audio.type;
}
}
// Format media
ogObject = (0, media_1.mediaSetup)(ogObject, options);
// Run fallbacks if not disabled
if (!options?.onlyGetOpenGraphInfo) {
const beforeFallback = { ...ogObject };
ogObject = (0, fallback_1.fallback)(ogObject, options, $);
// Track which fallbacks were used
for (const key of Object.keys(ogObject)) {
if (!beforeFallback[key] && ogObject[key]) {
fallbacksUsed.push(`fallback-${key}`);
}
}
}
// Extract article content if enabled
if (options?.extractArticleContent) {
const articleContent = extractArticleContent($);
if (articleContent) {
ogObject.articleContent = articleContent.content;
ogObject.readingTime = articleContent.readingTime;
ogObject.wordCount = articleContent.wordCount;
}
}
// Detect language if enabled
if (options?.detectLanguage) {
const lang = $("html").attr("lang") || $('meta[http-equiv="content-language"]').attr("content");
if (lang) {
ogObject.language = lang;
ogObject.textDirection = lang.startsWith("ar") || lang.startsWith("he") ? "rtl" : "ltr";
}
}
// Clean up undefined values
ogObject = (0, utils_1.removeNestedUndefinedValues)(ogObject);
// Sanitize data if security is enabled
if (options?.security) {
ogObject = (0, security_1.sanitizeExtractedData)(ogObject, options.security);
}
// Validate data if enabled
const validationStart = Date.now();
let validationResult = null;
let socialScore = null;
if (options?.validateData) {
validationResult = (0, validation_1.validateOpenGraph)(ogObject);
const twitterValidation = (0, validation_1.validateTwitterCard)(ogObject);
// Add validation errors and warnings
const validation = validationResult;
const twitterVal = twitterValidation;
errors.push(...validation.errors);
warnings.push(...validation.warnings);
errors.push(...twitterVal.errors);
warnings.push(...twitterVal.warnings);
}
if (options?.generateScore) {
socialScore = (0, validation_1.generateSocialScore)(ogObject);
}
const validationTime = Date.now() - validationStart;
// Calculate confidence score
const confidence = calculateConfidence(ogObject, errors, warnings);
const confidenceLevel = getConfidenceLevel(confidence);
// Complete metrics
metrics.fallbacksUsed = fallbacksUsed;
metrics.performance = {
htmlParseTime,
metaExtractionTime,
structuredDataExtractionTime,
validationTime,
totalTime: Date.now() - startTime,
};
const result = {
data: ogObject,
structuredData,
errors,
warnings,
confidence,
confidenceLevel,
fallbacksUsed,
metrics: metrics,
};
// Add validation and scoring results if generated
if (validationResult) {
result.validation = validationResult;
}
if (socialScore) {
result.socialScore = socialScore;
}
// Cache the result if caching is enabled
if (globalCache && options?.cache?.enabled && url) {
await globalCache.set(url, result);
}
return result;
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
errors.push({
code: "EXTRACTION_ERROR",
message: errorMessage,
severity: "critical",
timestamp: new Date(),
});
return {
data: {},
structuredData,
errors,
warnings,
confidence: 0,
confidenceLevel: "low",
fallbacksUsed,
metrics: {
...metrics,
performance: {
htmlParseTime: 0,
metaExtractionTime: 0,
structuredDataExtractionTime: 0,
validationTime: 0,
totalTime: Date.now() - startTime,
},
},
};
}
}
/**
* Calculate confidence score based on extracted data
*/
function calculateConfidence(data, errors, warnings) {
let score = 100;
// Deduct for missing required fields
if (!data.ogTitle) {
score -= 20;
}
if (!data.ogType) {
score -= 15;
}
if (!data.ogImage) {
score -= 15;
}
if (!data.ogUrl) {
score -= 10;
}
if (!data.ogDescription) {
score -= 10;
}
// Deduct for errors
score -= errors.filter((e) => e.severity === "critical").length * 10;
score -= errors.filter((e) => e.severity === "error").length * 5;
// Deduct for warnings
score -= warnings.length * 2;
// Bonus for additional metadata
if (data.twitterCard) {
score += 5;
}
if (data.articlePublishedTime) {
score += 3;
}
if (data.canonical) {
score += 3;
}
if (data.favicon) {
score += 2;
}
return Math.max(0, Math.min(100, score));
}
/**
* Get confidence level from score
*/
function getConfidenceLevel(score) {
if (score >= 80) {
return "high";
}
if (score >= 50) {
return "medium";
}
return "low";
}
/**
* Extract article content from HTML
*/
function extractArticleContent($) {
// Try to find main content area
const selectors = [
"article",
'[role="main"]',
"main",
".post-content",
".entry-content",
".article-content",
".content",
"#content",
];
let content = "";
for (const selector of selectors) {
const element = $(selector).first();
if (element.length > 0) {
content = element.text().trim();
break;
}
}
if (!content) {
// Fallback to largest text block
let maxLength = 0;
$("p").each((_, element) => {
const text = $(element).text().trim();
if (text.length > maxLength) {
maxLength = text.length;
content = text;
}
});
}
if (!content) {
return null;
}
// Calculate reading time (average 200 words per minute)
const words = content.split(/\s+/).filter((word) => word.length > 0);
const wordCount = words.length;
const readingTime = Math.ceil(wordCount / 200);
return {
content: content.substring(0, 5000), // Limit content length
readingTime,
wordCount,
};
}