@devmehq/open-graph-extractor
Version:
Fast, lightweight Open Graph, Twitter Card, and structured data extractor for Node.js with caching and validation
321 lines (320 loc) • 9.53 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.sanitizeHtml = sanitizeHtml;
exports.validateUrl = validateUrl;
exports.detectPII = detectPII;
exports.maskPII = maskPII;
exports.sanitizeExtractedData = sanitizeExtractedData;
exports.normalizeUrl = normalizeUrl;
const cheerio = __importStar(require("cheerio"));
/**
* Sanitize HTML content using Cheerio (Node.js only, no browser dependencies)
*/
function sanitizeHtml(html, options) {
if (!options?.sanitizeHtml) {
return html;
}
// Load HTML with Cheerio
const $ = cheerio.load(html);
// Remove dangerous tags
const dangerousTags = [
"script",
"style",
"iframe",
"object",
"embed",
"applet",
"form",
"input",
"button",
"textarea",
"select",
];
dangerousTags.forEach((tag) => {
$(tag).remove();
});
// Remove dangerous attributes
const dangerousAttrs = [
"onabort",
"onblur",
"onchange",
"onclick",
"ondblclick",
"onerror",
"onfocus",
"onkeydown",
"onkeypress",
"onkeyup",
"onload",
"onmousedown",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onreset",
"onresize",
"onselect",
"onsubmit",
"onunload",
"onafterprint",
"onbeforeprint",
"onbeforeunload",
"onhashchange",
"onmessage",
"onoffline",
"ononline",
"onpagehide",
"onpageshow",
"onpopstate",
"onstorage",
];
$("*").each((_, element) => {
const $element = $(element);
dangerousAttrs.forEach((attr) => {
$element.removeAttr(attr);
});
// Remove javascript: protocol from href and src
const href = $element.attr("href");
if (href?.toLowerCase().includes("javascript:")) {
$element.removeAttr("href");
}
const src = $element.attr("src");
if (src?.toLowerCase().includes("javascript:")) {
$element.removeAttr("src");
}
});
// Remove any inline styles that might contain javascript
$("[style]").each((_, element) => {
const $element = $(element);
const style = $element.attr("style");
if (style && (style.includes("javascript:") || style.includes("expression(") || style.includes("import("))) {
$element.removeAttr("style");
}
});
return $.html();
}
/**
* Validate URL for security
*/
function validateUrl(url, options) {
if (!options?.validateUrls) {
return true;
}
try {
const urlObj = new URL(url);
// Check protocol
if (!["http:", "https:"].includes(urlObj.protocol)) {
return false;
}
// Check against blocked domains
if (options.blockedDomains) {
const domain = urlObj.hostname;
if (options.blockedDomains.some((blocked) => domain.includes(blocked))) {
return false;
}
}
// Check against allowed domains
if (options.allowedDomains && options.allowedDomains.length > 0) {
const domain = urlObj.hostname;
if (!options.allowedDomains.some((allowed) => domain.includes(allowed))) {
return false;
}
}
// Check for local/private IPs
if (isPrivateIP(urlObj.hostname)) {
return false;
}
return true;
}
catch {
return false;
}
}
/**
* Check if an IP address is private/local
*/
function isPrivateIP(hostname) {
// Check for localhost
if (hostname === "localhost" || hostname === "127.0.0.1" || hostname === "::1") {
return true;
}
// Check for private IP ranges
const privateRanges = [/^10\./, /^172\.(1[6-9]|2[0-9]|3[0-1])\./, /^192\.168\./, /^169\.254\./, /^fc00:/i, /^fe80:/i];
return privateRanges.some((range) => range.test(hostname));
}
/**
* Common PII patterns
*/
const PII_PATTERNS = {
email: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g,
phone: /\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b/g,
ssn: /\b\d{3}-\d{2}-\d{4}\b/g,
creditCard: /\b(?:\d{4}[-\s]?){3}\d{4}\b/g,
ipAddress: /\b(?:\d{1,3}\.){3}\d{1,3}\b/g,
// Basic patterns - in production, use more sophisticated detection
};
/**
* Detect PII in text
*/
function detectPII(text) {
const types = [];
const matches = {};
for (const [type, pattern] of Object.entries(PII_PATTERNS)) {
const found = text.match(pattern);
if (found && found.length > 0) {
types.push(type);
matches[type] = found;
}
}
return {
hasPII: types.length > 0,
types,
matches,
};
}
/**
* Mask PII in text
*/
function maskPII(text, options) {
if (!options?.maskPII) {
return text;
}
let masked = text;
// Email addresses
masked = masked.replace(PII_PATTERNS.email, (match) => {
const parts = match.split("@");
if (parts.length === 2) {
const username = parts[0];
const domain = parts[1];
return `${username[0]}${"*".repeat(username.length - 1)}@${domain}`;
}
return match;
});
// Phone numbers
masked = masked.replace(PII_PATTERNS.phone, (match) => {
const digits = match.replace(/\D/g, "");
if (digits.length >= 10) {
return match.replace(/\d/g, "*").replace(/(\*{3})(\*{3})(\*{4})/, "$1-$2-$3");
}
return match;
});
// SSN
masked = masked.replace(PII_PATTERNS.ssn, "***-**-****");
// Credit cards
masked = masked.replace(PII_PATTERNS.creditCard, (match) => {
const digits = match.replace(/\D/g, "");
if (digits.length === 16) {
return `****-****-****-${digits.slice(-4)}`;
}
return match;
});
// IP addresses (keep first octet)
masked = masked.replace(PII_PATTERNS.ipAddress, (match) => {
const parts = match.split(".");
if (parts.length === 4) {
return `${parts[0]}.*.*.*`;
}
return match;
});
return masked;
}
/**
* Sanitize extracted data
*/
function sanitizeExtractedData(data, options) {
if (!options?.sanitizeHtml && !options?.maskPII) {
return data;
}
const sanitized = { ...data };
// Recursively sanitize string values
const sanitizeValue = (value) => {
if (typeof value === "string") {
let result = value;
if (options?.sanitizeHtml) {
result = sanitizeHtml(result, options);
}
if (options?.maskPII) {
result = maskPII(result, options);
}
return result;
}
if (Array.isArray(value)) {
return value.map(sanitizeValue);
}
if (value && typeof value === "object") {
const obj = {};
for (const [key, val] of Object.entries(value)) {
obj[key] = sanitizeValue(val);
}
return obj;
}
return value;
};
return sanitizeValue(sanitized);
}
/**
* Normalize URLs for consistency and security
*/
function normalizeUrl(url) {
try {
const urlObj = new URL(url);
// Remove tracking parameters
const trackingParams = [
"utm_source",
"utm_medium",
"utm_campaign",
"utm_term",
"utm_content",
"fbclid",
"gclid",
"ref",
"source",
];
for (const param of trackingParams) {
urlObj.searchParams.delete(param);
}
// Remove fragment
urlObj.hash = "";
// Ensure HTTPS when possible
if (urlObj.protocol === "http:" && !isPrivateIP(urlObj.hostname)) {
urlObj.protocol = "https:";
}
return urlObj.toString();
}
catch {
return url;
}
}