link-view
Version:
A Node.js package to generate link previews from URLs
727 lines (647 loc) • 33.2 kB
JavaScript
const axios = require('axios');
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const NodeCache = require('node-cache');
// Configure puppeteer with stealth plugin
puppeteer.use(StealthPlugin());
// Increased cache with better memory management
const previewCache = new NodeCache({
stdTTL: 86400,
checkperiod: 3600,
maxKeys: 1000
});
// Reusable browser instance
let browserInstance = null;
// Enhanced browser configuration
const launchOptions = {
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--single-process',
'--disable-web-security',
'--disable-features=IsolateOrigins',
'--disable-extensions',
'--disable-audio-output',
'--disable-remote-fonts',
'--disable-background-networking',
'--disable-default-apps',
'--window-size=1920,1080',
'--start-maximized',
'--disable-blink-features=AutomationControlled',
'--enable-features=NetworkService',
'--no-first-run',
'--no-service-autorun',
'--password-store=basic'
],
ignoreHTTPSErrors: true
};
async function getBrowser() {
if (!browserInstance) {
browserInstance = await puppeteer.launch(launchOptions);
}
return browserInstance;
}
// Enhanced console filtering
const shouldIgnoreConsoleMessage = (text) => {
const ignoredPatterns = [
'net::ERR_FAILED',
'Failed to fetch',
'preloaded using link preload',
'Failed to load resource',
'Track&Report',
'WebGL',
'Metric emission failed',
'error on etracker',
'TypeError: Failed to fetch',
'Warning -- sushi response',
'MSAVowelsJavascriptAssets'
];
return ignoredPatterns.some(pattern => text.toLowerCase().includes(pattern.toLowerCase()));
};
const fetchWithFallback = async(url) => {
const cachedData = previewCache.get(url);
if (cachedData) return cachedData;
let lastError = null;
const maxRetries = 3;
const isAmazon = url.toLowerCase().includes('amazon');
const isMyntra = url.toLowerCase().includes('myntra');
for (let attempt = 1; attempt <= maxRetries; attempt++) {
let page = null;
try {
const browser = await getBrowser();
page = await browser.newPage();
if (isMyntra) {
// Randomize viewport size slightly
const width = 1920 + Math.floor(Math.random() * 100);
const height = 1080 + Math.floor(Math.random() * 100);
await page.setViewport({ width, height, deviceScaleFactor: 1 });
// Enhanced anti-detection script
await page.evaluateOnNewDocument(() => {
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
// Overwrite navigator properties
Object.defineProperties(navigator, {
webdriver: { get: () => undefined },
language: { get: () => 'en-US' },
languages: { get: () => ['en-US', 'en'] },
deviceMemory: { get: () => 8 },
hardwareConcurrency: { get: () => 8 },
userAgent: { get: () => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' }
});
// Add touch support
const touchSupport = {
maxTouchPoints: 5,
ontouchstart: null,
ontouchend: null,
ontouchmove: null,
ontouchcancel: null
};
Object.assign(window.navigator, touchSupport);
// Mock scheduling APIs
window.requestIdleCallback = window.requestIdleCallback || ((cb) => setTimeout(cb, 1));
window.cancelIdleCallback = window.cancelIdleCallback || ((id) => clearTimeout(id));
});
// Set more realistic headers
await page.setExtraHTTPHeaders({
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
});
// Set cookies
const cookies = [
{ name: 'AKA_A2', value: '1', domain: '.myntra.com' },
{ name: 'at', value: 'true', domain: '.myntra.com' },
{ name: 'cart_count', value: '0', domain: '.myntra.com' },
{ name: 'mynt-loc-src', value: 'expiry', domain: '.myntra.com' }
];
await page.setCookie(...cookies);
// Enhanced console filtering
page.on('console', msg => {
const text = msg.text();
if (!shouldIgnoreConsoleMessage(text)) {
console.log('Browser console:', text);
}
});
// Adjust timeouts based on site and attempt
const navigationTimeout = isAmazon || isMyntra ? 90000 : (attempt === 1 ? 30000 : 45000);
await page.setDefaultNavigationTimeout(navigationTimeout);
// Block unnecessary resources
await page.setRequestInterception(true);
page.on('request', (request) => {
const resourceType = request.resourceType();
const url = request.url().toLowerCase();
try {
// Essential resources for Myntra
if (resourceType === 'document' ||
url.includes('gateway/v2/product') ||
(resourceType === 'image' &&
(url.includes('.jpg') || url.includes('.jpeg') || url.includes('.png')) &&
!url.includes('sprite') &&
!url.includes('icon'))) {
request.continue();
return;
}
// Block everything else
request.abort();
} catch (e) {
// If request is already handled, ignore the error
if (!e.message.includes('Request is already handled')) {
console.error('Request interception error:', e);
}
}
});
// Disable JavaScript for Amazon pages after initial load
if (isAmazon) {
await page.evaluateOnNewDocument(() => {
// Disable tracking and error reporting
window.ue_csm = { ue: {} };
window.ue = { log: () => {}, count: () => {}, tag: () => {} };
window.uet = () => {};
window.uex = () => {};
window.ueLogError = () => {};
});
}
// Special handling for Myntra
if (isMyntra) {
await page.evaluateOnNewDocument(() => {
// Emulate regular browser behavior
Object.defineProperty(navigator, 'webdriver', { get: () => false });
window.chrome = { runtime: {} };
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
});
}
// Try regular page load first
await page.goto(url, {
waitUntil: ['domcontentloaded'],
timeout: 30000
});
// Wait for key elements with multiple selectors
await Promise.race([
page.waitForSelector('.pdp-name'),
page.waitForSelector('.pdp-title'),
page.waitForSelector('.image-grid-imageContainer'),
page.waitForSelector('.pdp-image'),
new Promise(resolve => setTimeout(resolve, 5000))
]);
// Extract content using specific Myntra selectors
const productInfo = await page.evaluate(() => {
const getTitle = () => {
const titleElement = document.querySelector('.pdp-title') ||
document.querySelector('.pdp-name') ||
document.querySelector('h1.title');
return titleElement ? titleElement.textContent.trim() : null;
};
const getDescription = () => {
const descElement = document.querySelector('.pdp-product-description') ||
document.querySelector('.index-productDescriptors') ||
document.querySelector('.pdp-product-description-content');
return descElement ? descElement.textContent.trim() : null;
};
const getImage = () => {
// Try multiple image selectors
const imageElement = document.querySelector('.image-grid-imageContainer img') ||
document.querySelector('.pdp-image img') ||
document.querySelector('.img-responsive');
if (imageElement) {
return imageElement.src ||
imageElement.getAttribute('data-src') ||
imageElement.getAttribute('srcset')?.split(',')[0];
}
return null;
};
const title = getTitle();
const description = getDescription();
const image = getImage();
if (!title && !description && !image) {
return null;
}
return {
title: title || 'Product Title Not Available',
description: description || 'No description available',
image,
content: document.documentElement.innerHTML
};
});
if (productInfo) {
previewCache.set(url, productInfo);
if (page) await page.close();
return productInfo;
}
// If page scraping fails, try the API as fallback
const productId = url.split('/').slice(-2)[0];
const apiUrl = `https://www.myntra.com/gateway/v2/product/${productId}`;
const response = await fetch(apiUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'application/json',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': url
}
});
if (response.ok) {
const data = await response.json();
if (data && data.style) {
const apiProductInfo = {
title: data.style.name || data.style.brand,
description: data.style.description,
image: data.style.media.photos[0]?.secureSrc,
content: `
<html>
<head>
<title>${data.style.name}</title>
<meta name="description" content="${data.style.description}">
</head>
<body>
<h1>${data.style.name}</h1>
<img src="${data.style.media.photos[0]?.secureSrc}" alt="${data.style.name}">
<p>${data.style.description}</p>
</body>
</html>
`
};
previewCache.set(url, apiProductInfo);
if (page) await page.close();
return apiProductInfo;
}
}
}
// Enhanced console filtering
page.on('console', msg => {
const text = msg.text();
if (!shouldIgnoreConsoleMessage(text)) {
console.log('Browser console:', text);
}
});
// Adjust timeouts based on site and attempt
const navigationTimeout = isAmazon || isMyntra ? 90000 : (attempt === 1 ? 30000 : 45000);
await page.setDefaultNavigationTimeout(navigationTimeout);
// Block unnecessary resources
await page.setRequestInterception(true);
page.on('request', (request) => {
const resourceType = request.resourceType();
const url = request.url().toLowerCase();
// Enhanced resource blocking
if (['media', 'font', 'websocket', 'manifest', 'other'].includes(resourceType) ||
url.includes('analytics') ||
url.includes('tracking') ||
url.includes('metrics') ||
url.includes('advertisement') ||
url.includes('sponsored') ||
url.includes('unagi') ||
url.includes('sushi') ||
url.includes('track') ||
url.includes('report') ||
url.includes('etracker')) {
request.abort();
return;
}
// Allow essential resources
if (resourceType === 'document' ||
(resourceType === 'image' && !url.includes('sprite') && !url.includes('icon')) ||
(resourceType === 'script' && (url.includes('jquery') || url.includes('main')))) {
request.continue();
return;
}
request.abort();
});
// Disable JavaScript for Amazon pages after initial load
if (isAmazon) {
await page.evaluateOnNewDocument(() => {
// Disable tracking and error reporting
window.ue_csm = { ue: {} };
window.ue = { log: () => {}, count: () => {}, tag: () => {} };
window.uet = () => {};
window.uex = () => {};
window.ueLogError = () => {};
});
}
// Special handling for Myntra
if (isMyntra) {
await page.evaluateOnNewDocument(() => {
// Emulate regular browser behavior
Object.defineProperty(navigator, 'webdriver', { get: () => false });
window.chrome = { runtime: {} };
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
});
}
// Navigate with more lenient conditions
const response = await page.goto(url, {
waitUntil: ['domcontentloaded'],
timeout: navigationTimeout
});
// Accept both OK and Not Modified responses
if (!response || (!response.ok() && response.status() !== 304)) {
throw new Error(`Invalid response: ${response?.status() || 'no response'}`);
}
// Enhanced wait strategy for Myntra
if (isMyntra) {
// Direct API approach for Myntra
const productId = url.split('/').slice(-2)[0];
const apiUrl = `https://www.myntra.com/gateway/v2/product/${productId}`;
const apiResponse = await page.goto(apiUrl, {
waitUntil: 'networkidle0',
timeout: 30000
});
if (apiResponse.ok()) {
const data = await apiResponse.json();
if (data && data.style) {
const productInfo = {
title: data.style.name || data.style.brand,
description: data.style.description,
image: data.style.media.photos[0]?.secureSrc,
content: `
<html>
<head>
<title>${data.style.name}</title>
<meta name="description" content="${data.style.description}">
</head>
<body>
<h1>${data.style.name}</h1>
<img src="${data.style.media.photos[0]?.secureSrc}" alt="${data.style.name}">
<p>${data.style.description}</p>
</body>
</html>
`
};
previewCache.set(url, productInfo);
if (page) await page.close();
return productInfo;
}
}
// Fallback to regular page load if API fails
await page.goto(url, {
waitUntil: ['domcontentloaded', 'networkidle0'],
timeout: 30000
});
// Wait for key elements
await Promise.race([
page.waitForSelector('.pdp-name, .pdp-title', { timeout: 15000 }),
page.waitForSelector('.image-grid-imageContainer, .pdp-image', { timeout: 15000 }),
new Promise(resolve => setTimeout(resolve, 10000))
]);
} else {
// Wait for content with timeout
await Promise.race([
new Promise(resolve => setTimeout(resolve, 2000)),
page.waitForSelector('body', { timeout: 5000 })
]);
}
// Extract content using generic selectors
const { content, image, title, description } = await page.evaluate(() => {
const getMetaContent = (selectors) => {
for (const selector of selectors) {
try {
const element = document.querySelector(selector);
if (element) {
const content = element.getAttribute('content') ||
element.getAttribute('value') ||
element.textContent;
if (content) return content.trim();
}
} catch (e) {
console.error('Selector error:', e);
}
}
return null;
};
const findProductTitle = () => {
// Try structured data first (JSON-LD)
const jsonLdScripts = document.querySelectorAll('script[type="application/ld+json"]');
for (const script of jsonLdScripts) {
try {
const data = JSON.parse(script.textContent);
// Handle different JSON-LD structures
if (data['@type'] === 'Product' && data.name) return data.name;
if (data['@graph']) {
const product = data['@graph'].find(item =>
item['@type'] === 'Product' ||
item['@type'] === 'IndividualProduct'
);
if (product?.name) return product.name;
}
} catch (e) {}
}
// Try meta tags
const metaTitleSelectors = [
'meta[property="og:title"]',
'meta[name="twitter:title"]',
'meta[property="product:title"]',
'meta[name="title"]'
];
for (const selector of metaTitleSelectors) {
const meta = document.querySelector(selector);
if (meta?.content) {
const content = meta.content.trim();
if (content) return content;
}
}
// Try common product title elements
const titleSelectors = [
// Shopify specific
'.product__title',
'.product-single__title',
'[data-product-title]',
// Common e-commerce patterns
'[class*="product"][class*="title"]',
'[class*="product"][class*="name"]',
'[id*="product"][id*="title"]',
'[id*="product"][id*="name"]',
'.product-title',
'.product-name',
'#product-title',
'#product-name',
// Generic but likely product titles
'h1.title',
'h1.name',
'h1:first-of-type',
// Breadcrumb last item
'.breadcrumb li:last-child',
'[class*="breadcrumb"] span:last-child'
];
for (const selector of titleSelectors) {
const element = document.querySelector(selector);
if (element) {
const text = element.textContent.trim();
if (text && text.length > 3) {
// Clean up the title
return text
.replace(/\s+/g, ' ') // Remove extra spaces
.replace(/^\W+|\W+$/g, '') // Remove leading/trailing special chars
.replace(/\| .*$/, '') // Remove everything after |
.replace(/- .*$/, '') // Remove everything after -
.trim();
}
}
}
// Last resort: try to find the most prominent text
const h1s = Array.from(document.getElementsByTagName('h1'));
for (const h1 of h1s) {
if (h1.offsetHeight > 0 && h1.offsetWidth > 0) {
const text = h1.textContent.trim();
if (text && text.length > 3) return text;
}
}
// If still no title, try the page title
const pageTitle = document.title;
if (pageTitle) {
return pageTitle
.split(/[|\-–—]/) // Split on common separators
.map(part => part.trim())
.filter(part => part.length > 3)
.shift() || pageTitle;
}
return '';
};
const findBestImage = () => {
const imageSelectors = [
// Add Myntra-specific selectors
'.image-grid-imageContainer img',
'.pdp-image img',
'.img-responsive',
// High-res and zoom images
'img[data-zoom-image]',
'img[data-large-image]',
'img[data-old-hires]',
'[data-zoom-image]',
// Meta images
'meta[property="og:image"]',
'meta[name="twitter:image"]',
'meta[property="product:image"]',
// Common product image patterns
'.product__image img',
'.product-single__image img',
'.product-featured-img',
'#ProductPhotoImg',
'.product-image img',
'#product-image img',
'.gallery-image img',
'[data-main-image]',
'[id*="product"][id*="image"]',
'[class*="product"][class*="image"]'
];
for (const selector of imageSelectors) {
const element = document.querySelector(selector);
if (element) {
const src = element.getAttribute('data-zoom-image') ||
element.getAttribute('data-large-image') ||
element.getAttribute('data-old-hires') ||
element.getAttribute('content') ||
element.src;
if (src && !src.includes('logo') && !src.includes('icon')) {
return src;
}
}
}
// Find largest image as fallback
let bestImage = null;
let maxArea = 0;
document.querySelectorAll('img').forEach(img => {
if (img.width > 200 && img.height > 200) {
const area = img.width * img.height;
if (area > maxArea && !img.src.includes('logo')) {
maxArea = area;
bestImage = img.src;
}
}
});
return bestImage;
};
// Generic selectors that work across most e-commerce sites
const titleSelectors = [
'meta[property="og:title"]',
'meta[name="twitter:title"]',
'h1',
'[class*="product-title"]',
'[class*="productTitle"]',
'[class*="product-name"]',
'[class*="productName"]',
'.pdp_title', // Common in many e-commerce sites
'#productTitle', // Amazon-style
// Add Myntra-specific selectors
'.pdp-name',
'.pdp-title',
];
const descriptionSelectors = [
'meta[property="og:description"]',
'meta[name="description"]',
'[class*="description"]',
'[class*="product-details"]',
'[class*="productDetails"]',
'#feature-bullets', // Amazon-style
'.pdp_description', // Common in many e-commerce sites
'[data-testid*="description"]',
// Add Myntra-specific selectors
'.pdp-product-description',
'.index-productDescriptors',
];
return {
content: document.documentElement.innerHTML,
title: findProductTitle() || getMetaContent(titleSelectors) || document.title,
image: findBestImage(),
description: getMetaContent(descriptionSelectors)
};
});
if (!title) {
throw new Error('Failed to extract title');
}
const enhancedContent = `
<html>
<head>
<title>${title}</title>
${image ? `<meta name="product-image" content="${image}">` : ''}
${description ? `<meta name="description" content="${description}">` : ''}
<meta name="extracted-title" content="${title}">
</head>
<body>${content}</body>
</html>
`;
previewCache.set(url, enhancedContent);
if (page) await page.close();
return enhancedContent;
} catch (error) {
console.error(`Attempt ${attempt} failed for ${url}:`, error.message);
if (page) await page.close();
lastError = error;
if (attempt < maxRetries) {
const backoffTime = isMyntra ?
Math.pow(2, attempt) * 3000 :
Math.pow(2, attempt) * 1000;
await new Promise(resolve => setTimeout(resolve, backoffTime));
continue;
}
// Return a fallback object for Myntra if all attempts fail
if (isMyntra) {
return {
title: "Myntra Product",
description: "Product information temporarily unavailable",
image: null,
content: "<html><body><p>Content temporarily unavailable</p></body></html>"
};
}
throw new Error(`Failed after ${maxRetries} attempts: ${lastError.message}`);
}
}
};
// Cleanup function for graceful shutdown
process.on('SIGINT', async () => {
if (browserInstance) await browserInstance.close();
process.exit();
});
module.exports = { fetchWithFallback };