@trap_stevo/linkscope
Version:
Unleash legendary link intelligence—instantly extract Open Graph, Twitter Card, and metadata from any URL using adaptive scraping logic and real browser simulation. Feed raw text, batch URLs, or scoped domains—parse, adapt, and reveal the hidden web behin
207 lines (206 loc) • 7.28 kB
JavaScript
;
var _LinkScope;
function _assertClassBrand(e, t, n) { if ("function" == typeof e ? e === t : e.has(t)) return arguments.length < 3 ? t : n; throw new TypeError("Private element is not present on this object"); }
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
const puppeteer = require("puppeteer-extra");
const ogs = require("open-graph-scraper");
const cheerio = require("cheerio");
puppeteer.use(StealthPlugin());
class LinkScope {
static async scopeLink(url, options = {}) {
const allowedProtocols = options.allowedProtocols;
const proxy = options.proxy;
if (!_assertClassBrand(LinkScope, this, _isValidUrl).call(this, url, allowedProtocols)) {
throw new Error("Invalid URL");
}
const ogsOptions = {
url
};
if (proxy) {
const proxyUrl = new URL(proxy);
ogsOptions.fetchOptions = {
agent: {
https: new (require("https-proxy-agent"))(proxy),
http: new (require("http-proxy-agent"))(proxy)
}
};
}
const response = await ogs(ogsOptions);
if (!response.result?.success) {
try {
return await _assertClassBrand(LinkScope, this, _scopeLinkFromBrowser).call(this, url, proxy);
} catch (fallbackError) {
throw new Error(fallbackError.message || "Did not get link metadata from deep scope.");
}
}
const result = response.result;
const images = Array.isArray(result.ogImage) ? result.ogImage.map(img => img.url) : result.ogImage?.url ? [result.ogImage.url] : [];
return {
scopeTarget: result.requestUrl || url,
ogTitle: result.ogTitle || null,
ogDescription: result.ogDescription || null,
ogType: result.ogType || null,
ogUrl: result.ogUrl || null,
ogImage: images,
ogVideo: result.ogVideo || null,
ogAudio: result.ogAudio || null,
twitterCard: result.twitterCard || null,
twitterTitle: result.twitterTitle || null,
twitterDescription: result.twitterDescription || null,
twitterImage: result.twitterImage?.url || null,
twitterPlayer: result.twitterPlayer || null,
favicon: result.favicon || null,
charset: result.charset || null,
requestHeaders: result.requestHeaders || {},
success: true,
error: null,
allRawMetaTags: result.allMeta || {}
};
}
static async scopeMany(urls, options = {}) {
const allowedProtocols = options.allowedProtocols;
const proxy = options.proxy;
const concurrency = options.concurrency || 5;
const scopedResults = [];
let currentIndex = 0;
async function runNext() {
if (currentIndex >= urls.length) {
return;
}
const index = currentIndex++;
const url = urls[index];
try {
const data = await LinkScope.scopeLink(url, {
allowedProtocols,
proxy
});
scopedResults[index] = {
url,
status: "fulfilled",
data
};
} catch (error) {
const message = error instanceof Error ? error.message : typeof error === "string" ? error : JSON.stringify(error);
scopedResults[index] = {
url,
status: "rejected",
error: message
};
}
return runNext();
}
const tasks = Array.from({
length: Math.min(concurrency, urls.length)
}, () => runNext());
await Promise.all(tasks);
return scopedResults;
}
static async scopeText(rawText, options = {}) {
const allowedProtocols = options.allowedProtocols;
const proxy = options.proxy;
const urls = _assertClassBrand(LinkScope, this, _extractUrls).call(this, rawText, allowedProtocols);
const uniqueUrls = [...new Set(urls)];
return await this.scopeMany(uniqueUrls, {
allowedProtocols,
proxy
});
}
}
_LinkScope = LinkScope;
async function _scopeLinkFromBrowser(url, proxy) {
const launchOptions = {
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"]
};
if (proxy) {
launchOptions.args.push(`--proxy-server=${proxy}`);
}
const browser = await puppeteer.launch(launchOptions);
const page = await browser.newPage();
await page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36");
await page.setExtraHTTPHeaders({
"accept-language": "en-US,en;q=0.9",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"sec-fetch-site": "none",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"upgrade-insecure-requests": "1"
});
await page.setViewport({
height: 800,
width: 1280
});
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, "webdriver", {
get: () => false
});
window.navigator.chrome = {
runtime: {}
};
Object.defineProperty(navigator, "languages", {
get: () => ["en-US", "en"]
});
Object.defineProperty(navigator, "plugins", {
get: () => [1, 2, 3, 4, 5]
});
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = parameters => parameters.name === "notifications" ? Promise.resolve({
state: Notification.permission
}) : originalQuery(parameters);
});
await page.goto(url, {
waitUntil: "networkidle2",
timeout: 30000
});
await page.waitForTimeout(27);
const html = await page.content();
const $ = cheerio.load(html);
const getMeta = name => $(`meta[property='${name}']`).attr("content") || $(`meta[name='${name}']`).attr("content") || null;
const images = [];
$("meta[property='og:image'], meta[name='og:image']").each((_, el) => {
const img = $(el).attr("content");
if (img) {
images.push(img);
}
});
const fallbackData = {
scopeTarget: url,
ogTitle: getMeta("og:title") || $("title").text() || null,
ogDescription: getMeta("og:description") || getMeta("description") || null,
ogType: getMeta("og:type"),
ogUrl: getMeta("og:url"),
ogImage: images,
ogVideo: getMeta("og:video"),
ogAudio: getMeta("og:audio"),
twitterCard: getMeta("twitter:card"),
twitterTitle: getMeta("twitter:title"),
twitterDescription: getMeta("twitter:description"),
twitterImage: getMeta("twitter:image"),
twitterPlayer: getMeta("twitter:player"),
favicon: $("link[rel~='icon']").attr("href") || null,
charset: $("meta[charset]").attr("charset") || null,
requestHeaders: {},
success: true,
error: null,
allRawMetaTags: {}
};
await browser.close();
return fallbackData;
}
function _extractUrls(text, allowedProtocols) {
const urlRegex = /https?:\/\/[^\s<>"']+/gi;
return (text.match(urlRegex) || []).filter(url => _assertClassBrand(_LinkScope, this, _isValidUrl).call(this, url, allowedProtocols));
}
function _isValidUrl(url, allowedProtocols) {
try {
const parsed = new URL(url);
const baseProtocols = ["http:", "https:"];
const combined = Array.isArray(allowedProtocols) ? [...new Set([...baseProtocols, ...allowedProtocols])] : baseProtocols;
return combined.includes(parsed.protocol);
} catch {
return false;
}
}
;
module.exports = LinkScope;