UNPKG

relatt-scraper

Version:

Metascarper

274 lines (235 loc) • 7.05 kB

JavaScript

const metascraper = require("metascraper")([ require("metascraper-image")(), require("metascraper-logo-favicon")(), require("metascraper-description")(), require("metascraper-title")(), require("metascraper-url")(), ]); const he = require("he"); const { fetchLinkThroughRedirects } = require("../utils/redirect"); const { forceDecoding } = require("../utils/encode"); const { analyzeSubdomain } = require("./analyzeSubdomain"); const { processLinkImage } = require("./processLinkImage"); const { partitionLink } = require("./partitionLink"); const { getUrlHeaderData } = require("../utils/getUrlHeaderData"); const { isUrlGivenRssFeed } = require("../modules/checkFeedURL"); const { getInstagramPreview } = require("../utils/getInstagramPreview"); const { getTwitterPreview } = require("../utils/getTwitterPreview"); const got = require("got"); const metascrape = async ({ link, shouldCheckSubdomainAndProtocol = true, analyzeImage = true, checkIfRssUrl = false, operationType, }) => { let linkToProcess = link; let linkParts = await partitionLink({ link: link, shouldCheckSubdomainAndProtocol: false, operationType, }); if (["www.instagram.com", "instagram.com"].includes(linkParts.hostname)) { let instagramFetchResult = await getInstagramPreview(link); return { logo: instagramFetchResult.logo, image: instagramFetchResult.image, description: instagramFetchResult.description, title: instagramFetchResult.title, processedUrl: link, availableImgs: [], logs: [], isFile: false, }; } if (["www.twitter.com", "twitter.com"].includes(linkParts.hostname)) { return await getTwitterPreview(link); } let headersCheck = await verifyHeaders(link, checkIfRssUrl, operationType); let metaScrapeLogs = [...headersCheck.result.logs]; if (!headersCheck.isHtmlContentType) { return headersCheck.result; } if (shouldCheckSubdomainAndProtocol) { let { analyzedUrl, logs } = await analyzeSubdomain(link, operationType); metaScrapeLogs = [...metaScrapeLogs, ...logs]; linkToProcess = analyzedUrl; } /** * Fetch url content */ let { body, url, contentType, ...getUrlResult } = await getUrlBody( linkToProcess, linkParts, operationType ); metaScrapeLogs = [...metaScrapeLogs, ...getUrlResult.redirectLogs]; /** * from html get Open graph data */ let availableImgs = []; let html = body.toString("utf8"); let { logo, image, description, title } = await metascraper({ html, url }); /** * Check content type is not a text/html */ if (!contentType || !contentType.includes("text/html")) { image = null; description = ""; title = ""; } /** * Clean invalid characters from description and title */ if ( (description && description.match(/\uFFFD/g)) || (title && title.match(/\uFFFD/g)) ) { html = body.toString("latin1"); let metadata = await metascraper({ html, url }); description = metadata.description; title = metadata.title; } title = title ? forceDecoding(he.decode(title.replace(/<(?:.|\n)*?>/gm, " "))) : ""; description = description ? forceDecoding(he.decode(description.replace(/<(?:.|\n)*?>/gm, " "))) : ""; /** * If Exist OGImage check if the image is findable */ if (image && analyzeImage) { let processImageResult = await processLinkImage( image, body, url, operationType ); image = processImageResult.image; metaScrapeLogs = [...metaScrapeLogs, ...processImageResult.logs]; availableImgs = processImageResult.availableImgs; } let isVideo = await isVideoUrl(url); return { logo, image, description, title, processedUrl: url, availableImgs, logs: metaScrapeLogs, contentType: headersCheck.headers.contentType, contentLength: headersCheck.headers.contentLength, isFile: false, isVideo, }; }; async function getUrlBody(link, linkParts, operationType) { let processedUrl = processUrlOfVideoPlatform(link); return fetchLinkThroughRedirects( `${processedUrl}`, [], operationType, linkParts ); } /** * Verify content headers of the link * @param {*} link * @returns */ async function verifyHeaders(link, checkIfRssUrl, operationType) { let headers = await getUrlHeaderData({ link, fetchOptions: { https: { rejectUnauthorized: false } }, }); let logs = [...headers.logs]; if ( !( /^application\/rss\+xml/.test(headers.contentType) || /^application\/xml/.test(headers.contentType) || /^application\/octet-stream/.test(headers.contentType) || /^text\/xml; charset=UTF-8/.test(headers.contentType) || /^text\/xml/.test(headers.contentType) || /^text\/xml; charset=utf-8/.test(headers.contentType) || /^text\/xml;charset=utf-8/.test(headers.contentType) || (headers.contentType && headers.contentType.includes("text/html")) ) ) { return { isHtmlContentType: false, headers, result: { processedUrl: link, logs, contentType: headers.contentType, contentLength: headers.contentLength, isFile: true, }, }; } if (checkIfRssUrl) { let checkRssResult = await isUrlGivenRssFeed(link, operationType); if (checkRssResult && checkRssResult.rssUrl) { return { isHtmlContentType: false, headers, result: { isRssUrl: true, processedUrl: checkRssResult.rssUrl, logs, }, }; } } return { isHtmlContentType: true, headers, result: { logs, }, }; } function processUrlOfVideoPlatform(url) { let regEx = /(http:|https:|)\/\/(player.|www.)?(vimeo\.com|youtu(be\.com|\.be|be\.googleapis\.com)|dailymotion.com)\/(video\/|embed\/|shorts\/|watch\?v=|v\/)?([A-Za-z0-9._%-?]*)(\&\S+)?/; url.match(regEx); let processedUrl = url; if (RegExp.$3.indexOf("youtu") > -1) { if ( !processedUrl.includes("channel") && !processedUrl.includes("youtube.com/@") ) { processedUrl = "https://youtu.be/" + RegExp.$6; } } return processedUrl; } async function isVideoUrl(url) { if (await isSoundCloudVideo(url)) return true; let regEx = /(http:|https:|)\/\/(player.|www.|open.)?(vimeo\.com|youtu(be\.com|\.be|be\.googleapis\.com)|dailymotion.com|spotify.com)\/(video\/|shorts\/|embed\/|episode\/|artist\/|playlist\/|track\/|watch\?v=|v\/)?([A-Za-z0-9._%-]*)(\&\S+)?/; if ( !url || !url.match(regEx) || url.includes("channel") || !url.includes("channel") ) return false; return true; } async function isSoundCloudVideo(url) { let regEx = /^https?:\/\/(www.)?(soundcloud\.com|snd\.sc)\/(.*)$/; let isVideo = false; if (url.match(regEx)) { try { await got(`https://w.soundcloud.com/player/?url=${url}`); isVideo = true; } catch (e) { isVideo = false; } } return isVideo; } module.exports = { metascrape };