UNPKG

relatt-scraper

Version:

Metascarper

159 lines (139 loc) 3.84 kB
const got = require("got"); const { generateLinkRequestLogsLine } = require("./generateLinkRequestLogLine"); const { getUrlHeaderData } = require("./getUrlHeaderData"); /** * retrieve the redirect link * */ const redirect_link = (html, url) => { let urlRedirect = ""; let urlSource = url; let regEx = /(<meta\s+)*((name\s*=\s*("|')(?<name>[^'("|')]*)("|')){1}|content\s*=\s*("|')(?<content>[^'("|')]*)("|')|scheme\s*=\s*("|')(?<scheme>[^'("|')]*)("|'))/gi; /** * get meta data in the html */ let metaData = html.match(regEx); /** * find the link in meta */ if (metaData) { for (let i = 0; i < metaData.length; i++) { let index = metaData[i].indexOf("URL="); if (index > 0) { urlRedirect = metaData[i].substr( index + 4, metaData[i].length - (index + 5) ); break; } } } /** * verify the redirect link */ if ( urlRedirect.includes("https") || urlRedirect.includes("http") || urlRedirect.includes("www.") || urlRedirect === "" ) { /** * Check if the path has // caracteres and remove one of them */ return urlRedirect; } else { urlRedirect = urlSource.concat(urlRedirect); const index = urlRedirect.lastIndexOf("//"); if ((urlRedirect.match(new RegExp("//", "g")) || []).length > 1) { urlRedirect = urlRedirect.substr(0, index) + urlRedirect.substr(index + 1); } return urlRedirect; } }; const fetchLinkThroughRedirects = async ( link, logs, operationType, linkParts ) => { let hasRedirect = true; let urlToFetch = link; let finalUrl = ""; let finalBody = ""; let contentType = ""; let linkRequestLogs = logs.length ? logs : []; let fetchHeaders = ["www.twitter.com", "twitter.com"].includes(linkParts.hostname) ? { "user-agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", } : {}; while (hasRedirect) { let headers = await getUrlHeaderData({ link: urlToFetch, operationType, fetchOptions: { followRedirect: true, responseType: "buffer", https: { rejectUnauthorized: false, }, headers: fetchHeaders, }, }); linkRequestLogs.push( generateLinkRequestLogsLine(`${urlToFetch}`, operationType, 200) ); contentType = headers.contentType ? headers.contentType : ""; linkRequestLogs.push( generateLinkRequestLogsLine(`${urlToFetch}`, operationType, 200) ); /** * If there's a redirection with follow it in the next iteration. */ if (headers.location) { hasRedirect = true; urlToFetch = headers.location; } else { hasRedirect = false; let result = await got(`${urlToFetch}`, { followRedirect: true, responseType: "buffer", https: { rejectUnauthorized: false, }, headers: fetchHeaders, }); finalBody = result.body; let html = result.body.toString("utf8"); /** * Check wether there is a meta redirection. */ let redirectUrl = redirect_link(html, urlToFetch); if (redirectUrl) { let result_ = await got(`${redirectUrl}`, { followRedirect: true, responseType: "buffer", https: { rejectUnauthorized: false, }, headers: fetchHeaders, }); linkRequestLogs.push( generateLinkRequestLogsLine(`${redirectUrl}`, operationType, 200) ); finalBody = result_.body; } finalUrl = redirectUrl ? redirectUrl : urlToFetch; } } return { url: finalUrl, body: finalBody, contentType, redirectLogs: linkRequestLogs, }; }; module.exports = { fetchLinkThroughRedirects };