relatt-scraper
Version:
Metascarper
159 lines (139 loc) • 3.84 kB
JavaScript
const got = require("got");
const { generateLinkRequestLogsLine } = require("./generateLinkRequestLogLine");
const { getUrlHeaderData } = require("./getUrlHeaderData");
/**
* retrieve the redirect link
*
*/
const redirect_link = (html, url) => {
let urlRedirect = "";
let urlSource = url;
let regEx =
/(<meta\s+)*((name\s*=\s*("|')(?<name>[^'("|')]*)("|')){1}|content\s*=\s*("|')(?<content>[^'("|')]*)("|')|scheme\s*=\s*("|')(?<scheme>[^'("|')]*)("|'))/gi;
/**
* get meta data in the html
*/
let metaData = html.match(regEx);
/**
* find the link in meta
*/
if (metaData) {
for (let i = 0; i < metaData.length; i++) {
let index = metaData[i].indexOf("URL=");
if (index > 0) {
urlRedirect = metaData[i].substr(
index + 4,
metaData[i].length - (index + 5)
);
break;
}
}
}
/**
* verify the redirect link
*/
if (
urlRedirect.includes("https") ||
urlRedirect.includes("http") ||
urlRedirect.includes("www.") ||
urlRedirect === ""
) {
/**
* Check if the path has // caracteres and remove one of them
*/
return urlRedirect;
} else {
urlRedirect = urlSource.concat(urlRedirect);
const index = urlRedirect.lastIndexOf("//");
if ((urlRedirect.match(new RegExp("//", "g")) || []).length > 1) {
urlRedirect =
urlRedirect.substr(0, index) + urlRedirect.substr(index + 1);
}
return urlRedirect;
}
};
const fetchLinkThroughRedirects = async (
link,
logs,
operationType,
linkParts
) => {
let hasRedirect = true;
let urlToFetch = link;
let finalUrl = "";
let finalBody = "";
let contentType = "";
let linkRequestLogs = logs.length ? logs : [];
let fetchHeaders = ["www.twitter.com", "twitter.com"].includes(linkParts.hostname)
? {
"user-agent":
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
}
: {};
while (hasRedirect) {
let headers = await getUrlHeaderData({
link: urlToFetch,
operationType,
fetchOptions: {
followRedirect: true,
responseType: "buffer",
https: {
rejectUnauthorized: false,
},
headers: fetchHeaders,
},
});
linkRequestLogs.push(
generateLinkRequestLogsLine(`${urlToFetch}`, operationType, 200)
);
contentType = headers.contentType ? headers.contentType : "";
linkRequestLogs.push(
generateLinkRequestLogsLine(`${urlToFetch}`, operationType, 200)
);
/**
* If there's a redirection with follow it in the next iteration.
*/
if (headers.location) {
hasRedirect = true;
urlToFetch = headers.location;
} else {
hasRedirect = false;
let result = await got(`${urlToFetch}`, {
followRedirect: true,
responseType: "buffer",
https: {
rejectUnauthorized: false,
},
headers: fetchHeaders,
});
finalBody = result.body;
let html = result.body.toString("utf8");
/**
* Check wether there is a meta redirection.
*/
let redirectUrl = redirect_link(html, urlToFetch);
if (redirectUrl) {
let result_ = await got(`${redirectUrl}`, {
followRedirect: true,
responseType: "buffer",
https: {
rejectUnauthorized: false,
},
headers: fetchHeaders,
});
linkRequestLogs.push(
generateLinkRequestLogsLine(`${redirectUrl}`, operationType, 200)
);
finalBody = result_.body;
}
finalUrl = redirectUrl ? redirectUrl : urlToFetch;
}
}
return {
url: finalUrl,
body: finalBody,
contentType,
redirectLogs: linkRequestLogs,
};
};
module.exports = { fetchLinkThroughRedirects };