UNPKG

relatt-scraper

Version:

Metascarper

299 lines (265 loc) 7.84 kB
const url = require("url"); const got = require("got"); const { analyzeSubdomain } = require("./analyzeSubdomain"); const { generateLinkRequestLogsLine, } = require("../utils/generateLinkRequestLogLine"); const jsdom = require("jsdom"); const { JSDOM } = jsdom; /** * check if URL is valid * */ let urlValidationSteps = [ "TRY_RAW_URL", "TRY_HOST_WITH_FEED_PATH", "TRY_HOST_WITH_FEED_PATH_WITH_SLASH", "TRY_SCRAPPING", ]; /** * this method check if the Json API is available * @param {String} url2check */ const checkWpJsonAPI = async (_url, operationType) => { let logs = []; try { const response = await got(`${_url}/wp-json/wp/v2/posts`, { responseType: "json", https: { rejectUnauthorized: false, }, }); logs.push(generateLinkRequestLogsLine(_url, operationType, 200)); if ( Number(response.headers["x-wp-total"]) > 0 && response.headers["content-type"].includes("application/json") ) return { isWpJsonAPI: true, logs }; } catch (e) { logs.push(generateLinkRequestLogsLine(_url, operationType, 404)); if (e.name == "HTTPError") return { isWpJsonAPI: false, logs }; } logs.push(generateLinkRequestLogsLine(_url, operationType, 404)); return { isWpJsonAPI: false, logs }; }; const checkRssUrl = async (url2Check, operationType) => { /** * Parse url in order to get different component of it. * */ let parsedUrl = url.parse(url2Check); let rssUrl = ""; let logs = []; /** * Default value will be tried first, before adjustments * */ let host = parsedUrl.hostname; let path = parsedUrl.pathname; let testedPath = []; for (let step of urlValidationSteps) { let alternativeUrl = ""; switch (step) { case "TRY_RAW_URL": path = `${path}${parsedUrl.query ? "?" + parsedUrl.query : ""}`; break; case "TRY_HOST_WITH_FEED_PATH": if (!/feed/.test(parsedUrl.pathname)) { path = parsedUrl.pathname + "/feed"; } break; case "TRY_SCRAPPING": let { body } = await got(`${url2Check}`, { followRedirect: true, https: { rejectUnauthorized: false, }, }); const dom = new JSDOM(body); dom.window.document.querySelectorAll("link").forEach((link) => { if (link.type == "application/rss+xml") alternativeUrl = link.href; }); } path = path.replace("//", "/"); if (testedPath.includes(path) && !alternativeUrl) { continue; } else if (alternativeUrl) { testedPath.push(path); } try { let { headers, body } = await got( alternativeUrl ? alternativeUrl : `${parsedUrl.protocol}//${host}${path}`, { followRedirect: true, https: { rejectUnauthorized: false, }, } ); let contentType = headers["content-type"] ? headers["content-type"] : ""; logs.push( generateLinkRequestLogsLine( `${parsedUrl.protocol}//${host}${path}`, operationType, 200 ) ); if ( /^application\/rss\+xml/.test(contentType) || /^application\/xml/.test(contentType) || /^application\/octet-stream/.test(contentType) || /^text\/xml; charset=UTF-8/.test(contentType) || /^text\/xml/.test(contentType) || /^text\/xml; charset=utf-8/.test(contentType) || /^text\/xml;charset=utf-8/.test(contentType) ) { //test if the xml contains rss node if (body.includes("</rss>") && body.includes("<rss")) { rssUrl = alternativeUrl ? alternativeUrl : `${parsedUrl.protocol}//${host}${path}`; break; } } } catch (error) { logs.push( generateLinkRequestLogsLine( `${parsedUrl.protocol}//${host}${path}`, operationType, error.message ) ); console.log( `CONFIGURATION_BOT_RSS [RSS CHECK]: Impossible d'atteindre l'url ${parsedUrl.protocol}//${host}${path}` ); } } return { rssUrl: rssUrl && rssUrl[rssUrl.length - 1] === "/" ? rssUrl.slice(0, -1) : rssUrl, logs, }; }; const isUrlGivenRssFeed = async (url2Check, operationType) => { let { analyzedUrl, logs } = await analyzeSubdomain(url2Check, operationType); /** * Parse url in order to get different component of it. * */ let parsedUrl = url.parse(analyzedUrl); let rssUrl = ""; /** * Default value will be tried first, before adjustments * */ let host = parsedUrl.hostname; let path = parsedUrl.pathname; try { let { headers, body } = await got(`${parsedUrl.protocol}//${host}${path}`, { followRedirect: true, https: { rejectUnauthorized: false, }, }); let contentType = headers["content-type"] ? headers["content-type"] : ""; logs.push( generateLinkRequestLogsLine( `${parsedUrl.protocol}//${host}${path}`, operationType, 404 ) ); if ( /^application\/rss\+xml/.test(contentType) || /^application\/xml/.test(contentType) || /^application\/octet-stream/.test(contentType) || /^text\/xml; charset=UTF-8/.test(contentType) || /^text\/xml/.test(contentType) || /^text\/xml; charset=utf-8/.test(contentType) || /^text\/xml;charset=utf-8/.test(contentType) ) { //test if the xml contains rss node if (body.includes("</rss>") && body.includes("<rss")) { rssUrl = `${parsedUrl.protocol}//${host}${path}`; } } } catch (error) { logs.push( generateLinkRequestLogsLine( `${parsedUrl.protocol}//${host}${path}`, operationType, error.message ) ); console.log( `CREATION_DE_LIEN [RSS CHECK]: Impossible d'atteindre l'url ${parsedUrl.protocol}//${host}${path}` ); } return { rssUrl: rssUrl && rssUrl[rssUrl.length - 1] === "/" ? rssUrl.slice(0, -1) : rssUrl, logs, }; }; const crawlPage = async ({ link, operationType = "" }) => { let { analyzedUrl, logs } = await analyzeSubdomain(link, operationType); return await getLinksOnPage(analyzedUrl) }; const getLinksOnPage = async (page) => { try { let links = []; let parsedUrl = url.parse(page); let host = parsedUrl.hostname; let result = await got(`${page}`, { followRedirect: true, responseType: "buffer", https: { rejectUnauthorized: false, }, }); var patt = /<a[^>]*href=["']([^"']*)["']/g; while ((match = patt.exec(result.body.toString("utf8")))) { if (match[1] && !match[1].startsWith("#")) { let link = match[1].startsWith("/") ? `${parsedUrl.protocol ? parsedUrl.protocol + "//" : ""}${host}${ match[1] }` : match[1]; if (link && host == url.parse(link).host && link !== page) links.push(link.split("#")[0]); } } links = [...new Set(links)]; let categorizedLinks = {}; let relevantLinks = {}; links.forEach((l) => { url .parse(l) .path.split("/") .forEach((p) => { if (p) { categorizedLinks[p] = categorizedLinks[p] && categorizedLinks[p].length ? [...categorizedLinks[p], l] : [l]; } }); }); Object.keys(categorizedLinks).forEach((x) => { if (categorizedLinks[x] && categorizedLinks[x].length >= 2) { relevantLinks[x] = categorizedLinks[x]; } }); return relevantLinks; } catch (error) { console.log(error); return {}; } }; module.exports = { checkRssUrl, checkWpJsonAPI, isUrlGivenRssFeed, crawlPage, };