relatt-scraper
Version:
Metascarper
299 lines (265 loc) • 7.84 kB
JavaScript
const url = require("url");
const got = require("got");
const { analyzeSubdomain } = require("./analyzeSubdomain");
const {
generateLinkRequestLogsLine,
} = require("../utils/generateLinkRequestLogLine");
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
/**
* check if URL is valid
* */
let urlValidationSteps = [
"TRY_RAW_URL",
"TRY_HOST_WITH_FEED_PATH",
"TRY_HOST_WITH_FEED_PATH_WITH_SLASH",
"TRY_SCRAPPING",
];
/**
* this method check if the Json API is available
* @param {String} url2check
*/
const checkWpJsonAPI = async (_url, operationType) => {
let logs = [];
try {
const response = await got(`${_url}/wp-json/wp/v2/posts`, {
responseType: "json",
https: {
rejectUnauthorized: false,
},
});
logs.push(generateLinkRequestLogsLine(_url, operationType, 200));
if (
Number(response.headers["x-wp-total"]) > 0 &&
response.headers["content-type"].includes("application/json")
)
return { isWpJsonAPI: true, logs };
} catch (e) {
logs.push(generateLinkRequestLogsLine(_url, operationType, 404));
if (e.name == "HTTPError") return { isWpJsonAPI: false, logs };
}
logs.push(generateLinkRequestLogsLine(_url, operationType, 404));
return { isWpJsonAPI: false, logs };
};
const checkRssUrl = async (url2Check, operationType) => {
/**
* Parse url in order to get different component of it.
* */
let parsedUrl = url.parse(url2Check);
let rssUrl = "";
let logs = [];
/**
* Default value will be tried first, before adjustments
* */
let host = parsedUrl.hostname;
let path = parsedUrl.pathname;
let testedPath = [];
for (let step of urlValidationSteps) {
let alternativeUrl = "";
switch (step) {
case "TRY_RAW_URL":
path = `${path}${parsedUrl.query ? "?" + parsedUrl.query : ""}`;
break;
case "TRY_HOST_WITH_FEED_PATH":
if (!/feed/.test(parsedUrl.pathname)) {
path = parsedUrl.pathname + "/feed";
}
break;
case "TRY_SCRAPPING":
let { body } = await got(`${url2Check}`, {
followRedirect: true,
https: {
rejectUnauthorized: false,
},
});
const dom = new JSDOM(body);
dom.window.document.querySelectorAll("link").forEach((link) => {
if (link.type == "application/rss+xml") alternativeUrl = link.href;
});
}
path = path.replace("//", "/");
if (testedPath.includes(path) && !alternativeUrl) {
continue;
} else if (alternativeUrl) {
testedPath.push(path);
}
try {
let { headers, body } = await got(
alternativeUrl
? alternativeUrl
: `${parsedUrl.protocol}//${host}${path}`,
{
followRedirect: true,
https: {
rejectUnauthorized: false,
},
}
);
let contentType = headers["content-type"] ? headers["content-type"] : "";
logs.push(
generateLinkRequestLogsLine(
`${parsedUrl.protocol}//${host}${path}`,
operationType,
200
)
);
if (
/^application\/rss\+xml/.test(contentType) ||
/^application\/xml/.test(contentType) ||
/^application\/octet-stream/.test(contentType) ||
/^text\/xml; charset=UTF-8/.test(contentType) ||
/^text\/xml/.test(contentType) ||
/^text\/xml; charset=utf-8/.test(contentType) ||
/^text\/xml;charset=utf-8/.test(contentType)
) {
//test if the xml contains rss node
if (body.includes("</rss>") && body.includes("<rss")) {
rssUrl = alternativeUrl
? alternativeUrl
: `${parsedUrl.protocol}//${host}${path}`;
break;
}
}
} catch (error) {
logs.push(
generateLinkRequestLogsLine(
`${parsedUrl.protocol}//${host}${path}`,
operationType,
error.message
)
);
console.log(
`CONFIGURATION_BOT_RSS [RSS CHECK]: Impossible d'atteindre l'url ${parsedUrl.protocol}//${host}${path}`
);
}
}
return {
rssUrl:
rssUrl && rssUrl[rssUrl.length - 1] === "/"
? rssUrl.slice(0, -1)
: rssUrl,
logs,
};
};
const isUrlGivenRssFeed = async (url2Check, operationType) => {
let { analyzedUrl, logs } = await analyzeSubdomain(url2Check, operationType);
/**
* Parse url in order to get different component of it.
* */
let parsedUrl = url.parse(analyzedUrl);
let rssUrl = "";
/**
* Default value will be tried first, before adjustments
* */
let host = parsedUrl.hostname;
let path = parsedUrl.pathname;
try {
let { headers, body } = await got(`${parsedUrl.protocol}//${host}${path}`, {
followRedirect: true,
https: {
rejectUnauthorized: false,
},
});
let contentType = headers["content-type"] ? headers["content-type"] : "";
logs.push(
generateLinkRequestLogsLine(
`${parsedUrl.protocol}//${host}${path}`,
operationType,
404
)
);
if (
/^application\/rss\+xml/.test(contentType) ||
/^application\/xml/.test(contentType) ||
/^application\/octet-stream/.test(contentType) ||
/^text\/xml; charset=UTF-8/.test(contentType) ||
/^text\/xml/.test(contentType) ||
/^text\/xml; charset=utf-8/.test(contentType) ||
/^text\/xml;charset=utf-8/.test(contentType)
) {
//test if the xml contains rss node
if (body.includes("</rss>") && body.includes("<rss")) {
rssUrl = `${parsedUrl.protocol}//${host}${path}`;
}
}
} catch (error) {
logs.push(
generateLinkRequestLogsLine(
`${parsedUrl.protocol}//${host}${path}`,
operationType,
error.message
)
);
console.log(
`CREATION_DE_LIEN [RSS CHECK]: Impossible d'atteindre l'url ${parsedUrl.protocol}//${host}${path}`
);
}
return {
rssUrl:
rssUrl && rssUrl[rssUrl.length - 1] === "/"
? rssUrl.slice(0, -1)
: rssUrl,
logs,
};
};
const crawlPage = async ({ link, operationType = "" }) => {
let { analyzedUrl, logs } = await analyzeSubdomain(link, operationType);
return await getLinksOnPage(analyzedUrl)
};
const getLinksOnPage = async (page) => {
try {
let links = [];
let parsedUrl = url.parse(page);
let host = parsedUrl.hostname;
let result = await got(`${page}`, {
followRedirect: true,
responseType: "buffer",
https: {
rejectUnauthorized: false,
},
});
var patt = /<a[^>]*href=["']([^"']*)["']/g;
while ((match = patt.exec(result.body.toString("utf8")))) {
if (match[1] && !match[1].startsWith("#")) {
let link = match[1].startsWith("/")
? `${parsedUrl.protocol ? parsedUrl.protocol + "//" : ""}${host}${
match[1]
}`
: match[1];
if (link && host == url.parse(link).host && link !== page)
links.push(link.split("#")[0]);
}
}
links = [...new Set(links)];
let categorizedLinks = {};
let relevantLinks = {};
links.forEach((l) => {
url
.parse(l)
.path.split("/")
.forEach((p) => {
if (p) {
categorizedLinks[p] =
categorizedLinks[p] && categorizedLinks[p].length
? [...categorizedLinks[p], l]
: [l];
}
});
});
Object.keys(categorizedLinks).forEach((x) => {
if (categorizedLinks[x] && categorizedLinks[x].length >= 2) {
relevantLinks[x] = categorizedLinks[x];
}
});
return relevantLinks;
} catch (error) {
console.log(error);
return {};
}
};
module.exports = {
checkRssUrl,
checkWpJsonAPI,
isUrlGivenRssFeed,
crawlPage,
};