relatt-scraper
Version:
Metascarper
274 lines (235 loc) • 7.05 kB
JavaScript
const metascraper = require("metascraper")([
require("metascraper-image")(),
require("metascraper-logo-favicon")(),
require("metascraper-description")(),
require("metascraper-title")(),
require("metascraper-url")(),
]);
const he = require("he");
const { fetchLinkThroughRedirects } = require("../utils/redirect");
const { forceDecoding } = require("../utils/encode");
const { analyzeSubdomain } = require("./analyzeSubdomain");
const { processLinkImage } = require("./processLinkImage");
const { partitionLink } = require("./partitionLink");
const { getUrlHeaderData } = require("../utils/getUrlHeaderData");
const { isUrlGivenRssFeed } = require("../modules/checkFeedURL");
const { getInstagramPreview } = require("../utils/getInstagramPreview");
const { getTwitterPreview } = require("../utils/getTwitterPreview");
const got = require("got");
const metascrape = async ({
link,
shouldCheckSubdomainAndProtocol = true,
analyzeImage = true,
checkIfRssUrl = false,
operationType,
}) => {
let linkToProcess = link;
let linkParts = await partitionLink({
link: link,
shouldCheckSubdomainAndProtocol: false,
operationType,
});
if (["www.instagram.com", "instagram.com"].includes(linkParts.hostname)) {
let instagramFetchResult = await getInstagramPreview(link);
return {
logo: instagramFetchResult.logo,
image: instagramFetchResult.image,
description: instagramFetchResult.description,
title: instagramFetchResult.title,
processedUrl: link,
availableImgs: [],
logs: [],
isFile: false,
};
}
if (["www.twitter.com", "twitter.com"].includes(linkParts.hostname)) {
return await getTwitterPreview(link);
}
let headersCheck = await verifyHeaders(link, checkIfRssUrl, operationType);
let metaScrapeLogs = [...headersCheck.result.logs];
if (!headersCheck.isHtmlContentType) {
return headersCheck.result;
}
if (shouldCheckSubdomainAndProtocol) {
let { analyzedUrl, logs } = await analyzeSubdomain(link, operationType);
metaScrapeLogs = [...metaScrapeLogs, ...logs];
linkToProcess = analyzedUrl;
}
/**
* Fetch url content
*/
let { body, url, contentType, ...getUrlResult } = await getUrlBody(
linkToProcess,
linkParts,
operationType
);
metaScrapeLogs = [...metaScrapeLogs, ...getUrlResult.redirectLogs];
/**
* from html get Open graph data
*/
let availableImgs = [];
let html = body.toString("utf8");
let { logo, image, description, title } = await metascraper({ html, url });
/**
* Check content type is not a text/html
*/
if (!contentType || !contentType.includes("text/html")) {
image = null;
description = "";
title = "";
}
/**
* Clean invalid characters from description and title
*/
if (
(description && description.match(/\uFFFD/g)) ||
(title && title.match(/\uFFFD/g))
) {
html = body.toString("latin1");
let metadata = await metascraper({ html, url });
description = metadata.description;
title = metadata.title;
}
title = title
? forceDecoding(he.decode(title.replace(/<(?:.|\n)*?>/gm, " ")))
: "";
description = description
? forceDecoding(he.decode(description.replace(/<(?:.|\n)*?>/gm, " ")))
: "";
/**
* If Exist OGImage check if the image is findable
*/
if (image && analyzeImage) {
let processImageResult = await processLinkImage(
image,
body,
url,
operationType
);
image = processImageResult.image;
metaScrapeLogs = [...metaScrapeLogs, ...processImageResult.logs];
availableImgs = processImageResult.availableImgs;
}
let isVideo = await isVideoUrl(url);
return {
logo,
image,
description,
title,
processedUrl: url,
availableImgs,
logs: metaScrapeLogs,
contentType: headersCheck.headers.contentType,
contentLength: headersCheck.headers.contentLength,
isFile: false,
isVideo,
};
};
async function getUrlBody(link, linkParts, operationType) {
let processedUrl = processUrlOfVideoPlatform(link);
return fetchLinkThroughRedirects(
`${processedUrl}`,
[],
operationType,
linkParts
);
}
/**
* Verify content headers of the link
* @param {*} link
* @returns
*/
async function verifyHeaders(link, checkIfRssUrl, operationType) {
let headers = await getUrlHeaderData({
link,
fetchOptions: { https: { rejectUnauthorized: false } },
});
let logs = [...headers.logs];
if (
!(
/^application\/rss\+xml/.test(headers.contentType) ||
/^application\/xml/.test(headers.contentType) ||
/^application\/octet-stream/.test(headers.contentType) ||
/^text\/xml; charset=UTF-8/.test(headers.contentType) ||
/^text\/xml/.test(headers.contentType) ||
/^text\/xml; charset=utf-8/.test(headers.contentType) ||
/^text\/xml;charset=utf-8/.test(headers.contentType) ||
(headers.contentType && headers.contentType.includes("text/html"))
)
) {
return {
isHtmlContentType: false,
headers,
result: {
processedUrl: link,
logs,
contentType: headers.contentType,
contentLength: headers.contentLength,
isFile: true,
},
};
}
if (checkIfRssUrl) {
let checkRssResult = await isUrlGivenRssFeed(link, operationType);
if (checkRssResult && checkRssResult.rssUrl) {
return {
isHtmlContentType: false,
headers,
result: {
isRssUrl: true,
processedUrl: checkRssResult.rssUrl,
logs,
},
};
}
}
return {
isHtmlContentType: true,
headers,
result: {
logs,
},
};
}
function processUrlOfVideoPlatform(url) {
let regEx =
/(http:|https:|)\/\/(player.|www.)?(vimeo\.com|youtu(be\.com|\.be|be\.googleapis\.com)|dailymotion.com)\/(video\/|embed\/|shorts\/|watch\?v=|v\/)?([A-Za-z0-9._%-?]*)(\&\S+)?/;
url.match(regEx);
let processedUrl = url;
if (RegExp.$3.indexOf("youtu") > -1) {
if (
!processedUrl.includes("channel") &&
!processedUrl.includes("youtube.com/@")
) {
processedUrl = "https://youtu.be/" + RegExp.$6;
}
}
return processedUrl;
}
async function isVideoUrl(url) {
if (await isSoundCloudVideo(url)) return true;
let regEx =
/(http:|https:|)\/\/(player.|www.|open.)?(vimeo\.com|youtu(be\.com|\.be|be\.googleapis\.com)|dailymotion.com|spotify.com)\/(video\/|shorts\/|embed\/|episode\/|artist\/|playlist\/|track\/|watch\?v=|v\/)?([A-Za-z0-9._%-]*)(\&\S+)?/;
if (
!url ||
!url.match(regEx) ||
url.includes("channel") ||
!url.includes("channel")
)
return false;
return true;
}
async function isSoundCloudVideo(url) {
let regEx = /^https?:\/\/(www.)?(soundcloud\.com|snd\.sc)\/(.*)$/;
let isVideo = false;
if (url.match(regEx)) {
try {
await got(`https://w.soundcloud.com/player/?url=${url}`);
isVideo = true;
} catch (e) {
isVideo = false;
}
}
return isVideo;
}
module.exports = { metascrape };