relatt-scraper
Version:
Metascarper
200 lines (175 loc) • 5.02 kB
JavaScript
const got = require("got");
const urlParse = require("url");
const requestImageSize = require("request-image-size"); // For external img
const sizeOf = require("image-size");
const {
generateLinkRequestLogsLine,
} = require("../utils/generateLinkRequestLogLine");
const processLinkImage = async (imageLink, body, postUrl, operationType) => {
let searchAvailableImg = false;
let availableImgs = [];
let image = imageLink;
let logs = [];
/**
* When base64 image keep the image if the size is greater than 100x100
*/
if (image && image.startsWith("data:image")) {
try {
var img = Buffer.from(image.split(",")[1], "base64");
var dimensions = sizeOf(img);
if (dimensions.width < 100 || dimensions.height < 100) {
searchAvailableImg = true;
}
} catch (error) {
searchAvailableImg = true;
}
} else {
try {
/**
*
* Check if valid image url else search a alternative image in body
*/
if (isImageFromExcludedHostname(image)) {
searchAvailableImg = true;
} else {
let { headers } = await got(`${image}`, {
https: {
rejectUnauthorized: false,
},
});
logs.push(
generateLinkRequestLogsLine(`${image}`, `${operationType}_IMAGE`, 200)
);
if (!headers["content-type"].includes("image")) {
searchAvailableImg = true;
}
}
} catch (error) {
logs.push(
generateLinkRequestLogsLine(
`${image}`,
`${operationType}_IMAGE`,
error.message
)
);
let checkProtocol = /(?:http[s]*\:\/\/)*(.*?)\.(?=[^\/]*\..{2,5})/i;
let match = image.match(checkProtocol);
if (!match) {
let redirectImage = "";
if (image.includes("https://")) {
redirectImage = image.replace(/^https:\/\//, "https://www.");
} else if (image.includes("http://")) {
redirectImage = image.replace(/^http:\/\//, "http://www.");
} else {
redirectImage = "www." + image;
}
try {
let { headers } = await got(redirectImage, {
https: {
rejectUnauthorized: false,
},
});
logs.push(
generateLinkRequestLogsLine(
`${redirectImage}`,
`${operationType}_IMAGE`,
200
)
);
if (headers["content-type"].includes("image/")) image = redirectImage;
} catch (er) {
logs.push(
generateLinkRequestLogsLine(
`${redirectImage}`,
`${operationType}_IMAGE`,
er.message
)
);
}
} else {
searchAvailableImg = true;
}
}
}
if (searchAvailableImg) {
availableImgs = await ogImageAlternative(body, postUrl);
image = availableImgs.length ? availableImgs[0] : "";
}
return {
image,
logs,
availableImgs,
};
};
/**
*
* get images paths from page body
*/
function getImages(htmlBody) {
const imgRex = /<img.*?src="(.*?)"[^>]+>/g;
const imgRexDataSrc = /<img.*?data-src=(.*?) [^>]+>/g;
const images = [];
let img;
while ((img = imgRex.exec(htmlBody))) {
images.push(img[1]);
}
let imgDataSrc;
while ((imgDataSrc = imgRexDataSrc.exec(htmlBody))) {
images.push(imgDataSrc[1]);
}
return images;
}
/**
*
* Search alternative image on page
*/
const ogImageAlternative = async (body, processedUrl) => {
let availableImgs = [];
try {
let imageRegEx = /\.(?:jpg|gif|png|jpeg)/g;
const imgs = getImages(body.toString());
for await (let img of imgs) {
if (img.match(imageRegEx)) {
try {
/**
* parse image url and set protocol
*/
let parsedUrl = urlParse.parse(processedUrl);
let imgParsedUrl = urlParse.parse(img);
let pic = `${
imgParsedUrl.protocol ? imgParsedUrl.protocol : parsedUrl.protocol
}//${
imgParsedUrl.hostname ? imgParsedUrl.hostname : parsedUrl.hostname
}${imgParsedUrl.pathname}`;
/**
* fetch image size from image url
*/
await requestImageSize(pic).then((size) => {
if (size.width >= 100 || size.height >= 100) {
availableImgs.push({ path: pic, size: size.width * size.height });
}
});
} catch (err) {}
//if (availableImgs.length == 15) break;
}
}
/**
* Sort image by size
*/
let sortedImages = availableImgs.sort((a, b) => {
if (a.size < b.size) {
return 1;
} else if (a.size > b.size) {
return -1;
}
return 0;
});
return sortedImages.map((img) => img.path);
} catch (error) {
return [];
}
};
const isImageFromExcludedHostname = (imageUrl) => {
return /secure\.gravatar\.com/.test(imageUrl);
};
module.exports = { processLinkImage };