UNPKG

relatt-scraper

Version:

Metascarper

200 lines (175 loc) 5.02 kB
const got = require("got"); const urlParse = require("url"); const requestImageSize = require("request-image-size"); // For external img const sizeOf = require("image-size"); const { generateLinkRequestLogsLine, } = require("../utils/generateLinkRequestLogLine"); const processLinkImage = async (imageLink, body, postUrl, operationType) => { let searchAvailableImg = false; let availableImgs = []; let image = imageLink; let logs = []; /** * When base64 image keep the image if the size is greater than 100x100 */ if (image && image.startsWith("data:image")) { try { var img = Buffer.from(image.split(",")[1], "base64"); var dimensions = sizeOf(img); if (dimensions.width < 100 || dimensions.height < 100) { searchAvailableImg = true; } } catch (error) { searchAvailableImg = true; } } else { try { /** * * Check if valid image url else search a alternative image in body */ if (isImageFromExcludedHostname(image)) { searchAvailableImg = true; } else { let { headers } = await got(`${image}`, { https: { rejectUnauthorized: false, }, }); logs.push( generateLinkRequestLogsLine(`${image}`, `${operationType}_IMAGE`, 200) ); if (!headers["content-type"].includes("image")) { searchAvailableImg = true; } } } catch (error) { logs.push( generateLinkRequestLogsLine( `${image}`, `${operationType}_IMAGE`, error.message ) ); let checkProtocol = /(?:http[s]*\:\/\/)*(.*?)\.(?=[^\/]*\..{2,5})/i; let match = image.match(checkProtocol); if (!match) { let redirectImage = ""; if (image.includes("https://")) { redirectImage = image.replace(/^https:\/\//, "https://www."); } else if (image.includes("http://")) { redirectImage = image.replace(/^http:\/\//, "http://www."); } else { redirectImage = "www." + image; } try { let { headers } = await got(redirectImage, { https: { rejectUnauthorized: false, }, }); logs.push( generateLinkRequestLogsLine( `${redirectImage}`, `${operationType}_IMAGE`, 200 ) ); if (headers["content-type"].includes("image/")) image = redirectImage; } catch (er) { logs.push( generateLinkRequestLogsLine( `${redirectImage}`, `${operationType}_IMAGE`, er.message ) ); } } else { searchAvailableImg = true; } } } if (searchAvailableImg) { availableImgs = await ogImageAlternative(body, postUrl); image = availableImgs.length ? availableImgs[0] : ""; } return { image, logs, availableImgs, }; }; /** * * get images paths from page body */ function getImages(htmlBody) { const imgRex = /<img.*?src="(.*?)"[^>]+>/g; const imgRexDataSrc = /<img.*?data-src=(.*?) [^>]+>/g; const images = []; let img; while ((img = imgRex.exec(htmlBody))) { images.push(img[1]); } let imgDataSrc; while ((imgDataSrc = imgRexDataSrc.exec(htmlBody))) { images.push(imgDataSrc[1]); } return images; } /** * * Search alternative image on page */ const ogImageAlternative = async (body, processedUrl) => { let availableImgs = []; try { let imageRegEx = /\.(?:jpg|gif|png|jpeg)/g; const imgs = getImages(body.toString()); for await (let img of imgs) { if (img.match(imageRegEx)) { try { /** * parse image url and set protocol */ let parsedUrl = urlParse.parse(processedUrl); let imgParsedUrl = urlParse.parse(img); let pic = `${ imgParsedUrl.protocol ? imgParsedUrl.protocol : parsedUrl.protocol }//${ imgParsedUrl.hostname ? imgParsedUrl.hostname : parsedUrl.hostname }${imgParsedUrl.pathname}`; /** * fetch image size from image url */ await requestImageSize(pic).then((size) => { if (size.width >= 100 || size.height >= 100) { availableImgs.push({ path: pic, size: size.width * size.height }); } }); } catch (err) {} //if (availableImgs.length == 15) break; } } /** * Sort image by size */ let sortedImages = availableImgs.sort((a, b) => { if (a.size < b.size) { return 1; } else if (a.size > b.size) { return -1; } return 0; }); return sortedImages.map((img) => img.path); } catch (error) { return []; } }; const isImageFromExcludedHostname = (imageUrl) => { return /secure\.gravatar\.com/.test(imageUrl); }; module.exports = { processLinkImage };