UNPKG

open-graph-scraper

Version:

Node.js scraper module for Open Graph and Twitter Card info

github.com/jshemas/openGraphScraper

jshemas/openGraphScraper

187 lines (186 loc) • 6.02 kB

JavaScript

"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.defaultUrlValidatorSettings = void 0; exports.isUrlValid = isUrlValid; exports.validateAndFormatURL = validateAndFormatURL; exports.findImageTypeFromUrl = findImageTypeFromUrl; exports.isImageTypeValid = isImageTypeValid; exports.isThisANonHTMLUrl = isThisANonHTMLUrl; exports.removeNestedUndefinedValues = removeNestedUndefinedValues; exports.optionSetup = optionSetup; exports.isCustomMetaTagsValid = isCustomMetaTagsValid; exports.unescapeScriptText = unescapeScriptText; const isUrl_1 = __importDefault(require("./isUrl")); exports.defaultUrlValidatorSettings = { allow_fragments: true, allow_protocol_relative_urls: false, allow_query_components: true, allow_trailing_dot: false, allow_underscores: false, protocols: ['http', 'https'], require_host: true, require_port: false, require_protocol: false, require_tld: true, require_valid_protocol: true, validate_length: true, }; /** * Checks if URL is valid * * @param {string} url - url to be checked * @param {string} urlValidatorSettings - settings used by validator * @return {boolean} boolean value if the url is valid * */ function isUrlValid(url, urlValidatorSettings) { return typeof url === 'string' && url.length > 0 && (0, isUrl_1.default)(url, urlValidatorSettings); } /** * Forces url to start with http:// if it doesn't * * @param {string} url - url to be updated * @return {string} url that starts with http * */ const coerceUrl = (url) => (/^(f|ht)tps?:\/\//i.test(url) ? url : `http://${url}`); /** * Validates and formats url * * @param {string} url - url to be checked and formatted * @param {string} urlValidatorSettings - settings used by validator * @return {string} proper url or null * */ function validateAndFormatURL(url, urlValidatorSettings) { return { url: isUrlValid(url, urlValidatorSettings) ? coerceUrl(url) : null }; } /** * Finds the image type from a given url * * @param {string} url - url to be checked * @return {string} image type from url * */ function findImageTypeFromUrl(url) { let type = url.split('.').pop() ?? ''; [type] = type.split('?'); return type; } /** * Checks if image type is valid * * @param {string} type - type to be checked * @return {boolean} boolean value if type is value * */ function isImageTypeValid(type) { const validImageTypes = ['apng', 'bmp', 'gif', 'ico', 'cur', 'jpg', 'jpeg', 'jfif', 'pjpeg', 'pjp', 'png', 'svg', 'tif', 'tiff', 'webp']; return validImageTypes.includes(type); } /** * Checks if URL is a non html page * * @param {string} url - url to be checked * @return {boolean} boolean value if url is non html * */ function isThisANonHTMLUrl(url) { const invalidImageTypes = ['.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.3gp', '.avi', '.mov', '.mp4', '.m4v', '.m4a', '.mp3', '.mkv', '.ogv', '.ogm', '.ogg', '.oga', '.webm', '.wav', '.bmp', '.gif', '.jpg', '.jpeg', '.png', '.webp', '.zip', '.rar', '.tar', '.tar.gz', '.tgz', '.tar.bz2', '.tbz2', '.txt', '.pdf']; const extension = findImageTypeFromUrl(url); return invalidImageTypes.some((type) => `.${extension}`.includes(type)); } /** * Find and delete nested undefineds * * @param {object} object - object to be cleaned * @return {object} object without nested undefineds * */ function removeNestedUndefinedValues(object) { Object.entries(object).forEach(([key, value]) => { if (value && typeof value === 'object') removeNestedUndefinedValues(value); else if (value === undefined) delete object[key]; }); return object; } /** * Split the options object into ogs and got option objects * * @param {object} options - options that need to be split * @return {object} object with nested options for ogs and got * */ function optionSetup(ogsOptions) { const options = { onlyGetOpenGraphInfo: false, ...ogsOptions, }; return { options }; } /** * Checks if image type is valid * * @param {string} type - type to be checked * @return {boolean} boolean value if type is value * */ function isCustomMetaTagsValid(customMetaTags) { if (!Array.isArray(customMetaTags)) return false; let result = true; customMetaTags.forEach((customMetaTag) => { if (typeof customMetaTag === 'object') { if (!('fieldName' in customMetaTag) || typeof customMetaTag.fieldName !== 'string') result = false; if (!('multiple' in customMetaTag) || typeof customMetaTag.multiple !== 'boolean') result = false; if (!('property' in customMetaTag) || typeof customMetaTag.property !== 'string') result = false; } else { result = false; } }); return result; } /** * Unescape script text. * * Certain websites escape script text within script tags, which can * interfere with `JSON.parse()`. Therefore, we need to unescape it. * * Known good escape sequences: * * https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Character_escape#uhhhh * * ```js * JSON.parse('"\\u2611"'); // '☑' * ``` * * Known bad escape sequences: * * https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Character_escape#xhh * * ```js * JSON.parse('"\\x26"'); // '&' * ``` * * @param {string} scriptText - the text of the script tag * @returns {string} unescaped script text */ function unescapeScriptText(scriptText) { // https://stackoverflow.com/a/34056693 return scriptText.replace(/\\x([0-9a-f]{2})/ig, (_, pair) => { const charCode = parseInt(pair, 16); if (charCode === 34) { return '\\"'; } return String.fromCharCode(charCode); }); }