open-graph-scraper
Version:
Node.js scraper module for Open Graph and Twitter Card info
187 lines (186 loc) • 6.02 kB
JavaScript
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.defaultUrlValidatorSettings = void 0;
exports.isUrlValid = isUrlValid;
exports.validateAndFormatURL = validateAndFormatURL;
exports.findImageTypeFromUrl = findImageTypeFromUrl;
exports.isImageTypeValid = isImageTypeValid;
exports.isThisANonHTMLUrl = isThisANonHTMLUrl;
exports.removeNestedUndefinedValues = removeNestedUndefinedValues;
exports.optionSetup = optionSetup;
exports.isCustomMetaTagsValid = isCustomMetaTagsValid;
exports.unescapeScriptText = unescapeScriptText;
const isUrl_1 = __importDefault(require("./isUrl"));
exports.defaultUrlValidatorSettings = {
allow_fragments: true,
allow_protocol_relative_urls: false,
allow_query_components: true,
allow_trailing_dot: false,
allow_underscores: false,
protocols: ['http', 'https'],
require_host: true,
require_port: false,
require_protocol: false,
require_tld: true,
require_valid_protocol: true,
validate_length: true,
};
/**
* Checks if URL is valid
*
* @param {string} url - url to be checked
* @param {string} urlValidatorSettings - settings used by validator
* @return {boolean} boolean value if the url is valid
*
*/
function isUrlValid(url, urlValidatorSettings) {
return typeof url === 'string' && url.length > 0 && (0, isUrl_1.default)(url, urlValidatorSettings);
}
/**
* Forces url to start with http:// if it doesn't
*
* @param {string} url - url to be updated
* @return {string} url that starts with http
*
*/
const coerceUrl = (url) => (/^(f|ht)tps?:\/\//i.test(url) ? url : `http://${url}`);
/**
* Validates and formats url
*
* @param {string} url - url to be checked and formatted
* @param {string} urlValidatorSettings - settings used by validator
* @return {string} proper url or null
*
*/
function validateAndFormatURL(url, urlValidatorSettings) {
return { url: isUrlValid(url, urlValidatorSettings) ? coerceUrl(url) : null };
}
/**
* Finds the image type from a given url
*
* @param {string} url - url to be checked
* @return {string} image type from url
*
*/
function findImageTypeFromUrl(url) {
let type = url.split('.').pop() ?? '';
[type] = type.split('?');
return type;
}
/**
* Checks if image type is valid
*
* @param {string} type - type to be checked
* @return {boolean} boolean value if type is value
*
*/
function isImageTypeValid(type) {
const validImageTypes = ['apng', 'bmp', 'gif', 'ico', 'cur', 'jpg', 'jpeg', 'jfif', 'pjpeg', 'pjp', 'png', 'svg', 'tif', 'tiff', 'webp'];
return validImageTypes.includes(type);
}
/**
* Checks if URL is a non html page
*
* @param {string} url - url to be checked
* @return {boolean} boolean value if url is non html
*
*/
function isThisANonHTMLUrl(url) {
const invalidImageTypes = ['.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.3gp', '.avi', '.mov', '.mp4', '.m4v', '.m4a', '.mp3', '.mkv', '.ogv', '.ogm', '.ogg', '.oga', '.webm', '.wav', '.bmp', '.gif', '.jpg', '.jpeg', '.png', '.webp', '.zip', '.rar', '.tar', '.tar.gz', '.tgz', '.tar.bz2', '.tbz2', '.txt', '.pdf'];
const extension = findImageTypeFromUrl(url);
return invalidImageTypes.some((type) => `.${extension}`.includes(type));
}
/**
* Find and delete nested undefineds
*
* @param {object} object - object to be cleaned
* @return {object} object without nested undefineds
*
*/
function removeNestedUndefinedValues(object) {
Object.entries(object).forEach(([key, value]) => {
if (value && typeof value === 'object')
removeNestedUndefinedValues(value);
else if (value === undefined)
delete object[key];
});
return object;
}
/**
* Split the options object into ogs and got option objects
*
* @param {object} options - options that need to be split
* @return {object} object with nested options for ogs and got
*
*/
function optionSetup(ogsOptions) {
const options = {
onlyGetOpenGraphInfo: false,
...ogsOptions,
};
return { options };
}
/**
* Checks if image type is valid
*
* @param {string} type - type to be checked
* @return {boolean} boolean value if type is value
*
*/
function isCustomMetaTagsValid(customMetaTags) {
if (!Array.isArray(customMetaTags))
return false;
let result = true;
customMetaTags.forEach((customMetaTag) => {
if (typeof customMetaTag === 'object') {
if (!('fieldName' in customMetaTag) || typeof customMetaTag.fieldName !== 'string')
result = false;
if (!('multiple' in customMetaTag) || typeof customMetaTag.multiple !== 'boolean')
result = false;
if (!('property' in customMetaTag) || typeof customMetaTag.property !== 'string')
result = false;
}
else {
result = false;
}
});
return result;
}
/**
* Unescape script text.
*
* Certain websites escape script text within script tags, which can
* interfere with `JSON.parse()`. Therefore, we need to unescape it.
*
* Known good escape sequences:
*
* https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Character_escape#uhhhh
*
* ```js
* JSON.parse('"\\u2611"'); // '☑'
* ```
*
* Known bad escape sequences:
*
* https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Character_escape#xhh
*
* ```js
* JSON.parse('"\\x26"'); // '&'
* ```
*
* @param {string} scriptText - the text of the script tag
* @returns {string} unescaped script text
*/
function unescapeScriptText(scriptText) {
// https://stackoverflow.com/a/34056693
return scriptText.replace(/\\x([0-9a-f]{2})/ig, (_, pair) => {
const charCode = parseInt(pair, 16);
if (charCode === 34) {
return '\\"';
}
return String.fromCharCode(charCode);
});
}
;