UNPKG

ts-open-graph-scraper

Version:

Node.js scraper module for Open Graph and Twitter Card info, based on https://github.com/jshemas/openGraphScraper

272 lines (271 loc) 14 kB
"use strict"; var __assign = (this && this.__assign) || function () { __assign = Object.assign || function(t) { for (var s, i = 1, n = arguments.length; i < n; i++) { s = arguments[i]; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p]; } return t; }; return __assign.apply(this, arguments); }; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __generator = (this && this.__generator) || function (thisArg, body) { var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; function verb(n) { return function (v) { return step([n, v]); }; } function step(op) { if (f) throw new TypeError("Generator is already executing."); while (_) try { if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; if (y = 0, t) op = [op[0] & 2, t.value]; switch (op[0]) { case 0: case 1: t = op; break; case 4: _.label++; return { value: op[1], done: false }; case 5: _.label++; y = op[1]; op = [0]; continue; case 7: op = _.ops.pop(); _.trys.pop(); continue; default: if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } if (t[2]) _.ops.pop(); _.trys.pop(); continue; } op = body.call(thisArg, _); } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; } }; var __spreadArrays = (this && this.__spreadArrays) || function () { for (var s = 0, i = 0, il = arguments.length; i < il; i++) s += arguments[i].length; for (var r = Array(s), k = 0, i = 0; i < il; i++) for (var a = arguments[i], j = 0, jl = a.length; j < jl; j++, k++) r[k] = a[j]; return r; }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (Object.hasOwnProperty.call(mod, k)) result[k] = mod[k]; result["default"] = mod; return result; }; Object.defineProperty(exports, "__esModule", { value: true }); var charset_1 = __importDefault(require("./charset")); var cheerio_1 = __importDefault(require("cheerio")); var fields_1 = __importDefault(require("./fields")); var iconv_lite_1 = __importDefault(require("iconv-lite")); var chardet_1 = __importDefault(require("chardet")); var media = __importStar(require("./media")); var request_promise_native_1 = __importDefault(require("request-promise-native")); var url_1 = __importDefault(require("url")); var utils = __importStar(require("./utils")); var error_1 = __importDefault(require("./error")); exports.default = setOptionsAndReturnOpenGraphResults; function setOptionsAndReturnOpenGraphResults(_optionsOrUrl) { return __awaiter(this, void 0, void 0, function () { var options, validate, results, error_2; return __generator(this, function (_a) { switch (_a.label) { case 0: if (typeof _optionsOrUrl === 'string') { options = { url: _optionsOrUrl, resolveWithFullResponse: true, }; } else { options = __assign(__assign({}, _optionsOrUrl), { resolveWithFullResponse: true }); } if (options.html) { if (options.url) { throw new error_1.default('Must specify either `url` or `html`, not both'); } return [2, extractMetaTags(options.html, options)]; } validate = utils.validate(options.url, options.timeout); if (!validate.returnInputUrl) return [3, 4]; options.url = validate.returnInputUrl; options.timeout = validate.returnInputTimeout; options.headers = __assign({}, options.headers); options.gzip = true; options.encoding = options.encoding || undefined; options.jar = options.jar || false; options.followAllRedirects = options.followAllRedirects || true; options.maxRedirects = options.maxRedirects || 20; if ('window' in global && typeof global['window'] !== 'undefined') { options.gzip = false; options.protocol = url_1.default.parse(options.url).protocol; } if (options.blacklist && options.blacklist.length > 0) { options.blacklist.forEach(function (site) { var _a; var siteUrl = url_1.default.parse(site); var requestUrl = url_1.default.parse(options.url); if (site === options.url || (siteUrl.protocol === requestUrl.protocol && siteUrl.hostname === requestUrl.hostname && siteUrl.port === requestUrl.port && (!requestUrl.pathname || ((_a = requestUrl.pathname) === null || _a === void 0 ? void 0 : _a.startsWith(siteUrl.pathname || ''))))) { throw new error_1.default('Host Name Has Been Black Listed'); } }); } _a.label = 1; case 1: _a.trys.push([1, 3, , 4]); return [4, requestAndResultsFormatter(options)]; case 2: results = _a.sent(); if (results) { return [2, results]; } return [3, 4]; case 3: error_2 = _a.sent(); if (error_2 && (error_2.code === 'ENOTFOUND' || error_2.code === 'EHOSTUNREACH')) { throw new error_1.default('Page Not Found', error_2); } else if (error_2 && error_2.code === 'ETIMEDOUT') { throw new error_1.default('Time Out', error_2); } else { throw new error_1.default('Time Out', error_2); } return [3, 4]; case 4: throw new error_1.default('Invalid URL'); } }); }); } function requestAndResultsFormatter(_options) { return __awaiter(this, void 0, void 0, function () { var options, peekSize, response, body, formatBody, char, ogObject, e_1; return __generator(this, function (_a) { switch (_a.label) { case 0: options = __assign({ url: '' }, _options); peekSize = options.peekSize || 1024; if (!options.url) { throw new error_1.default('You must provide either options.url or options.html'); } _a.label = 1; case 1: _a.trys.push([1, 3, , 4]); return [4, request_promise_native_1.default(options)]; case 2: response = _a.sent(); body = response.body; formatBody = body; if (response && response.statusCode && (response.statusCode >= 400 && response.statusCode < 600)) { throw new error_1.default('Server Has Ran Into A Error', null, response); } if (!options.encoding) { char = charset_1.default(response.headers, formatBody, peekSize) || chardet_1.default.detect(formatBody); if (char) { try { formatBody = iconv_lite_1.default.decode(Buffer.from(formatBody), char); } catch (ex) { throw new error_1.default(undefined, ex, response); } } else { formatBody = formatBody.toString(); } } ogObject = __assign(__assign({}, extractMetaTags(formatBody, options)), { response: response }); if (options.withCharset) { ogObject.charset = charset_1.default(response.headers, formatBody, peekSize); } return [2, ogObject]; case 3: e_1 = _a.sent(); throw new error_1.default(undefined, e_1); case 4: return [2]; } }); }); } function extractMetaTags(body, options) { var _a, _b; var $ = cheerio_1.default.load(body); var validFieldNames = fields_1.default.map(function (x) { return x.property; }); var meta = $('meta').toArray().filter(function (x) { return validFieldNames.some(function (y) { return y === x.attribs.property; }); }); var ogObjectRaw = meta.reduce(function (acc, element) { var _a; var _b, _c, _d, _e; var property = (_d = (_c = (_b = element.attribs.property, (_b !== null && _b !== void 0 ? _b : element.attribs.name)), (_c !== null && _c !== void 0 ? _c : element.attribs['http-equiv'])), (_d !== null && _d !== void 0 ? _d : element.attribs.httpEquiv)); var item = property && fields_1.default.find(function (x) { return x.property === property; }); if (!item) { return acc; } var content = (_e = element.attribs.content, (_e !== null && _e !== void 0 ? _e : element.attribs.value)); var currentValue = acc[item.fieldName]; if (item.multiple) { if (!currentValue) currentValue = [content]; else currentValue = __spreadArrays(currentValue, [content]); } else { currentValue = (currentValue !== null && currentValue !== void 0 ? currentValue : content); } var out = __assign(__assign({}, acc), (_a = {}, _a[item.fieldName] = currentValue, _a)); return out; }, {}); if (!ogObjectRaw.ogImage) { ogObjectRaw.ogImage = ogObjectRaw.ogImageURL ? ogObjectRaw.ogImageURL : ogObjectRaw.ogImageSecureURL ? ogObjectRaw.ogImageSecureURL : []; } if (!ogObjectRaw.ogImage || !ogObjectRaw.ogImage.length) { delete ogObjectRaw.ogImage; } var ogObject = media.mediaSetup(ogObjectRaw, options); if (!options.onlyGetOpenGraphInfo) { if (!ogObject.ogTitle && $('head > title').text() && $('head > title').text().length > 0) { ogObject.ogTitle = $('head > title').text(); } if (!ogObject.ogDescription && $('head > meta[name="description"]').attr('content') && ((_b = (_a = $('head > meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.attr('content')) === null || _b === void 0 ? void 0 : _b.length) > 0) { ogObject.ogDescription = $('head > meta[name="description"]').attr('content'); } var ogImageFallback = options.ogImageFallback === undefined ? true : options.ogImageFallback; if (!ogObject.ogImage && ogImageFallback) { ogObject.ogImage = []; var supportedImageExts_1 = ['jpg', 'jpeg', 'png']; $('img').toArray().forEach(function (elem) { var src = elem.attribs.src || ''; var type = supportedImageExts_1.find(function (x) { return x === src.split('.').pop(); }); if (type) { ogObject.ogImage.push({ url: src, width: parseInt(elem.attribs.width, 10), height: parseInt(elem.attribs.height, 10), type: "image/" + type, }); } }); } } if (Array.isArray(ogObject.ogImage) && !ogObject.ogImage.length) { delete ogObject.ogImage; } return ogObject; }