ts-open-graph-scraper
Version:
Node.js scraper module for Open Graph and Twitter Card info, based on https://github.com/jshemas/openGraphScraper
272 lines (271 loc) • 14 kB
JavaScript
;
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (_) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
var __spreadArrays = (this && this.__spreadArrays) || function () {
for (var s = 0, i = 0, il = arguments.length; i < il; i++) s += arguments[i].length;
for (var r = Array(s), k = 0, i = 0; i < il; i++)
for (var a = arguments[i], j = 0, jl = a.length; j < jl; j++, k++)
r[k] = a[j];
return r;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (Object.hasOwnProperty.call(mod, k)) result[k] = mod[k];
result["default"] = mod;
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
var charset_1 = __importDefault(require("./charset"));
var cheerio_1 = __importDefault(require("cheerio"));
var fields_1 = __importDefault(require("./fields"));
var iconv_lite_1 = __importDefault(require("iconv-lite"));
var chardet_1 = __importDefault(require("chardet"));
var media = __importStar(require("./media"));
var request_promise_native_1 = __importDefault(require("request-promise-native"));
var url_1 = __importDefault(require("url"));
var utils = __importStar(require("./utils"));
var error_1 = __importDefault(require("./error"));
exports.default = setOptionsAndReturnOpenGraphResults;
function setOptionsAndReturnOpenGraphResults(_optionsOrUrl) {
return __awaiter(this, void 0, void 0, function () {
var options, validate, results, error_2;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (typeof _optionsOrUrl === 'string') {
options = {
url: _optionsOrUrl,
resolveWithFullResponse: true,
};
}
else {
options = __assign(__assign({}, _optionsOrUrl), { resolveWithFullResponse: true });
}
if (options.html) {
if (options.url) {
throw new error_1.default('Must specify either `url` or `html`, not both');
}
return [2, extractMetaTags(options.html, options)];
}
validate = utils.validate(options.url, options.timeout);
if (!validate.returnInputUrl) return [3, 4];
options.url = validate.returnInputUrl;
options.timeout = validate.returnInputTimeout;
options.headers = __assign({}, options.headers);
options.gzip = true;
options.encoding = options.encoding || undefined;
options.jar = options.jar || false;
options.followAllRedirects = options.followAllRedirects || true;
options.maxRedirects = options.maxRedirects || 20;
if ('window' in global && typeof global['window'] !== 'undefined') {
options.gzip = false;
options.protocol = url_1.default.parse(options.url).protocol;
}
if (options.blacklist && options.blacklist.length > 0) {
options.blacklist.forEach(function (site) {
var _a;
var siteUrl = url_1.default.parse(site);
var requestUrl = url_1.default.parse(options.url);
if (site === options.url || (siteUrl.protocol === requestUrl.protocol &&
siteUrl.hostname === requestUrl.hostname &&
siteUrl.port === requestUrl.port && (!requestUrl.pathname || ((_a = requestUrl.pathname) === null || _a === void 0 ? void 0 : _a.startsWith(siteUrl.pathname || ''))))) {
throw new error_1.default('Host Name Has Been Black Listed');
}
});
}
_a.label = 1;
case 1:
_a.trys.push([1, 3, , 4]);
return [4, requestAndResultsFormatter(options)];
case 2:
results = _a.sent();
if (results) {
return [2, results];
}
return [3, 4];
case 3:
error_2 = _a.sent();
if (error_2 && (error_2.code === 'ENOTFOUND' || error_2.code === 'EHOSTUNREACH')) {
throw new error_1.default('Page Not Found', error_2);
}
else if (error_2 && error_2.code === 'ETIMEDOUT') {
throw new error_1.default('Time Out', error_2);
}
else {
throw new error_1.default('Time Out', error_2);
}
return [3, 4];
case 4: throw new error_1.default('Invalid URL');
}
});
});
}
function requestAndResultsFormatter(_options) {
return __awaiter(this, void 0, void 0, function () {
var options, peekSize, response, body, formatBody, char, ogObject, e_1;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
options = __assign({ url: '' }, _options);
peekSize = options.peekSize || 1024;
if (!options.url) {
throw new error_1.default('You must provide either options.url or options.html');
}
_a.label = 1;
case 1:
_a.trys.push([1, 3, , 4]);
return [4, request_promise_native_1.default(options)];
case 2:
response = _a.sent();
body = response.body;
formatBody = body;
if (response && response.statusCode &&
(response.statusCode >= 400 && response.statusCode < 600)) {
throw new error_1.default('Server Has Ran Into A Error', null, response);
}
if (!options.encoding) {
char = charset_1.default(response.headers, formatBody, peekSize) || chardet_1.default.detect(formatBody);
if (char) {
try {
formatBody = iconv_lite_1.default.decode(Buffer.from(formatBody), char);
}
catch (ex) {
throw new error_1.default(undefined, ex, response);
}
}
else {
formatBody = formatBody.toString();
}
}
ogObject = __assign(__assign({}, extractMetaTags(formatBody, options)), { response: response });
if (options.withCharset) {
ogObject.charset = charset_1.default(response.headers, formatBody, peekSize);
}
return [2, ogObject];
case 3:
e_1 = _a.sent();
throw new error_1.default(undefined, e_1);
case 4: return [2];
}
});
});
}
function extractMetaTags(body, options) {
var _a, _b;
var $ = cheerio_1.default.load(body);
var validFieldNames = fields_1.default.map(function (x) { return x.property; });
var meta = $('meta').toArray().filter(function (x) { return validFieldNames.some(function (y) { return y === x.attribs.property; }); });
var ogObjectRaw = meta.reduce(function (acc, element) {
var _a;
var _b, _c, _d, _e;
var property = (_d = (_c = (_b = element.attribs.property, (_b !== null && _b !== void 0 ? _b : element.attribs.name)), (_c !== null && _c !== void 0 ? _c : element.attribs['http-equiv'])), (_d !== null && _d !== void 0 ? _d : element.attribs.httpEquiv));
var item = property && fields_1.default.find(function (x) { return x.property === property; });
if (!item) {
return acc;
}
var content = (_e = element.attribs.content, (_e !== null && _e !== void 0 ? _e : element.attribs.value));
var currentValue = acc[item.fieldName];
if (item.multiple) {
if (!currentValue)
currentValue = [content];
else
currentValue = __spreadArrays(currentValue, [content]);
}
else {
currentValue = (currentValue !== null && currentValue !== void 0 ? currentValue : content);
}
var out = __assign(__assign({}, acc), (_a = {}, _a[item.fieldName] = currentValue, _a));
return out;
}, {});
if (!ogObjectRaw.ogImage) {
ogObjectRaw.ogImage = ogObjectRaw.ogImageURL ?
ogObjectRaw.ogImageURL :
ogObjectRaw.ogImageSecureURL ?
ogObjectRaw.ogImageSecureURL :
[];
}
if (!ogObjectRaw.ogImage || !ogObjectRaw.ogImage.length) {
delete ogObjectRaw.ogImage;
}
var ogObject = media.mediaSetup(ogObjectRaw, options);
if (!options.onlyGetOpenGraphInfo) {
if (!ogObject.ogTitle && $('head > title').text() && $('head > title').text().length > 0) {
ogObject.ogTitle = $('head > title').text();
}
if (!ogObject.ogDescription
&& $('head > meta[name="description"]').attr('content')
&& ((_b = (_a = $('head > meta[name="description"]')) === null || _a === void 0 ? void 0 : _a.attr('content')) === null || _b === void 0 ? void 0 : _b.length) > 0) {
ogObject.ogDescription = $('head > meta[name="description"]').attr('content');
}
var ogImageFallback = options.ogImageFallback === undefined ? true : options.ogImageFallback;
if (!ogObject.ogImage && ogImageFallback) {
ogObject.ogImage = [];
var supportedImageExts_1 = ['jpg', 'jpeg', 'png'];
$('img').toArray().forEach(function (elem) {
var src = elem.attribs.src || '';
var type = supportedImageExts_1.find(function (x) { return x === src.split('.').pop(); });
if (type) {
ogObject.ogImage.push({
url: src,
width: parseInt(elem.attribs.width, 10),
height: parseInt(elem.attribs.height, 10),
type: "image/" + type,
});
}
});
}
}
if (Array.isArray(ogObject.ogImage) && !ogObject.ogImage.length) {
delete ogObject.ogImage;
}
return ogObject;
}