UNPKG

scrape-meta

Version:

A library to easily scrape metadata from an article on the web using Open Graph metadata, regular HTML metadata, and series of fallbacks.

128 lines (108 loc) 3.06 kB
'use strict'; var _typeof = typeof Symbol === "function" && typeof Symbol.iterator === "symbol" ? function (obj) { return typeof obj; } : function (obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; var RULES = require('./rules'); var cheerio = require('cheerio'); var popsicle = require('popsicle'); var utils = require('./utils' /** * Scrape metadata from `html`. * * @param {String} html * @param {Object} rules (optional) * @return {Promise} metadata */ );function scrapeHtml(html, rules, url, headers) { var sourceUrl = ''; if (url) { sourceUrl = url; } return scrapeMetadata(html, sourceUrl, rules, headers); } /** * Scrape metadata from `url`. * * @param {String} url * @param {Object} rules (optional) * @return {Promise} metadata */ function scrapeUrl(url, rules) { var request = popsicle.request({ url: url, headers: { 'User-Agent': 'ScrapeMeta' }, options: { jar: process.browser ? null : popsicle.jar() } }); return request.then(function (res) { return scrapeMetadata(res.body, url, rules, res.headers); }); } /** * Scrape metadata from `window`. * * @param {Window} window * @param {Object} rules (optional) * @return {Promise} metadata */ function scrapeWindow(window, rules) { var html = window.document.documentElement.outerHTML; var url = window.location.href; return scrapeMetadata(html, url, rules); } /** * Scrape each entry in the metadata result dictionary in parallel. * * @param {String} html * @param {String} url * @param {Object} rules (optional) * @return {Promise} metadata */ function scrapeMetadata(html, url, rules, headers) { rules = rules || RULES; var keys = Object.keys(rules); var $ = cheerio.load(html); var metadata = {}; var promises = keys.map(function (key) { return scrapeMetadatum($, url, rules[key]); }); if (headers && (typeof headers === 'undefined' ? 'undefined' : _typeof(headers)) === 'object') { metadata.contentType = utils.getContentType(headers); } return Promise.all(promises).then(function (values) { return keys.reduce(function (memo, key, i) { memo[key] = values[i]; return memo; }, metadata); }); } /** * Scrape the first non-null value returned by an array of `rules` functions for * a single property in the metadata result dictionary. * * @param {Cheerio} $ * @param {String} url * @param {Array or Function} rules * @return {Promise} value */ function scrapeMetadatum($, url, rules) { if (!Array.isArray(rules)) rules = [rules]; return rules.reduce(function (promise, rule) { return promise.then(function (value) { if (value != null && value !== '') return value; var next = rule($, url); if (next != null && next !== '') return next; return null; }); }, Promise.resolve()); } /** * Export. */ module.exports = { RULES: RULES, scrapeHtml: scrapeHtml, scrapeUrl: scrapeUrl, scrapeWindow: scrapeWindow };