scrape-meta
Version:
A library to easily scrape metadata from an article on the web using Open Graph metadata, regular HTML metadata, and series of fallbacks.
128 lines (108 loc) • 3.06 kB
JavaScript
;
var _typeof = typeof Symbol === "function" && typeof Symbol.iterator === "symbol" ? function (obj) { return typeof obj; } : function (obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; };
var RULES = require('./rules');
var cheerio = require('cheerio');
var popsicle = require('popsicle');
var utils = require('./utils'
/**
* Scrape metadata from `html`.
*
* @param {String} html
* @param {Object} rules (optional)
* @return {Promise} metadata
*/
);function scrapeHtml(html, rules, url, headers) {
var sourceUrl = '';
if (url) {
sourceUrl = url;
}
return scrapeMetadata(html, sourceUrl, rules, headers);
}
/**
* Scrape metadata from `url`.
*
* @param {String} url
* @param {Object} rules (optional)
* @return {Promise} metadata
*/
function scrapeUrl(url, rules) {
var request = popsicle.request({
url: url,
headers: {
'User-Agent': 'ScrapeMeta'
},
options: {
jar: process.browser ? null : popsicle.jar()
}
});
return request.then(function (res) {
return scrapeMetadata(res.body, url, rules, res.headers);
});
}
/**
* Scrape metadata from `window`.
*
* @param {Window} window
* @param {Object} rules (optional)
* @return {Promise} metadata
*/
function scrapeWindow(window, rules) {
var html = window.document.documentElement.outerHTML;
var url = window.location.href;
return scrapeMetadata(html, url, rules);
}
/**
* Scrape each entry in the metadata result dictionary in parallel.
*
* @param {String} html
* @param {String} url
* @param {Object} rules (optional)
* @return {Promise} metadata
*/
function scrapeMetadata(html, url, rules, headers) {
rules = rules || RULES;
var keys = Object.keys(rules);
var $ = cheerio.load(html);
var metadata = {};
var promises = keys.map(function (key) {
return scrapeMetadatum($, url, rules[key]);
});
if (headers && (typeof headers === 'undefined' ? 'undefined' : _typeof(headers)) === 'object') {
metadata.contentType = utils.getContentType(headers);
}
return Promise.all(promises).then(function (values) {
return keys.reduce(function (memo, key, i) {
memo[key] = values[i];
return memo;
}, metadata);
});
}
/**
* Scrape the first non-null value returned by an array of `rules` functions for
* a single property in the metadata result dictionary.
*
* @param {Cheerio} $
* @param {String} url
* @param {Array or Function} rules
* @return {Promise} value
*/
function scrapeMetadatum($, url, rules) {
if (!Array.isArray(rules)) rules = [rules];
return rules.reduce(function (promise, rule) {
return promise.then(function (value) {
if (value != null && value !== '') return value;
var next = rule($, url);
if (next != null && next !== '') return next;
return null;
});
}, Promise.resolve());
}
/**
* Export.
*/
module.exports = {
RULES: RULES,
scrapeHtml: scrapeHtml,
scrapeUrl: scrapeUrl,
scrapeWindow: scrapeWindow
};