scrape-meta
Version:
A library to easily scrape metadata from an article on the web using Open Graph metadata, regular HTML metadata, and series of fallbacks.
79 lines (68 loc) • 2.04 kB
JavaScript
;
var isUrl = require('is-url');
var toTitle = require('to-title-case'
/**
* Wrap a rule with validation and formatting logic.
*
* @param {Function} rule
* @return {Function} wrapped
*/
);function wrap(rule) {
return function ($) {
var value = rule($);
if (typeof value != 'string') return;
if (isUrl(value)) return;
if (value.indexOf('www.') === 0) return;
if (value.includes('|')) return;
// trim extra whitespace
value = value.replace(/\s+/g, ' ');
value = value.trim
// remove any extra "by" in the start of the string
();value = value.replace(/^[\s\n]*by[\s\n]*/im, ''
// make it title case, since some sites have it in weird casing
);value = toTitle(value);
return value;
};
}
/**
* Enforce stricter matching for a `rule`.
*
* @param {Function} rule
* @return {Function} stricter
*/
function strict(rule) {
return function ($) {
var value = rule($);
var regexp = /^\S+\s+\S+/;
if (!regexp.test(value)) return;
return value;
};
}
/**
* Rules.
*/
module.exports = [wrap(function ($) {
return $('meta[property="article:author"]').attr('content');
}), wrap(function ($) {
return $('meta[name="author"]').attr('content');
}), wrap(function ($) {
return $('meta[name="sailthru.author"]').attr('content');
}), wrap(function ($) {
return $('[rel="author"]').first().text();
}), wrap(function ($) {
return $('[itemprop*="author"] [itemprop="name"]').first().text();
}), wrap(function ($) {
return $('[itemprop*="author"]').first().text();
}), wrap(function ($) {
return $('meta[property="book:author"]').attr('content');
}), strict(wrap(function ($) {
return $('a[class*="author"]').first().text();
})), strict(wrap(function ($) {
return $('[class*="author"] a').first().text();
})), strict(wrap(function ($) {
return $('[class*="author"]').first().text();
})), strict(wrap(function ($) {
return $('[class*="byline"]').text();
})), strict(wrap(function ($) {
return $('a[href*="/author/"]').text();
}))];