UNPKG

scrape-meta

Version:

A library to easily scrape metadata from an article on the web using Open Graph metadata, regular HTML metadata, and series of fallbacks.

79 lines (68 loc) 2.04 kB
'use strict'; var isUrl = require('is-url'); var toTitle = require('to-title-case' /** * Wrap a rule with validation and formatting logic. * * @param {Function} rule * @return {Function} wrapped */ );function wrap(rule) { return function ($) { var value = rule($); if (typeof value != 'string') return; if (isUrl(value)) return; if (value.indexOf('www.') === 0) return; if (value.includes('|')) return; // trim extra whitespace value = value.replace(/\s+/g, ' '); value = value.trim // remove any extra "by" in the start of the string ();value = value.replace(/^[\s\n]*by[\s\n]*/im, '' // make it title case, since some sites have it in weird casing );value = toTitle(value); return value; }; } /** * Enforce stricter matching for a `rule`. * * @param {Function} rule * @return {Function} stricter */ function strict(rule) { return function ($) { var value = rule($); var regexp = /^\S+\s+\S+/; if (!regexp.test(value)) return; return value; }; } /** * Rules. */ module.exports = [wrap(function ($) { return $('meta[property="article:author"]').attr('content'); }), wrap(function ($) { return $('meta[name="author"]').attr('content'); }), wrap(function ($) { return $('meta[name="sailthru.author"]').attr('content'); }), wrap(function ($) { return $('[rel="author"]').first().text(); }), wrap(function ($) { return $('[itemprop*="author"] [itemprop="name"]').first().text(); }), wrap(function ($) { return $('[itemprop*="author"]').first().text(); }), wrap(function ($) { return $('meta[property="book:author"]').attr('content'); }), strict(wrap(function ($) { return $('a[class*="author"]').first().text(); })), strict(wrap(function ($) { return $('[class*="author"] a').first().text(); })), strict(wrap(function ($) { return $('[class*="author"]').first().text(); })), strict(wrap(function ($) { return $('[class*="byline"]').text(); })), strict(wrap(function ($) { return $('a[href*="/author/"]').text(); }))];