scrape-meta
Version:
A library to easily scrape metadata from an article on the web using Open Graph metadata, regular HTML metadata, and series of fallbacks.
109 lines (99 loc) • 3.36 kB
JavaScript
;
var isIso = require('is-isodate');
var chrono = require('chrono-node'
/**
* Wrap a rule with validation and formatting logic.
*
* @param {Function} rule
* @return {Function} wrapped
*/
);function wrap(rule) {
return function ($) {
var value = rule($);
if (!value) return;
// remove whitespace for easier parsing
value = value.trim
// convert isodates to restringify, because sometimes they are truncated
();if (isIso(value)) return new Date(value).toISOString
// parse number strings as milliseconds
();if (/^[0-9]+$/.test(value)) {
var int = parseInt(value, 10);
var date = new Date(int);
return date.toISOString();
}
// try to parse with the built-in date parser
var native = new Date(value);
if (!isNaN(native.getTime())) return native.toISOString
// try to parse a complex date string
();var parsed = chrono.parseDate(value);
if (parsed) return parsed.toISOString();
};
}
/**
* Rules.
*/
module.exports = [wrap(function ($) {
return $('meta[property="article:published_time"]').attr('content');
}), wrap(function ($) {
return $('meta[name="dc.date"]').attr('content');
}), wrap(function ($) {
return $('meta[name="DC.date"]').attr('content');
}), wrap(function ($) {
return $('meta[name="dc.date.issued"]').attr('content');
}), wrap(function ($) {
return $('meta[name="DC.date.issued"]').attr('content');
}), wrap(function ($) {
return $('meta[name="dc.date.created"]').attr('content');
}), wrap(function ($) {
return $('meta[name="DC.date.created"]').attr('content');
}), wrap(function ($) {
return $('meta[name="DC.Date"]').attr('content');
}), wrap(function ($) {
return $('meta[name="date"]').attr('content');
}), wrap(function ($) {
return $('meta[name="dcterms.date"]').attr('content');
}), wrap(function ($) {
return $('[itemprop="datePublished"]').attr('content');
}), wrap(function ($) {
return $('time[itemprop*="pubDate"]').attr('datetime');
}), wrap(function ($) {
return $('time[itemprop*="pubdate"]').attr('datetime');
}), wrap(function ($) {
return $('[property*="dc:date"]').attr('content');
}), wrap(function ($) {
return $('[property*="dc:created"]').attr('content');
}), wrap(function ($) {
return $('time[datetime][pubdate]').attr('datetime');
}), wrap(function ($) {
return $('meta[name="sailthru.date"]').attr('content');
}), wrap(function ($) {
return $('meta[property="book:release_date"]').attr('content');
}), wrap(function ($) {
return $('time[datetime]').attr('datetime');
}), wrap(function ($) {
return $('[class*="byline"]').text();
}), wrap(function ($) {
return $('[class*="dateline"]').text();
}), wrap(function ($) {
return $('[class*="date"]').text();
}), wrap(function ($) {
return $('[id*="date"]').text();
}), wrap(function ($) {
return $('[class*="post-meta"]').text();
}), wrap(function ($, url) {
var regexp = /(\d{4}[\-\/]\d{2}[\-\/]\d{2})/;
var match = regexp.exec(url);
if (!match) return;
var string = match[1];
var date = new Date(string);
return date.toISOString();
}), wrap(function ($) {
var text = $('[class*="byline"]').text();
if (!text) return;
var regexp = /(\w+ \d{2},? \d{4})/;
var match = regexp.exec(text);
if (!match) return;
var string = match[1];
var date = new Date(string);
return date.toISOString();
})];