htmlmetaparser
Version:
A `htmlparser2` handler for parsing rich metadata from HTML. Includes HTML metadata, JSON-LD, RDFa, microdata, OEmbed, Twitter cards and AppLinks.
561 lines • 23.7 kB
JavaScript
"use strict";
var setvalue_1 = require('setvalue');
var url_1 = require('url');
var oembed_1 = require('./oembed');
var providers = new oembed_1.OEmbedProviders(require('../vendor/providers.json'));
var RDF_VALID_NAME_START_CHAR_RANGE = 'A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6' +
'\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' +
'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u10000-\uEFFFF';
var RDF_NAME_START_CHAR_REGEXP = new RegExp("^[" + RDF_VALID_NAME_START_CHAR_RANGE + "]$");
var RDF_NAME_CHAR_REGEXP = new RegExp("^[" + RDF_VALID_NAME_START_CHAR_RANGE + "\\-\\.0-9\u00B7\u0300-\u036F\u203F-\u2040]*$");
exports.KNOWN_VOCABULARIES = {
csvw: 'http://www.w3.org/ns/csvw#',
dcat: 'http://www.w3.org/ns/dcat#',
qb: 'http://purl.org/linked-data/cube#',
grddl: 'http://www.w3.org/2003/g/data-view#',
ma: 'http://www.w3.org/ns/ma-ont#',
org: 'http://www.w3.org/ns/org#',
owl: 'http://www.w3.org/2002/07/owl#',
prov: 'http://www.w3.org/ns/prov#',
rdf: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
rdfa: 'http://www.w3.org/ns/rdfa#',
rdfs: 'http://www.w3.org/2000/01/rdf-schema#',
rif: 'http://www.w3.org/2007/rif#',
rr: 'http://www.w3.org/ns/r2rml#',
sd: 'http://www.w3.org/ns/sparql-service-description#',
skos: 'http://www.w3.org/2004/02/skos/core#',
skosxl: 'http://www.w3.org/2008/05/skos-xl#',
wdr: 'http://www.w3.org/2007/05/powder#',
void: 'http://rdfs.org/ns/void#',
wdrs: 'http://www.w3.org/2007/05/powder-s#',
xhv: 'http://www.w3.org/1999/xhtml/vocab#',
xml: 'http://www.w3.org/XML/1998/namespace',
xsd: 'http://www.w3.org/2001/XMLSchema#',
cc: 'https://creativecommons.org/ns#',
ctag: 'http://commontag.org/ns#',
dc: 'http://purl.org/dc/terms/',
dcterms: 'http://purl.org/dc/terms/',
dc11: 'http://purl.org/dc/elements/1.1/',
foaf: 'http://xmlns.com/foaf/0.1/',
gr: 'http://purl.org/goodrelations/v1#',
ical: 'http://www.w3.org/2002/12/cal/icaltzd#',
og: 'http://ogp.me/ns#',
rev: 'http://purl.org/stuff/rev#',
sioc: 'http://rdfs.org/sioc/ns#',
v: 'http://rdf.data-vocabulary.org/#',
vcard: 'http://www.w3.org/2006/vcard/ns#',
schema: 'http://schema.org/',
music: 'http://ogp.me/ns/music#',
video: 'http://ogp.me/ns/video#',
article: 'http://ogp.me/ns/article#',
book: 'http://ogp.me/ns/book#',
profile: 'http://ogp.me/ns/profile#',
website: 'http://ogp.me/ns/website#',
fb: 'http://ogp.me/ns/fb#'
};
exports.HTML_VALUE_MAP = {
meta: function (baseUrl, attrs) {
return attrs.content;
},
audio: function (baseUrl, attrs) {
return attrs.src ? url_1.resolve(baseUrl, attrs.src) : undefined;
},
a: function (baseUrl, attrs) {
return attrs.href ? url_1.resolve(baseUrl, attrs.href) : undefined;
},
object: function (baseUrl, attrs) {
return attrs.data ? url_1.resolve(baseUrl, attrs.data) : undefined;
},
time: function (baseUrl, attrs) {
return attrs.datetime;
},
data: function (baseUrl, attrs) {
return attrs.value;
}
};
exports.HTML_VALUE_MAP['embed'] = exports.HTML_VALUE_MAP['audio'];
exports.HTML_VALUE_MAP['iframe'] = exports.HTML_VALUE_MAP['audio'];
exports.HTML_VALUE_MAP['img'] = exports.HTML_VALUE_MAP['audio'];
exports.HTML_VALUE_MAP['source'] = exports.HTML_VALUE_MAP['audio'];
exports.HTML_VALUE_MAP['track'] = exports.HTML_VALUE_MAP['audio'];
exports.HTML_VALUE_MAP['video'] = exports.HTML_VALUE_MAP['audio'];
exports.HTML_VALUE_MAP['area'] = exports.HTML_VALUE_MAP['a'];
exports.HTML_VALUE_MAP['link'] = exports.HTML_VALUE_MAP['a'];
exports.HTML_VALUE_MAP['meter'] = exports.HTML_VALUE_MAP['data'];
exports.HandlerFlags = {
hasLang: (1 << 0),
rdfaLink: (1 << 1),
rdfaNode: (1 << 2),
rdfaVocab: (1 << 3),
microdataNode: (1 << 4),
microdataVocab: (1 << 5),
microdataScope: (1 << 6)
};
var Handler = (function () {
function Handler(callback, options) {
this.callback = callback;
this.options = options;
this.result = { alternate: [] };
this.contexts = [{ tagName: '', text: '', flags: 0 }];
this.langs = [];
this._rdfa = {};
this._rdfaNodes = [{}];
this._rdfaVocabs = [];
this._rdfaRels = [];
this._microdata = {};
this._microdataRefs = {};
this._microdataScopes = [[]];
this._microdataNodes = [{}];
}
Handler.prototype.onend = function () {
var oembedProvider = providers.match(this.options.url);
if (oembedProvider && !this.result.alternate.some(function (x) { return x.type === oembedProvider.type; })) {
this.result.alternate.push(oembedProvider);
}
this.callback(null, this.result);
};
Handler.prototype.onerror = function (error) {
this.callback(error, this.result);
};
Handler.prototype.onopentagname = function (tagName) {
this.contexts.push({ tagName: tagName, text: '', flags: 0 });
};
Handler.prototype.onopentag = function (tagName, attributes) {
var context = last(this.contexts);
var relAttr = normalize(attributes['rel']);
var srcAttr = normalize(attributes['src']);
var hrefAttr = normalize(attributes['href']);
var langAttr = normalize(attributes['lang']);
var propertyAttr = normalize(attributes['property']);
var vocabAttr = normalize(attributes['vocab']);
var prefixAttr = normalize(attributes['prefix']);
var resourceAttr = normalize(attributes['resource']);
var typeOfAttr = normalize(attributes['typeof']);
var aboutAttr = normalize(attributes['about']);
var idAttr = normalize(attributes['id']);
var itempropAttr = normalize(attributes['itemprop']);
var itemidAttr = normalize(attributes['itemid']);
var itemtypeAttr = normalize(attributes['itemtype']);
var itemrefAttr = normalize(attributes['itemref']);
if (langAttr) {
this.langs.push(langAttr);
context.flags = context.flags | exports.HandlerFlags.hasLang;
}
if (idAttr) {
context.id = idAttr;
if (!this._microdataRefs.hasOwnProperty(idAttr)) {
this._microdataRefs[idAttr] = {};
}
}
if (attributes.hasOwnProperty('itemscope')) {
var newNode = {};
if (itemrefAttr) {
var refs = split(itemrefAttr);
for (var _i = 0, refs_1 = refs; _i < refs_1.length; _i++) {
var ref = refs_1[_i];
if (this._microdataRefs[ref] != null) {
assignJsonldProperties(newNode, this._microdataRefs[ref]);
}
this._microdataRefs[ref] = newNode;
}
}
if (itempropAttr) {
this._addMicrodataProperty(last(this._microdataNodes), context.id, split(itempropAttr), newNode);
}
else {
this.result.microdata = this._microdata;
pushToGraph(this._microdata, newNode);
this._microdataScopes.push([]);
context.flags = context.flags | exports.HandlerFlags.microdataScope;
}
this._microdataNodes.push(newNode);
context.flags = context.flags | exports.HandlerFlags.microdataNode;
}
if (itempropAttr && !(context.flags & exports.HandlerFlags.microdataNode)) {
var value = getValueMap(this.options.url, tagName, attributes);
var props = split(itempropAttr);
if (value != null) {
this._addMicrodataProperty(last(this._microdataNodes), context.id, props, simplifyJsonLdValue({
'@value': value,
'@language': last(this.langs)
}));
}
else {
context.microdataTextProperty = props;
}
}
if (itemidAttr) {
this._setMicrodataProperty(last(this._microdataNodes), context.id, '@id', itemidAttr);
}
if (itemtypeAttr) {
var _a = splitItemtype(itemtypeAttr), vocab = _a[0], type = _a[1];
var vocabs = last(this._microdataScopes);
if (type && vocab !== last(vocabs)) {
setContext(last(this._microdataNodes), '@vocab', vocab);
vocabs.push(vocab);
context.flags = context.flags | exports.HandlerFlags.microdataVocab;
}
this._addMicrodataProperty(last(this._microdataNodes), context.id, '@type', type || itemtypeAttr);
}
if (vocabAttr) {
setContext(last(this._rdfaNodes), '@vocab', vocabAttr);
this._rdfaVocabs.push(vocabAttr);
context.flags = context.flags | exports.HandlerFlags.rdfaVocab;
}
if (prefixAttr) {
var parts = split(prefixAttr);
for (var i = 0; i < parts.length; i += 2) {
var name_1 = parts[i];
var value = parts[i + 1];
var prefix = name_1.slice(0, -1);
if (name_1.charAt(name_1.length - 1) !== ':' || !isValidName(prefix)) {
continue;
}
setContext(this._rdfa, prefix, value);
}
}
if (relAttr) {
var links = this._normalizeRdfaProperty(relAttr);
if (links.length) {
this._rdfaRels.push({ links: links, used: false });
context.flags = context.flags | exports.HandlerFlags.rdfaLink;
}
}
if (this._rdfaRels.length) {
var rel = last(this._rdfaRels);
if (!rel.used) {
var validRelId = resourceAttr || hrefAttr || srcAttr;
if (validRelId) {
var newNode = { '@id': validRelId };
rel.used = true;
this._addRdfaProperty(last(this._rdfaNodes), rel.links, newNode);
if (resourceAttr && !(context.flags & exports.HandlerFlags.rdfaNode)) {
this._rdfaNodes.push(newNode);
context.flags = context.flags | exports.HandlerFlags.rdfaNode;
}
}
if (!(context.flags & exports.HandlerFlags.rdfaLink) && (propertyAttr || typeOfAttr)) {
rel.used = true;
if (!(context.flags & exports.HandlerFlags.rdfaNode)) {
var newNode = {};
this._rdfaNodes.push(newNode);
this._addRdfaProperty(last(this._rdfaNodes), rel.links, newNode);
context.flags = context.flags | exports.HandlerFlags.rdfaNode;
}
}
}
}
if (aboutAttr) {
this._rdfaNodes.push(this._createRdfaResource(aboutAttr));
context.flags = context.flags | exports.HandlerFlags.rdfaNode;
}
if (propertyAttr) {
var value = getValueMap(this.options.url, tagName, attributes);
var properties = this._normalizeRdfaProperty(propertyAttr);
if (value != null) {
this._addRdfaProperty(last(this._rdfaNodes), properties, simplifyJsonLdValue({
'@value': value,
'@language': last(this.langs),
'@type': normalize(attributes['datatype'])
}));
}
else {
if ((typeOfAttr || resourceAttr) && !(context.flags & exports.HandlerFlags.rdfaLink)) {
var newNode = { '@id': resourceAttr };
this._addRdfaProperty(last(this._rdfaNodes), properties, newNode);
if (typeOfAttr && !(context.flags & exports.HandlerFlags.rdfaNode)) {
this._rdfaNodes.push(newNode);
context.flags = context.flags | exports.HandlerFlags.rdfaNode;
}
}
else {
context.rdfaTextProperty = properties;
}
}
}
if (resourceAttr && !propertyAttr && !relAttr && !aboutAttr) {
this._rdfaNodes.push(this._createRdfaResource(resourceAttr));
context.flags = context.flags | exports.HandlerFlags.rdfaNode;
}
if (typeOfAttr) {
if (!this._rdfaRels.length && !propertyAttr && !relAttr && !resourceAttr && !aboutAttr) {
this._rdfaNodes.push(this._createRdfaResource());
context.flags = context.flags | exports.HandlerFlags.rdfaNode;
}
this._addRdfaProperty(last(this._rdfaNodes), '@type', split(typeOfAttr));
}
if (tagName === 'meta') {
var nameAttr = normalize(attributes['name']);
var contentAttr = normalize(attributes['content']);
if (propertyAttr && contentAttr) {
if (/^twitter:/.test(propertyAttr)) {
setvalue_1.set(this.result, ['twitter', propertyAttr.substr(8)], contentAttr);
}
else if (/^al:/.test(propertyAttr)) {
setvalue_1.set(this.result, ['applinks', propertyAttr.substr(3)], contentAttr);
}
}
if (nameAttr && contentAttr) {
var name_2 = nameAttr.toLowerCase();
if (/^twitter:/.test(name_2)) {
setvalue_1.set(this.result, ['twitter', name_2.substr(8)], contentAttr);
}
else if (/^dc\./.test(name_2)) {
setvalue_1.set(this.result, ['dublincore', name_2.substr(3)], contentAttr);
}
else if (/^dcterms\./.test(name_2)) {
setvalue_1.set(this.result, ['dublincore', name_2.substr(8)], contentAttr);
}
else if (/^sailthru\./.test(name_2)) {
setvalue_1.set(this.result, ['sailthru', name_2.substr(9)], contentAttr);
}
else if (name_2 === 'date' ||
name_2 === 'keywords' ||
name_2 === 'author' ||
name_2 === 'description' ||
name_2 === 'language' ||
name_2 === 'generator' ||
name_2 === 'creator' ||
name_2 === 'publisher' ||
name_2 === 'robots' ||
name_2 === 'viewport' ||
name_2 === 'application-name' ||
name_2 === 'apple-mobile-web-app-title') {
setvalue_1.set(this.result, ['html', name_2], contentAttr);
}
}
}
if (tagName === 'link') {
if (relAttr && hrefAttr) {
var rels = split(relAttr);
for (var _b = 0, rels_1 = rels; _b < rels_1.length; _b++) {
var rel = rels_1[_b];
if (rel === 'canonical' || rel === 'amphtml') {
setvalue_1.set(this.result, ['html', rel], url_1.resolve(this.options.url, hrefAttr));
}
else if (rel === 'alternate') {
var typeAttr = normalize(attributes['type']);
var mediaAttr = normalize(attributes['media']);
var hreflangAttr = normalize(attributes['hreflang']);
if (typeAttr || mediaAttr || hreflangAttr) {
this.result.alternate.push({
type: typeAttr || 'text/html',
media: mediaAttr,
hreflang: hreflangAttr,
title: normalize(attributes['title']),
href: url_1.resolve(this.options.url, hrefAttr)
});
}
}
else if (rel === 'meta') {
this.result.alternate.push({
type: normalize(attributes['type']) || 'application/rdf+xml',
href: url_1.resolve(this.options.url, hrefAttr)
});
}
}
}
}
if (tagName === 'script') {
context.scriptType = normalize(attributes['type']);
}
};
Handler.prototype.ontext = function (value) {
last(this.contexts).text += value;
};
Handler.prototype.onclosetag = function () {
var prevContext = this.contexts.pop();
var currentContext = last(this.contexts);
var text = normalize(prevContext.text);
if (prevContext.flags) {
if (prevContext.flags & exports.HandlerFlags.microdataNode) {
this._microdataNodes.pop();
}
if (prevContext.flags & exports.HandlerFlags.microdataVocab) {
last(this._microdataScopes).pop();
}
if (prevContext.flags & exports.HandlerFlags.microdataScope) {
this._microdataScopes.pop();
}
if (prevContext.flags & exports.HandlerFlags.rdfaNode) {
this._rdfaNodes.pop();
}
if (prevContext.flags & exports.HandlerFlags.rdfaVocab) {
this._rdfaVocabs.pop();
}
if (prevContext.flags & exports.HandlerFlags.rdfaLink) {
this._rdfaRels.pop();
}
if (prevContext.flags & exports.HandlerFlags.hasLang) {
this.langs.pop();
}
}
if (prevContext.scriptType) {
if (prevContext.scriptType === 'application/ld+json') {
try {
var jsonld = JSON.parse(prevContext.text);
this.result.jsonld = jsonld;
}
catch (e) { }
}
return;
}
currentContext.text += prevContext.text;
if (text) {
var schemaValue = simplifyJsonLdValue({
'@value': text,
'@language': last(this.langs)
});
if (prevContext.rdfaTextProperty) {
this._addRdfaProperty(last(this._rdfaNodes), prevContext.rdfaTextProperty, schemaValue);
}
if (prevContext.microdataTextProperty) {
this._addMicrodataProperty(last(this._microdataNodes), prevContext.id, prevContext.microdataTextProperty, schemaValue);
}
if (prevContext.tagName === 'title') {
setvalue_1.set(this.result, ['html', 'title'], text);
}
}
};
Handler.prototype._addMicrodataProperty = function (node, id, itemprop, value) {
addJsonldProperty(node, itemprop, value);
if (id && this._microdataRefs.hasOwnProperty(id)) {
addJsonldProperty(this._microdataRefs[id], itemprop, value);
}
if (!this._microdata.hasOwnProperty('@graph')) {
this.result.microdata = this._microdata;
pushToGraph(this._microdata, node);
}
};
Handler.prototype._setMicrodataProperty = function (node, id, key, value) {
node[key] = value;
if (id && this._microdataRefs.hasOwnProperty(id)) {
this._microdataRefs[id][key] = value;
}
};
Handler.prototype._addRdfaProperty = function (node, property, value) {
addJsonldProperty(node, property, value);
if (!this._rdfa.hasOwnProperty('@graph')) {
this.result.rdfa = this._rdfa;
pushToGraph(this._rdfa, node);
}
};
Handler.prototype._normalizeRdfaProperty = function (propertyList) {
var properties = [];
for (var _i = 0, _a = split(propertyList); _i < _a.length; _i++) {
var property = _a[_i];
var prefix = getPrefix(property);
if (prefix) {
if (!this._rdfa.hasOwnProperty('@context') || !this._rdfa['@context'].hasOwnProperty(prefix)) {
if (exports.KNOWN_VOCABULARIES.hasOwnProperty(prefix)) {
setContext(this._rdfa, prefix, exports.KNOWN_VOCABULARIES[prefix]);
}
}
}
else {
if (this._rdfaVocabs.length === 0) {
continue;
}
}
properties.push(property);
}
return properties;
};
Handler.prototype._createRdfaResource = function (id) {
if (id && this._rdfa.hasOwnProperty('@graph')) {
for (var _i = 0, _a = this._rdfa['@graph']; _i < _a.length; _i++) {
var item = _a[_i];
if (item['@id'] === id) {
return item;
}
}
}
var node = { '@id': id };
this.result.rdfa = this._rdfa;
pushToGraph(this._rdfa, node);
return node;
};
return Handler;
}());
exports.Handler = Handler;
function pushToGraph(node, value) {
node['@graph'] = node['@graph'] || [];
node['@graph'].push(value);
}
function setContext(node, key, value) {
node['@context'] = node['@context'] || {};
node['@context'][key] = value;
}
function normalize(value) {
return value == null ? undefined : value.trim().replace(/\s+/g, ' ');
}
function addJsonldProperty(obj, key, value) {
if (!key) {
return;
}
if (Array.isArray(key)) {
for (var _i = 0, key_1 = key; _i < key_1.length; _i++) {
var k = key_1[_i];
addJsonldProperty(obj, k, value);
}
}
else {
obj[key] = merge(obj[key], value);
}
}
function assignJsonldProperties(obj, values) {
for (var _i = 0, _a = Object.keys(values); _i < _a.length; _i++) {
var key = _a[_i];
addJsonldProperty(obj, key, values[key]);
}
}
function last(arr) {
return arr[arr.length - 1];
}
function getValueMap(url, tagName, attributes) {
var value = normalize(attributes.content);
if (!value && exports.HTML_VALUE_MAP.hasOwnProperty(tagName)) {
return normalize(exports.HTML_VALUE_MAP[tagName](url, attributes));
}
return value;
}
function merge(left, right) {
var result = (Array.isArray(left) ? left : (left == null ? [] : [left])).concat(right);
return result.length > 1 ? result : result[0];
}
function isValidName(value) {
return value.length > 1 &&
RDF_NAME_START_CHAR_REGEXP.test(value.charAt(0)) &&
RDF_NAME_CHAR_REGEXP.test(value.substr(1));
}
function getPrefix(value) {
var indexOf = value.indexOf(':');
if (indexOf === -1) {
return;
}
if (value.charAt(indexOf + 1) === '/' && value.charAt(indexOf + 2) === '/') {
return;
}
return value.substr(0, indexOf);
}
function split(value) {
return value.split(/\s+/g);
}
function splitItemtype(value) {
var hashIndexOf = value.lastIndexOf('#');
var slashIndexOf = value.lastIndexOf('/');
if (hashIndexOf > -1) {
return [value.substr(0, hashIndexOf + 1), value.substr(hashIndexOf + 1)];
}
if (slashIndexOf > -1) {
return [value.substr(0, slashIndexOf + 1), value.substr(slashIndexOf + 1)];
}
return [value, ''];
}
function simplifyJsonLdValue(value) {
if (value['@type'] != null || value['@language'] != null) {
return value;
}
return value['@value'];
}
//# sourceMappingURL=index.js.map