UNPKG

htmlmetaparser

Version:

A `htmlparser2` handler for parsing rich metadata from HTML. Includes HTML metadata, JSON-LD, RDFa, microdata, OEmbed, Twitter cards and AppLinks.

561 lines 23.7 kB
"use strict"; var setvalue_1 = require('setvalue'); var url_1 = require('url'); var oembed_1 = require('./oembed'); var providers = new oembed_1.OEmbedProviders(require('../vendor/providers.json')); var RDF_VALID_NAME_START_CHAR_RANGE = 'A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6' + '\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + '\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u10000-\uEFFFF'; var RDF_NAME_START_CHAR_REGEXP = new RegExp("^[" + RDF_VALID_NAME_START_CHAR_RANGE + "]$"); var RDF_NAME_CHAR_REGEXP = new RegExp("^[" + RDF_VALID_NAME_START_CHAR_RANGE + "\\-\\.0-9\u00B7\u0300-\u036F\u203F-\u2040]*$"); exports.KNOWN_VOCABULARIES = { csvw: 'http://www.w3.org/ns/csvw#', dcat: 'http://www.w3.org/ns/dcat#', qb: 'http://purl.org/linked-data/cube#', grddl: 'http://www.w3.org/2003/g/data-view#', ma: 'http://www.w3.org/ns/ma-ont#', org: 'http://www.w3.org/ns/org#', owl: 'http://www.w3.org/2002/07/owl#', prov: 'http://www.w3.org/ns/prov#', rdf: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', rdfa: 'http://www.w3.org/ns/rdfa#', rdfs: 'http://www.w3.org/2000/01/rdf-schema#', rif: 'http://www.w3.org/2007/rif#', rr: 'http://www.w3.org/ns/r2rml#', sd: 'http://www.w3.org/ns/sparql-service-description#', skos: 'http://www.w3.org/2004/02/skos/core#', skosxl: 'http://www.w3.org/2008/05/skos-xl#', wdr: 'http://www.w3.org/2007/05/powder#', void: 'http://rdfs.org/ns/void#', wdrs: 'http://www.w3.org/2007/05/powder-s#', xhv: 'http://www.w3.org/1999/xhtml/vocab#', xml: 'http://www.w3.org/XML/1998/namespace', xsd: 'http://www.w3.org/2001/XMLSchema#', cc: 'https://creativecommons.org/ns#', ctag: 'http://commontag.org/ns#', dc: 'http://purl.org/dc/terms/', dcterms: 'http://purl.org/dc/terms/', dc11: 'http://purl.org/dc/elements/1.1/', foaf: 'http://xmlns.com/foaf/0.1/', gr: 'http://purl.org/goodrelations/v1#', ical: 'http://www.w3.org/2002/12/cal/icaltzd#', og: 'http://ogp.me/ns#', rev: 'http://purl.org/stuff/rev#', sioc: 'http://rdfs.org/sioc/ns#', v: 'http://rdf.data-vocabulary.org/#', vcard: 'http://www.w3.org/2006/vcard/ns#', schema: 'http://schema.org/', music: 'http://ogp.me/ns/music#', video: 'http://ogp.me/ns/video#', article: 'http://ogp.me/ns/article#', book: 'http://ogp.me/ns/book#', profile: 'http://ogp.me/ns/profile#', website: 'http://ogp.me/ns/website#', fb: 'http://ogp.me/ns/fb#' }; exports.HTML_VALUE_MAP = { meta: function (baseUrl, attrs) { return attrs.content; }, audio: function (baseUrl, attrs) { return attrs.src ? url_1.resolve(baseUrl, attrs.src) : undefined; }, a: function (baseUrl, attrs) { return attrs.href ? url_1.resolve(baseUrl, attrs.href) : undefined; }, object: function (baseUrl, attrs) { return attrs.data ? url_1.resolve(baseUrl, attrs.data) : undefined; }, time: function (baseUrl, attrs) { return attrs.datetime; }, data: function (baseUrl, attrs) { return attrs.value; } }; exports.HTML_VALUE_MAP['embed'] = exports.HTML_VALUE_MAP['audio']; exports.HTML_VALUE_MAP['iframe'] = exports.HTML_VALUE_MAP['audio']; exports.HTML_VALUE_MAP['img'] = exports.HTML_VALUE_MAP['audio']; exports.HTML_VALUE_MAP['source'] = exports.HTML_VALUE_MAP['audio']; exports.HTML_VALUE_MAP['track'] = exports.HTML_VALUE_MAP['audio']; exports.HTML_VALUE_MAP['video'] = exports.HTML_VALUE_MAP['audio']; exports.HTML_VALUE_MAP['area'] = exports.HTML_VALUE_MAP['a']; exports.HTML_VALUE_MAP['link'] = exports.HTML_VALUE_MAP['a']; exports.HTML_VALUE_MAP['meter'] = exports.HTML_VALUE_MAP['data']; exports.HandlerFlags = { hasLang: (1 << 0), rdfaLink: (1 << 1), rdfaNode: (1 << 2), rdfaVocab: (1 << 3), microdataNode: (1 << 4), microdataVocab: (1 << 5), microdataScope: (1 << 6) }; var Handler = (function () { function Handler(callback, options) { this.callback = callback; this.options = options; this.result = { alternate: [] }; this.contexts = [{ tagName: '', text: '', flags: 0 }]; this.langs = []; this._rdfa = {}; this._rdfaNodes = [{}]; this._rdfaVocabs = []; this._rdfaRels = []; this._microdata = {}; this._microdataRefs = {}; this._microdataScopes = [[]]; this._microdataNodes = [{}]; } Handler.prototype.onend = function () { var oembedProvider = providers.match(this.options.url); if (oembedProvider && !this.result.alternate.some(function (x) { return x.type === oembedProvider.type; })) { this.result.alternate.push(oembedProvider); } this.callback(null, this.result); }; Handler.prototype.onerror = function (error) { this.callback(error, this.result); }; Handler.prototype.onopentagname = function (tagName) { this.contexts.push({ tagName: tagName, text: '', flags: 0 }); }; Handler.prototype.onopentag = function (tagName, attributes) { var context = last(this.contexts); var relAttr = normalize(attributes['rel']); var srcAttr = normalize(attributes['src']); var hrefAttr = normalize(attributes['href']); var langAttr = normalize(attributes['lang']); var propertyAttr = normalize(attributes['property']); var vocabAttr = normalize(attributes['vocab']); var prefixAttr = normalize(attributes['prefix']); var resourceAttr = normalize(attributes['resource']); var typeOfAttr = normalize(attributes['typeof']); var aboutAttr = normalize(attributes['about']); var idAttr = normalize(attributes['id']); var itempropAttr = normalize(attributes['itemprop']); var itemidAttr = normalize(attributes['itemid']); var itemtypeAttr = normalize(attributes['itemtype']); var itemrefAttr = normalize(attributes['itemref']); if (langAttr) { this.langs.push(langAttr); context.flags = context.flags | exports.HandlerFlags.hasLang; } if (idAttr) { context.id = idAttr; if (!this._microdataRefs.hasOwnProperty(idAttr)) { this._microdataRefs[idAttr] = {}; } } if (attributes.hasOwnProperty('itemscope')) { var newNode = {}; if (itemrefAttr) { var refs = split(itemrefAttr); for (var _i = 0, refs_1 = refs; _i < refs_1.length; _i++) { var ref = refs_1[_i]; if (this._microdataRefs[ref] != null) { assignJsonldProperties(newNode, this._microdataRefs[ref]); } this._microdataRefs[ref] = newNode; } } if (itempropAttr) { this._addMicrodataProperty(last(this._microdataNodes), context.id, split(itempropAttr), newNode); } else { this.result.microdata = this._microdata; pushToGraph(this._microdata, newNode); this._microdataScopes.push([]); context.flags = context.flags | exports.HandlerFlags.microdataScope; } this._microdataNodes.push(newNode); context.flags = context.flags | exports.HandlerFlags.microdataNode; } if (itempropAttr && !(context.flags & exports.HandlerFlags.microdataNode)) { var value = getValueMap(this.options.url, tagName, attributes); var props = split(itempropAttr); if (value != null) { this._addMicrodataProperty(last(this._microdataNodes), context.id, props, simplifyJsonLdValue({ '@value': value, '@language': last(this.langs) })); } else { context.microdataTextProperty = props; } } if (itemidAttr) { this._setMicrodataProperty(last(this._microdataNodes), context.id, '@id', itemidAttr); } if (itemtypeAttr) { var _a = splitItemtype(itemtypeAttr), vocab = _a[0], type = _a[1]; var vocabs = last(this._microdataScopes); if (type && vocab !== last(vocabs)) { setContext(last(this._microdataNodes), '@vocab', vocab); vocabs.push(vocab); context.flags = context.flags | exports.HandlerFlags.microdataVocab; } this._addMicrodataProperty(last(this._microdataNodes), context.id, '@type', type || itemtypeAttr); } if (vocabAttr) { setContext(last(this._rdfaNodes), '@vocab', vocabAttr); this._rdfaVocabs.push(vocabAttr); context.flags = context.flags | exports.HandlerFlags.rdfaVocab; } if (prefixAttr) { var parts = split(prefixAttr); for (var i = 0; i < parts.length; i += 2) { var name_1 = parts[i]; var value = parts[i + 1]; var prefix = name_1.slice(0, -1); if (name_1.charAt(name_1.length - 1) !== ':' || !isValidName(prefix)) { continue; } setContext(this._rdfa, prefix, value); } } if (relAttr) { var links = this._normalizeRdfaProperty(relAttr); if (links.length) { this._rdfaRels.push({ links: links, used: false }); context.flags = context.flags | exports.HandlerFlags.rdfaLink; } } if (this._rdfaRels.length) { var rel = last(this._rdfaRels); if (!rel.used) { var validRelId = resourceAttr || hrefAttr || srcAttr; if (validRelId) { var newNode = { '@id': validRelId }; rel.used = true; this._addRdfaProperty(last(this._rdfaNodes), rel.links, newNode); if (resourceAttr && !(context.flags & exports.HandlerFlags.rdfaNode)) { this._rdfaNodes.push(newNode); context.flags = context.flags | exports.HandlerFlags.rdfaNode; } } if (!(context.flags & exports.HandlerFlags.rdfaLink) && (propertyAttr || typeOfAttr)) { rel.used = true; if (!(context.flags & exports.HandlerFlags.rdfaNode)) { var newNode = {}; this._rdfaNodes.push(newNode); this._addRdfaProperty(last(this._rdfaNodes), rel.links, newNode); context.flags = context.flags | exports.HandlerFlags.rdfaNode; } } } } if (aboutAttr) { this._rdfaNodes.push(this._createRdfaResource(aboutAttr)); context.flags = context.flags | exports.HandlerFlags.rdfaNode; } if (propertyAttr) { var value = getValueMap(this.options.url, tagName, attributes); var properties = this._normalizeRdfaProperty(propertyAttr); if (value != null) { this._addRdfaProperty(last(this._rdfaNodes), properties, simplifyJsonLdValue({ '@value': value, '@language': last(this.langs), '@type': normalize(attributes['datatype']) })); } else { if ((typeOfAttr || resourceAttr) && !(context.flags & exports.HandlerFlags.rdfaLink)) { var newNode = { '@id': resourceAttr }; this._addRdfaProperty(last(this._rdfaNodes), properties, newNode); if (typeOfAttr && !(context.flags & exports.HandlerFlags.rdfaNode)) { this._rdfaNodes.push(newNode); context.flags = context.flags | exports.HandlerFlags.rdfaNode; } } else { context.rdfaTextProperty = properties; } } } if (resourceAttr && !propertyAttr && !relAttr && !aboutAttr) { this._rdfaNodes.push(this._createRdfaResource(resourceAttr)); context.flags = context.flags | exports.HandlerFlags.rdfaNode; } if (typeOfAttr) { if (!this._rdfaRels.length && !propertyAttr && !relAttr && !resourceAttr && !aboutAttr) { this._rdfaNodes.push(this._createRdfaResource()); context.flags = context.flags | exports.HandlerFlags.rdfaNode; } this._addRdfaProperty(last(this._rdfaNodes), '@type', split(typeOfAttr)); } if (tagName === 'meta') { var nameAttr = normalize(attributes['name']); var contentAttr = normalize(attributes['content']); if (propertyAttr && contentAttr) { if (/^twitter:/.test(propertyAttr)) { setvalue_1.set(this.result, ['twitter', propertyAttr.substr(8)], contentAttr); } else if (/^al:/.test(propertyAttr)) { setvalue_1.set(this.result, ['applinks', propertyAttr.substr(3)], contentAttr); } } if (nameAttr && contentAttr) { var name_2 = nameAttr.toLowerCase(); if (/^twitter:/.test(name_2)) { setvalue_1.set(this.result, ['twitter', name_2.substr(8)], contentAttr); } else if (/^dc\./.test(name_2)) { setvalue_1.set(this.result, ['dublincore', name_2.substr(3)], contentAttr); } else if (/^dcterms\./.test(name_2)) { setvalue_1.set(this.result, ['dublincore', name_2.substr(8)], contentAttr); } else if (/^sailthru\./.test(name_2)) { setvalue_1.set(this.result, ['sailthru', name_2.substr(9)], contentAttr); } else if (name_2 === 'date' || name_2 === 'keywords' || name_2 === 'author' || name_2 === 'description' || name_2 === 'language' || name_2 === 'generator' || name_2 === 'creator' || name_2 === 'publisher' || name_2 === 'robots' || name_2 === 'viewport' || name_2 === 'application-name' || name_2 === 'apple-mobile-web-app-title') { setvalue_1.set(this.result, ['html', name_2], contentAttr); } } } if (tagName === 'link') { if (relAttr && hrefAttr) { var rels = split(relAttr); for (var _b = 0, rels_1 = rels; _b < rels_1.length; _b++) { var rel = rels_1[_b]; if (rel === 'canonical' || rel === 'amphtml') { setvalue_1.set(this.result, ['html', rel], url_1.resolve(this.options.url, hrefAttr)); } else if (rel === 'alternate') { var typeAttr = normalize(attributes['type']); var mediaAttr = normalize(attributes['media']); var hreflangAttr = normalize(attributes['hreflang']); if (typeAttr || mediaAttr || hreflangAttr) { this.result.alternate.push({ type: typeAttr || 'text/html', media: mediaAttr, hreflang: hreflangAttr, title: normalize(attributes['title']), href: url_1.resolve(this.options.url, hrefAttr) }); } } else if (rel === 'meta') { this.result.alternate.push({ type: normalize(attributes['type']) || 'application/rdf+xml', href: url_1.resolve(this.options.url, hrefAttr) }); } } } } if (tagName === 'script') { context.scriptType = normalize(attributes['type']); } }; Handler.prototype.ontext = function (value) { last(this.contexts).text += value; }; Handler.prototype.onclosetag = function () { var prevContext = this.contexts.pop(); var currentContext = last(this.contexts); var text = normalize(prevContext.text); if (prevContext.flags) { if (prevContext.flags & exports.HandlerFlags.microdataNode) { this._microdataNodes.pop(); } if (prevContext.flags & exports.HandlerFlags.microdataVocab) { last(this._microdataScopes).pop(); } if (prevContext.flags & exports.HandlerFlags.microdataScope) { this._microdataScopes.pop(); } if (prevContext.flags & exports.HandlerFlags.rdfaNode) { this._rdfaNodes.pop(); } if (prevContext.flags & exports.HandlerFlags.rdfaVocab) { this._rdfaVocabs.pop(); } if (prevContext.flags & exports.HandlerFlags.rdfaLink) { this._rdfaRels.pop(); } if (prevContext.flags & exports.HandlerFlags.hasLang) { this.langs.pop(); } } if (prevContext.scriptType) { if (prevContext.scriptType === 'application/ld+json') { try { var jsonld = JSON.parse(prevContext.text); this.result.jsonld = jsonld; } catch (e) { } } return; } currentContext.text += prevContext.text; if (text) { var schemaValue = simplifyJsonLdValue({ '@value': text, '@language': last(this.langs) }); if (prevContext.rdfaTextProperty) { this._addRdfaProperty(last(this._rdfaNodes), prevContext.rdfaTextProperty, schemaValue); } if (prevContext.microdataTextProperty) { this._addMicrodataProperty(last(this._microdataNodes), prevContext.id, prevContext.microdataTextProperty, schemaValue); } if (prevContext.tagName === 'title') { setvalue_1.set(this.result, ['html', 'title'], text); } } }; Handler.prototype._addMicrodataProperty = function (node, id, itemprop, value) { addJsonldProperty(node, itemprop, value); if (id && this._microdataRefs.hasOwnProperty(id)) { addJsonldProperty(this._microdataRefs[id], itemprop, value); } if (!this._microdata.hasOwnProperty('@graph')) { this.result.microdata = this._microdata; pushToGraph(this._microdata, node); } }; Handler.prototype._setMicrodataProperty = function (node, id, key, value) { node[key] = value; if (id && this._microdataRefs.hasOwnProperty(id)) { this._microdataRefs[id][key] = value; } }; Handler.prototype._addRdfaProperty = function (node, property, value) { addJsonldProperty(node, property, value); if (!this._rdfa.hasOwnProperty('@graph')) { this.result.rdfa = this._rdfa; pushToGraph(this._rdfa, node); } }; Handler.prototype._normalizeRdfaProperty = function (propertyList) { var properties = []; for (var _i = 0, _a = split(propertyList); _i < _a.length; _i++) { var property = _a[_i]; var prefix = getPrefix(property); if (prefix) { if (!this._rdfa.hasOwnProperty('@context') || !this._rdfa['@context'].hasOwnProperty(prefix)) { if (exports.KNOWN_VOCABULARIES.hasOwnProperty(prefix)) { setContext(this._rdfa, prefix, exports.KNOWN_VOCABULARIES[prefix]); } } } else { if (this._rdfaVocabs.length === 0) { continue; } } properties.push(property); } return properties; }; Handler.prototype._createRdfaResource = function (id) { if (id && this._rdfa.hasOwnProperty('@graph')) { for (var _i = 0, _a = this._rdfa['@graph']; _i < _a.length; _i++) { var item = _a[_i]; if (item['@id'] === id) { return item; } } } var node = { '@id': id }; this.result.rdfa = this._rdfa; pushToGraph(this._rdfa, node); return node; }; return Handler; }()); exports.Handler = Handler; function pushToGraph(node, value) { node['@graph'] = node['@graph'] || []; node['@graph'].push(value); } function setContext(node, key, value) { node['@context'] = node['@context'] || {}; node['@context'][key] = value; } function normalize(value) { return value == null ? undefined : value.trim().replace(/\s+/g, ' '); } function addJsonldProperty(obj, key, value) { if (!key) { return; } if (Array.isArray(key)) { for (var _i = 0, key_1 = key; _i < key_1.length; _i++) { var k = key_1[_i]; addJsonldProperty(obj, k, value); } } else { obj[key] = merge(obj[key], value); } } function assignJsonldProperties(obj, values) { for (var _i = 0, _a = Object.keys(values); _i < _a.length; _i++) { var key = _a[_i]; addJsonldProperty(obj, key, values[key]); } } function last(arr) { return arr[arr.length - 1]; } function getValueMap(url, tagName, attributes) { var value = normalize(attributes.content); if (!value && exports.HTML_VALUE_MAP.hasOwnProperty(tagName)) { return normalize(exports.HTML_VALUE_MAP[tagName](url, attributes)); } return value; } function merge(left, right) { var result = (Array.isArray(left) ? left : (left == null ? [] : [left])).concat(right); return result.length > 1 ? result : result[0]; } function isValidName(value) { return value.length > 1 && RDF_NAME_START_CHAR_REGEXP.test(value.charAt(0)) && RDF_NAME_CHAR_REGEXP.test(value.substr(1)); } function getPrefix(value) { var indexOf = value.indexOf(':'); if (indexOf === -1) { return; } if (value.charAt(indexOf + 1) === '/' && value.charAt(indexOf + 2) === '/') { return; } return value.substr(0, indexOf); } function split(value) { return value.split(/\s+/g); } function splitItemtype(value) { var hashIndexOf = value.lastIndexOf('#'); var slashIndexOf = value.lastIndexOf('/'); if (hashIndexOf > -1) { return [value.substr(0, hashIndexOf + 1), value.substr(hashIndexOf + 1)]; } if (slashIndexOf > -1) { return [value.substr(0, slashIndexOf + 1), value.substr(slashIndexOf + 1)]; } return [value, '']; } function simplifyJsonLdValue(value) { if (value['@type'] != null || value['@language'] != null) { return value; } return value['@value']; } //# sourceMappingURL=index.js.map