UNPKG

web-auto-extractor

Version:

Automatically extracts structured information from webpages

156 lines (138 loc) 4.51 kB
'use strict'; Object.defineProperty(exports, "__esModule", { value: true }); var _htmlparser = require('htmlparser2'); var _htmlparser2 = _interopRequireDefault(_htmlparser); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } function getPropValue(tagName, attribs, TYPE, PROP) { if (attribs[TYPE]) { return null; } else if ((tagName === 'a' || tagName === 'link') && attribs.href) { return attribs.href.trim(); } else if (attribs.content) { return attribs.content.trim(); } else if (attribs[PROP] === 'image' && attribs.src) { return attribs.src.trim(); } else { return null; } } var getAttrNames = function getAttrNames(specName) { var TYPE = void 0, PROP = void 0; if (specName.toLowerCase().startsWith('micro')) { TYPE = 'itemtype'; PROP = 'itemprop'; } else if (specName.toLowerCase().startsWith('rdfa')) { TYPE = 'typeof'; PROP = 'property'; } else { throw new Error('Unsupported spec: use either micro or rdfa'); } return { TYPE: TYPE, PROP: PROP }; }; var getType = function getType(typeString) { var match = /(.*\/)(\w+)/g.exec(typeString); return { context: match && match[1] ? match[1] : undefined, type: match && match[2] ? match[2] : typeString }; }; var createHandler = function createHandler(specName) { var scopes = []; var tags = []; var topLevelScope = {}; var textForProp = null; var _getAttrNames = getAttrNames(specName), TYPE = _getAttrNames.TYPE, PROP = _getAttrNames.PROP; var onopentag = function onopentag(tagName, attribs) { var currentScope = scopes[scopes.length - 1]; var tag = false; if (attribs[TYPE]) { if (attribs[PROP] && currentScope) { var newScope = {}; currentScope[attribs[PROP]] = currentScope[attribs[PROP]] || []; currentScope[attribs[PROP]].push(newScope); currentScope = newScope; } else { currentScope = {}; var _getType = getType(attribs[TYPE]), type = _getType.type; topLevelScope[type] = topLevelScope[type] || []; topLevelScope[type].push(currentScope); } } if (currentScope) { if (attribs[TYPE]) { var _getType2 = getType(attribs[TYPE]), context = _getType2.context, _type = _getType2.type; var vocab = attribs.vocab; currentScope['@context'] = context || vocab; currentScope['@type'] = _type; tag = TYPE; scopes.push(currentScope); } else if (attribs[PROP]) { if (currentScope[attribs[PROP]] && !Array.isArray(currentScope[attribs[PROP]])) { // PROP occurs for the second time, storing it as an array currentScope[attribs[PROP]] = [currentScope[attribs[PROP]]]; } var value = getPropValue(tagName, attribs, TYPE, PROP); if (!value) { tag = PROP; if (Array.isArray(currentScope[attribs[PROP]])) { currentScope[attribs[PROP]].push(''); } else { currentScope[attribs[PROP]] = ''; } textForProp = attribs[PROP]; } else { if (Array.isArray(currentScope[attribs[PROP]])) { currentScope[attribs[PROP]].push(value); } else { currentScope[attribs[PROP]] = value; } } } } tags.push(tag); }; var ontext = function ontext(text) { if (textForProp) { if (Array.isArray(scopes[scopes.length - 1][textForProp])) { scopes[scopes.length - 1][textForProp][scopes[scopes.length - 1][textForProp].length - 1] += text.trim(); } else { scopes[scopes.length - 1][textForProp] += text.trim(); } } }; var onclosetag = function onclosetag(tagname) { var tag = tags.pop(); if (tag === TYPE) { var scope = scopes.pop(); if (!scope['@context']) { delete scope['@context']; } Object.keys(scope).forEach(function (key) { if (Array.isArray(scope[key]) && scope[key].length === 1) { scope[key] = scope[key][0]; } }); } else if (tag === PROP) { textForProp = false; } }; return { onopentag: onopentag, ontext: ontext, onclosetag: onclosetag, topLevelScope: topLevelScope }; }; exports.default = function (html, specName) { var handler = createHandler(specName); new _htmlparser2.default.Parser(handler).end(html); return handler.topLevelScope; };