microdata-rdf-streaming-parser
Version:
A fast and lightweight streaming Microdata to RDF parser
435 lines • 20 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.MicrodataRdfParser = void 0;
const htmlparser2_1 = require("htmlparser2");
const readable_stream_1 = require("readable-stream");
const ItemPropertyHandlerContent_1 = require("./propertyhandler/ItemPropertyHandlerContent");
const ItemPropertyHandlerNumber_1 = require("./propertyhandler/ItemPropertyHandlerNumber");
const ItemPropertyHandlerTime_1 = require("./propertyhandler/ItemPropertyHandlerTime");
const ItemPropertyHandlerUrl_1 = require("./propertyhandler/ItemPropertyHandlerUrl");
const Util_1 = require("./Util");
const VOCAB_REGISTRY_DEFAULT = require("./vocab-registry-default.json");
/**
* A stream transformer that parses Microdata (text) streams to an {@link RDF.Stream}.
*/
class MicrodataRdfParser extends readable_stream_1.Transform {
constructor(options) {
super({ readableObjectMode: true });
// Stacks, where the key is the current depth.
this.itemScopeStack = [];
this.textBufferStack = [];
// Variables for managing itemrefs.
this.isEmittingReferences = false;
this.pendingItemRefsDomain = {};
this.pendingItemRefsRangeFinalized = {};
// eslint-disable-next-line lines-between-class-members
this.pendingItemRefsRangeCollecting = {};
options = options || {};
this.options = options;
this.util = new Util_1.Util(options.dataFactory, options.baseIRI);
this.defaultGraph = options.defaultGraph || this.util.dataFactory.defaultGraph();
this.htmlParseListener = options.htmlParseListener;
this.vocabRegistry = options.vocabRegistry || VOCAB_REGISTRY_DEFAULT;
this.parser = this.initializeParser(!!options.xmlMode);
}
/**
* Parses the given text stream into a quad stream.
* @param {NodeJS.EventEmitter} stream A text stream.
* @return {RDF.Stream} A quad stream.
*/
import(stream) {
const output = new readable_stream_1.PassThrough({ readableObjectMode: true });
stream.on('error', (error) => parsed.emit('error', error));
stream.on('data', (data) => output.push(data));
stream.on('end', () => output.push(null));
const parsed = output.pipe(new MicrodataRdfParser(this.options));
return parsed;
}
_transform(chunk, encoding, callback) {
this.parser.write(chunk.toString());
callback();
}
_flush(callback) {
this.parser.end();
callback();
}
/**
* Get the current item scope for the current depth.
* This will skip all undefined item scopes.
* @param parent If we should start looking one level higher in the stack.
*/
getItemScope(parent) {
let parentTagI = this.itemScopeStack.length - (parent ? 2 : 1);
while (parentTagI > 0 && !this.itemScopeStack[parentTagI]) {
parentTagI--;
}
return this.itemScopeStack[parentTagI];
}
/**
* Get the current stack depth.
*/
getDepth() {
return this.itemScopeStack.length;
}
onTagOpen(name, attributes) {
if (!this.isEmittingReferences) {
// If the tag has an 'id', start collecting the whole stack in the item reference buffer
if ('id' in attributes) {
const id = attributes.id;
this.pendingItemRefsRangeCollecting[id] = {
events: [],
counter: 0,
ids: [],
};
}
// Store this event in all collecting item reference buffers
for (const buffer of Object.values(this.pendingItemRefsRangeCollecting)) {
buffer.counter++;
buffer.events.push({ type: 'open', name, attributes });
}
}
// Ensure the text buffer stack is in line with the stack depth
this.textBufferStack.push(undefined);
// Processing steps based on https://w3c.github.io/microdata-rdf/#rdf-conversion-algorithm
// 1. Determine the current item scope
let itemScope;
if ('itemscope' in attributes) {
// Create a new item scope
let subject;
if (this.emittingReferencesItemScopeIdGenerator) {
subject = this.emittingReferencesItemScopeIdGenerator();
}
else {
subject = 'itemid' in attributes && this.util.createSubject(attributes.itemid) ||
this.util.dataFactory.blankNode();
// Store the genererated id in all collecting item reference buffers
for (const buffer of Object.values(this.pendingItemRefsRangeCollecting)) {
buffer.ids.push(subject);
}
}
itemScope = { subject };
// If the id was reused from a reference, block any new triples to be generated from it
if (this.isEmittingReferences) {
itemScope.blockEmission = true;
}
// Inherit vocab from parent item scope
const parentItemScope = this.getItemScope();
if (parentItemScope && parentItemScope.vocab) {
itemScope.vocab = parentItemScope.vocab;
}
// 2. Push any changes to the item scope to the stack
this.itemScopeStack.push(itemScope);
}
else {
// Determine the parent item scope
itemScope = this.getItemScope();
// 2. Push any changes to the item scope to the stack
this.itemScopeStack.push(undefined);
}
// If we have a valid item scope, process the current node
if (itemScope) {
// 3. Handle item types
if ('itemtype' in attributes) {
for (const type of this.util.createVocabIris(attributes.itemtype, itemScope, false)) {
// 4. Vocab identifier is the first valid item
if (!itemScope.vocab) {
// 5. Modify vocab based on registry
itemScope.vocab = this.util.deriveVocab(type.value, this.vocabRegistry);
}
// Emit item type
if (!itemScope.blockEmission) {
this.emitTriple(itemScope.subject, this.util.dataFactory.namedNode(`${Util_1.Util.RDF}type`), type);
}
}
}
// Save language in item scope
if ('lang' in attributes) {
itemScope.language = attributes.lang;
}
if ('xml:lang' in attributes) {
itemScope.language = attributes['xml:lang'];
}
// Handle itemrefs (only if we also had an itemscope)
// If we have an itemref, store it in our domain buffer.
if ('itemscope' in attributes &&
!this.isEmittingReferences && 'itemref' in attributes) {
for (const reference of attributes.itemref.split(/\s+/u)) {
if (!(reference in this.pendingItemRefsDomain)) {
this.pendingItemRefsDomain[reference] = [];
}
this.pendingItemRefsDomain[reference].push(itemScope);
this.tryToEmitReferences(reference, itemScope);
}
}
}
// 6. Handle item properties
if ('itemprop' in attributes) {
this.handleItemProperties(attributes.itemprop, false, itemScope, name, attributes);
}
// Handle reverse item properties
// https://w3c.github.io/microdata-rdf/#reverse-itemprop
if ('itemprop-reverse' in attributes) {
this.handleItemProperties(attributes['itemprop-reverse'], true, itemScope, name, attributes);
}
}
onText(data) {
// Store this event in all collecting item reference buffers
if (!this.isEmittingReferences) {
for (const buffer of Object.values(this.pendingItemRefsRangeCollecting)) {
buffer.events.push({ type: 'text', data });
}
}
// Save the text inside all item scopes that need to collect text
for (const textBuffer of this.textBufferStack) {
if (textBuffer) {
textBuffer.push(data);
}
}
}
onTagClose() {
// Store this event in all collecting item reference buffers
if (!this.isEmittingReferences) {
for (const [reference, buffer] of Object.entries(this.pendingItemRefsRangeCollecting)) {
buffer.counter--;
buffer.events.push({ type: 'close' });
// Once the counter becomes zero, the tag is fully buffered, so we finalize it.
if (buffer.counter === 0) {
this.pendingItemRefsRangeFinalized[reference] = buffer;
delete this.pendingItemRefsRangeCollecting[reference];
// Try to emit this reference with buffered domain items
this.tryToEmitReferences(reference);
}
}
}
// Emit all triples that were determined in the active tag
const itemScope = this.getItemScope(true);
if (itemScope) {
const depth = this.getDepth();
if (itemScope.predicates && depth in itemScope.predicates) {
for (const [predicateKey, predicates] of Object.entries(itemScope.predicates[depth])) {
// First check if we have a child item scope, otherwise get the text content
// Safely cast textBufferStack, as it is always defined when itemScope.predicates is defined.
const object = this.util.createLiteral(this.textBufferStack[depth].join(''), itemScope);
this.emitPredicateTriples(itemScope, predicates, object, predicateKey === 'reverse');
delete itemScope.predicates[depth][predicateKey];
}
}
}
// Remove the active tag from the stack
this.itemScopeStack.pop();
this.textBufferStack.pop();
}
onEnd() {
// Nothing important should happen here.
}
/**
* Initialize a new HtmlParser.
* @param xmlMode If the parser should be setup in strict mode.
*/
initializeParser(xmlMode) {
return new htmlparser2_1.Parser({
onclosetag: () => {
try {
this.onTagClose();
if (this.htmlParseListener) {
this.htmlParseListener.onTagClose();
}
}
catch (error) {
this.emit('error', error);
}
},
onend: () => {
try {
this.onEnd();
if (this.htmlParseListener) {
this.htmlParseListener.onEnd();
}
}
catch (error) {
this.emit('error', error);
}
},
onopentag: (name, attributes) => {
try {
this.onTagOpen(name, attributes);
if (this.htmlParseListener) {
this.htmlParseListener.onTagOpen(name, attributes);
}
}
catch (error) {
this.emit('error', error);
}
},
ontext: (data) => {
try {
this.onText(data);
if (this.htmlParseListener) {
this.htmlParseListener.onText(data);
}
}
catch (error) {
this.emit('error', error);
}
},
}, {
decodeEntities: true,
recognizeSelfClosing: true,
xmlMode,
});
}
/**
* Handle the given item properties.
* @param itempropValue The value of itemprop or itemprop-reverse.
* @param reverse If the item properties are reversed (itemprop-reverse).
* @param itemScope The current item scope.
* @param tagName The current tag name.
* @param tagAttributes The current tag attributes.
*/
handleItemProperties(itempropValue, reverse, itemScope, tagName, tagAttributes) {
const parentItemScope = this.getItemScope(true);
if (parentItemScope) {
// Set predicates in the scope, and handle them on tag close.
const depth = this.getDepth();
const predicates = this.util.createVocabIris(itempropValue, parentItemScope, true);
if (!parentItemScope.predicates) {
parentItemScope.predicates = {};
}
if (!parentItemScope.predicates[depth]) {
parentItemScope.predicates[depth] = {};
}
const predicatesKey = reverse ? 'reverse' : 'forward';
parentItemScope.predicates[depth][predicatesKey] = predicates;
// Append rdf:type predicate if vocabulary expansion applies
for (const vocabularyExpansionType of this.util.getVocabularyExpansionType(itempropValue, parentItemScope, this.vocabRegistry)) {
predicates.push(vocabularyExpansionType);
}
// Check if a property handler that applies, forcefully use that as predicate value.
// But DON'T call handlers in this prop is a direct (nested) itemscope.
if (itemScope && 'itemscope' in tagAttributes) {
this.emitPredicateTriples(parentItemScope, predicates, itemScope.subject, reverse);
// Finalize the predicates, so text values do not apply to them.
delete parentItemScope.predicates[depth][predicatesKey];
}
else {
for (const handler of MicrodataRdfParser.ITEM_PROPERTY_HANDLERS) {
if (handler.canHandle(tagName, tagAttributes)) {
const object = handler.getObject(tagAttributes, this.util, parentItemScope);
this.emitPredicateTriples(parentItemScope, predicates, object, reverse);
// Finalize the predicates, so text values do not apply to them.
delete parentItemScope.predicates[depth][predicatesKey];
}
}
}
// If no valid handler was found, indicate that we should collect text at this depth.
if (parentItemScope.predicates[depth][predicatesKey]) {
this.textBufferStack[depth] = [];
}
}
}
/**
* Emit the given object for the given predicates.
* @param itemScope The current item scope.
* @param predicates An array of predicates.
* @param object An object.
* @param reverse If the triples should be reversed.
*/
emitPredicateTriples(itemScope, predicates, object, reverse) {
if (!itemScope.blockEmission) {
for (const predicate of predicates) {
if (reverse) {
// Literals can not exist in subject position, so they must be ignored.
if (object.termType !== 'Literal') {
this.emitTriple(object, predicate, itemScope.subject);
}
}
else {
this.emitTriple(itemScope.subject, predicate, object);
}
}
}
}
/**
* Emit the given triple to the stream.
* @param {Quad_Subject} subject A subject term.
* @param {Quad_Predicate} predicate A predicate term.
* @param {Quad_Object} object An object term.
*/
emitTriple(subject, predicate, object) {
this.push(this.util.dataFactory.quad(subject, predicate, object, this.defaultGraph));
}
/**
* Attempt to emit all pending itemrefs for the given reference.
* @param reference An item reference id.
* @param itemScopeDomain An optional item scope. If defined, only refs from this scope will be emitted.
*/
tryToEmitReferences(reference, itemScopeDomain) {
const range = this.pendingItemRefsRangeFinalized[reference];
if (range) {
// Determine the item scope domains to emit
let applicableItemScopes;
if (itemScopeDomain) {
applicableItemScopes = [itemScopeDomain];
// Remove the item from the pending array
// Element is guaranteed to exist in buffer
const itemScopeDomainIndex = this.pendingItemRefsDomain[reference].indexOf(itemScopeDomain);
this.pendingItemRefsDomain[reference].splice(itemScopeDomainIndex, 1);
}
else {
applicableItemScopes = this.pendingItemRefsDomain[reference];
// Remove all items from the pending array
delete this.pendingItemRefsDomain[reference];
}
if (applicableItemScopes) {
// Save the stack state
const itemScopeStackOld = this.itemScopeStack;
const textBufferStackOld = this.textBufferStack;
this.isEmittingReferences = true;
// For all applicable item scopes, emit the buffered events.
for (const itemScope of applicableItemScopes) {
this.itemScopeStack = [itemScope];
this.textBufferStack = [undefined];
const pendingIds = [...range.ids];
this.emittingReferencesItemScopeIdGenerator = () => pendingIds.shift();
for (const event of range.events) {
switch (event.type) {
case 'open':
this.onTagOpen(event.name, event.attributes);
break;
case 'text':
this.onText(event.data);
break;
case 'close':
this.onTagClose();
break;
}
}
}
// Restore the stack state
this.emittingReferencesItemScopeIdGenerator = undefined;
this.itemScopeStack = itemScopeStackOld;
this.textBufferStack = textBufferStackOld;
this.isEmittingReferences = false;
}
}
}
}
exports.MicrodataRdfParser = MicrodataRdfParser;
MicrodataRdfParser.ITEM_PROPERTY_HANDLERS = [
new ItemPropertyHandlerContent_1.ItemPropertyHandlerContent(),
new ItemPropertyHandlerUrl_1.ItemPropertyHandlerUrl('a', 'href'),
new ItemPropertyHandlerUrl_1.ItemPropertyHandlerUrl('area', 'href'),
new ItemPropertyHandlerUrl_1.ItemPropertyHandlerUrl('audio', 'src'),
new ItemPropertyHandlerUrl_1.ItemPropertyHandlerUrl('embed', 'src'),
new ItemPropertyHandlerUrl_1.ItemPropertyHandlerUrl('iframe', 'src'),
new ItemPropertyHandlerUrl_1.ItemPropertyHandlerUrl('img', 'src'),
new ItemPropertyHandlerUrl_1.ItemPropertyHandlerUrl('link', 'href'),
new ItemPropertyHandlerUrl_1.ItemPropertyHandlerUrl('object', 'data'),
new ItemPropertyHandlerUrl_1.ItemPropertyHandlerUrl('source', 'src'),
new ItemPropertyHandlerUrl_1.ItemPropertyHandlerUrl('track', 'src'),
new ItemPropertyHandlerUrl_1.ItemPropertyHandlerUrl('video', 'src'),
new ItemPropertyHandlerNumber_1.ItemPropertyHandlerNumber('data', 'value'),
new ItemPropertyHandlerNumber_1.ItemPropertyHandlerNumber('meter', 'value'),
new ItemPropertyHandlerTime_1.ItemPropertyHandlerTime(),
];
//# sourceMappingURL=MicrodataRdfParser.js.map