UNPKG

extract-cbd-shape

Version:

Extract an entity based on CBD and a SHACL shape

458 lines (400 loc) 13.4 kB
import {rdfDereferencer, RdfDereferencer} from "rdf-dereference"; import {NodeLink, RDFMap, ShapeTemplate} from "./Shape"; import {Path} from "./Path"; import {DataFactory} from "rdf-data-factory"; import {RdfStore} from "rdf-stores"; import {Quad, Term} from "@rdfjs/types"; import debug from "debug"; import {ShapesGraph} from "./ShapesGraph"; const log = debug("extract-cbd-shape"); const df = new DataFactory(); class DereferenceNeeded { target: string; msg?: string; constructor(target: string, msg?: string) { this.target = target; this.msg = msg; } } type CBDShapeExtractorOptions = { cbdDefaultGraph: boolean; fetch?: typeof fetch; }; /** * Usage: * import {ShapeExtractor} from "extract-cbd-shape"; * ... * let shapeExtractor = new ShapeExtractor(shape, dereferencer); * let entityquads = await shapeExtractor.extract(store, entity); */ export class CBDShapeExtractor { dereferencer: RdfDereferencer; shapesGraph?: ShapesGraph; options: CBDShapeExtractorOptions; constructor( shapesGraphStore?: RdfStore, dereferencer?: RdfDereferencer<Quad>, options: Partial<CBDShapeExtractorOptions> = {}, ) { // Assign with default options this.options = Object.assign({cbdDefaultGraph: false}, options); if (!dereferencer) { this.dereferencer = rdfDereferencer; } else { this.dereferencer = dereferencer; } //Pre-process shape if (shapesGraphStore) { this.shapesGraph = new ShapesGraph(shapesGraphStore); } } public async bulkExtract( store: RdfStore, ids: Array<Term>, shapeId?: Term, graphsToIgnore?: Array<Term>, itemExtracted?: (member: { subject: Term; quads: Quad[] }) => void, ): Promise<Array<{ subject: Term; quads: Quad[] }>> { const out: Array<{ subject: Term; quads: Quad[] }> = []; const idSet = new Set(ids.map((x) => x.value)); const memberSpecificQuads: { [id: string]: Array<Quad> } = {}; for (let id of ids) { memberSpecificQuads[id.value] = []; } const newStore = RdfStore.createDefault(); for (let quad of store.readQuads(null, null, null, null)) { if (quad.graph.termType == "NamedNode" && idSet.has(quad.graph.value)) { memberSpecificQuads[quad.graph.value].push(quad); } else { newStore.addQuad(quad); } } const promises = []; for (let id of ids) { const promise = this.extract( newStore, id, shapeId, (graphsToIgnore || []).slice(), ).then((quads) => { quads.push(...memberSpecificQuads[id.value]); if (itemExtracted) { itemExtracted({subject: id, quads}); } out.push({subject: id, quads}); }); promises.push(promise); } await Promise.all(promises); return out; } /** * Extracts: * * first level quads, * * their blank nodes with their quads (recursively), * * all quads in the namedgraph of this entity, * * all quads of required paths found in the shape * * the same algorithm on top of all found node links * @param store The RdfStore loaded with a set of initial quads * @param id The entity to be described/extracted * @param shapeId The optional SHACL NodeShape identifier * @param graphsToIgnore The optional parameter of graph to ignore when other entities are mentioned in the current context * @returns Promise of a quad array of the described entity */ public async extract( store: RdfStore, id: Term, shapeId?: Term, graphsToIgnore?: Array<Term>, ): Promise<Array<Quad>> { // First extract everything except for something within the graphs to ignore, or within the graph of the current entity, as that’s going to be added anyway later on let dontExtractFromGraph: Array<string> = ( graphsToIgnore ? graphsToIgnore : [] ).map((item) => { return item.value; }); const extractInstance = new ExtractInstance( store, this.dereferencer, dontExtractFromGraph, this.options, this.shapesGraph, ); return await extractInstance.extract(id, false, shapeId); } } export type Extracted = { forwards: { [node: string]: Extracted; }; backwards: { [node: string]: Extracted; }; }; export type ExtractReasons = { cbd: boolean; shape: boolean; }; export class CbdExtracted { topology: Extracted; cbdExtractedMap: RDFMap<ExtractReasons>; constructor( topology?: Extracted, cbdExtracted: RDFMap<ExtractReasons> = new RDFMap(), ) { if (topology) { this.topology = topology; } else { this.topology = {forwards: {}, backwards: {}}; } this.cbdExtractedMap = cbdExtracted; } addCBDTerm(term: Term) { const t = this.cbdExtractedMap.get(term); if (t) { t.cbd = true; } else { this.cbdExtractedMap.set(term, {cbd: true, shape: false}); } } addShapeTerm(term: Term) { const t = this.cbdExtractedMap.get(term); if (t) { t.shape = true; } else { this.cbdExtractedMap.set(term, {cbd: true, shape: false}); } } cbdExtracted(term: Term): boolean { return !!this.cbdExtractedMap.get(term)?.shape; } push(term: Term, inverse: boolean): CbdExtracted { if (inverse) { if (!this.topology.backwards[term.value]) { const ne: Extracted = { forwards: {}, backwards: {}, }; ne.forwards[term.value] = this.topology; this.topology.backwards[term.value] = ne; } return new CbdExtracted( this.topology.backwards[term.value], this.cbdExtractedMap, ); } else { if (!this.topology.forwards[term.value]) { const ne: Extracted = { forwards: {}, backwards: {}, }; ne.backwards[term.value] = this.topology; this.topology.forwards[term.value] = ne; } return new CbdExtracted( this.topology.forwards[term.value], this.cbdExtractedMap, ); } } enter(term: Term, inverse: boolean): CbdExtracted | undefined { const out = inverse ? this.topology.backwards[term.value] : this.topology.forwards[term.value]; if (out) { return new CbdExtracted(out, this.cbdExtractedMap); } } } class ExtractInstance { dereferenced: Set<string> = new Set(); store: RdfStore; dereferencer: RdfDereferencer; options: CBDShapeExtractorOptions; graphsToIgnore: string[]; shapesGraph?: ShapesGraph; constructor( store: RdfStore, dereferencer: RdfDereferencer, graphsToIgnore: string[], options: CBDShapeExtractorOptions, shapesGraph?: ShapesGraph, ) { this.store = store; this.dereferencer = dereferencer; this.shapesGraph = shapesGraph; this.graphsToIgnore = graphsToIgnore; this.options = options; } public async extract( id: Term, offline: boolean, shapeId?: Term | ShapeTemplate, ) { const result = await this.maybeExtractRecursively( id, new CbdExtracted(), offline, shapeId, ); result.push(...this.store.getQuads(null, null, null, id)); if (result.length === 0) { if (await this.dereference(id.value)) { // retry const result = await this.maybeExtractRecursively( id, new CbdExtracted(), offline, shapeId, ); return result.filter((value: Quad, index: number, array: Quad[]) => { return index === array.findIndex((x) => x.equals(value)); }); } } return result.filter((value: Quad, index: number, array: Quad[]) => { return index === array.findIndex((x) => x.equals(value)); }); } private async dereference(url: string): Promise<boolean> { if (this.dereferenced.has(url)) { log("Will not dereference " + url + " again"); return false; } this.dereferenced.add(url); await this.loadQuadStreamInStore( ( await this.dereferencer.dereference(url, { fetch: this.options.fetch, }) ).data, ); return true; } private async maybeExtractRecursively( id: Term, extracted: CbdExtracted, offline: boolean, shapeId?: Term | ShapeTemplate, ): Promise<Array<Quad>> { if (extracted.cbdExtracted(id)) { return []; } extracted.addShapeTerm(id); return this.extractRecursively(id, extracted, offline, shapeId); } private async extractRecursively( id: Term, extracted: CbdExtracted, offline: boolean, shapeId?: Term | ShapeTemplate, ): Promise<Array<Quad>> { const result: Quad[] = []; let shape: ShapeTemplate | undefined; if (shapeId instanceof ShapeTemplate) { shape = shapeId; } else if (shapeId && this.shapesGraph) { shape = this.shapesGraph.shapes.get(shapeId); } if (!shape?.closed) { this.CBD(id, result, extracted, this.graphsToIgnore); } // Next, on our newly fetched data, // we’ll need to process all paths of the shape. If the shape is open, we’re going to do CBD afterwards, so let’s omit paths with only a PredicatePath when the shape is open if (!!shape) { //For all valid items in the atLeastOneLists, process the required path, optional paths and nodelinks. Do the same for the atLeastOneLists inside these options. let extraPaths: Path[] = []; let extraNodeLinks: NodeLink[] = []; // Process atLeastOneLists in extraPaths and extra NodeLinks shape.fillPathsAndLinks(extraPaths, extraNodeLinks); for (let path of shape.requiredPaths.concat( shape.optionalPaths, extraPaths, )) { if (!path.found(extracted) || shape.closed) { let pathQuads = path .match(this.store, extracted, id, this.graphsToIgnore) .flatMap((pathResult) => { return pathResult.path; }); result.push(...pathQuads); } } for (let nodeLink of shape.nodeLinks.concat(extraNodeLinks)) { let matches = nodeLink.pathPattern.match( this.store, extracted, id, this.graphsToIgnore, ); // I don't know how to do this correctly, but this is not the way for (let match of matches) { result.push( ...(await this.maybeExtractRecursively( match.target, match.cbdExtracted, offline, nodeLink.link, )), ); } } } if (!offline && id.termType === "NamedNode") { if (shape) { const problems = shape.requiredAreNotPresent(extracted); if (problems) { if (await this.dereference(id.value)) { // retry return this.extractRecursively(id, extracted, offline, shapeId); } else { log( `${ id.value } does not adhere to the shape (${problems.toString()})`, ); } } } } return result; } /** * Performs Concise Bounded Description: extract star-shape and recurses over the blank nodes * @param result list of quads * @param extractedStar topology object to keep track of already found properties * @param store store to use for cbd * @param id starting subject * @param graphsToIgnore */ private CBD( id: Term, result: Quad[], extractedStar: CbdExtracted, graphsToIgnore: Array<string>, ) { extractedStar.addCBDTerm(id); const graph = this.options.cbdDefaultGraph ? df.defaultGraph() : null; const quads = this.store.getQuads(id, null, null, graph); for (const q of quads) { // Ignore quads in the graphs to ignore if (graphsToIgnore?.includes(q.graph.value)) { continue; } result.push(q); const next = extractedStar.push(q.predicate, false); // Conditionally get more quads: if it’s a not yet extracted blank node if ( q.object.termType === "BlankNode" && !extractedStar.cbdExtracted(q.object) ) { this.CBD(q.object, result, next, graphsToIgnore); } } } private loadQuadStreamInStore(quadStream: any) { return new Promise((resolve, reject) => { this.store.import(quadStream).on("end", resolve).on("error", reject); }); } }