UNPKG

@rdfc/sds-storage-writer-ts

Version:

An RDF-Connect processor to write SDS streams into a given storage system

164 lines (163 loc) 6.46 kB
import { PROV, RDF as RDFT, SDS } from "@treecg/types"; import { DataFactory, Parser, Writer } from "n3"; import { getLoggerFor } from "./utils/logUtil.js"; import { Extractor } from "./extractor.js"; import canonize from "rdf-canonize"; import { Lock } from "async-await-mutex-lock"; import { getRepository, } from "./repositories/Repository.js"; const logger = getLoggerFor("ingest"); function maybe_parse(data) { if (typeof data === "string" || data instanceof String) { const parse = new Parser(); return parse.parse(data); } else { return data; } } function filterMember(quads, id, blacklist = [], done) { const d = done === undefined ? new Set() : done; const quadIsBlacklisted = (q) => blacklist.some((b) => b(q, id)); d.add(id.value); const out = quads.filter((q) => q.subject.equals(id) && !quadIsBlacklisted(q)); const newObjects = quads .filter((q) => q.subject.equals(id) && !quadIsBlacklisted(q)) .map((q) => q.object) .filter((o) => o.termType === "BlankNode" || o.termType === "NamedNode"); for (const id of newObjects) { if (d.has(id.value)) continue; out.push(...filterMember(quads, id, blacklist, d)); } const newSubjects = quads .filter((q) => q.object.equals(id) && !quadIsBlacklisted(q)) .map((q) => q.subject) .filter((o) => o.termType === "BlankNode" || o.termType === "NamedNode"); for (const id of newSubjects) { if (d.has(id.value)) continue; out.push(...filterMember(quads, id, blacklist, d)); } return out; } const lock = new Lock(); async function handleRecords(extract, repository, operations) { const dataSer = new Writer().quadsToString(extract.getData()); const records = extract.getRecords(); const dataOperations = repository.prepareDataBulk(); for (const rec of records) { if (!rec.dataless) { await repository.handleRecord(rec, dataSer, dataOperations); } } await repository.ingestDataBulk(dataOperations); for (const rec of records) { for (const bucket of rec.buckets) { await repository.handleMember(rec, bucket, operations); } } } async function handleBuckets(extract, repository, operations) { const buckets = extract.getBuckets(); for (const bucket of buckets) { if (!bucket.root) { delete bucket.root; } if (!bucket.immutable) { delete bucket.immutable; } await repository.handleBucket(bucket, operations); } } const df = DataFactory; async function pathString(thing) { if (!thing) { return; } const quads = [ df.quad(df.namedNode(""), df.namedNode("http://purl.org/dc/terms/subject"), thing.id), ...thing.quads.map((x) => df.quad(x.subject, x.predicate, x.object)), ]; const canonical = await canonize.canonize(quads, { algorithm: "RDFC-1.0" }); return canonical; } async function handleRelations(extract, repository, operations) { const removeRelations = extract.getRemoveRelations(); for (const rel of removeRelations) { const pathValue = await pathString(rel.path); const valueValue = await pathString(rel.value); await repository.removeRelation(rel, pathValue, valueValue, operations); } const relations = extract.getRelations(); for (const rel of relations) { const pathValue = await pathString(rel.path); const valueValue = await pathString(rel.value); await repository.handleRelation(rel, pathValue, valueValue, operations); } } async function handleMetadata(metadata, repository, ingestMetadata) { const meta = maybe_parse(metadata); if (!ingestMetadata) { logger.error("[setup_metadata] Cannot handle metadata, repository is closed"); return; } const streams = meta .filter((q) => q.predicate.equals(RDFT.terms.type) && q.object.equals(SDS.terms.Stream)) .map((q) => q.subject); for (const streamId of streams) { const streamMember = filterMember(meta, streamId, [ (q, id) => q.predicate.equals(PROV.terms.used) && q.object.equals(id), (q, id) => q.predicate.equals(SDS.terms.dataset) && q.object.equals(id), ]); const ser = new Writer().quadsToString(streamMember); await repository.ingestMetadata(SDS.Stream, streamId.value, ser); } } export async function ingest(data, metadata, database) { const repository = getRepository(database); await repository.open(); const dbFragmentations = await repository.findMetadataFragmentations(); logger.debug(`Found ${dbFragmentations.length} fragmentations (${dbFragmentations.map((x) => x.id.value)})`); let ingestMetadata = true; let ingestData = true; let closed = false; const closeRepository = () => { if (!ingestMetadata && !ingestData && !closed) { logger.info("Closing repository client connection"); closed = true; return repository.close(); } }; data.on("end", async () => { ingestData = false; return await closeRepository(); }); metadata.on("end", async () => { ingestMetadata = false; return await closeRepository(); }); metadata.data(async (meta) => await handleMetadata(meta, repository, ingestMetadata)); if (metadata.lastElement) { await handleMetadata(metadata.lastElement, repository, ingestMetadata); } logger.debug("Attached metadata handler"); await repository.createIndices(); const extractor = new Extractor(); data.data(async (input) => { const data = maybe_parse(input); if (!ingestData) { logger.error("Cannot handle data, repository is closed"); return; } logger.debug(`Handling ingest for '${data.find((q) => q.predicate.equals(SDS.terms.payload))?.object?.value}'`); const extract = extractor.extract_quads(data); const indexOperations = repository.prepareIndexBulk(); await handleRecords(extract, repository, indexOperations); await handleRelations(extract, repository, indexOperations); await handleBuckets(extract, repository, indexOperations); await repository.ingestIndexBulk(indexOperations); }); logger.debug("Attached data handler"); }