UNPKG

geneea-nlp-client

Version:

The TypeScript Client for Geneea Interpretor G3 API.

362 lines (361 loc) 16.6 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.readFromJson = readFromJson; const offset_converter_1 = require("../../common/offset-converter"); const sem_ver_1 = require("../../common/sem-ver"); const char_span_1 = require("../../common/char-span"); const __1 = require("../"); const model_1 = require("../model"); const gkb_propert_1 = require("../model/gkb-propert"); /** * Reads the G3 object from a JSON object as returned from Geneea G3 API. * @param raw Raw JSON object corresponding to the G3 API. * @returns G3 object encapsulating the analysis. * * Note: depending on the requested set of analyses and language support many of the keys can be missing. */ // eslint-disable-next-line @typescript-eslint/explicit-module-boundary-types function readFromJson(raw) { return new Reader().fromJson(raw); } /** Standard keys used in G3 analysis JSON. */ const STD_KEYS = new Set([ "id", "language", "paragraphs", "entities", "tags", "relations", "docSentiment", "itemSentiments", "docVectors", "itemVectors", "usedChars", "metadata", "debugInfo", "version", ]); const RE_FULL_DEP_FNC = /^([a-z]+):([a-z]+)$/i; class Reader { constructor() { // Note: this implementation is not thread-safe. /** Registry to store read objects by their ids; used to resolve references. */ this.registry = new Map(); this.id2sentiment = new Map(); this.id2vectors = new Map(); this.mentionId2tectoToken = new Map(); this.entityId2derivedMentions = new Map(); /** Version of the JSON being read. */ this.version = new sem_ver_1.SemVer(3, 3, 0); } register(obj) { this.registry.set(obj.id, obj); } /** * Resolves a string id into the associated object. * @param id String id to resolve. * @returns The object associated with the id. * @throws Throws an error when the id does not map to any object. */ resolveId(id) { const obj = this.registry.get(id); if (obj === undefined) { throw new Error(`Unknown object ID used as a reference ${id}.`); } return obj; } resolveIds(ids) { const objs = ids .map((id) => this.resolveId(id)) .filter((o) => o !== null); return objs; } readAndResolveId(raw, key) { return raw[key] ? this.resolveId(raw[key]) : null; } readAndResolveIds(raw, key) { return this.resolveIds(raw[key]); } checkVersion(version) { const semVer = sem_ver_1.SemVer.valueOf(version !== null && version !== void 0 ? version : "3.0.0"); // 3.0.0 was not specified if (semVer.major > 3) { throw new Error(`Unsupported API version ${version}, major ver.num > 3.`); } else if (semVer.isSameOrNewerThan(new sem_ver_1.SemVer(3, 4, 0))) { console.log(`Reading analysis with version ${version} higher than ${this.version} is only partially supported.`); } this.version = semVer; } readSentiment(raw) { return new model_1.Sentiment(raw.mean, raw.label, raw.positive, raw.negative); } readVector(raw) { return new model_1.Vector(raw.name, raw.version, raw.values); } readVectors(raw) { return raw.map((v) => this.readVector(v)); } readFeats(raw) { return new Map(Object.entries(raw)); } readGkbProperty(raw) { return new gkb_propert_1.GkbProperty(raw.name, raw.label, raw.valueGkbId, raw.boolValue, raw.floatValue, raw.intValue, raw.strValue); } readToken(raw, tokenIdx, useOrigTextField) { var _a; let text; let origText; let off; let origOff; if (useOrigTextField) { text = raw.text; off = this.offMap.get(raw.off); origText = (_a = raw.origText) !== null && _a !== void 0 ? _a : text; origOff = raw.origOff ? this.offMap.getOrig(raw.origOff) : off; } else { text = raw.corrText; off = this.offMap.get(raw.corrOff); origText = raw.text; origOff = this.offMap.getOrig(raw.off); } const pos = raw.pos ? (0, __1.UPosFromStr)(raw.pos) : null; const fncStr = raw.fnc; let fnc = null; let subFnc = null; if (fncStr) { if (fncStr.toUpperCase() === "CLAUSE") { fnc = __1.UDep.ROOT; subFnc = null; } else { const matches = fncStr.match(RE_FULL_DEP_FNC); if (matches !== null) { fnc = (0, __1.UDepFromStr)(matches[1]); subFnc = matches[2]; } else { fnc = (0, __1.UDepFromStr)(fncStr); subFnc = null; } } } const tok = model_1.Token.of(raw.id, tokenIdx, // sentence based index text, char_span_1.CharSpan.withLen(off, text.length), origText, char_span_1.CharSpan.withLen(origOff, origText.length), raw.dLemma, raw.lemma, pos, raw.mTag, raw.feats ? this.readFeats(raw.feats) : null, fnc, subFnc); this.register(tok); return tok; } readTectoToken(raw, tokenIdx) { var _a; const tt = model_1.TectoToken.of(raw.id, tokenIdx, (_a = raw.fnc) !== null && _a !== void 0 ? _a : (0, __1.UDepToStr)(__1.UDep.DEP), raw.lemma, raw.feats ? this.readFeats(raw.feats) : null, this.readAndResolveIds(raw, "tokenIds"), null, // EntityMention will be filled later null); if (raw.entityMentionId) { let tts = []; if (this.mentionId2tectoToken.has(raw.entityMentionId)) tts = this.mentionId2tectoToken.get(raw.entityMentionId); tts.push(tt); this.mentionId2tectoToken.set(raw.entityMentionId, tts); } this.register(tt); return tt; } createTree(rawTokens, tokens) { const tb = new model_1.TreeBuilder().addNodes(tokens); for (const rt of rawTokens) { if (rt.parId) { const parent = this.resolveId(rt.parId); const child = this.resolveId(rt.id); tb.addDependency(child.idx, parent.idx); } } return tb.build(); } readSentence(raw, useOrigTextField) { var _a, _b, _c, _d, _e; const rawTokens = raw.tokens; const tokens = rawTokens.map((raw, idx) => this.readToken(raw, idx, useOrigTextField)); const rawTectoTokens = (_a = raw.tecto) !== null && _a !== void 0 ? _a : []; const tectoTokens = rawTectoTokens.map((raw, idx) => this.readTectoToken(raw, idx)); let tree; let tectoTree; if (tokens[0].fnc != null) { tree = this.createTree(rawTokens, tokens); tectoTree = this.createTree(rawTectoTokens, tectoTokens); } else { tree = null; tectoTree = null; } const sent = model_1.Sentence.of(raw.id, (_b = tree === null || tree === void 0 ? void 0 : tree.root) !== null && _b !== void 0 ? _b : null, tokens, tectoTree === null || tectoTree === void 0 ? void 0 : tectoTree.root, tectoTree === null || tectoTree === void 0 ? void 0 : tectoTree.tokens, (_c = this.id2sentiment.get(raw.id)) !== null && _c !== void 0 ? _c : null, (_d = this.id2vectors.get(raw.id)) !== null && _d !== void 0 ? _d : null); sent.tokens.forEach((t) => (t.sentence = sent)); (_e = sent.tectoTokens) === null || _e === void 0 ? void 0 : _e.forEach((tt) => (tt.sentence = sent)); this.register(sent); return sent; } readParagraph(raw) { var _a, _b; const useOrigTextField = this.version.isSameOrNewerThan(new sem_ver_1.SemVer(3, 2, 0)); const hasCodepointOffs = this.version.isSameOrNewerThan(new sem_ver_1.SemVer(3, 2, 1)); const text = useOrigTextField ? raw.text : raw.corrText; const origText = useOrigTextField ? raw.origText : raw.text; this.offMap = hasCodepointOffs ? new Cp2JSOffsetMapping(text, origText !== null && origText !== void 0 ? origText : null) : new IdentityOffsetMapping(); const para = model_1.Paragraph.of(raw.id, raw.type, text, origText, raw.sentences.map((s) => this.readSentence(s, useOrigTextField)), (_a = this.id2sentiment.get(raw.id)) !== null && _a !== void 0 ? _a : null, (_b = this.id2vectors.get(raw.id)) !== null && _b !== void 0 ? _b : null); para.sentences.forEach((s) => (s.paragraph = para)); this.register(para); return para; } readEntityMention(raw) { var _a, _b; const men = model_1.EntityMention.of(raw.id, raw.text, raw.mwl, this.readAndResolveIds(raw, "tokenIds"), raw.feats ? this.readFeats(raw.feats) : null, null, // derivedFrom; will be filled later (_a = this.id2sentiment.get(raw.id)) !== null && _a !== void 0 ? _a : null, (_b = this.id2vectors.get(raw.id)) !== null && _b !== void 0 ? _b : null); const entityId = raw.derivedFromEntityId; if (entityId) { const derivedMentions = this.entityId2derivedMentions.get(entityId); if (derivedMentions) derivedMentions.push(men); else this.entityId2derivedMentions.set(entityId, [men]); } this.register(men); return men; } readEntity(raw) { var _a, _b; const hasGkbProps = this.version.isSameOrNewerThan(new sem_ver_1.SemVer(3, 3, 0)) && Array.isArray(raw.gkbProperties); const ent = model_1.Entity.of(raw.id, raw.gkbId, raw.stdForm, raw.type, raw.mentions ? raw.mentions.map((m) => this.readEntityMention(m)) : null, raw.feats ? this.readFeats(raw.feats) : null, (_a = this.id2sentiment.get(raw.id)) !== null && _a !== void 0 ? _a : null, (_b = this.id2vectors.get(raw.id)) !== null && _b !== void 0 ? _b : null, hasGkbProps ? raw.gkbProperties.map((p) => this.readGkbProperty(p)) : []); ent.mentions.forEach((m) => (m.mentionOf = ent)); this.register(ent); return ent; } readTagMention(raw) { var _a, _b; const men = model_1.TagMention.of(raw.id, this.readAndResolveIds(raw, "tokenIds"), raw.feats ? this.readFeats(raw.feats) : null, (_a = this.id2sentiment.get(raw.id)) !== null && _a !== void 0 ? _a : null, (_b = this.id2vectors.get(raw.id)) !== null && _b !== void 0 ? _b : null); this.register(men); return men; } readTag(raw) { var _a, _b; const hasGkbProps = this.version.isSameOrNewerThan(new sem_ver_1.SemVer(3, 3, 0)) && Array.isArray(raw.gkbProperties); const tag = model_1.Tag.of(raw.id, raw.gkbId, raw.stdForm, raw.type, raw.relevance, raw.mentions ? raw.mentions.map((m) => this.readTagMention(m)) : null, raw.feats ? this.readFeats(raw.feats) : null, (_a = this.id2sentiment.get(raw.id)) !== null && _a !== void 0 ? _a : null, (_b = this.id2vectors.get(raw.id)) !== null && _b !== void 0 ? _b : null, hasGkbProps ? raw.gkbProperties.map((p) => this.readGkbProperty(p)) : []); tag.mentions.forEach((m) => (m.mentionOf = tag)); this.register(tag); return tag; } readRelationArgument(raw) { return new model_1.RelationArgument(raw.name, raw.type, this.readAndResolveId(raw, "entityId")); } readRelationSupport(raw) { return model_1.RelationSupport.of(this.readAndResolveIds(raw, "tokenIds"), this.readAndResolveId(raw, "tectoId")); } readRelation(raw) { var _a, _b, _c; const rel = model_1.Relation.of(raw.id, raw.textRepr, raw.name, raw.type, ((_a = raw.args) !== null && _a !== void 0 ? _a : []).map((obj) => this.readRelationArgument(obj)), raw.support ? raw.support.map((obj) => this.readRelationSupport(obj)) : null, raw.feats ? this.readFeats(raw.feats) : null, (_b = this.id2sentiment.get(raw.id)) !== null && _b !== void 0 ? _b : null, (_c = this.id2vectors.get(raw.id)) !== null && _c !== void 0 ? _c : null); this.register(rel); return rel; } readMetadata(raw) { const useTopLevelMetadata = this.version.isOlderThan(new sem_ver_1.SemVer(3, 1, 0)); let metadata = raw.metadata ? new Map(Object.entries(raw.metadata)) : null; const unknownKeys = Object.keys(raw).filter((k) => !STD_KEYS.has(k)); if (unknownKeys.length !== 0) { if (useTopLevelMetadata && metadata === null) { metadata = new Map(); unknownKeys.forEach((k) => metadata === null || metadata === void 0 ? void 0 : metadata.set(k, raw[k])); } else { console.warn(`[Warning] unrecognized fields in the analysis: ${unknownKeys}`); } } return metadata; } /** * Reads the Analysis object from a JSON object as returned from Geneea G3 API. * @param rawAnalysis Object corresponding to a G3 API JSON. * @returns An Analysis object encapsulating the NLP analysis. */ fromJson(rawAnalysis) { var _a, _b, _c, _d, _e; const ra = rawAnalysis; this.checkVersion(ra.version); // store item sentiment to fill it in when the relevant items are constructed if (ra.itemSentiments) { const itemSentiments = Object.entries(ra.itemSentiments); itemSentiments.map(([id, obj]) => this.id2sentiment.set(id, this.readSentiment(obj))); } // store item vectors to fill them in when the relevant items are constructed if (ra.itemVectors) { const itemVectors = Object.entries(ra.itemVectors); itemVectors.map(([id, obj]) => { const vectors = obj; this.id2vectors.set(id, vectors.map((v) => this.readVector(v))); }); } const docId = (_a = ra.id) !== null && _a !== void 0 ? _a : null; // ISO 639-2 for Undetermined language. const language = new model_1.Language((_c = (_b = ra.language) === null || _b === void 0 ? void 0 : _b.detected) !== null && _c !== void 0 ? _c : "und"); const paragraphs = (ra.paragraphs || []).map((raw) => this.readParagraph(raw)); const entities = (ra.entities || []).map((raw) => this.readEntity(raw)); const tags = (ra.tags || []).map((raw) => this.readTag(raw)); const relations = (ra.relations || []).map((raw) => this.readRelation(raw)); const docSentiment = ra.docSentiment ? this.readSentiment(ra.docSentiment) : null; const docVectors = ra.docVectors ? this.readVectors(ra.docVectors) : null; const usedChars = (_d = ra.usedChars) !== null && _d !== void 0 ? _d : null; const metadata = this.readMetadata(ra); const debugInfo = (_e = ra.debugInfo) !== null && _e !== void 0 ? _e : null; const analysis = new model_1.Analysis(docId, language, paragraphs, docSentiment, entities, tags, relations, docVectors, usedChars, metadata, debugInfo); analysis.paragraphs.forEach((x) => (x.container = analysis)); // fill derived-from entities for mentions this.entityId2derivedMentions.forEach((mentions, id) => { const entity = this.resolveId(id); mentions.forEach((m) => (m.derivedFrom = entity)); }); // fill tecto-token entity mention this.mentionId2tectoToken.forEach((tokens, id) => { const mention = this.resolveId(id); tokens.forEach((tt) => { tt.entityMention = mention; tt.entity = mention.mentionOf; }); }); return analysis; } } /** Encapsulates mapping of offsets from code-points to JavaScript string indices. */ class Cp2JSOffsetMapping { constructor(text, origText) { const convForText = new offset_converter_1.Cp2JSOffsetConverter(text); const convForOrigText = origText === null ? convForText : new offset_converter_1.Cp2JSOffsetConverter(origText); this.get = (cpOff) => convForText.convert(cpOff); this.getOrig = (cpOff) => convForOrigText.convert(cpOff); } } class IdentityOffsetMapping { get(cpOff) { return cpOff; } getOrig(cpOff) { return cpOff; } }