geneea-nlp-client
Version:
The TypeScript Client for Geneea Interpretor G3 API.
362 lines (361 loc) • 16.6 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.readFromJson = readFromJson;
const offset_converter_1 = require("../../common/offset-converter");
const sem_ver_1 = require("../../common/sem-ver");
const char_span_1 = require("../../common/char-span");
const __1 = require("../");
const model_1 = require("../model");
const gkb_propert_1 = require("../model/gkb-propert");
/**
* Reads the G3 object from a JSON object as returned from Geneea G3 API.
* @param raw Raw JSON object corresponding to the G3 API.
* @returns G3 object encapsulating the analysis.
*
* Note: depending on the requested set of analyses and language support many of the keys can be missing.
*/
// eslint-disable-next-line @typescript-eslint/explicit-module-boundary-types
function readFromJson(raw) {
return new Reader().fromJson(raw);
}
/** Standard keys used in G3 analysis JSON. */
const STD_KEYS = new Set([
"id",
"language",
"paragraphs",
"entities",
"tags",
"relations",
"docSentiment",
"itemSentiments",
"docVectors",
"itemVectors",
"usedChars",
"metadata",
"debugInfo",
"version",
]);
const RE_FULL_DEP_FNC = /^([a-z]+):([a-z]+)$/i;
class Reader {
constructor() {
// Note: this implementation is not thread-safe.
/** Registry to store read objects by their ids; used to resolve references. */
this.registry = new Map();
this.id2sentiment = new Map();
this.id2vectors = new Map();
this.mentionId2tectoToken = new Map();
this.entityId2derivedMentions = new Map();
/** Version of the JSON being read. */
this.version = new sem_ver_1.SemVer(3, 3, 0);
}
register(obj) {
this.registry.set(obj.id, obj);
}
/**
* Resolves a string id into the associated object.
* @param id String id to resolve.
* @returns The object associated with the id.
* @throws Throws an error when the id does not map to any object.
*/
resolveId(id) {
const obj = this.registry.get(id);
if (obj === undefined) {
throw new Error(`Unknown object ID used as a reference ${id}.`);
}
return obj;
}
resolveIds(ids) {
const objs = ids
.map((id) => this.resolveId(id))
.filter((o) => o !== null);
return objs;
}
readAndResolveId(raw, key) {
return raw[key] ? this.resolveId(raw[key]) : null;
}
readAndResolveIds(raw, key) {
return this.resolveIds(raw[key]);
}
checkVersion(version) {
const semVer = sem_ver_1.SemVer.valueOf(version !== null && version !== void 0 ? version : "3.0.0"); // 3.0.0 was not specified
if (semVer.major > 3) {
throw new Error(`Unsupported API version ${version}, major ver.num > 3.`);
}
else if (semVer.isSameOrNewerThan(new sem_ver_1.SemVer(3, 4, 0))) {
console.log(`Reading analysis with version ${version} higher than ${this.version} is only partially supported.`);
}
this.version = semVer;
}
readSentiment(raw) {
return new model_1.Sentiment(raw.mean, raw.label, raw.positive, raw.negative);
}
readVector(raw) {
return new model_1.Vector(raw.name, raw.version, raw.values);
}
readVectors(raw) {
return raw.map((v) => this.readVector(v));
}
readFeats(raw) {
return new Map(Object.entries(raw));
}
readGkbProperty(raw) {
return new gkb_propert_1.GkbProperty(raw.name, raw.label, raw.valueGkbId, raw.boolValue, raw.floatValue, raw.intValue, raw.strValue);
}
readToken(raw, tokenIdx, useOrigTextField) {
var _a;
let text;
let origText;
let off;
let origOff;
if (useOrigTextField) {
text = raw.text;
off = this.offMap.get(raw.off);
origText = (_a = raw.origText) !== null && _a !== void 0 ? _a : text;
origOff = raw.origOff ? this.offMap.getOrig(raw.origOff) : off;
}
else {
text = raw.corrText;
off = this.offMap.get(raw.corrOff);
origText = raw.text;
origOff = this.offMap.getOrig(raw.off);
}
const pos = raw.pos ? (0, __1.UPosFromStr)(raw.pos) : null;
const fncStr = raw.fnc;
let fnc = null;
let subFnc = null;
if (fncStr) {
if (fncStr.toUpperCase() === "CLAUSE") {
fnc = __1.UDep.ROOT;
subFnc = null;
}
else {
const matches = fncStr.match(RE_FULL_DEP_FNC);
if (matches !== null) {
fnc = (0, __1.UDepFromStr)(matches[1]);
subFnc = matches[2];
}
else {
fnc = (0, __1.UDepFromStr)(fncStr);
subFnc = null;
}
}
}
const tok = model_1.Token.of(raw.id, tokenIdx, // sentence based index
text, char_span_1.CharSpan.withLen(off, text.length), origText, char_span_1.CharSpan.withLen(origOff, origText.length), raw.dLemma, raw.lemma, pos, raw.mTag, raw.feats ? this.readFeats(raw.feats) : null, fnc, subFnc);
this.register(tok);
return tok;
}
readTectoToken(raw, tokenIdx) {
var _a;
const tt = model_1.TectoToken.of(raw.id, tokenIdx, (_a = raw.fnc) !== null && _a !== void 0 ? _a : (0, __1.UDepToStr)(__1.UDep.DEP), raw.lemma, raw.feats ? this.readFeats(raw.feats) : null, this.readAndResolveIds(raw, "tokenIds"), null, // EntityMention will be filled later
null);
if (raw.entityMentionId) {
let tts = [];
if (this.mentionId2tectoToken.has(raw.entityMentionId))
tts = this.mentionId2tectoToken.get(raw.entityMentionId);
tts.push(tt);
this.mentionId2tectoToken.set(raw.entityMentionId, tts);
}
this.register(tt);
return tt;
}
createTree(rawTokens, tokens) {
const tb = new model_1.TreeBuilder().addNodes(tokens);
for (const rt of rawTokens) {
if (rt.parId) {
const parent = this.resolveId(rt.parId);
const child = this.resolveId(rt.id);
tb.addDependency(child.idx, parent.idx);
}
}
return tb.build();
}
readSentence(raw, useOrigTextField) {
var _a, _b, _c, _d, _e;
const rawTokens = raw.tokens;
const tokens = rawTokens.map((raw, idx) => this.readToken(raw, idx, useOrigTextField));
const rawTectoTokens = (_a = raw.tecto) !== null && _a !== void 0 ? _a : [];
const tectoTokens = rawTectoTokens.map((raw, idx) => this.readTectoToken(raw, idx));
let tree;
let tectoTree;
if (tokens[0].fnc != null) {
tree = this.createTree(rawTokens, tokens);
tectoTree = this.createTree(rawTectoTokens, tectoTokens);
}
else {
tree = null;
tectoTree = null;
}
const sent = model_1.Sentence.of(raw.id, (_b = tree === null || tree === void 0 ? void 0 : tree.root) !== null && _b !== void 0 ? _b : null, tokens, tectoTree === null || tectoTree === void 0 ? void 0 : tectoTree.root, tectoTree === null || tectoTree === void 0 ? void 0 : tectoTree.tokens, (_c = this.id2sentiment.get(raw.id)) !== null && _c !== void 0 ? _c : null, (_d = this.id2vectors.get(raw.id)) !== null && _d !== void 0 ? _d : null);
sent.tokens.forEach((t) => (t.sentence = sent));
(_e = sent.tectoTokens) === null || _e === void 0 ? void 0 : _e.forEach((tt) => (tt.sentence = sent));
this.register(sent);
return sent;
}
readParagraph(raw) {
var _a, _b;
const useOrigTextField = this.version.isSameOrNewerThan(new sem_ver_1.SemVer(3, 2, 0));
const hasCodepointOffs = this.version.isSameOrNewerThan(new sem_ver_1.SemVer(3, 2, 1));
const text = useOrigTextField ? raw.text : raw.corrText;
const origText = useOrigTextField ? raw.origText : raw.text;
this.offMap = hasCodepointOffs
? new Cp2JSOffsetMapping(text, origText !== null && origText !== void 0 ? origText : null)
: new IdentityOffsetMapping();
const para = model_1.Paragraph.of(raw.id, raw.type, text, origText, raw.sentences.map((s) => this.readSentence(s, useOrigTextField)), (_a = this.id2sentiment.get(raw.id)) !== null && _a !== void 0 ? _a : null, (_b = this.id2vectors.get(raw.id)) !== null && _b !== void 0 ? _b : null);
para.sentences.forEach((s) => (s.paragraph = para));
this.register(para);
return para;
}
readEntityMention(raw) {
var _a, _b;
const men = model_1.EntityMention.of(raw.id, raw.text, raw.mwl, this.readAndResolveIds(raw, "tokenIds"), raw.feats ? this.readFeats(raw.feats) : null, null, // derivedFrom; will be filled later
(_a = this.id2sentiment.get(raw.id)) !== null && _a !== void 0 ? _a : null, (_b = this.id2vectors.get(raw.id)) !== null && _b !== void 0 ? _b : null);
const entityId = raw.derivedFromEntityId;
if (entityId) {
const derivedMentions = this.entityId2derivedMentions.get(entityId);
if (derivedMentions)
derivedMentions.push(men);
else
this.entityId2derivedMentions.set(entityId, [men]);
}
this.register(men);
return men;
}
readEntity(raw) {
var _a, _b;
const hasGkbProps = this.version.isSameOrNewerThan(new sem_ver_1.SemVer(3, 3, 0)) &&
Array.isArray(raw.gkbProperties);
const ent = model_1.Entity.of(raw.id, raw.gkbId, raw.stdForm, raw.type, raw.mentions
? raw.mentions.map((m) => this.readEntityMention(m))
: null, raw.feats ? this.readFeats(raw.feats) : null, (_a = this.id2sentiment.get(raw.id)) !== null && _a !== void 0 ? _a : null, (_b = this.id2vectors.get(raw.id)) !== null && _b !== void 0 ? _b : null, hasGkbProps
? raw.gkbProperties.map((p) => this.readGkbProperty(p))
: []);
ent.mentions.forEach((m) => (m.mentionOf = ent));
this.register(ent);
return ent;
}
readTagMention(raw) {
var _a, _b;
const men = model_1.TagMention.of(raw.id, this.readAndResolveIds(raw, "tokenIds"), raw.feats ? this.readFeats(raw.feats) : null, (_a = this.id2sentiment.get(raw.id)) !== null && _a !== void 0 ? _a : null, (_b = this.id2vectors.get(raw.id)) !== null && _b !== void 0 ? _b : null);
this.register(men);
return men;
}
readTag(raw) {
var _a, _b;
const hasGkbProps = this.version.isSameOrNewerThan(new sem_ver_1.SemVer(3, 3, 0)) &&
Array.isArray(raw.gkbProperties);
const tag = model_1.Tag.of(raw.id, raw.gkbId, raw.stdForm, raw.type, raw.relevance, raw.mentions
? raw.mentions.map((m) => this.readTagMention(m))
: null, raw.feats ? this.readFeats(raw.feats) : null, (_a = this.id2sentiment.get(raw.id)) !== null && _a !== void 0 ? _a : null, (_b = this.id2vectors.get(raw.id)) !== null && _b !== void 0 ? _b : null, hasGkbProps
? raw.gkbProperties.map((p) => this.readGkbProperty(p))
: []);
tag.mentions.forEach((m) => (m.mentionOf = tag));
this.register(tag);
return tag;
}
readRelationArgument(raw) {
return new model_1.RelationArgument(raw.name, raw.type, this.readAndResolveId(raw, "entityId"));
}
readRelationSupport(raw) {
return model_1.RelationSupport.of(this.readAndResolveIds(raw, "tokenIds"), this.readAndResolveId(raw, "tectoId"));
}
readRelation(raw) {
var _a, _b, _c;
const rel = model_1.Relation.of(raw.id, raw.textRepr, raw.name, raw.type, ((_a = raw.args) !== null && _a !== void 0 ? _a : []).map((obj) => this.readRelationArgument(obj)), raw.support
? raw.support.map((obj) => this.readRelationSupport(obj))
: null, raw.feats ? this.readFeats(raw.feats) : null, (_b = this.id2sentiment.get(raw.id)) !== null && _b !== void 0 ? _b : null, (_c = this.id2vectors.get(raw.id)) !== null && _c !== void 0 ? _c : null);
this.register(rel);
return rel;
}
readMetadata(raw) {
const useTopLevelMetadata = this.version.isOlderThan(new sem_ver_1.SemVer(3, 1, 0));
let metadata = raw.metadata
? new Map(Object.entries(raw.metadata))
: null;
const unknownKeys = Object.keys(raw).filter((k) => !STD_KEYS.has(k));
if (unknownKeys.length !== 0) {
if (useTopLevelMetadata && metadata === null) {
metadata = new Map();
unknownKeys.forEach((k) => metadata === null || metadata === void 0 ? void 0 : metadata.set(k, raw[k]));
}
else {
console.warn(`[Warning] unrecognized fields in the analysis: ${unknownKeys}`);
}
}
return metadata;
}
/**
* Reads the Analysis object from a JSON object as returned from Geneea G3 API.
* @param rawAnalysis Object corresponding to a G3 API JSON.
* @returns An Analysis object encapsulating the NLP analysis.
*/
fromJson(rawAnalysis) {
var _a, _b, _c, _d, _e;
const ra = rawAnalysis;
this.checkVersion(ra.version);
// store item sentiment to fill it in when the relevant items are constructed
if (ra.itemSentiments) {
const itemSentiments = Object.entries(ra.itemSentiments);
itemSentiments.map(([id, obj]) => this.id2sentiment.set(id, this.readSentiment(obj)));
}
// store item vectors to fill them in when the relevant items are constructed
if (ra.itemVectors) {
const itemVectors = Object.entries(ra.itemVectors);
itemVectors.map(([id, obj]) => {
const vectors = obj;
this.id2vectors.set(id, vectors.map((v) => this.readVector(v)));
});
}
const docId = (_a = ra.id) !== null && _a !== void 0 ? _a : null;
// ISO 639-2 for Undetermined language.
const language = new model_1.Language((_c = (_b = ra.language) === null || _b === void 0 ? void 0 : _b.detected) !== null && _c !== void 0 ? _c : "und");
const paragraphs = (ra.paragraphs || []).map((raw) => this.readParagraph(raw));
const entities = (ra.entities || []).map((raw) => this.readEntity(raw));
const tags = (ra.tags || []).map((raw) => this.readTag(raw));
const relations = (ra.relations || []).map((raw) => this.readRelation(raw));
const docSentiment = ra.docSentiment
? this.readSentiment(ra.docSentiment)
: null;
const docVectors = ra.docVectors
? this.readVectors(ra.docVectors)
: null;
const usedChars = (_d = ra.usedChars) !== null && _d !== void 0 ? _d : null;
const metadata = this.readMetadata(ra);
const debugInfo = (_e = ra.debugInfo) !== null && _e !== void 0 ? _e : null;
const analysis = new model_1.Analysis(docId, language, paragraphs, docSentiment, entities, tags, relations, docVectors, usedChars, metadata, debugInfo);
analysis.paragraphs.forEach((x) => (x.container = analysis));
// fill derived-from entities for mentions
this.entityId2derivedMentions.forEach((mentions, id) => {
const entity = this.resolveId(id);
mentions.forEach((m) => (m.derivedFrom = entity));
});
// fill tecto-token entity mention
this.mentionId2tectoToken.forEach((tokens, id) => {
const mention = this.resolveId(id);
tokens.forEach((tt) => {
tt.entityMention = mention;
tt.entity = mention.mentionOf;
});
});
return analysis;
}
}
/** Encapsulates mapping of offsets from code-points to JavaScript string indices. */
class Cp2JSOffsetMapping {
constructor(text, origText) {
const convForText = new offset_converter_1.Cp2JSOffsetConverter(text);
const convForOrigText = origText === null
? convForText
: new offset_converter_1.Cp2JSOffsetConverter(origText);
this.get = (cpOff) => convForText.convert(cpOff);
this.getOrig = (cpOff) => convForOrigText.convert(cpOff);
}
}
class IdentityOffsetMapping {
get(cpOff) {
return cpOff;
}
getOrig(cpOff) {
return cpOff;
}
}