UNPKG

geneea-nlp-client

Version:

The TypeScript Client for Geneea Interpretor G3 API.

bitbucket.org/geneea/ts-nlp-client

231 lines (230 loc) • 8.58 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.TokenSupport = exports.Token = void 0; const char_span_1 = require("../../common/char-span"); const common_1 = require("../../common/common"); const udep_1 = require("../../common/udep"); const node_1 = require("./node"); /** * A token including basic morphological and syntactic information. * A token is similar to a word, but includes punctuation. * Tokens have an zero-based index reflecting their position within their sentence. * The morphological and syntactical features might be null (deepLemma, lemma, morphTag, pos, fnc, parent), * or empty (children) if not requested or supported. */ class Token extends node_1.Node { constructor(id, idx, /** Text of this token, possibly after correction. */ text, /** Cahracter span within the paragraph. */ charSpan, /** Text of this token in the original paragraph. */ origText, /** Character span within the original paragraph. */ origCharSpan, /** Lemma of the token e.g. bezpecny. null if not requested/supported. */ deepLemma, /** Simple lemma of the token, e.g. nejnebezpecnejsi (in Cz, includes negation and grade). Null if not requested/supported. */ lemma, /** Google universal tag. Null if not requested/supported. */ pos, /** Morphological tag, e.g. AAMS1-...., VBD, ... Null if not requested/supported. */ morphTag, /** Universal and custom features */ feats, /** Label of the dependency edge. Null if not requested/supported. */ fnc, /** Dependency sub-function. None if not requested/supported. */ subFnc) { super(); this.id = id; this.idx = idx; this.text = text; this.charSpan = charSpan; this.origText = origText; this.origCharSpan = origCharSpan; this.deepLemma = deepLemma; this.lemma = lemma; this.pos = pos; this.morphTag = morphTag; this.feats = feats; this.fnc = fnc; this.subFnc = subFnc; } /** Token factory method, public constructor. */ static of(id, idx, text, charSpan, origText = null, origCharSpan = null, deepLemma = null, lemma = null, pos = null, morphTag = null, feats = null, fnc = null, subFnc = null) { return new Token(id, idx, text, charSpan, origText === null || text === origText ? text : origText, origCharSpan === null || charSpan === origCharSpan ? charSpan : origCharSpan, deepLemma, lemma, pos, morphTag, feats !== null && feats !== void 0 ? feats : new Map(), fnc, subFnc); } /** True iff the token form contains a negation prefix. */ isNegated() { return this.feats.has(Token.FEAT_NEGATED); } /** True iff the token is unknown to the lemmatizer. The lemma provided is the same as the token itself. */ isUnknown() { return this.feats.has(Token.FEAT_UNKNOWN); } /** * Token following of preceding this token within the sentence. * @param offset Relative offset. The following tokens have a positive offse, * preceding a negative one. The ext token has offset = 1. * @returns The token at the relative offset or null if the offset is invalid. */ offsetToken(offset) { const i = this.idx + offset; const tokens = this.sentence.tokens; return 0 <= i && i < tokens.length ? tokens[i] : null; } /** The previous token or null if this token is sentence initial. */ previous() { return this.offsetToken(-1); } /** The next token or null if this token is sentence final. */ next() { return this.offsetToken(1); } /** * Full dependency function in the format `{fnc}:{subFnc}` if the sub-function is present. * Otherwise it's the same as `fnc`. */ get fullFnc() { if (this.fnc !== null) { const fnc = (0, udep_1.UDepToStr)(this.fnc); return this.subFnc !== null ? `${fnc}:${this.subFnc}` : fnc; } else return null; } /** Converts the token to a default non-recursive string: index + text. */ toSimpleString() { return this.toStringWith(true, false, false); } /** Converts the token to a non-recursive string: index + [text] + [pos] + [fnc]. */ toStringWith(text, pos, fnc) { var _a, _b; const t = text ? `:${this.text}` : ""; const p = pos ? `:${(_a = this.pos) !== null && _a !== void 0 ? _a : "_"}` : ""; const f = fnc ? `:${(_b = this.fnc) !== null && _b !== void 0 ? _b : "_"}` : ""; return `${this.idx}${t}${p}${f}`; } toString() { return (0, common_1.objToStr)(this, [ "id", "idx", "text", "charSpan", "origText", "origCharSpan", "deepLemma", "lemma", "pos", "feats", "morphTag", "fnc", "subFnc", ]); } } exports.Token = Token; /** Lemma info features, a list of strings. */ Token.FEAT_LEMMA_INFO = "lemmaInfo"; /** Key presence signifies it is a negated word, value = true. */ Token.FEAT_NEGATED = "negated"; /** Key presence signifies it is an unknown word, value = true. */ Token.FEAT_UNKNOWN = "unknown"; /** * Tokens within a single sentence; ordered by word-order; non-empty, continuous or discontinuous. */ class TokenSupport { /** * * @param tokens The tokens of this support. * @param isContinuous Is this support a continuous sequence of tokens, i.e. a token span? */ constructor(tokens, isContinuous) { this.tokens = tokens; this.isContinuous = isContinuous; } /** * Creates a TokenSupport object from a list of tokens. * @param tokens Non-empty list of tokens (no need for them to be sorted). */ static of(tokens) { if (tokens.length === 0) { throw new Error("TokenSupport cannot be empty."); } if (!node_1.NodeUtils.isFromSameSentence(tokens)) { throw new Error("Tokens are not from the same sentence."); } return new TokenSupport(node_1.NodeUtils.sorted(tokens), node_1.NodeUtils.isContinuous(tokens)); } get sentence() { return this.tokens[0].sentence; } get paragraph() { return this.tokens[0].sentence.paragraph; } get first() { return this.tokens[0]; } get last() { return this.tokens[this.tokens.length - 1]; } get size() { return this.tokens.length; } get ids() { return this.tokens.map((t) => t.id); } /** * The character span between the first and last token relative to the enclosing paragraph; * for discontinuous support this includes intervening gaps. */ get charSpan() { return char_span_1.CharSpan.of(this.firstCharParaOffset, this.lastCharParaOffset); } /** Offset of the first character of these tokens within the enclosing paragraph. */ get firstCharParaOffset() { return this.first.charSpan.start; } /** Offset of the last character of these tokens within the enclosing paragraph. */ get lastCharParaOffset() { return this.last.charSpan.end; } /** * Substring of a full text as denoted by this support (before correction). * For discontinuous supports, the result includes the intervening gaps. */ get text() { return this.charSpan.extractText(this.sentence.paragraph.text); } /** Breaks this token support into continuous sub-sequences of tokens. */ spans() { if (this.isContinuous) { return [this]; } else { const spans = []; let start = 0; let prev = this.tokens[0]; for (let i = 1; i < this.tokens.length; i++) { if (prev.idx + 1 !== this.tokens[i].idx) { spans.push(new TokenSupport(this.tokens.slice(start, i), true)); start = i; } prev = this.tokens[i]; } spans.push(new TokenSupport(this.tokens.slice(start), true)); return spans; } } /** The coverage texts of each of the continuous spans, ordered by word-order. */ textSpans() { return this.spans().map((s) => s.text); } toString() { return "[" + this.tokens.map((t) => t.toString()).join(", ") + "]"; } } exports.TokenSupport = TokenSupport;