geneea-nlp-client
Version:
The TypeScript Client for Geneea Interpretor G3 API.
231 lines (230 loc) • 8.58 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.TokenSupport = exports.Token = void 0;
const char_span_1 = require("../../common/char-span");
const common_1 = require("../../common/common");
const udep_1 = require("../../common/udep");
const node_1 = require("./node");
/**
* A token including basic morphological and syntactic information.
* A token is similar to a word, but includes punctuation.
* Tokens have an zero-based index reflecting their position within their sentence.
* The morphological and syntactical features might be null (deepLemma, lemma, morphTag, pos, fnc, parent),
* or empty (children) if not requested or supported.
*/
class Token extends node_1.Node {
constructor(id, idx,
/** Text of this token, possibly after correction. */
text,
/** Cahracter span within the paragraph. */
charSpan,
/** Text of this token in the original paragraph. */
origText,
/** Character span within the original paragraph. */
origCharSpan,
/** Lemma of the token e.g. bezpecny. null if not requested/supported. */
deepLemma,
/** Simple lemma of the token, e.g. nejnebezpecnejsi (in Cz, includes negation and grade). Null if not requested/supported. */
lemma,
/** Google universal tag. Null if not requested/supported. */
pos,
/** Morphological tag, e.g. AAMS1-...., VBD, ... Null if not requested/supported. */
morphTag,
/** Universal and custom features */
feats,
/** Label of the dependency edge. Null if not requested/supported. */
fnc,
/** Dependency sub-function. None if not requested/supported. */
subFnc) {
super();
this.id = id;
this.idx = idx;
this.text = text;
this.charSpan = charSpan;
this.origText = origText;
this.origCharSpan = origCharSpan;
this.deepLemma = deepLemma;
this.lemma = lemma;
this.pos = pos;
this.morphTag = morphTag;
this.feats = feats;
this.fnc = fnc;
this.subFnc = subFnc;
}
/** Token factory method, public constructor. */
static of(id, idx, text, charSpan, origText = null, origCharSpan = null, deepLemma = null, lemma = null, pos = null, morphTag = null, feats = null, fnc = null, subFnc = null) {
return new Token(id, idx, text, charSpan, origText === null || text === origText ? text : origText, origCharSpan === null || charSpan === origCharSpan
? charSpan
: origCharSpan, deepLemma, lemma, pos, morphTag, feats !== null && feats !== void 0 ? feats : new Map(), fnc, subFnc);
}
/** True iff the token form contains a negation prefix. */
isNegated() {
return this.feats.has(Token.FEAT_NEGATED);
}
/** True iff the token is unknown to the lemmatizer. The lemma provided is the same as the token itself. */
isUnknown() {
return this.feats.has(Token.FEAT_UNKNOWN);
}
/**
* Token following of preceding this token within the sentence.
* @param offset Relative offset. The following tokens have a positive offse,
* preceding a negative one. The ext token has offset = 1.
* @returns The token at the relative offset or null if the offset is invalid.
*/
offsetToken(offset) {
const i = this.idx + offset;
const tokens = this.sentence.tokens;
return 0 <= i && i < tokens.length ? tokens[i] : null;
}
/** The previous token or null if this token is sentence initial. */
previous() {
return this.offsetToken(-1);
}
/** The next token or null if this token is sentence final. */
next() {
return this.offsetToken(1);
}
/**
* Full dependency function in the format `{fnc}:{subFnc}` if the sub-function is present.
* Otherwise it's the same as `fnc`.
*/
get fullFnc() {
if (this.fnc !== null) {
const fnc = (0, udep_1.UDepToStr)(this.fnc);
return this.subFnc !== null ? `${fnc}:${this.subFnc}` : fnc;
}
else
return null;
}
/** Converts the token to a default non-recursive string: index + text. */
toSimpleString() {
return this.toStringWith(true, false, false);
}
/** Converts the token to a non-recursive string: index + [text] + [pos] + [fnc]. */
toStringWith(text, pos, fnc) {
var _a, _b;
const t = text ? `:${this.text}` : "";
const p = pos ? `:${(_a = this.pos) !== null && _a !== void 0 ? _a : "_"}` : "";
const f = fnc ? `:${(_b = this.fnc) !== null && _b !== void 0 ? _b : "_"}` : "";
return `${this.idx}${t}${p}${f}`;
}
toString() {
return (0, common_1.objToStr)(this, [
"id",
"idx",
"text",
"charSpan",
"origText",
"origCharSpan",
"deepLemma",
"lemma",
"pos",
"feats",
"morphTag",
"fnc",
"subFnc",
]);
}
}
exports.Token = Token;
/** Lemma info features, a list of strings. */
Token.FEAT_LEMMA_INFO = "lemmaInfo";
/** Key presence signifies it is a negated word, value = true. */
Token.FEAT_NEGATED = "negated";
/** Key presence signifies it is an unknown word, value = true. */
Token.FEAT_UNKNOWN = "unknown";
/**
* Tokens within a single sentence; ordered by word-order; non-empty, continuous or discontinuous.
*/
class TokenSupport {
/**
*
* @param tokens The tokens of this support.
* @param isContinuous Is this support a continuous sequence of tokens, i.e. a token span?
*/
constructor(tokens, isContinuous) {
this.tokens = tokens;
this.isContinuous = isContinuous;
}
/**
* Creates a TokenSupport object from a list of tokens.
* @param tokens Non-empty list of tokens (no need for them to be sorted).
*/
static of(tokens) {
if (tokens.length === 0) {
throw new Error("TokenSupport cannot be empty.");
}
if (!node_1.NodeUtils.isFromSameSentence(tokens)) {
throw new Error("Tokens are not from the same sentence.");
}
return new TokenSupport(node_1.NodeUtils.sorted(tokens), node_1.NodeUtils.isContinuous(tokens));
}
get sentence() {
return this.tokens[0].sentence;
}
get paragraph() {
return this.tokens[0].sentence.paragraph;
}
get first() {
return this.tokens[0];
}
get last() {
return this.tokens[this.tokens.length - 1];
}
get size() {
return this.tokens.length;
}
get ids() {
return this.tokens.map((t) => t.id);
}
/**
* The character span between the first and last token relative to the enclosing paragraph;
* for discontinuous support this includes intervening gaps.
*/
get charSpan() {
return char_span_1.CharSpan.of(this.firstCharParaOffset, this.lastCharParaOffset);
}
/** Offset of the first character of these tokens within the enclosing paragraph. */
get firstCharParaOffset() {
return this.first.charSpan.start;
}
/** Offset of the last character of these tokens within the enclosing paragraph. */
get lastCharParaOffset() {
return this.last.charSpan.end;
}
/**
* Substring of a full text as denoted by this support (before correction).
* For discontinuous supports, the result includes the intervening gaps.
*/
get text() {
return this.charSpan.extractText(this.sentence.paragraph.text);
}
/** Breaks this token support into continuous sub-sequences of tokens. */
spans() {
if (this.isContinuous) {
return [this];
}
else {
const spans = [];
let start = 0;
let prev = this.tokens[0];
for (let i = 1; i < this.tokens.length; i++) {
if (prev.idx + 1 !== this.tokens[i].idx) {
spans.push(new TokenSupport(this.tokens.slice(start, i), true));
start = i;
}
prev = this.tokens[i];
}
spans.push(new TokenSupport(this.tokens.slice(start), true));
return spans;
}
}
/** The coverage texts of each of the continuous spans, ordered by word-order. */
textSpans() {
return this.spans().map((s) => s.text);
}
toString() {
return "[" + this.tokens.map((t) => t.toString()).join(", ") + "]";
}
}
exports.TokenSupport = TokenSupport;