UNPKG

conllu-core

Version:

A core type to handle CoNLL-U format

github.com/NattapongSiri/conllu_core

NattapongSiri/conllu_core

742 lines • 34.4 kB

JavaScript

import { assert } from "console"; import { Comment, CompoundToken, EmptyToken, Meta, NominalToken, Relation, Sentence, TokenIdMap, TokenType } from "."; /** * The policy to adjust a `head` field that point to a token being merge. * Every other tokens will have their head value adjust accordingly but the `head` that point * to token being merge may need different treatment depending on user requirement. */ export var HeadPolicy; (function (HeadPolicy) { /** Update all the `head` pointed to the token within merge range to a merged `ID` */ HeadPolicy[HeadPolicy["Adjust"] = 0] = "Adjust"; /** Remove any `head` linked to the token within merge range */ HeadPolicy[HeadPolicy["Remove"] = 1] = "Remove"; })(HeadPolicy || (HeadPolicy = {})); var SentenceBuilder = /** @class */ (function () { function SentenceBuilder() { } /** * Make this builder out of existing Sentence. * It will make a shallow copy of existing sentence so any modification * with this builder will also immediately reflect on original sentence. */ SentenceBuilder.from = function (sentence) { var builder = new SentenceBuilder(); builder.meta = sentence.meta; builder.tokens = sentence.tokens; builder.id_map = new TokenIdMap(); var id = 0; var empty_id = 0; for (var _i = 0, _a = sentence.tokens; _i < _a.length; _i++) { var t = _a[_i]; if (t instanceof NominalToken) { builder.id_map.push(++id); empty_id = 0; } else if (t instanceof EmptyToken) { builder.id_map.push([id, ++empty_id]); } else if (t instanceof CompoundToken) { builder.id_map.push(undefined); } else { throw "Unsupported token type. Found " + t.constructor.name; } } return builder; }; /** Append given meta into this sentence */ SentenceBuilder.prototype.push_meta = function (key, value) { this.meta.push(new Meta({ key: key, value: value })); }; /** Append comment into this sentence */ SentenceBuilder.prototype.push_comment = function (text) { this.meta.push(new Comment(text)); }; /** Find meta by given `key` */ SentenceBuilder.prototype.find_meta = function (key) { return this.meta.find(function (m) { if (m instanceof Meta) m.key == key; }); }; /** Find index of meta from given `key` */ SentenceBuilder.prototype.find_meta_index = function (key) { return this.meta.findIndex(function (m) { return m instanceof Meta && m.key == key; }); }; /** Append given token into this sentence */ SentenceBuilder.prototype.push_token = function (t) { this.tokens.push(t); if (t instanceof NominalToken) this.id_map.insert(this.id_map.length, TokenType.Nominal); else if (t instanceof CompoundToken) this.id_map.insert(this.id_map.length, TokenType.Compound); else if (t instanceof EmptyToken) this.id_map.insert(this.id_map.length, TokenType.Empty); else throw "Unsupported type of Token. Found " + t.constructor.name; }; /** Find a token by an `id` */ SentenceBuilder.prototype.find_token_by_id = function (id, type) { switch (type) { case TokenType.Nominal: assert(typeof id == "number", "ID of Nominal token must be number"); var index_nom = this.id_map.findIndex(function (_id) { return _id == id; }); return index_nom == -1 ? undefined : this.tokens[index_nom]; case TokenType.Empty: assert(id instanceof Array, "ID of Empty token must be [number, number]"); var index_empty = this.id_map.findIndex(function (_id) { return _id == id; }); return index_empty == -1 ? undefined : this.tokens[index_empty]; case TokenType.Compound: assert(id instanceof Array && id.length == 2, "Retrieving Comound token need [number, number] ID. Found", id); var id_nom_1 = id[0]; var index_neighbor = this.id_map.findIndex(function (_id) { return _id == id_nom_1; }); if (index_neighbor > 0 && this.tokens[index_neighbor - 1] instanceof CompoundToken) { var compound = this.tokens[index_neighbor - 1]; if (compound.id[1] == id[1]) { return compound; } else { return undefined; } } else { return undefined; } default: throw "Unsupported TokenType"; } }; /** Get id of token at given index or undefined if token at given index is Compound token */ SentenceBuilder.prototype.get_id_by_index = function (index) { assert(index < this.id_map.length, "Index out of bound. Length is " + this.id_map.length + " but index is " + index); return this.id_map[index]; }; /** Set head and deprel field of token at given `token_index` argument to ID of token at `head_index`. */ SentenceBuilder.prototype.upsert_head_by_index = function (token_index, head_index, relation) { var token = this.tokens[token_index]; var head_tok = this.tokens[head_index]; if (token instanceof NominalToken && head_tok instanceof NominalToken) { var id = this.id_map[head_index]; token.head = id; token.deprel = relation; } else { throw "Both tokens must be NominalToken but at " + token_index + " found " + token.constructor.name + " and " + head_index + " found " + head_tok.constructor.name; } }; /** Add or replace a dep in deps field of given token index. The dep to be add/replace use head index instead of ID */ SentenceBuilder.prototype.upsert_dep_by_index = function (token_index, head_index, relation) { var token = this.tokens[token_index]; var head_tok = this.tokens[head_index]; if (token instanceof NominalToken || token instanceof EmptyToken) { var set_dep = function (token, id, relation) { if (token.deps) { var exist_index = token.deps.findIndex(function (dep) { return dep[0] == id; }); if (exist_index != -1) { token.deps[exist_index] = [id, relation]; } else { // Find insertion point var index = 0; for (; index < token.deps.length; index++) { if (token.deps[index][0] > id) { index--; break; } } // Insert token at found insertion point token.deps.splice(index, 0, [id, relation]); } } else { token.deps = [[id, relation]]; } }; if (head_tok instanceof NominalToken) { set_dep(token, [this.id_map[head_index]], relation); } else if (head_tok instanceof EmptyToken) { set_dep(token, this.id_map[head_index], relation); } else { throw "Token at " + head_index + " must either be NominalToken or EmptyToken but found " + head_tok.constructor.name; } } else { throw "Token at " + token_index + " must either be NominalToken or EmptyToken but found " + token.constructor.name; } }; /** Insert a `token` at given `index`. The index must be <= number of existing tokens. */ SentenceBuilder.prototype.insert_token = function (token, index) { var _this = this; assert(index <= this.tokens.length, "Index out of bound. Total number of tokens is " + this.tokens.length + " but index is " + index); var update_dependencies = function (update_deps) { for (var _i = 0, _a = _this.tokens; _i < _a.length; _i++) { var t = _a[_i]; if (t instanceof NominalToken) { if (t.head && t.head >= _this.id_map[index]) { t.head++; } if (t.deps) update_deps(t.deps); } else if (t instanceof EmptyToken) { update_deps(t.deps); } } }; if (token instanceof NominalToken) { update_dependencies(function (deps) { for (var _i = 0, deps_1 = deps; _i < deps_1.length; _i++) { var dep = deps_1[_i]; if (dep[0][0] >= _this.id_map[index]) { dep[0][0]++; } } }); this.id_map.insert(index, TokenType.Nominal); } else if (token instanceof EmptyToken) { update_dependencies(function (deps) { for (var _i = 0, deps_2 = deps; _i < deps_2.length; _i++) { var dep = deps_2[_i]; if (dep[0] >= _this.id_map[index]) { dep[0][1]++; } } }); this.id_map.insert(index, TokenType.Empty); } else if (token instanceof CompoundToken) { // Compound token doesn't impact head, deps, or ID of other token this.id_map.insert(index, TokenType.Compound); } else { throw "Unsupported type of token. Found " + token.constructor.name; } this.tokens.splice(index, 0, token); }; /** * Remove a token at given index and update all dependencies to it based on given policy. * It return the removed token without update any field value of it. */ SentenceBuilder.prototype.remove_token = function (index, policy) { var _this = this; if (policy === void 0) { policy = HeadPolicy.Adjust; } assert(index < this.tokens.length, "Index out of bound. Index is %d but length is %d", index, this.tokens.length); var update_core = function (update_head, update_deps) { for (var i = 0; i < _this.tokens.length; i++) { if (i == index) continue; var t = _this.tokens[i]; if (t instanceof NominalToken) update_head(t); var deps = []; if ((t instanceof NominalToken || t instanceof EmptyToken) && t.deps) { deps = t.deps; } for (var j = 0; j < deps.length;) { j = update_deps(deps, j); } } }; if (this.tokens[index] instanceof EmptyToken) { // Remove empty token var id_1 = this.id_map[index]; var update = function (apply) { update_core(function () { }, apply); }; switch (policy) { case HeadPolicy.Adjust: update(function (deps, i) { if (deps[i][0] >= id_1) deps[i][0][1]--; return i + 1; }); break; case HeadPolicy.Remove: update(function (deps, i) { if (deps[i][0] == id_1) deps.splice(i, 1); return i; }); break; default: throw "Unsupported update policy. Found " + HeadPolicy[policy]; } } else if (this.tokens[index] instanceof NominalToken) { // Remove nominal token var id_2 = this.id_map[index]; switch (policy) { case HeadPolicy.Adjust: update_core(function (t) { if (t.head && t.head >= id_2) t.head--; }, function (deps, i) { if (deps[i][0][0] >= id_2) deps[i][0][0]--; return i + 1; }); break; case HeadPolicy.Remove: update_core(function (t) { if (t.head && t.head == id_2) t.head = undefined; }, function (deps, i) { if (deps[i][0][0] == id_2) deps.splice(i, 1); return i; }); break; default: throw "Unsupported update policy. Found " + HeadPolicy[policy]; } } this.id_map.remove_chunk(index, 1); return this.tokens.splice(index, 1); }; /** * Merge tokens using index, not ID. * Both `from` and `to` are index of token. * It's inclusive at both end so if `from = 1`, and `to = 2`, it will merge * token at index 1 and 2 into 1 token. * The field value of merged token will depends on `policy`. */ SentenceBuilder.prototype.merge = function (from, to, policy) { var _this = this; if (policy === void 0) { policy = new MergePolicy(); } assert(from < this.tokens.length, "Argument from is %d while there is %d tokens", from, this.tokens.length); assert(from < to, "Argument from is larger or equals to argument to"); assert(this.tokens[from].constructor === this.tokens[to].constructor, "Token at %d is %s and %d is %s but its should have the same type", from, this.tokens[from].constructor.name, to, this.tokens[to].constructor.name); assert(this.tokens[from] instanceof NominalToken || this.tokens[from] instanceof EmptyToken, "First merginng token must either be `NominalToken` or `EmptyToken`"); if (this.tokens[from] instanceof EmptyToken) { var _a = this.id_map[from], _ = _a[0], empty_id_1 = _a[1]; // update deps field var offset_1 = to - from; var to_id_1 = empty_id_1 + offset_1; /** * A core algorithm to update deps field * @param update A callback function that responsible for update dep at given index. * The callback must return next index to evaluate. */ var update_core = function (update) { for (var i = 0; i < _this.tokens.length; i++) { if (i >= from && i <= to) continue; var token = _this.tokens[i]; if ((token instanceof NominalToken || token instanceof EmptyToken) && token.deps) { var j = 0; while (j < token.deps.length) { var dep = token.deps[j]; if (dep[0][0] == empty_id_1) { if (dep[0][1] > to_id_1) { dep[0][1] -= offset_1; } else if (dep[0][1] > empty_id_1 && dep[0][1] <= to_id_1) { j = update(token.deps, j); continue; } } j++; } } } }; switch (policy.headPol) { case HeadPolicy.Adjust: update_core(function (deps, i) { deps[i][0][1] = empty_id_1 + 1; return i; }); break; case HeadPolicy.Remove: update_core(function (deps, i) { deps.splice(i, 1); return i + 1; }); break; default: throw "Not yet implemented HeadPolicy type "; } // The token between from_idx to to_idx will all be EmptyToken // We also don't have to worry about CompoundToken var tokens = this.tokens.slice(from, to + 1); // Merge all empty token in given range _merge(tokens, policy); } else { // Due to assertion above, it's nominal token var id_3 = this.id_map[from]; var to_id_2 = this.id_map[to]; var offset_2 = to_id_2 - id_3; // Scan for compound token that will become invalid // Compound must be in front of nominal token according to CoNLL-U spec. for (var i = from - 1; i >= 0; i--) { var token = this.tokens[i]; if (token instanceof CompoundToken) { var _b = token.id, _ = _b[0], end = _b[1]; if (to_id_2 <= end) { // Merge will cause compound end ID to be invalid but it can be fix. token.id[1] = end - (to_id_2 - id_3); // Fix by reduce equals to number of token being merged } else if (to_id_2 > end) { // Merge token will cause compound end ID to be invalid. It can be fix using some math. // The (end - from) is a number of tokens within the compound that is getting merge. // New end will be point to ID of merged token thus equals to the last token // within the compound that is not getting merge + 1 token.id[1] = id_3; } break; // compound token cannot be overlap so the first nearest compound found is the only possible one to become invalid } } // Update head/deprel /** * Common update head, deprel, and deps algorithm. * It take two callbacks. * - `update_head` callback which will be called when the given token have head field pointed to a merging token * - `update_deps` callback which will be called when given index need to update head. * It need to return next index of deps to be check. */ var update_core = function (update_head, update_deps) { /** Common deps field update algorithm for both Nominal and Empty token */ var core_deps = function (deps) { if (deps) { var j = 0; while (j < deps.length) { var dep = deps[j]; if (dep[0][0] >= to_id_2) { dep[0][0] -= offset_2; // Adjust head id of all subsequence tokens } else if (dep[0][0] >= id_3 && dep[0][0] < to_id_2) { j = update_deps(deps, j); // Update dep at given index j because it pointed to merging token continue; } j++; } } }; for (var i = 0; i < _this.tokens.length; i++) { if (i > from && i <= to) continue; // these index will be removed anyway var token = _this.tokens[i]; if (token instanceof NominalToken) { if (token.head > to_id_2) { token.head -= offset_2; // adjust all subsequence tokens } else if (token.head >= id_3 && token.head <= to_id_2) { update_head(token); // update head that pointed in a merge range } core_deps(token.deps); // process deps field } else if (token instanceof EmptyToken) { core_deps(token.deps); } } }; switch (policy.headPol) { case HeadPolicy.Remove: var update_head_remove = function (token) { token.head = undefined; token.deprel = undefined; }; var update_deps_remove = function (deps, i) { deps.splice(i, 1); return i; // It doesn't need to increment `i` as it remove itself out so the cursor is literally incremented }; update_core(update_head_remove, update_deps_remove); break; case HeadPolicy.Adjust: var update_head_adjust = function (token) { token.head = id_3; }; var update_deps_adjust = function (deps, i) { // Adjust all dep deps[i][0][0] = id_3; if (deps[i][0].length == 2) { // every empty token inside merging range will be remove // so it should only pointed to the merged id deps[i][0].pop(); } return i + 1; // We need to increment cursor to check for next dep }; update_core(update_head_adjust, update_deps_adjust); break; default: throw "Not yet implemented HeadPolicy type"; } // Now merge tokens into head token. var tokens = []; for (var _i = 0, _c = this.tokens.slice(from, to + 1); _i < _c.length; _i++) { var token = _c[_i]; if (token instanceof NominalToken) { tokens.push(token); } } _merge(tokens, policy); // Remove all merged tokens this.tokens.splice(from + 1, to - from); this.id_map.remove_chunk(from + 1, to - from); } }; /** * Split a token at given `index`. It take `at` argument which is a list of index * of location of `form` field of a token to be splitted. All other fields depends * on `policy` argument. * It update all other dependencies by shifting all the ID accordingly. * * The token at given `index` must either be NominalToken or EmptyToken. */ SentenceBuilder.prototype.split = function (index, at, policy) { if (policy === void 0) { policy = new SplitPolicy(); } assert(index < this.tokens.length, "Index out of bound. Sentence has " + this.tokens.length + " tokens but index is " + index); assert(this.tokens[index] instanceof NominalToken || this.tokens[index] instanceof EmptyToken, "Sentence at given index must either be NominalToken or EmptyToken but found " + this.tokens[index].constructor.name); assert(at.length > 0, "Argument at must have one or more element"); // sort at first so the slice will always valid at.sort(); assert(at[at.length - 1] < this.tokens[index].form.length, "Split point is beyond surface form text length"); var tokens = this.tokens; var offset = at.length; var id = 0; var empty_id = 0; // Update `head` and `deps` field of every token except at a splitting token var update_deps = function (_) { }; // Define update_deps function based on current type of token at given index. // We take this chance to update `id_map` if (this.tokens[index] instanceof EmptyToken) { this.id_map.insert(index + 1, TokenType.Empty, offset - 1); id = this.id_map[index][0]; empty_id = this.id_map[index][1]; update_deps = function (deps) { for (var i_1 = 0; i_1 < deps.length && deps[i_1][0][0] <= id; i_1++) if (deps[i_1][0][0] == id && deps[i_1][0][1] > empty_id) deps[i_1][0][1] += offset; }; } else if (this.tokens[index] instanceof NominalToken) { this.id_map.insert(index + 1, TokenType.Nominal, offset - 1); id = this.id_map[index]; update_deps = function (deps) { for (var i_2 = 0; i_2 < deps.length; i_2++) if (deps[i_2][0][0] > id) deps[i_2][0][0] += offset; }; } else { throw "Not yet implemented for type " + this.tokens[index].constructor.name; } // Compound cannot be overlap so there will be exactly one compound that contain the token for (var i_3 = index - 1; i_3 >= 0; i_3--) { var token = this.tokens[i_3]; if (token instanceof CompoundToken && token.id[0] <= id && token.id[1] >= id) { // Expand compound token that had token at given index as part of compound token.id[1] += offset; break; } } // Perform update dependencies on each token for (var i_4 = 0; i_4 < tokens.length; i_4++) { var token = tokens[i_4]; if (token instanceof NominalToken) { if (token.head > id) token.head += offset; update_deps(token.deps); } else if (token instanceof EmptyToken) update_deps(token.deps); } var i = at[0]; var insert_point = index + 1; var insert = function (token, i, j) { var new_form = token.form.slice(i, j); var lemma = policy.lemma(token); var upos = policy.upos(token); var xpos = policy.xpos(token); var feats = policy.feats(token); var deps = policy.deps(token); var misc = policy.misc(token); if (token instanceof NominalToken) { var headRel = policy.headRels(token); tokens.splice(insert_point++, 0, new NominalToken({ form: new_form, lemma: lemma, upos: upos, xpos: xpos, feats: feats, headRel: headRel, deps: deps, misc: misc })); } else { tokens.splice(insert_point++, 0, new EmptyToken({ form: new_form, lemma: lemma, upos: upos, xpos: xpos, feats: feats, deps: deps, misc: misc })); } }; for (var _i = 0, _a = at.slice(1); _i < _a.length; _i++) { var j = _a[_i]; // The token can be either Nominal or EmptyToken. Otherwise, assertion will fail. insert(tokens[index], i, j); i = j; } // Reuse existing token at given `index` tokens[index].form = tokens[index].form.slice(0, at[0]); }; // TODO Create CompoundToken based on given index /** Build Sentence object out of this builder */ SentenceBuilder.prototype.build = function () { return new Sentence({ meta: this.meta, tokens: this.tokens }); }; return SentenceBuilder; }()); export { SentenceBuilder }; function nominal_guard(arr) { return arr[0] instanceof NominalToken; } function _merge(tokens, policy) { var token = tokens[0]; var trim_last = true; var form = tokens.map(function (t) { if (t.misc && t.misc.find(function (m) { return m == "SpaceAfter=No"; })) { trim_last = false; return t.form; } else { trim_last = true; return t.form + " "; } }).reduce(function (f, str) { return f + str; }); if (trim_last) form = form.slice(0, -1); var lemma = policy.lemma(tokens); var upos = policy.upos(tokens); var xpos = policy.xpos ? policy.xpos(tokens) : undefined; var feats = policy.feats(tokens); var deps = policy.deps(tokens); var misc = policy.misc(tokens); if (nominal_guard(tokens)) { var _a = policy.headRels(tokens), head = _a[0], deprel = _a[1]; token.head = head; token.deprel = deprel; } token.form = form; token.lemma = lemma; token.upos = upos; token.xpos = xpos; token.feats = feats; token.deps = deps; token.misc = misc; } /** * A merging policy. * * It has following attributes: * - `headPol` field will determined how all dependants shall be handle. * The default value is `HeadPolicy.Adjust` * - `lemma` field is a callback that takes all tokens being merged as argument and return merged lemma. * The default value is every lemma concatenated together or undefined. * - `upos` field is a callback that takes all tokens being merged as argument and return merged part-of-speech. * The default value is a part-of-speech of first token being merge. * - `xpos` an optional field which is a callback that takes all tokens being merged as argument and return merged language specific part-of-speech. * - `feats` field is a callback that takes all tokens being merged as argument and return merged feature(s). * The default value is a flatten merge of all unique features from every tokens being merged. * - `headRels` field is a callback that takes all tokens being merged as argument and return merged `head` and `deprel` fields. * The default is first token being merged head/deprel field if there's no root `Relation` in merging, otherwise, it * will become new root. * - `deps` field is a callback that takes all tokens being merged as argument and return merged `deps` fields. * The default is merged of every unique deps field of tokens being merged. * - `misc` field is a callback that takes all tokens being merged as argument and return merged `misc` fields. * The default value is all unique of flatten map misc fields of every tokens. */ var MergePolicy = /** @class */ (function () { function MergePolicy() { this.headPol = HeadPolicy.Adjust; this.lemma = function (tokens) { var trim_last = true; var lemmas = tokens.filter(function (t) { return t.lemma != null; }).map(function (t) { if (t.misc && t.misc.find(function (m) { return m == "SpaceAfter=No"; })) { trim_last = false; return t.lemma; } else { trim_last = true; return t.lemma + ' '; } }); if (lemmas.length > 0) { var lemma = lemmas.join(''); if (trim_last) return lemma.slice(0, -1); else return lemma; } }; this.upos = function (tokens) { return tokens[0].upos; }; this.feats = function (tokens) { var feats = tokens.filter(function (t) { return t.feats != null; }).flatMap(function (t) { return t.feats; }).filter(function (v, i, a) { return a.indexOf(v) === i; }); if (feats.length > 0) return feats; }; this.headRels = function (tokens) { if (tokens.find(function (t) { return t.head == 0; })) return [0, new Relation("root")]; return [tokens[0].head, tokens[0].deprel]; }; this.deps = function (tokens) { var deps = tokens.filter(function (t) { return t.deps != null; }).flatMap(function (t) { return t.deps; }).filter(function (v, i, a) { return a.indexOf(v) === i; }).sort(function (a, b) { if (a[0] < b[0]) return -1; else if (a[0] > b[0]) return 1; else { if (a[1] < b[1]) return -1; else if (a[1] > b[1]) return 1; else return 0; } }); if (deps.length > 0) return deps; }; this.misc = function (tokens) { var misc = Array.from(new Set(tokens.filter(function (t) { return t.misc != undefined; }).flatMap(function (t) { return t.misc; }))).filter(function (m) { return !m.startsWith("SpaceAfter="); }); // SpaceAfter need special treatment as it is predefined by CoNLL-U as per // https://universaldependencies.org/format.html#untokenized-text var last_token = tokens[tokens.length - 1]; var i = last_token.misc ? last_token.misc.findIndex(function (m) { return m.startsWith("SpaceAfter="); }) : -1; if (i >= 0) misc.push(last_token.misc[i]); // Copy SpaceAfter from last mergining token if (misc.length > 0) return misc; }; } return MergePolicy; }()); export { MergePolicy }; /** * Define how each property of splitted will be derived. * By default, all properties are copy from original token. */ var SplitPolicy = /** @class */ (function () { function SplitPolicy() { this.lemma = function (token) { return token.lemma; }; this.upos = function (token) { return token.upos; }; this.feats = function (token) { return token.feats; }; this.headRels = function (token) { return [token.head, token.deprel]; }; this.deps = function (token) { return token.deps; }; this.misc = function (token) { return token.misc; }; } return SplitPolicy; }()); export { SplitPolicy }; //# sourceMappingURL=builder.js.map