UNPKG

conllu-core

Version:

A core type to handle CoNLL-U format

github.com/NattapongSiri/conllu_core

NattapongSiri/conllu_core

406 lines • 17.3 kB

JavaScript

import { CompoundToken, EmptyToken, NominalToken, Relation } from "."; /** * The policy to adjust a `head` field that point to a token being merge. * Every other tokens will have their head value adjust accordingly but the `head` that point * to token being merge may need different treatment depending on user requirement. */ export var HeadPolicy; (function (HeadPolicy) { /** Update all the `head` pointed to the token within merge range to a merged `ID` */ HeadPolicy[HeadPolicy["Adjust"] = 0] = "Adjust"; /** Remove any `head` linked to the token within merge range */ HeadPolicy[HeadPolicy["Remove"] = 1] = "Remove"; })(HeadPolicy || (HeadPolicy = {})); function nominal_guard(arr) { return arr[0] instanceof NominalToken; } /** * A merging policy. * * It has following attributes: * - `headPol` field will determined how all dependants shall be handle. * The default value is `HeadPolicy.Adjust` * - `lemma` field is a callback that takes all tokens being merged as argument and return merged lemma. * The default value is every lemma concatenated together or undefined. * - `upos` field is a callback that takes all tokens being merged as argument and return merged part-of-speech. * The default value is a part-of-speech of first token being merge. * - `xpos` an optional field which is a callback that takes all tokens being merged as argument and return merged language specific part-of-speech. * - `feats` field is a callback that takes all tokens being merged as argument and return merged feature(s). * The default value is a flatten merge of all unique features from every tokens being merged. * - `headRels` field is a callback that takes all tokens being merged as argument and return merged `head` and `deprel` fields. * The default is first token being merged head/deprel field if there's no root `Relation` in merging, otherwise, it * will become new root. * - `deps` field is a callback that takes all tokens being merged as argument and return merged `deps` fields. * The default is merged of every unique deps field of tokens being merged. * - `misc` field is a callback that takes all tokens being merged as argument and return merged `misc` fields. * The default value is all unique of flatten map misc fields of every tokens. */ var MergePolicy = /** @class */ (function () { function MergePolicy() { this.headPol = HeadPolicy.Adjust; this.lemma = function (tokens) { var trim_last = true; var lemmas = tokens.filter(function (t) { return t.lemma != null; }).map(function (t) { if (t.misc && t.misc.find(function (m) { return m == "SpaceAfter=No"; })) { trim_last = false; return t.lemma; } else { trim_last = true; return t.lemma + ' '; } }); if (lemmas.length > 0) { var lemma = lemmas.join(''); if (trim_last) return lemma.slice(0, -1); else return lemma; } }; this.upos = function (tokens) { return tokens[0].upos; }; this.feats = function (tokens) { var feats = tokens.filter(function (t) { return t.feats != null; }).flatMap(function (t) { return t.feats; }).filter(function (v, i, a) { return a.indexOf(v) === i; }); if (feats.length > 0) return feats; }; this.headRels = function (tokens) { if (tokens.find(function (t) { return t.head == 0; })) return [0, new Relation("root")]; return [tokens[0].head, tokens[0].deprel]; }; this.deps = function (tokens) { var deps = tokens.filter(function (t) { return t.deps != null; }).flatMap(function (t) { return t.deps; }).filter(function (v, i, a) { return a.indexOf(v) === i; }).sort(function (a, b) { if (a[0] < b[0]) return -1; else if (a[0] > b[0]) return 1; else { if (a[1] < b[1]) return -1; else if (a[1] > b[1]) return 1; else return 0; } }); if (deps.length > 0) return deps; }; this.misc = function (tokens) { var misc = Array.from(new Set(tokens.filter(function (t) { return t.misc != undefined; }).flatMap(function (t) { return t.misc; }))).filter(function (m) { return !m.startsWith("SpaceAfter="); }); // SpaceAfter need special treatment as it is predefined by CoNLL-U as per // https://universaldependencies.org/format.html#untokenized-text var last_token = tokens[tokens.length - 1]; var i = last_token.misc ? last_token.misc.findIndex(function (m) { return m.startsWith("SpaceAfter="); }) : -1; if (i >= 0) misc.push(last_token.misc[i]); // Copy SpaceAfter from last mergining token if (misc.length > 0) return misc; }; } return MergePolicy; }()); export { MergePolicy }; /** * Perform tokens merging. * It is an error to merge different types of tokens. * For example, you can **not** merge `from` = 0.1 and `to` = 2. * If merging token cause a `CompoundToken` that refering to it become invalid, * it will automatically remove that `CompoundToken` out. * * It will result in undefined behavior if `Sentence` being merge is invalid. * Caller should ensure that `Sentence` is valid. method `validate` of `Sentence` * can be used for such validation. * * You can merge multiple `EmptyToken`. * You can merge multiple `NominalToken`. * If there's an `EmptyToken` or a `CompoundToken` between `NominalToken` being merged, * it will automatically remove. * You can never merge `CompoundToken`. * * It may end up with invalid `EmptyToken` if it any EmptyToken have single dependency that depends on * the token being merged. Such invalid `EmptyToken` will be retain. It is user responsibility * to fix the dependency for every `EmptyToken` that become invalid. * * @param sentence An object of Sentence to merge tokens * @param from A first token to be merge. It is an `id`, not an `index` of token. * @param to An inclusive id of last token to be merged. It is an `id`, not an `index` of token. * @param policy Specify how to treat a head that point to any token being merged. The default policy is to `Adjust` */ export function mergeTokens(sentence, from, to, policy) { if (typeof from == "number" && typeof to == "number") { mergeNominal(sentence, from, to, policy); } else if (from instanceof Array && to instanceof Array) { if (from.length == 2 && to.length == 2) { mergeEmpty(sentence, from, to, policy); } else { throw "`id` must either be number for `NominalToken` or [number, number] for `EmptyToken`"; } } } /** Perform merging a empty token. See mergeToken function doc for more info */ function mergeEmpty(sentence, from, to, policy) { if (from >= to || from[0] != to[0]) return; var id = 1; var from_idx = 0; var to_idx = 0; // Scan tokens and compute from_idx and to_idx for (var i = 0; i < sentence.tokens.length; i++) { if (sentence.tokens[i] instanceof NominalToken) { if (id >= from[0]) { from_idx = i + 1; break; } id++; } } id = 1; // Scan for empty token index from given `from` and `to` ID. for (var i = from_idx; i < sentence.tokens.length; i++) { if (sentence.tokens[i] instanceof EmptyToken) { if (id == from[1]) { from_idx = i; } else if (id == to[1]) { to_idx = i; break; } id++; } else { throw "Given ID range contains one or more non-EmptyToken."; } } if (from_idx >= to_idx) { throw "Given from/to `id` is invalid. Either one or both of it is not exist in given sentence."; } // update deps field var offset = to[1] - from[1]; /** * A core algorithm to update deps field * @param update A callback function that responsible for update dep at given index. * The callback must return next index to evaluate. */ var update_core = function (update) { for (var i = 0; i < sentence.tokens.length; i++) { if (i >= from_idx && i <= to_idx) continue; var token = sentence.tokens[i]; if ((token instanceof NominalToken || token instanceof EmptyToken) && token.deps) { var j = 0; while (j < token.deps.length) { var dep = token.deps[j]; if (dep[0][0] == from[0]) { if (dep[0][1] > to[1]) { dep[0][1] -= offset; } else if (dep[0][1] > from[1] && dep[0][1] <= to[1]) { j = update(token.deps, j); continue; } } j++; } } } }; switch (policy.headPol) { case HeadPolicy.Adjust: update_core(function (deps, i) { deps[i][0][1] = from[1] + 1; return i; }); break; case HeadPolicy.Remove: update_core(function (deps, i) { deps.splice(i, 1); return i + 1; }); break; default: throw "Not yet implemented HeadPolicy type "; } // The token between from_idx to to_idx will all be EmptyToken // We also don't have to worry about CompoundToken var tokens = sentence.tokens.slice(from_idx, to_idx + 1); // Merge all empty token in given range _merge(tokens, policy); // Remove all merged token sentence.tokens.splice(from_idx + 1, to_idx - from_idx); } /** Perform merging a nominal token. See mergeToken function doc for more info */ function mergeNominal(sentence, from, to, policy) { if (from >= to) return; var id = 1; var from_idx = 0; var to_idx = 0; // scan tokens and compute from_idx and to_idx for (var i = 0; i < sentence.tokens.length; i++) { if (sentence.tokens[i] instanceof NominalToken) { if (id == from) { from_idx = i; } else if (id == to) { to_idx = i; break; } id++; } } if (from_idx >= to_idx) { throw "Given from/to `id` is invalid. Either one or both of it is not exist in given sentence."; } // Scan for compound token that will become invalid // Compound must be in front of nominal token according to CoNLL-U spec. for (var i = from_idx - 1; i >= 0; i--) { var token = sentence.tokens[i]; if (token instanceof CompoundToken) { var _a = token.id, begin = _a[0], end = _a[1]; if (from <= begin && to >= end) { // Merge will cause this compound token smaller or equal to merged token causing it to be invalid. from_idx--; to_idx--; sentence.tokens.splice(i, 1); } else if (to <= end) { // Merge will cause compound end ID to be invalid but it can be fix. token.id[1] = end - (to - from); // Fix by reduce equals to number of token being merged } else if (to > end) { // Merge token will cause compound end ID to be invalid. It can be fix using some math. // The (end - from) is a number of tokens within the compound that is getting merge. // New end will be point to ID of merged token thus equals to the last token // within the compound that is not getting merge + 1 token.id[1] = end - (end - from); } break; // compound token cannot be overlap so the first nearest compound found is the only possible one to become invalid } } var tokens = []; for (var _i = 0, _b = sentence.tokens.slice(from_idx, to_idx + 1); _i < _b.length; _i++) { var token = _b[_i]; if (token instanceof NominalToken) { tokens.push(token); } } // Update head/deprel var offset = to - from; /** * Common update head, deprel, and deps algorithm. * It take two callbacks. * - `update_head` callback which will be called when the given token have head field pointed to a merging token * - `update_deps` callback which will be called when given index need to update head. * It need to return next index of deps to be check. */ var update_core = function (update_head, update_deps) { /** Common deps field update algorithm for both Nominal and Empty token */ var core_deps = function (deps) { if (deps) { var j = 0; while (j < deps.length) { var dep = deps[j]; if (dep[0][0] >= to) { dep[0][0] -= offset; // Adjust head id of all subsequence tokens } else if (dep[0][0] >= from && dep[0][0] < to) { j = update_deps(deps, j); // Update dep at given index j because it pointed to merging token continue; } j++; } } }; for (var i = 0; i < sentence.tokens.length; i++) { if (i > from_idx && i <= to_idx) continue; // these index will be removed anyway var token = sentence.tokens[i]; if (token instanceof NominalToken) { if (token.head > to) { token.head -= offset; // adjust all subsequence tokens } else if (token.head >= from && token.head <= to) { update_head(token); // update head that pointed in a merge range } core_deps(token.deps); // process deps field } else if (token instanceof EmptyToken) { core_deps(token.deps); } } }; switch (policy.headPol) { case HeadPolicy.Remove: var update_head_remove = function (token) { token.head = undefined; token.deprel = undefined; }; var update_deps_remove = function (deps, i) { deps.splice(i, 1); return i; // It doesn't need to increment `i` as it remove itself out so the cursor is literally incremented }; update_core(update_head_remove, update_deps_remove); break; case HeadPolicy.Adjust: var update_head_adjust = function (token) { token.head = from; }; var update_deps_adjust = function (deps, i) { // Adjust all dep deps[i][0][0] = from; if (deps[i][0].length == 2) { // every empty token inside merging range will be remove // so it should only pointed to the merged id deps[i][0].pop(); } return i + 1; // We need to increment cursor to check for next dep }; update_core(update_head_adjust, update_deps_adjust); break; default: throw "Not yet implemented HeadPolicy type"; } // Now merge tokens into head token. _merge(tokens, policy); // Remove all merged tokens sentence.tokens.splice(from_idx + 1, to_idx - from_idx); } function _merge(tokens, policy) { var token = tokens[0]; var trim_last = true; var form = tokens.map(function (t) { if (t.misc && t.misc.find(function (m) { return m == "SpaceAfter=No"; })) { trim_last = false; return t.form; } else { trim_last = true; return t.form + " "; } }).reduce(function (f, str) { return f + str; }); if (trim_last) form = form.slice(0, -1); var lemma = policy.lemma(tokens); var upos = policy.upos(tokens); var xpos = policy.xpos ? policy.xpos(tokens) : undefined; var feats = policy.feats(tokens); var deps = policy.deps(tokens); var misc = policy.misc(tokens); if (nominal_guard(tokens)) { var _a = policy.headRels(tokens), head = _a[0], deprel = _a[1]; token.head = head; token.deprel = deprel; } token.form = form; token.lemma = lemma; token.upos = upos; token.xpos = xpos; token.feats = feats; token.deps = deps; token.misc = misc; } //# sourceMappingURL=sentence.js.map