conllu-core
Version:
A core type to handle CoNLL-U format
406 lines • 17.3 kB
JavaScript
import { CompoundToken, EmptyToken, NominalToken, Relation } from ".";
/**
* The policy to adjust a `head` field that point to a token being merge.
* Every other tokens will have their head value adjust accordingly but the `head` that point
* to token being merge may need different treatment depending on user requirement.
*/
export var HeadPolicy;
(function (HeadPolicy) {
/** Update all the `head` pointed to the token within merge range to a merged `ID` */
HeadPolicy[HeadPolicy["Adjust"] = 0] = "Adjust";
/** Remove any `head` linked to the token within merge range */
HeadPolicy[HeadPolicy["Remove"] = 1] = "Remove";
})(HeadPolicy || (HeadPolicy = {}));
function nominal_guard(arr) {
return arr[0] instanceof NominalToken;
}
/**
* A merging policy.
*
* It has following attributes:
* - `headPol` field will determined how all dependants shall be handle.
* The default value is `HeadPolicy.Adjust`
* - `lemma` field is a callback that takes all tokens being merged as argument and return merged lemma.
* The default value is every lemma concatenated together or undefined.
* - `upos` field is a callback that takes all tokens being merged as argument and return merged part-of-speech.
* The default value is a part-of-speech of first token being merge.
* - `xpos` an optional field which is a callback that takes all tokens being merged as argument and return merged language specific part-of-speech.
* - `feats` field is a callback that takes all tokens being merged as argument and return merged feature(s).
* The default value is a flatten merge of all unique features from every tokens being merged.
* - `headRels` field is a callback that takes all tokens being merged as argument and return merged `head` and `deprel` fields.
* The default is first token being merged head/deprel field if there's no root `Relation` in merging, otherwise, it
* will become new root.
* - `deps` field is a callback that takes all tokens being merged as argument and return merged `deps` fields.
* The default is merged of every unique deps field of tokens being merged.
* - `misc` field is a callback that takes all tokens being merged as argument and return merged `misc` fields.
* The default value is all unique of flatten map misc fields of every tokens.
*/
var MergePolicy = /** @class */ (function () {
function MergePolicy() {
this.headPol = HeadPolicy.Adjust;
this.lemma = function (tokens) {
var trim_last = true;
var lemmas = tokens.filter(function (t) { return t.lemma != null; }).map(function (t) {
if (t.misc && t.misc.find(function (m) { return m == "SpaceAfter=No"; })) {
trim_last = false;
return t.lemma;
}
else {
trim_last = true;
return t.lemma + ' ';
}
});
if (lemmas.length > 0) {
var lemma = lemmas.join('');
if (trim_last)
return lemma.slice(0, -1);
else
return lemma;
}
};
this.upos = function (tokens) { return tokens[0].upos; };
this.feats = function (tokens) {
var feats = tokens.filter(function (t) { return t.feats != null; }).flatMap(function (t) { return t.feats; }).filter(function (v, i, a) { return a.indexOf(v) === i; });
if (feats.length > 0)
return feats;
};
this.headRels = function (tokens) {
if (tokens.find(function (t) { return t.head == 0; }))
return [0, new Relation("root")];
return [tokens[0].head, tokens[0].deprel];
};
this.deps = function (tokens) {
var deps = tokens.filter(function (t) { return t.deps != null; }).flatMap(function (t) { return t.deps; }).filter(function (v, i, a) { return a.indexOf(v) === i; }).sort(function (a, b) {
if (a[0] < b[0])
return -1;
else if (a[0] > b[0])
return 1;
else {
if (a[1] < b[1])
return -1;
else if (a[1] > b[1])
return 1;
else
return 0;
}
});
if (deps.length > 0)
return deps;
};
this.misc = function (tokens) {
var misc = Array.from(new Set(tokens.filter(function (t) { return t.misc != undefined; }).flatMap(function (t) { return t.misc; }))).filter(function (m) { return !m.startsWith("SpaceAfter="); });
// SpaceAfter need special treatment as it is predefined by CoNLL-U as per
// https://universaldependencies.org/format.html#untokenized-text
var last_token = tokens[tokens.length - 1];
var i = last_token.misc ? last_token.misc.findIndex(function (m) { return m.startsWith("SpaceAfter="); }) : -1;
if (i >= 0)
misc.push(last_token.misc[i]); // Copy SpaceAfter from last mergining token
if (misc.length > 0)
return misc;
};
}
return MergePolicy;
}());
export { MergePolicy };
/**
* Perform tokens merging.
* It is an error to merge different types of tokens.
* For example, you can **not** merge `from` = 0.1 and `to` = 2.
* If merging token cause a `CompoundToken` that refering to it become invalid,
* it will automatically remove that `CompoundToken` out.
*
* It will result in undefined behavior if `Sentence` being merge is invalid.
* Caller should ensure that `Sentence` is valid. method `validate` of `Sentence`
* can be used for such validation.
*
* You can merge multiple `EmptyToken`.
* You can merge multiple `NominalToken`.
* If there's an `EmptyToken` or a `CompoundToken` between `NominalToken` being merged,
* it will automatically remove.
* You can never merge `CompoundToken`.
*
* It may end up with invalid `EmptyToken` if it any EmptyToken have single dependency that depends on
* the token being merged. Such invalid `EmptyToken` will be retain. It is user responsibility
* to fix the dependency for every `EmptyToken` that become invalid.
*
* @param sentence An object of Sentence to merge tokens
* @param from A first token to be merge. It is an `id`, not an `index` of token.
* @param to An inclusive id of last token to be merged. It is an `id`, not an `index` of token.
* @param policy Specify how to treat a head that point to any token being merged. The default policy is to `Adjust`
*/
export function mergeTokens(sentence, from, to, policy) {
if (typeof from == "number" && typeof to == "number") {
mergeNominal(sentence, from, to, policy);
}
else if (from instanceof Array && to instanceof Array) {
if (from.length == 2 && to.length == 2) {
mergeEmpty(sentence, from, to, policy);
}
else {
throw "`id` must either be number for `NominalToken` or [number, number] for `EmptyToken`";
}
}
}
/** Perform merging a empty token. See mergeToken function doc for more info */
function mergeEmpty(sentence, from, to, policy) {
if (from >= to || from[0] != to[0])
return;
var id = 1;
var from_idx = 0;
var to_idx = 0;
// Scan tokens and compute from_idx and to_idx
for (var i = 0; i < sentence.tokens.length; i++) {
if (sentence.tokens[i] instanceof NominalToken) {
if (id >= from[0]) {
from_idx = i + 1;
break;
}
id++;
}
}
id = 1;
// Scan for empty token index from given `from` and `to` ID.
for (var i = from_idx; i < sentence.tokens.length; i++) {
if (sentence.tokens[i] instanceof EmptyToken) {
if (id == from[1]) {
from_idx = i;
}
else if (id == to[1]) {
to_idx = i;
break;
}
id++;
}
else {
throw "Given ID range contains one or more non-EmptyToken.";
}
}
if (from_idx >= to_idx) {
throw "Given from/to `id` is invalid. Either one or both of it is not exist in given sentence.";
}
// update deps field
var offset = to[1] - from[1];
/**
* A core algorithm to update deps field
* @param update A callback function that responsible for update dep at given index.
* The callback must return next index to evaluate.
*/
var update_core = function (update) {
for (var i = 0; i < sentence.tokens.length; i++) {
if (i >= from_idx && i <= to_idx)
continue;
var token = sentence.tokens[i];
if ((token instanceof NominalToken || token instanceof EmptyToken) && token.deps) {
var j = 0;
while (j < token.deps.length) {
var dep = token.deps[j];
if (dep[0][0] == from[0]) {
if (dep[0][1] > to[1]) {
dep[0][1] -= offset;
}
else if (dep[0][1] > from[1] && dep[0][1] <= to[1]) {
j = update(token.deps, j);
continue;
}
}
j++;
}
}
}
};
switch (policy.headPol) {
case HeadPolicy.Adjust:
update_core(function (deps, i) {
deps[i][0][1] = from[1] + 1;
return i;
});
break;
case HeadPolicy.Remove:
update_core(function (deps, i) {
deps.splice(i, 1);
return i + 1;
});
break;
default:
throw "Not yet implemented HeadPolicy type ";
}
// The token between from_idx to to_idx will all be EmptyToken
// We also don't have to worry about CompoundToken
var tokens = sentence.tokens.slice(from_idx, to_idx + 1);
// Merge all empty token in given range
_merge(tokens, policy);
// Remove all merged token
sentence.tokens.splice(from_idx + 1, to_idx - from_idx);
}
/** Perform merging a nominal token. See mergeToken function doc for more info */
function mergeNominal(sentence, from, to, policy) {
if (from >= to)
return;
var id = 1;
var from_idx = 0;
var to_idx = 0;
// scan tokens and compute from_idx and to_idx
for (var i = 0; i < sentence.tokens.length; i++) {
if (sentence.tokens[i] instanceof NominalToken) {
if (id == from) {
from_idx = i;
}
else if (id == to) {
to_idx = i;
break;
}
id++;
}
}
if (from_idx >= to_idx) {
throw "Given from/to `id` is invalid. Either one or both of it is not exist in given sentence.";
}
// Scan for compound token that will become invalid
// Compound must be in front of nominal token according to CoNLL-U spec.
for (var i = from_idx - 1; i >= 0; i--) {
var token = sentence.tokens[i];
if (token instanceof CompoundToken) {
var _a = token.id, begin = _a[0], end = _a[1];
if (from <= begin && to >= end) {
// Merge will cause this compound token smaller or equal to merged token causing it to be invalid.
from_idx--;
to_idx--;
sentence.tokens.splice(i, 1);
}
else if (to <= end) {
// Merge will cause compound end ID to be invalid but it can be fix.
token.id[1] = end - (to - from); // Fix by reduce equals to number of token being merged
}
else if (to > end) {
// Merge token will cause compound end ID to be invalid. It can be fix using some math.
// The (end - from) is a number of tokens within the compound that is getting merge.
// New end will be point to ID of merged token thus equals to the last token
// within the compound that is not getting merge + 1
token.id[1] = end - (end - from);
}
break; // compound token cannot be overlap so the first nearest compound found is the only possible one to become invalid
}
}
var tokens = [];
for (var _i = 0, _b = sentence.tokens.slice(from_idx, to_idx + 1); _i < _b.length; _i++) {
var token = _b[_i];
if (token instanceof NominalToken) {
tokens.push(token);
}
}
// Update head/deprel
var offset = to - from;
/**
* Common update head, deprel, and deps algorithm.
* It take two callbacks.
* - `update_head` callback which will be called when the given token have head field pointed to a merging token
* - `update_deps` callback which will be called when given index need to update head.
* It need to return next index of deps to be check.
*/
var update_core = function (update_head, update_deps) {
/** Common deps field update algorithm for both Nominal and Empty token */
var core_deps = function (deps) {
if (deps) {
var j = 0;
while (j < deps.length) {
var dep = deps[j];
if (dep[0][0] >= to) {
dep[0][0] -= offset; // Adjust head id of all subsequence tokens
}
else if (dep[0][0] >= from && dep[0][0] < to) {
j = update_deps(deps, j); // Update dep at given index j because it pointed to merging token
continue;
}
j++;
}
}
};
for (var i = 0; i < sentence.tokens.length; i++) {
if (i > from_idx && i <= to_idx)
continue; // these index will be removed anyway
var token = sentence.tokens[i];
if (token instanceof NominalToken) {
if (token.head > to) {
token.head -= offset; // adjust all subsequence tokens
}
else if (token.head >= from && token.head <= to) {
update_head(token); // update head that pointed in a merge range
}
core_deps(token.deps); // process deps field
}
else if (token instanceof EmptyToken) {
core_deps(token.deps);
}
}
};
switch (policy.headPol) {
case HeadPolicy.Remove:
var update_head_remove = function (token) {
token.head = undefined;
token.deprel = undefined;
};
var update_deps_remove = function (deps, i) {
deps.splice(i, 1);
return i; // It doesn't need to increment `i` as it remove itself out so the cursor is literally incremented
};
update_core(update_head_remove, update_deps_remove);
break;
case HeadPolicy.Adjust:
var update_head_adjust = function (token) {
token.head = from;
};
var update_deps_adjust = function (deps, i) {
// Adjust all dep
deps[i][0][0] = from;
if (deps[i][0].length == 2) {
// every empty token inside merging range will be remove
// so it should only pointed to the merged id
deps[i][0].pop();
}
return i + 1; // We need to increment cursor to check for next dep
};
update_core(update_head_adjust, update_deps_adjust);
break;
default:
throw "Not yet implemented HeadPolicy type";
}
// Now merge tokens into head token.
_merge(tokens, policy);
// Remove all merged tokens
sentence.tokens.splice(from_idx + 1, to_idx - from_idx);
}
function _merge(tokens, policy) {
var token = tokens[0];
var trim_last = true;
var form = tokens.map(function (t) {
if (t.misc && t.misc.find(function (m) { return m == "SpaceAfter=No"; })) {
trim_last = false;
return t.form;
}
else {
trim_last = true;
return t.form + " ";
}
}).reduce(function (f, str) { return f + str; });
if (trim_last)
form = form.slice(0, -1);
var lemma = policy.lemma(tokens);
var upos = policy.upos(tokens);
var xpos = policy.xpos ? policy.xpos(tokens) : undefined;
var feats = policy.feats(tokens);
var deps = policy.deps(tokens);
var misc = policy.misc(tokens);
if (nominal_guard(tokens)) {
var _a = policy.headRels(tokens), head = _a[0], deprel = _a[1];
token.head = head;
token.deprel = deprel;
}
token.form = form;
token.lemma = lemma;
token.upos = upos;
token.xpos = xpos;
token.feats = feats;
token.deps = deps;
token.misc = misc;
}
//# sourceMappingURL=sentence.js.map