conllu-core
Version:
A core type to handle CoNLL-U format
1,163 lines • 56.5 kB
JavaScript
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (_) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
var __asyncValues = (this && this.__asyncValues) || function (o) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var m = o[Symbol.asyncIterator], i;
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
};
var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }
var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var g = generator.apply(thisArg, _arguments || []), i, q = [];
return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i;
function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; }
function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }
function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }
function fulfill(value) { resume("next", value); }
function reject(value) { resume("throw", value); }
function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }
};
import XRegExp from 'xregexp';
import { createReadStream, createWriteStream } from 'fs';
import { createInterface } from 'readline';
/**
* Attempt to convert given token to string.
* If id array is empty or zero length then it will
* exclue id from string.
* If id array has exactly two element, it'll consider
* the token to be compound token. It will print id in format
* `id[0]-id[1]`
*/
function tokenToString(_a) {
var _b = _a.id, id = _b === void 0 ? [] : _b, _c = _a.form, form = _c === void 0 ? "_" : _c, _d = _a.lemma, lemma = _d === void 0 ? "_" : _d, upos = _a.upos, xpos = _a.xpos, feats = _a.feats, head = _a.head, _e = _a.deprel, deprel = _e === void 0 ? "_" : _e, deps = _a.deps, misc = _a.misc;
if (id.length == 2) {
return id[0] + "-" + id[1] + "\t" + form + "\t_\t_\t_\t_\t_\t_\t_\t" + (misc ? misc.join("|") : "_");
}
else {
return form + "\t" + lemma + "\t" + (upos ? upos.toString() : "_") + "\t" + (xpos ? xpos.toString() : "_") + "\t" + (feats ? feats.map(function (f) { return f.toString(); }).join("|") : "_") + "\t" + (head ? head : "_") + "\t" + deprel + "\t" + (deps ? deps.map(function (_a) {
var id = _a[0], rel = _a[1];
return (id.length == 1 ? id[0] : id[0] + "." + id[1]) + ":" + rel;
}).join("|") : "_") + "\t" + (misc ? misc.join("|") : "_");
}
}
/**
* Sort given deps as required by `conllu` specification that deps must be sorted
* by head. If head is equals, it must be sorted by relation name in alphabetic ascending order.
* @param deps An array of tuple with `Head` and `DepsRelation` where `Head` can either be 1 integer element tuple
* or 2 integer elements tuple if `Head` is `EmptyToken`. For example `[1]` if head is `NominalToken` and
* `[1, 1]` if head is `EmptyToken`.
*/
function sortDeps(deps) {
return deps.sort(function (d1, d2) {
var headCmp = d1[0][0] - d2[0][0];
if (headCmp != 0) {
return headCmp; // HeadIds have diff
}
else {
if (d1[0][1] && d2[0][1]) {
return d1[0][1] - d2[0][1]; // compare Empty part
}
}
});
}
/**
* Parse a string contains one or more deps.
* Each dep must be split by `|`. `Head` reference can either be `integer` or
* `decimal`. For example, `1` and `1.1`.
* `DepsRelation` associated with `Head` by using `:` to delimit the part.
* See [enhanced dependencies](https://universaldependencies.org/u/overview/enhanced-syntax.html)
* for valid name pattern.
* @param str A string to be parsed into proper `Head`:`DepsRelation` object.
* If `DepsRelation` name is invalid, it will throw an exception.
*/
function parseDeps(str) {
return str.split("|").map(function (dep) {
var splitPoint = dep.indexOf(":");
var ids = dep.slice(0, splitPoint).split(".");
if (ids.length > 2 || ids.length == 0) {
throw "ID of Deps must be either `int` or `int`.`int`";
}
var head = ids[0], empty = ids[1];
var headId = parseInt(head);
if (isNaN(headId)) {
throw "Deps contain non-numeric head position";
}
if (empty != undefined) {
var emptyId = parseInt(empty);
if (isNaN(emptyId)) {
throw "Deps contain non-numeric null node position";
}
return [[headId, emptyId], new DepsRelation(dep.slice(splitPoint + 1))];
}
return [[headId], new DepsRelation(dep.slice(splitPoint + 1))];
});
}
/** Parse and return head as integer number or throw exception if it is not an integer number */
function parseHead(str) {
var id = parseInt(str);
if (isNaN(id) || id != +str) {
return undefined;
}
return id;
}
/**
* Parse a string that strip '#' sign in front of the string out.
* This is assistant function for perfomance reason to parse string and
* return either a Comment or Meta instead of trying to parse it as Comment
* first then try to parse it again as Meta.
* @param str a hash stripped out line of string
*/
function parseHashLine(str) {
var idx = str.indexOf("=");
if (idx != -1) {
return new Meta({
key: str.slice(0, idx).trim(),
value: str.slice(idx + 1).trim()
});
}
else {
return new Comment(str);
}
}
/**
* It attempt to parse a line of tokens.
* @param str A string that contains a line of tokens without new line character
* @param Parser An XPOSParser that map xpos to upos
*/
function parseToken(str, Parser) {
var tokens = str.split("\t");
if (tokens.length != 10) {
throw "Token line must contains 10 columns separate by tap";
}
var idx = tokens[0].indexOf("-");
if (idx != -1) {
// possibly compound token
var start = +tokens[0].slice(0, idx);
var end = +tokens[0].slice(idx + 1);
if (Math.floor(start) != start || Math.floor(end) != end) {
throw "Compound id must both be integer number";
}
return parseCompoundUncheck(start, end, tokens.slice(1));
}
idx = tokens[0].indexOf(".");
if (idx != -1) {
// possibly empty token
var head = +tokens[0].slice(0, idx);
var id_1 = +tokens[0].slice(idx + 1);
if (Math.floor(head) != head || Math.floor(id_1) != id_1) {
throw "Empty id format must be <int>.<int> such as 1.1, 1.2, .. , 1.10";
}
return [head, id_1, parseEmptyUncheck(tokens.slice(1), Parser)];
}
var id = parseInt(tokens[0]);
if (isNaN(id) || id != +tokens[0]) {
throw "ID must either be <int>.<int> or <int>-<int> or <int> such as 1, 1.2, or 1-2";
}
return [id, parseNominalUncheck(tokens.slice(1), Parser)];
}
/**
* Parse a string and return an integer value of an ID or throw an exception.
* @param str a string that may contain an integer value
*/
function parseNominalId(str) {
var id = parseInt(str);
if (isNaN(id) || id != +str) {
throw "The give string is not valid Nominal token";
}
return id;
}
function parseNominal(tokens, Parser) {
if (tokens.length != 10) {
throw "Require tab separate string with 10 columns";
}
var id = parseNominalId(tokens[0]);
return [
id,
parseNominalUncheck(tokens.slice(1), Parser)
];
}
/**
* Similar to parseNominal function except that it doesn't check number of element
* in given tokens. This function will be used as a core NominalToken object construction
* to reduce code duplicate. The tokens must be a slice stripping out the first token so
* it will have only 9 elements.
*/
function parseNominalUncheck(tokens, Parser) {
var head = parseHead(tokens[5]);
return new NominalToken({
form: tokens[0],
lemma: tokens[1] == "_" ? undefined : tokens[1],
upos: UPOS[tokens[2]],
xpos: Parser && tokens[3] != "_" ? Parser.parse(tokens[3]) : undefined,
feats: tokens[4] != "_" ? tokens[4].split("|").map(function (f) {
var _a = f.split("="), key = _a[0], values = _a[1];
return new Feature(key, values.split(","));
}) : undefined,
headRel: isNaN(head) ? undefined : [head, new Relation(tokens[6])],
deps: tokens[7] != "_" ? parseDeps(tokens[7]) : undefined,
misc: tokens[8] != "_" ? tokens[8].split("|") : undefined
});
}
/**
* Parse a string that may contain a compound ID. It throw an exception if it isn't.
* @param ids A string in format of "int-int" such as "1-2"
*/
function parseCompoundId(ids) {
var id = ids.split("-");
if (id.length != 2) {
throw "CompoundToken need id to be in format `[start, end]` where `end` > `start`";
}
var start = parseInt(id[0]);
var end = parseInt(id[1]);
if (isNaN(start) || isNaN(end) || start != +id[0] || end != +id[1]) {
// +id[0] and +id[1] will turn the string into number which might be int or float
// if it doesn't equals to `parseInt()` result then it's not integer
throw "CompoundToken require both start and end to be integer number";
}
return [start, end];
}
/** Parse a compound token from given tokens strings */
function parseCompound(tokens) {
if (tokens.length != 10) {
throw "CompountToken requires tab separate string with 10 columns";
}
var _a = parseCompoundId(tokens[0]), start = _a[0], end = _a[1];
return parseCompoundUncheck(start, end, tokens.slice(1));
}
/**
* Similar to parseCompound function but doesn't check number of element in tokens.
* It require caller to pass in id of start and id of end token.
* Both id are inclusive. This function will be used as a core Compound object construction.
*/
function parseCompoundUncheck(start, end, tokens) {
return new CompoundToken({
id: [start, end],
form: tokens[0],
misc: tokens[8] == "_" ? undefined : tokens[8].split("|")
});
}
/**
* Parse a string and return pair of ids or throw an exception.
* @param str A string contains empty id in format of "int.int" for example, "1.1"
*/
function parseEmtpyId(str) {
var emptyId = str.split(".");
if (emptyId.length != 2) {
throw "ID of empty token must be in format `int.int`";
}
var headId = parseInt(emptyId[0]);
var id = parseInt(emptyId[1]);
if (isNaN(headId) || isNaN(id)) {
throw "ID of empty token must be in format `int.int`";
}
return [headId, id];
}
/** Parse given string as EmptyToken along with its' ID tuple */
function parseEmpty(tokens, Parser) {
if (tokens.length != 10) {
throw "EmptyToken requires tab separate string with 10 columns";
}
var emptyId = tokens[0].split(".");
if (emptyId.length != 2) {
throw "ID of empty token must be in format `int.int`";
}
var _a = parseEmtpyId(tokens[0]), headId = _a[0], id = _a[1];
if (tokens[8] == "_") {
throw "EmptyToken requires non empty `deps` column";
}
return [
headId,
id,
parseEmptyUncheck(tokens.slice(1), Parser)
];
}
/** Similar to parseEmpty but doesn't check token length nor ID. */
function parseEmptyUncheck(tokens, Parser) {
return new EmptyToken({
form: tokens[0] != "_" ? tokens[0] : undefined,
lemma: tokens[1] != "_" ? tokens[1] : undefined,
upos: tokens[2] != "_" ? UPOS[tokens[2]] : undefined,
xpos: Parser && tokens[3] != "_" ? Parser.parse(tokens[3]) : undefined,
feats: tokens[4] != "_" ? tokens[4].split("|").map(function (f) {
var _a = f.split("="), key = _a[0], values = _a[1];
return new Feature(key, values.split(","));
}) : undefined,
deps: parseDeps(tokens[7]),
misc: tokens[8] != "_" ? tokens[8].split("|") : undefined
});
}
/**
* A generator function that keep return a `Sentence` object on each call.
* Use this generator if whole document cannot be fit into memory.
*
* @param stream A `Readable` stream that contains CoNLL-U format text.
* @param Parser A derivative of `XPOSParser` object for parsing `xpos` field
*/
export function sentences(stream, Parser) {
return __asyncGenerator(this, arguments, function sentences_1() {
function read_line(readable) {
return __asyncGenerator(this, arguments, function read_line_1() {
var lines, lines_1, lines_1_1, line, e_2_1;
var e_2, _a;
return __generator(this, function (_b) {
switch (_b.label) {
case 0:
lines = createInterface({ input: readable });
_b.label = 1;
case 1:
_b.trys.push([1, 8, 9, 14]);
lines_1 = __asyncValues(lines);
_b.label = 2;
case 2: return [4 /*yield*/, __await(lines_1.next())];
case 3:
if (!(lines_1_1 = _b.sent(), !lines_1_1.done)) return [3 /*break*/, 7];
line = lines_1_1.value;
return [4 /*yield*/, __await(line)];
case 4: return [4 /*yield*/, _b.sent()];
case 5:
_b.sent();
_b.label = 6;
case 6: return [3 /*break*/, 2];
case 7: return [3 /*break*/, 14];
case 8:
e_2_1 = _b.sent();
e_2 = { error: e_2_1 };
return [3 /*break*/, 14];
case 9:
_b.trys.push([9, , 12, 13]);
if (!(lines_1_1 && !lines_1_1.done && (_a = lines_1["return"]))) return [3 /*break*/, 11];
return [4 /*yield*/, __await(_a.call(lines_1))];
case 10:
_b.sent();
_b.label = 11;
case 11: return [3 /*break*/, 13];
case 12:
if (e_2) throw e_2.error;
return [7 /*endfinally*/];
case 13: return [7 /*endfinally*/];
case 14:
lines.close();
return [2 /*return*/];
}
});
});
}
var _a, _b, sentence, e_1_1;
var e_1, _c;
return __generator(this, function (_d) {
switch (_d.label) {
case 0:
_d.trys.push([0, 7, 8, 13]);
_a = __asyncValues(_sentences(read_line(stream), Parser));
_d.label = 1;
case 1: return [4 /*yield*/, __await(_a.next())];
case 2:
if (!(_b = _d.sent(), !_b.done)) return [3 /*break*/, 6];
sentence = _b.value;
return [4 /*yield*/, __await(sentence)];
case 3: return [4 /*yield*/, _d.sent()];
case 4:
_d.sent();
_d.label = 5;
case 5: return [3 /*break*/, 1];
case 6: return [3 /*break*/, 13];
case 7:
e_1_1 = _d.sent();
e_1 = { error: e_1_1 };
return [3 /*break*/, 13];
case 8:
_d.trys.push([8, , 11, 12]);
if (!(_b && !_b.done && (_c = _a["return"]))) return [3 /*break*/, 10];
return [4 /*yield*/, __await(_c.call(_a))];
case 9:
_d.sent();
_d.label = 10;
case 10: return [3 /*break*/, 12];
case 11:
if (e_1) throw e_1.error;
return [7 /*endfinally*/];
case 12: return [7 /*endfinally*/];
case 13: return [2 /*return*/];
}
});
});
}
/**
* Core `Sentence` generator function. It take a generator that yield a line on each call and a Parser as argument.
* Each call to this function yield a `Sentence`.
*/
function _sentences(line_iter, Parser) {
return __asyncGenerator(this, arguments, function _sentences_1() {
var meta, tokens, line_iter_1, line_iter_1_1, line, t, e_3_1;
var e_3, _a;
return __generator(this, function (_b) {
switch (_b.label) {
case 0:
meta = [];
tokens = [];
_b.label = 1;
case 1:
_b.trys.push([1, 9, 10, 15]);
line_iter_1 = __asyncValues(line_iter);
_b.label = 2;
case 2: return [4 /*yield*/, __await(line_iter_1.next())];
case 3:
if (!(line_iter_1_1 = _b.sent(), !line_iter_1_1.done)) return [3 /*break*/, 8];
line = line_iter_1_1.value;
line = line.trim();
if (!(line.length > 0)) return [3 /*break*/, 4];
if (line[0] == '#') {
meta.push(parseHashLine(line.slice(1)));
}
else {
t = parseToken(line, Parser);
if (t instanceof CompoundToken) {
tokens.push(t);
}
else if (t.length == 2) {
// nominal token
tokens.push(t[1]);
}
else if (t.length == 3) {
// empty token
tokens.push(t[2]);
}
else {
throw "Invalid state while parsing token line";
}
}
return [3 /*break*/, 7];
case 4:
if (!(tokens.length > 0)) return [3 /*break*/, 7];
return [4 /*yield*/, __await(new Sentence({ meta: meta, tokens: tokens }))];
case 5: return [4 /*yield*/, _b.sent()];
case 6:
_b.sent();
meta = [];
tokens = [];
_b.label = 7;
case 7: return [3 /*break*/, 2];
case 8: return [3 /*break*/, 15];
case 9:
e_3_1 = _b.sent();
e_3 = { error: e_3_1 };
return [3 /*break*/, 15];
case 10:
_b.trys.push([10, , 13, 14]);
if (!(line_iter_1_1 && !line_iter_1_1.done && (_a = line_iter_1["return"]))) return [3 /*break*/, 12];
return [4 /*yield*/, __await(_a.call(line_iter_1))];
case 11:
_b.sent();
_b.label = 12;
case 12: return [3 /*break*/, 14];
case 13:
if (e_3) throw e_3.error;
return [7 /*endfinally*/];
case 14: return [7 /*endfinally*/];
case 15:
if (!(tokens.length > 0)) return [3 /*break*/, 18];
return [4 /*yield*/, __await(new Sentence({ meta: meta, tokens: tokens }))];
case 16: return [4 /*yield*/, _b.sent()];
case 17:
_b.sent();
meta = [];
tokens = [];
_b.label = 18;
case 18: return [2 /*return*/];
}
});
});
}
/**
* `Document` is an entry point to `conllu`. It contains zero or more sentences.
*
* To programmatically construct a `Document` use it constructor.
* To construct a `Document` using CoNLL-U format text, use either
* `parse`, `load`, or `read` method depending on source of text.
*
* If `Document` cannot be fit into memory, use `sentences` generator function.
*/
var Document = /** @class */ (function () {
function Document(sentences) {
this.sentences = sentences;
}
/**
* Load conllu file as Document. This method is async.
*
* @param file_path Path to conllu file
* @param Parser An optional Parser that is derivative of type XPOSParser for mapping XPOS to UPOS
*/
Document.load = function (file_path, Parser) {
var e_4, _a;
return __awaiter(this, void 0, void 0, function () {
var stream, loaded_sentences, _b, _c, line, e_4_1;
return __generator(this, function (_d) {
switch (_d.label) {
case 0:
stream = createReadStream(file_path);
loaded_sentences = [];
_d.label = 1;
case 1:
_d.trys.push([1, 6, 7, 12]);
_b = __asyncValues(sentences(stream, Parser));
_d.label = 2;
case 2: return [4 /*yield*/, _b.next()];
case 3:
if (!(_c = _d.sent(), !_c.done)) return [3 /*break*/, 5];
line = _c.value;
loaded_sentences.push(line);
_d.label = 4;
case 4: return [3 /*break*/, 2];
case 5: return [3 /*break*/, 12];
case 6:
e_4_1 = _d.sent();
e_4 = { error: e_4_1 };
return [3 /*break*/, 12];
case 7:
_d.trys.push([7, , 10, 11]);
if (!(_c && !_c.done && (_a = _b["return"]))) return [3 /*break*/, 9];
return [4 /*yield*/, _a.call(_b)];
case 8:
_d.sent();
_d.label = 9;
case 9: return [3 /*break*/, 11];
case 10:
if (e_4) throw e_4.error;
return [7 /*endfinally*/];
case 11: return [7 /*endfinally*/];
case 12: return [2 /*return*/, new Document(loaded_sentences)];
}
});
});
};
/**
* Parse given stream line by line to construct an object of Document.
*
* @param stream A stream source of text to be parse
* @param Parser An optional Parser that is derivative of type XPOSParser for mapping XPOS to UPOS
*/
Document.read = function (stream, Parser) {
var e_5, _a;
return __awaiter(this, void 0, void 0, function () {
var loaded_sentences, _b, _c, sentence, e_5_1;
return __generator(this, function (_d) {
switch (_d.label) {
case 0:
loaded_sentences = [];
_d.label = 1;
case 1:
_d.trys.push([1, 6, 7, 12]);
_b = __asyncValues(sentences(stream, Parser));
_d.label = 2;
case 2: return [4 /*yield*/, _b.next()];
case 3:
if (!(_c = _d.sent(), !_c.done)) return [3 /*break*/, 5];
sentence = _c.value;
loaded_sentences.push(sentence);
_d.label = 4;
case 4: return [3 /*break*/, 2];
case 5: return [3 /*break*/, 12];
case 6:
e_5_1 = _d.sent();
e_5 = { error: e_5_1 };
return [3 /*break*/, 12];
case 7:
_d.trys.push([7, , 10, 11]);
if (!(_c && !_c.done && (_a = _b["return"]))) return [3 /*break*/, 9];
return [4 /*yield*/, _a.call(_b)];
case 8:
_d.sent();
_d.label = 9;
case 9: return [3 /*break*/, 11];
case 10:
if (e_5) throw e_5.error;
return [7 /*endfinally*/];
case 11: return [7 /*endfinally*/];
case 12: return [2 /*return*/, new Document(loaded_sentences)];
}
});
});
};
/**
* An async utitility function that cumulatively parse each line of string then return a document.
*
* @param line_iter An async generator object where each call return a line of string
* @param Parser a Parser derivative from XPOSParser
*/
Document.parse_core = function (line_iter, Parser) {
var e_6, _a;
return __awaiter(this, void 0, void 0, function () {
var sentences, _b, _c, sentence, e_6_1;
return __generator(this, function (_d) {
switch (_d.label) {
case 0:
sentences = [];
_d.label = 1;
case 1:
_d.trys.push([1, 6, 7, 12]);
_b = __asyncValues(_sentences(line_iter));
_d.label = 2;
case 2: return [4 /*yield*/, _b.next()];
case 3:
if (!(_c = _d.sent(), !_c.done)) return [3 /*break*/, 5];
sentence = _c.value;
sentences.push(sentence);
_d.label = 4;
case 4: return [3 /*break*/, 2];
case 5: return [3 /*break*/, 12];
case 6:
e_6_1 = _d.sent();
e_6 = { error: e_6_1 };
return [3 /*break*/, 12];
case 7:
_d.trys.push([7, , 10, 11]);
if (!(_c && !_c.done && (_a = _b["return"]))) return [3 /*break*/, 9];
return [4 /*yield*/, _a.call(_b)];
case 8:
_d.sent();
_d.label = 9;
case 9: return [3 /*break*/, 11];
case 10:
if (e_6) throw e_6.error;
return [7 /*endfinally*/];
case 11: return [7 /*endfinally*/];
case 12: return [2 /*return*/, new Document(sentences)];
}
});
});
};
/**
* Attempt to parse string as a document. This method is async.
*
* @param str An entire document in string where each line is terminate by '\u000a'
* @param Parser An optional XPOSParser instance
*/
Document.parse = function (str, Parser) {
return __awaiter(this, void 0, void 0, function () {
function lines_iter(str) {
return __asyncGenerator(this, arguments, function lines_iter_1() {
var _i, _a, line;
return __generator(this, function (_b) {
switch (_b.label) {
case 0:
_i = 0, _a = str.split("\u000a");
_b.label = 1;
case 1:
if (!(_i < _a.length)) return [3 /*break*/, 5];
line = _a[_i];
return [4 /*yield*/, __await(line)];
case 2: return [4 /*yield*/, _b.sent()];
case 3:
_b.sent();
_b.label = 4;
case 4:
_i++;
return [3 /*break*/, 1];
case 5: return [2 /*return*/];
}
});
});
}
return __generator(this, function (_a) {
return [2 /*return*/, this.parse_core(lines_iter(str), Parser)];
});
});
};
/** Save this document to a file in given path. The content encoding is UTF-8 */
Document.prototype.save = function (path) {
return __awaiter(this, void 0, void 0, function () {
var stream;
return __generator(this, function (_a) {
stream = createWriteStream(path);
this.write(stream);
stream.close();
return [2 /*return*/];
});
});
};
/** Return CoNLL-U string representation of the doc */
Document.prototype.toString = function () {
return this.sentences.map(function (sentence) { return sentence.toString(); }).join("\u000a\u000a");
};
/**
* Validate every sentence dependencies. It immediately return when there's an error.
* Otherwise, it return SentenceValidationResult.Ok
*/
Document.prototype.validate = function () {
for (var i in this.sentences) {
var validated = this.sentences[i].validate();
if (validated != SentenceValidationResult.Ok) {
return validated;
}
}
return SentenceValidationResult.Ok;
};
/** Serialize this document as CoNLL-U text into given stream */
Document.prototype.write = function (stream) {
return __awaiter(this, void 0, void 0, function () {
var _i, _a, sentence;
return __generator(this, function (_b) {
switch (_b.label) {
case 0:
_i = 0, _a = this.sentences;
_b.label = 1;
case 1:
if (!(_i < _a.length)) return [3 /*break*/, 4];
sentence = _a[_i];
if (!!stream.write(sentence.toString() + "\u000a\u000a")) return [3 /*break*/, 3];
return [4 /*yield*/, new Promise(function (resolve) { return stream.once('drain', function () {
resolve();
}); })];
case 2:
_b.sent();
_b.label = 3;
case 3:
_i++;
return [3 /*break*/, 1];
case 4: return [2 /*return*/];
}
});
});
};
return Document;
}());
export { Document };
/**
* Sentence meta data.
*
* It's a key/value pair. It's defined by prefixing the sentence with
* `# key = value` format.
*/
var Meta = /** @class */ (function () {
/**
* Construct `Meta` by given dictionary.
* @param param0 A dic of `key` and `value` where `value` is optional.
* If `value` is omitted, `toString` method will return `Comment` format
* string rather than empty value `key`
*/
function Meta(_a) {
var key = _a.key, value = _a.value;
this.key = key;
this.value = value;
}
/**
* Instantiate the object by providing a `conllu` string.
* @param str A string to be parsed into `Meta`
*/
Meta.parse = function (str) {
if (str[0] != '#') {
throw "Meta entry must start with `#`";
}
str = str.slice(1).trim();
var eqId = str.indexOf("=");
if (eqId == -1) {
throw "Meta entry must have `=` symbol";
}
var key = str.slice(0, eqId).trim();
var value = str.slice(eqId + 1).trim();
var meta = new Meta({ key: key, value: value });
return meta;
};
/** Convert this object into `conllu` string */
Meta.prototype.toString = function () {
if (this.key && this.value) {
return "# " + this.key + " = " + this.value;
}
else if (this.key) {
return "# " + this.key;
}
else {
throw "Missing key from meta";
}
};
return Meta;
}());
export { Meta };
/**
* A comment of sentence. It's similar to `Meta` but doesn't have `=` symbol.
* Similar to `Meta`, it must be prefix of sentence.
*/
var Comment = /** @class */ (function () {
/**
* @param text Comment to be added
*/
function Comment(text) {
if (text) {
text = text.trim();
if (text.length > 0) {
this.text = text;
}
}
}
/**
* Construct a comment object from given string.
* @param str A string to be parse as `Comment`
*/
Comment.parse = function (str) {
if (str[0] != "#") {
throw "Comment line must begin with `#`";
}
var c = new Comment();
str = str.slice(1).trim();
if (str.length > 0) {
c.text = str;
}
return c;
};
/** Get `conllu` string from this comment */
Comment.prototype.toString = function () {
if (this.text && this.text.length > 0) {
return "# " + this.text;
}
else {
return "#";
}
};
return Comment;
}());
export { Comment };
/**
* A validation result for calling validate on each `Sentence`.
* It may also throw some exceptions such as "Head of deps that reference to hidden/empty token must be in [integer, integer] format".
*/
export var SentenceValidationResult;
(function (SentenceValidationResult) {
SentenceValidationResult[SentenceValidationResult["Ok"] = 0] = "Ok";
/** Compound token end range is beyond index of last token error */
SentenceValidationResult[SentenceValidationResult["CompoundEndBeyondLastTokenError"] = 1] = "CompoundEndBeyondLastTokenError";
/** Some of compound token is overlap to other compound token error */
SentenceValidationResult[SentenceValidationResult["CompoundOverlapError"] = 2] = "CompoundOverlapError";
/** Compound token start index point to token prior to itself error */
/** Head index is larger than number of tokens or less than 1 error */
SentenceValidationResult[SentenceValidationResult["DepHeadOutOfBoundError"] = 3] = "DepHeadOutOfBoundError";
SentenceValidationResult[SentenceValidationResult["CompoundStartAfterTokenError"] = 4] = "CompoundStartAfterTokenError";
/** Empty token after compound token error */
SentenceValidationResult[SentenceValidationResult["EmptyAfterCompoundError"] = 5] = "EmptyAfterCompoundError";
/** Head index is larger than number of tokens or less than 1 error */
SentenceValidationResult[SentenceValidationResult["HeadOutOfBoundError"] = 6] = "HeadOutOfBoundError";
/** NominalToken with head with missing deprel error */
SentenceValidationResult[SentenceValidationResult["HeadWithoutDeprelError"] = 7] = "HeadWithoutDeprelError";
/** NominalToken with non-intenger value in head error */
SentenceValidationResult[SentenceValidationResult["NonIntegerHeadError"] = 8] = "NonIntegerHeadError";
})(SentenceValidationResult || (SentenceValidationResult = {}));
/**
* `Sentence` consists of:
* 1. `meta` which is array. The object inside array can either be `Meta` object or `Comment` object.
* 1. `tokens` which is array of derivative of `Token` class.
*
* To parse sentence text:
* 1. You can either construct a `Document` from text by using `parse`, `load`, `read` method and access
* `Sentence` via `sentences` field of `Document` object.
* 2. You can also use generator function `sentences` to parse each text chunk incrementally.
*/
var Sentence = /** @class */ (function () {
/**
* Construct a new sentence from given dictionary
* @param param0 A dictionary object contain optional `meta` array of either
* `Meta` or `Comment` and tokens field which is array of `Token` derivative.
*/
function Sentence(_a) {
var meta = _a.meta, tokens = _a.tokens;
this.meta = meta;
this.tokens = tokens;
}
/** get `conllu` formatted string of current sentence */
Sentence.prototype.toString = function () {
var metaStr = "" + this.meta.map(function (m) { return m.toString(); }).join("\u000a");
var id = 1;
var hiddenId = 1;
var tokensStr = "" + this.tokens.map(function (token) {
if (token instanceof CompoundToken) {
return token.toString();
}
else if (token instanceof EmptyToken) {
return id - 1 + "." + hiddenId++ + "\t" + token.toString();
}
else if (token instanceof NominalToken) {
hiddenId = 1;
return id++ + "\t" + token.toString();
}
else {
throw "Unsupport type of token";
}
}).join("\u000a");
return metaStr + "\n" + tokensStr;
};
/**
* Parse given string as `Sentence` object
* @param str A string to be used to instantiate `Sentence`.
* @param Parser An `XPOSParser` derivative object
*/
Sentence.parse = function (str, Parser) {
var meta = [];
var tokens = [];
for (var _i = 0, _a = str.split('\u000a'); _i < _a.length; _i++) {
var line = _a[_i];
var l = line.trim();
if (l.startsWith("#")) {
var eqIdx = l.indexOf("=");
if (eqIdx == -1) {
meta.push(new Comment(l.slice(1)));
}
else {
var key = l.slice(1, eqIdx).trim();
var value = l.slice(eqIdx + 1).trim();
meta.push(new Meta({ key: key, value: value }));
}
}
else if (l.length > 0) {
var ts = line.split("\t");
if (ts.length != 10) {
throw "All token must have 10 columns";
}
try {
var _b = parseNominal(ts, Parser), _id = _b[0], tok = _b[1];
tokens.push(tok);
}
catch (e) {
if (e === "The give string is not valid Nominal token") {
try {
var tok = parseCompound(ts);
tokens.push(tok);
}
catch (e) {
if (e === "CompoundToken need id to be in format `[start, end]` where `end` > `start`") {
var _c = parseEmpty(ts, Parser), _headId = _c[0], _emptyId = _c[1], tok = _c[2];
tokens.push(tok);
}
}
}
}
}
else {
break;
}
}
return new Sentence({ meta: meta, tokens: tokens });
};
/**
* Validate current sentence whether the token structure is valid and all
* `head`, `relation`, and `deps` are valid.
*/
Sentence.prototype.validate = function () {
var end = null;
var edges = []; // an index based that is true when head value of a node is equals to the index
var hiddenCount = [];
var hiddenEdges = [];
var tokenCount = 0;
var compound = false;
var simplifyDeps = function (deps) {
for (var _i = 0, deps_1 = deps; _i < deps_1.length; _i++) {
var dep = deps_1[_i];
switch (dep[0].length) {
case 1:
if (!Number.isInteger(dep[0][0]))
throw "Head of deps that reference to hidden/empty token must be in [integer, integer] format";
edges[dep[0][0]] = true;
break;
case 2:
// Head is empty token
if (hiddenEdges[dep[0][0]] < dep[0][1] || hiddenEdges[dep[0][0]] == undefined) {
hiddenEdges[dep[0][0]] = dep[0][1];
}
break;
default:
throw "Invalid deps object. Head of dep must either be [number] or [number, number] ";
}
}
};
/** Return false if all deps are valid, otherwise return true */
var validateDeps = function () {
for (var i = 1; i < hiddenEdges.length; i++) {
if (hiddenEdges[i] == undefined) {
continue;
}
if (hiddenEdges[i] > hiddenCount[i] || hiddenEdges[i] < 1) {
return true;
}
}
return false;
};
for (var _i = 0, _a = this.tokens; _i < _a.length; _i++) {
var token = _a[_i];
if (token instanceof NominalToken) {
tokenCount++;
hiddenCount[tokenCount] = 0;
compound = false;
if (token.head != undefined) {
if (!Number.isInteger(token.head)) {
return SentenceValidationResult.NonIntegerHeadError;
}
else if (token.head > this.tokens.length || token.head < 1) {
return SentenceValidationResult.HeadOutOfBoundError;
}
else if (!token.deprel) {
return SentenceValidationResult.HeadWithoutDeprelError;
}
edges[token.head] = true;
}
if (token.deps) {
simplifyDeps(token.deps);
}
if (end != null && end == tokenCount) {
end = null;
}
}
else if (token instanceof CompoundToken) {
if (end != null) {
return SentenceValidationResult.CompoundOverlapError;
}
end = token.id[1];
compound = true;
if (token.id[0] >= tokenCount && tokenCount != 0) {
return SentenceValidationResult.CompoundStartAfterTokenError;
}
}
else if (token instanceof EmptyToken) {
hiddenCount[tokenCount]++;
simplifyDeps(token.deps);
if (compound)
return SentenceValidationResult.EmptyAfterCompoundError;
}
}
if (edges.length > tokenCount + 1) { // need to + 1 because edges is zero based
return SentenceValidationResult.HeadOutOfBoundError;
}
else if (validateDeps()) {
return SentenceValidationResult.DepHeadOutOfBoundError;
}
else if (end == null) {
return SentenceValidationResult.Ok;
}
else {
return SentenceValidationResult.CompoundEndBeyondLastTokenError;
}
};
return Sentence;
}());
export { Sentence };
/** Root ancestor that all type of Token should inherit from */
var Token = /** @class */ (function () {
function Token() {
}
return Token;
}());
export { Token };
/**
* A CompoundToken is a token which `id` is a range between [start, end] inclusively
* at both start and end index.
*
* The token requires `id` and `form` with optionally `misc` column.
*
* All other fields, when convert to string, has `_` values.
* ID in string format will be `start`-`end`, e.g. `1-2`.
* The `end` index must be greater than start. It is an error to have ID with
* `[1, 1]`
*/
var CompoundToken = /** @class */ (function () {
function CompoundToken(_a) {
var _b = _a.id, start = _b[0], end = _b[1], form = _a.form, misc = _a.misc;
if (end <= start) {
throw "CompountToken id range must be in `[start, end]` where `end` > `start`";
}
this.id = [start, end];
this.form = form;
this.misc = misc;
}
/**
* Parse given string and return a `CompoundToken`
*
* The string must be tab separate with 10 columns.
* See https://universaldependencies.org/format.html for file format.
*
* Only `id`, `form`, and `misc` columns are use.
* All other columns are ignored as
* https://universaldependencies.org/format.html#words-tokens-and-empty-nodes
* state that all other columns beside these three must be empty.
*/
CompoundToken.parse = function (str) {
var cols = str.split("\t");
return parseCompound(cols);
};
/** Retrieve a CoNLL-U format string representation of this token */
CompoundToken.prototype.toString = function () {
return tokenToString(this);
};
return CompoundToken;
}());
export { CompoundToken };
/**
* Nominal token is a basic type of token which must exist in `Sentence` in order to
* use other type of token.
*
* The mandatory field is `form` and `upos`. All other fields are optional.
* All optional field, when converted to string, will become "_".
*
* If `deps` field is supplied when construct, it will automatically sort it to comply with
* https://universaldependencies.org/format.html#syntactic-annotation
*/
var NominalToken = /** @class */ (function () {
function NominalToken(_a) {
var form = _a.form, lemma = _a.lemma, upos = _a.upos, xpos = _a.xpos, feats = _a.feats, headRel = _a.headRel, deps = _a.deps, misc = _a.misc;
if (deps && !deps.every(function (dep) { return dep[0].length == 1 || dep[0].length == 2; })) {
throw "NominalToken `deps` id must be array with either 1 or 2 number";
}
this.form = form;
this.lemma = lemma;
this.upos = upos;
this.xpos = xpos;
this.feats = feats ? feats.sort(function (f1, f2) { return f1.name.localeCompare(f2.name); }) : undefined;
this.head = headRel ? headRel[0] : undefined;
this.deprel = headRel ? headRel[1] : undefined;
this.deps = deps ? sortDeps(deps) : undefined;
this.misc = misc;
}
/**
* Parse