UNPKG

conllu-core

Version:

A core type to handle CoNLL-U format

github.com/NattapongSiri/conllu_core

NattapongSiri/conllu_core

1,163 lines • 56.5 kB

JavaScript

var __assign = (this && this.__assign) || function () { __assign = Object.assign || function(t) { for (var s, i = 1, n = arguments.length; i < n; i++) { s = arguments[i]; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p]; } return t; }; return __assign.apply(this, arguments); }; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __generator = (this && this.__generator) || function (thisArg, body) { var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; function verb(n) { return function (v) { return step([n, v]); }; } function step(op) { if (f) throw new TypeError("Generator is already executing."); while (_) try { if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; if (y = 0, t) op = [op[0] & 2, t.value]; switch (op[0]) { case 0: case 1: t = op; break; case 4: _.label++; return { value: op[1], done: false }; case 5: _.label++; y = op[1]; op = [0]; continue; case 7: op = _.ops.pop(); _.trys.pop(); continue; default: if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } if (t[2]) _.ops.pop(); _.trys.pop(); continue; } op = body.call(thisArg, _); } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; } }; var __asyncValues = (this && this.__asyncValues) || function (o) { if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); var m = o[Symbol.asyncIterator], i; return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i); function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; } function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); } }; var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); } var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) { if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); var g = generator.apply(thisArg, _arguments || []), i, q = []; return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i; function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; } function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } } function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); } function fulfill(value) { resume("next", value); } function reject(value) { resume("throw", value); } function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); } }; import XRegExp from 'xregexp'; import { createReadStream, createWriteStream } from 'fs'; import { createInterface } from 'readline'; /** * Attempt to convert given token to string. * If id array is empty or zero length then it will * exclue id from string. * If id array has exactly two element, it'll consider * the token to be compound token. It will print id in format * `id[0]-id[1]` */ function tokenToString(_a) { var _b = _a.id, id = _b === void 0 ? [] : _b, _c = _a.form, form = _c === void 0 ? "_" : _c, _d = _a.lemma, lemma = _d === void 0 ? "_" : _d, upos = _a.upos, xpos = _a.xpos, feats = _a.feats, head = _a.head, _e = _a.deprel, deprel = _e === void 0 ? "_" : _e, deps = _a.deps, misc = _a.misc; if (id.length == 2) { return id[0] + "-" + id[1] + "\t" + form + "\t_\t_\t_\t_\t_\t_\t_\t" + (misc ? misc.join("|") : "_"); } else { return form + "\t" + lemma + "\t" + (upos ? upos.toString() : "_") + "\t" + (xpos ? xpos.toString() : "_") + "\t" + (feats ? feats.map(function (f) { return f.toString(); }).join("|") : "_") + "\t" + (head ? head : "_") + "\t" + deprel + "\t" + (deps ? deps.map(function (_a) { var id = _a[0], rel = _a[1]; return (id.length == 1 ? id[0] : id[0] + "." + id[1]) + ":" + rel; }).join("|") : "_") + "\t" + (misc ? misc.join("|") : "_"); } } /** * Sort given deps as required by `conllu` specification that deps must be sorted * by head. If head is equals, it must be sorted by relation name in alphabetic ascending order. * @param deps An array of tuple with `Head` and `DepsRelation` where `Head` can either be 1 integer element tuple * or 2 integer elements tuple if `Head` is `EmptyToken`. For example `[1]` if head is `NominalToken` and * `[1, 1]` if head is `EmptyToken`. */ function sortDeps(deps) { return deps.sort(function (d1, d2) { var headCmp = d1[0][0] - d2[0][0]; if (headCmp != 0) { return headCmp; // HeadIds have diff } else { if (d1[0][1] && d2[0][1]) { return d1[0][1] - d2[0][1]; // compare Empty part } } }); } /** * Parse a string contains one or more deps. * Each dep must be split by `|`. `Head` reference can either be `integer` or * `decimal`. For example, `1` and `1.1`. * `DepsRelation` associated with `Head` by using `:` to delimit the part. * See [enhanced dependencies](https://universaldependencies.org/u/overview/enhanced-syntax.html) * for valid name pattern. * @param str A string to be parsed into proper `Head`:`DepsRelation` object. * If `DepsRelation` name is invalid, it will throw an exception. */ function parseDeps(str) { return str.split("|").map(function (dep) { var splitPoint = dep.indexOf(":"); var ids = dep.slice(0, splitPoint).split("."); if (ids.length > 2 || ids.length == 0) { throw "ID of Deps must be either `int` or `int`.`int`"; } var head = ids[0], empty = ids[1]; var headId = parseInt(head); if (isNaN(headId)) { throw "Deps contain non-numeric head position"; } if (empty != undefined) { var emptyId = parseInt(empty); if (isNaN(emptyId)) { throw "Deps contain non-numeric null node position"; } return [[headId, emptyId], new DepsRelation(dep.slice(splitPoint + 1))]; } return [[headId], new DepsRelation(dep.slice(splitPoint + 1))]; }); } /** Parse and return head as integer number or throw exception if it is not an integer number */ function parseHead(str) { var id = parseInt(str); if (isNaN(id) || id != +str) { return undefined; } return id; } /** * Parse a string that strip '#' sign in front of the string out. * This is assistant function for perfomance reason to parse string and * return either a Comment or Meta instead of trying to parse it as Comment * first then try to parse it again as Meta. * @param str a hash stripped out line of string */ function parseHashLine(str) { var idx = str.indexOf("="); if (idx != -1) { return new Meta({ key: str.slice(0, idx).trim(), value: str.slice(idx + 1).trim() }); } else { return new Comment(str); } } /** * It attempt to parse a line of tokens. * @param str A string that contains a line of tokens without new line character * @param Parser An XPOSParser that map xpos to upos */ function parseToken(str, Parser) { var tokens = str.split("\t"); if (tokens.length != 10) { throw "Token line must contains 10 columns separate by tap"; } var idx = tokens[0].indexOf("-"); if (idx != -1) { // possibly compound token var start = +tokens[0].slice(0, idx); var end = +tokens[0].slice(idx + 1); if (Math.floor(start) != start || Math.floor(end) != end) { throw "Compound id must both be integer number"; } return parseCompoundUncheck(start, end, tokens.slice(1)); } idx = tokens[0].indexOf("."); if (idx != -1) { // possibly empty token var head = +tokens[0].slice(0, idx); var id_1 = +tokens[0].slice(idx + 1); if (Math.floor(head) != head || Math.floor(id_1) != id_1) { throw "Empty id format must be <int>.<int> such as 1.1, 1.2, .. , 1.10"; } return [head, id_1, parseEmptyUncheck(tokens.slice(1), Parser)]; } var id = parseInt(tokens[0]); if (isNaN(id) || id != +tokens[0]) { throw "ID must either be <int>.<int> or <int>-<int> or <int> such as 1, 1.2, or 1-2"; } return [id, parseNominalUncheck(tokens.slice(1), Parser)]; } /** * Parse a string and return an integer value of an ID or throw an exception. * @param str a string that may contain an integer value */ function parseNominalId(str) { var id = parseInt(str); if (isNaN(id) || id != +str) { throw "The give string is not valid Nominal token"; } return id; } function parseNominal(tokens, Parser) { if (tokens.length != 10) { throw "Require tab separate string with 10 columns"; } var id = parseNominalId(tokens[0]); return [ id, parseNominalUncheck(tokens.slice(1), Parser) ]; } /** * Similar to parseNominal function except that it doesn't check number of element * in given tokens. This function will be used as a core NominalToken object construction * to reduce code duplicate. The tokens must be a slice stripping out the first token so * it will have only 9 elements. */ function parseNominalUncheck(tokens, Parser) { var head = parseHead(tokens[5]); return new NominalToken({ form: tokens[0], lemma: tokens[1] == "_" ? undefined : tokens[1], upos: UPOS[tokens[2]], xpos: Parser && tokens[3] != "_" ? Parser.parse(tokens[3]) : undefined, feats: tokens[4] != "_" ? tokens[4].split("|").map(function (f) { var _a = f.split("="), key = _a[0], values = _a[1]; return new Feature(key, values.split(",")); }) : undefined, headRel: isNaN(head) ? undefined : [head, new Relation(tokens[6])], deps: tokens[7] != "_" ? parseDeps(tokens[7]) : undefined, misc: tokens[8] != "_" ? tokens[8].split("|") : undefined }); } /** * Parse a string that may contain a compound ID. It throw an exception if it isn't. * @param ids A string in format of "int-int" such as "1-2" */ function parseCompoundId(ids) { var id = ids.split("-"); if (id.length != 2) { throw "CompoundToken need id to be in format `[start, end]` where `end` > `start`"; } var start = parseInt(id[0]); var end = parseInt(id[1]); if (isNaN(start) || isNaN(end) || start != +id[0] || end != +id[1]) { // +id[0] and +id[1] will turn the string into number which might be int or float // if it doesn't equals to `parseInt()` result then it's not integer throw "CompoundToken require both start and end to be integer number"; } return [start, end]; } /** Parse a compound token from given tokens strings */ function parseCompound(tokens) { if (tokens.length != 10) { throw "CompountToken requires tab separate string with 10 columns"; } var _a = parseCompoundId(tokens[0]), start = _a[0], end = _a[1]; return parseCompoundUncheck(start, end, tokens.slice(1)); } /** * Similar to parseCompound function but doesn't check number of element in tokens. * It require caller to pass in id of start and id of end token. * Both id are inclusive. This function will be used as a core Compound object construction. */ function parseCompoundUncheck(start, end, tokens) { return new CompoundToken({ id: [start, end], form: tokens[0], misc: tokens[8] == "_" ? undefined : tokens[8].split("|") }); } /** * Parse a string and return pair of ids or throw an exception. * @param str A string contains empty id in format of "int.int" for example, "1.1" */ function parseEmtpyId(str) { var emptyId = str.split("."); if (emptyId.length != 2) { throw "ID of empty token must be in format `int.int`"; } var headId = parseInt(emptyId[0]); var id = parseInt(emptyId[1]); if (isNaN(headId) || isNaN(id)) { throw "ID of empty token must be in format `int.int`"; } return [headId, id]; } /** Parse given string as EmptyToken along with its' ID tuple */ function parseEmpty(tokens, Parser) { if (tokens.length != 10) { throw "EmptyToken requires tab separate string with 10 columns"; } var emptyId = tokens[0].split("."); if (emptyId.length != 2) { throw "ID of empty token must be in format `int.int`"; } var _a = parseEmtpyId(tokens[0]), headId = _a[0], id = _a[1]; if (tokens[8] == "_") { throw "EmptyToken requires non empty `deps` column"; } return [ headId, id, parseEmptyUncheck(tokens.slice(1), Parser) ]; } /** Similar to parseEmpty but doesn't check token length nor ID. */ function parseEmptyUncheck(tokens, Parser) { return new EmptyToken({ form: tokens[0] != "_" ? tokens[0] : undefined, lemma: tokens[1] != "_" ? tokens[1] : undefined, upos: tokens[2] != "_" ? UPOS[tokens[2]] : undefined, xpos: Parser && tokens[3] != "_" ? Parser.parse(tokens[3]) : undefined, feats: tokens[4] != "_" ? tokens[4].split("|").map(function (f) { var _a = f.split("="), key = _a[0], values = _a[1]; return new Feature(key, values.split(",")); }) : undefined, deps: parseDeps(tokens[7]), misc: tokens[8] != "_" ? tokens[8].split("|") : undefined }); } /** * A generator function that keep return a `Sentence` object on each call. * Use this generator if whole document cannot be fit into memory. * * @param stream A `Readable` stream that contains CoNLL-U format text. * @param Parser A derivative of `XPOSParser` object for parsing `xpos` field */ export function sentences(stream, Parser) { return __asyncGenerator(this, arguments, function sentences_1() { function read_line(readable) { return __asyncGenerator(this, arguments, function read_line_1() { var lines, lines_1, lines_1_1, line, e_2_1; var e_2, _a; return __generator(this, function (_b) { switch (_b.label) { case 0: lines = createInterface({ input: readable }); _b.label = 1; case 1: _b.trys.push([1, 8, 9, 14]); lines_1 = __asyncValues(lines); _b.label = 2; case 2: return [4 /*yield*/, __await(lines_1.next())]; case 3: if (!(lines_1_1 = _b.sent(), !lines_1_1.done)) return [3 /*break*/, 7]; line = lines_1_1.value; return [4 /*yield*/, __await(line)]; case 4: return [4 /*yield*/, _b.sent()]; case 5: _b.sent(); _b.label = 6; case 6: return [3 /*break*/, 2]; case 7: return [3 /*break*/, 14]; case 8: e_2_1 = _b.sent(); e_2 = { error: e_2_1 }; return [3 /*break*/, 14]; case 9: _b.trys.push([9, , 12, 13]); if (!(lines_1_1 && !lines_1_1.done && (_a = lines_1["return"]))) return [3 /*break*/, 11]; return [4 /*yield*/, __await(_a.call(lines_1))]; case 10: _b.sent(); _b.label = 11; case 11: return [3 /*break*/, 13]; case 12: if (e_2) throw e_2.error; return [7 /*endfinally*/]; case 13: return [7 /*endfinally*/]; case 14: lines.close(); return [2 /*return*/]; } }); }); } var _a, _b, sentence, e_1_1; var e_1, _c; return __generator(this, function (_d) { switch (_d.label) { case 0: _d.trys.push([0, 7, 8, 13]); _a = __asyncValues(_sentences(read_line(stream), Parser)); _d.label = 1; case 1: return [4 /*yield*/, __await(_a.next())]; case 2: if (!(_b = _d.sent(), !_b.done)) return [3 /*break*/, 6]; sentence = _b.value; return [4 /*yield*/, __await(sentence)]; case 3: return [4 /*yield*/, _d.sent()]; case 4: _d.sent(); _d.label = 5; case 5: return [3 /*break*/, 1]; case 6: return [3 /*break*/, 13]; case 7: e_1_1 = _d.sent(); e_1 = { error: e_1_1 }; return [3 /*break*/, 13]; case 8: _d.trys.push([8, , 11, 12]); if (!(_b && !_b.done && (_c = _a["return"]))) return [3 /*break*/, 10]; return [4 /*yield*/, __await(_c.call(_a))]; case 9: _d.sent(); _d.label = 10; case 10: return [3 /*break*/, 12]; case 11: if (e_1) throw e_1.error; return [7 /*endfinally*/]; case 12: return [7 /*endfinally*/]; case 13: return [2 /*return*/]; } }); }); } /** * Core `Sentence` generator function. It take a generator that yield a line on each call and a Parser as argument. * Each call to this function yield a `Sentence`. */ function _sentences(line_iter, Parser) { return __asyncGenerator(this, arguments, function _sentences_1() { var meta, tokens, line_iter_1, line_iter_1_1, line, t, e_3_1; var e_3, _a; return __generator(this, function (_b) { switch (_b.label) { case 0: meta = []; tokens = []; _b.label = 1; case 1: _b.trys.push([1, 9, 10, 15]); line_iter_1 = __asyncValues(line_iter); _b.label = 2; case 2: return [4 /*yield*/, __await(line_iter_1.next())]; case 3: if (!(line_iter_1_1 = _b.sent(), !line_iter_1_1.done)) return [3 /*break*/, 8]; line = line_iter_1_1.value; line = line.trim(); if (!(line.length > 0)) return [3 /*break*/, 4]; if (line[0] == '#') { meta.push(parseHashLine(line.slice(1))); } else { t = parseToken(line, Parser); if (t instanceof CompoundToken) { tokens.push(t); } else if (t.length == 2) { // nominal token tokens.push(t[1]); } else if (t.length == 3) { // empty token tokens.push(t[2]); } else { throw "Invalid state while parsing token line"; } } return [3 /*break*/, 7]; case 4: if (!(tokens.length > 0)) return [3 /*break*/, 7]; return [4 /*yield*/, __await(new Sentence({ meta: meta, tokens: tokens }))]; case 5: return [4 /*yield*/, _b.sent()]; case 6: _b.sent(); meta = []; tokens = []; _b.label = 7; case 7: return [3 /*break*/, 2]; case 8: return [3 /*break*/, 15]; case 9: e_3_1 = _b.sent(); e_3 = { error: e_3_1 }; return [3 /*break*/, 15]; case 10: _b.trys.push([10, , 13, 14]); if (!(line_iter_1_1 && !line_iter_1_1.done && (_a = line_iter_1["return"]))) return [3 /*break*/, 12]; return [4 /*yield*/, __await(_a.call(line_iter_1))]; case 11: _b.sent(); _b.label = 12; case 12: return [3 /*break*/, 14]; case 13: if (e_3) throw e_3.error; return [7 /*endfinally*/]; case 14: return [7 /*endfinally*/]; case 15: if (!(tokens.length > 0)) return [3 /*break*/, 18]; return [4 /*yield*/, __await(new Sentence({ meta: meta, tokens: tokens }))]; case 16: return [4 /*yield*/, _b.sent()]; case 17: _b.sent(); meta = []; tokens = []; _b.label = 18; case 18: return [2 /*return*/]; } }); }); } /** * `Document` is an entry point to `conllu`. It contains zero or more sentences. * * To programmatically construct a `Document` use it constructor. * To construct a `Document` using CoNLL-U format text, use either * `parse`, `load`, or `read` method depending on source of text. * * If `Document` cannot be fit into memory, use `sentences` generator function. */ var Document = /** @class */ (function () { function Document(sentences) { this.sentences = sentences; } /** * Load conllu file as Document. This method is async. * * @param file_path Path to conllu file * @param Parser An optional Parser that is derivative of type XPOSParser for mapping XPOS to UPOS */ Document.load = function (file_path, Parser) { var e_4, _a; return __awaiter(this, void 0, void 0, function () { var stream, loaded_sentences, _b, _c, line, e_4_1; return __generator(this, function (_d) { switch (_d.label) { case 0: stream = createReadStream(file_path); loaded_sentences = []; _d.label = 1; case 1: _d.trys.push([1, 6, 7, 12]); _b = __asyncValues(sentences(stream, Parser)); _d.label = 2; case 2: return [4 /*yield*/, _b.next()]; case 3: if (!(_c = _d.sent(), !_c.done)) return [3 /*break*/, 5]; line = _c.value; loaded_sentences.push(line); _d.label = 4; case 4: return [3 /*break*/, 2]; case 5: return [3 /*break*/, 12]; case 6: e_4_1 = _d.sent(); e_4 = { error: e_4_1 }; return [3 /*break*/, 12]; case 7: _d.trys.push([7, , 10, 11]); if (!(_c && !_c.done && (_a = _b["return"]))) return [3 /*break*/, 9]; return [4 /*yield*/, _a.call(_b)]; case 8: _d.sent(); _d.label = 9; case 9: return [3 /*break*/, 11]; case 10: if (e_4) throw e_4.error; return [7 /*endfinally*/]; case 11: return [7 /*endfinally*/]; case 12: return [2 /*return*/, new Document(loaded_sentences)]; } }); }); }; /** * Parse given stream line by line to construct an object of Document. * * @param stream A stream source of text to be parse * @param Parser An optional Parser that is derivative of type XPOSParser for mapping XPOS to UPOS */ Document.read = function (stream, Parser) { var e_5, _a; return __awaiter(this, void 0, void 0, function () { var loaded_sentences, _b, _c, sentence, e_5_1; return __generator(this, function (_d) { switch (_d.label) { case 0: loaded_sentences = []; _d.label = 1; case 1: _d.trys.push([1, 6, 7, 12]); _b = __asyncValues(sentences(stream, Parser)); _d.label = 2; case 2: return [4 /*yield*/, _b.next()]; case 3: if (!(_c = _d.sent(), !_c.done)) return [3 /*break*/, 5]; sentence = _c.value; loaded_sentences.push(sentence); _d.label = 4; case 4: return [3 /*break*/, 2]; case 5: return [3 /*break*/, 12]; case 6: e_5_1 = _d.sent(); e_5 = { error: e_5_1 }; return [3 /*break*/, 12]; case 7: _d.trys.push([7, , 10, 11]); if (!(_c && !_c.done && (_a = _b["return"]))) return [3 /*break*/, 9]; return [4 /*yield*/, _a.call(_b)]; case 8: _d.sent(); _d.label = 9; case 9: return [3 /*break*/, 11]; case 10: if (e_5) throw e_5.error; return [7 /*endfinally*/]; case 11: return [7 /*endfinally*/]; case 12: return [2 /*return*/, new Document(loaded_sentences)]; } }); }); }; /** * An async utitility function that cumulatively parse each line of string then return a document. * * @param line_iter An async generator object where each call return a line of string * @param Parser a Parser derivative from XPOSParser */ Document.parse_core = function (line_iter, Parser) { var e_6, _a; return __awaiter(this, void 0, void 0, function () { var sentences, _b, _c, sentence, e_6_1; return __generator(this, function (_d) { switch (_d.label) { case 0: sentences = []; _d.label = 1; case 1: _d.trys.push([1, 6, 7, 12]); _b = __asyncValues(_sentences(line_iter)); _d.label = 2; case 2: return [4 /*yield*/, _b.next()]; case 3: if (!(_c = _d.sent(), !_c.done)) return [3 /*break*/, 5]; sentence = _c.value; sentences.push(sentence); _d.label = 4; case 4: return [3 /*break*/, 2]; case 5: return [3 /*break*/, 12]; case 6: e_6_1 = _d.sent(); e_6 = { error: e_6_1 }; return [3 /*break*/, 12]; case 7: _d.trys.push([7, , 10, 11]); if (!(_c && !_c.done && (_a = _b["return"]))) return [3 /*break*/, 9]; return [4 /*yield*/, _a.call(_b)]; case 8: _d.sent(); _d.label = 9; case 9: return [3 /*break*/, 11]; case 10: if (e_6) throw e_6.error; return [7 /*endfinally*/]; case 11: return [7 /*endfinally*/]; case 12: return [2 /*return*/, new Document(sentences)]; } }); }); }; /** * Attempt to parse string as a document. This method is async. * * @param str An entire document in string where each line is terminate by '\u000a' * @param Parser An optional XPOSParser instance */ Document.parse = function (str, Parser) { return __awaiter(this, void 0, void 0, function () { function lines_iter(str) { return __asyncGenerator(this, arguments, function lines_iter_1() { var _i, _a, line; return __generator(this, function (_b) { switch (_b.label) { case 0: _i = 0, _a = str.split("\u000a"); _b.label = 1; case 1: if (!(_i < _a.length)) return [3 /*break*/, 5]; line = _a[_i]; return [4 /*yield*/, __await(line)]; case 2: return [4 /*yield*/, _b.sent()]; case 3: _b.sent(); _b.label = 4; case 4: _i++; return [3 /*break*/, 1]; case 5: return [2 /*return*/]; } }); }); } return __generator(this, function (_a) { return [2 /*return*/, this.parse_core(lines_iter(str), Parser)]; }); }); }; /** Save this document to a file in given path. The content encoding is UTF-8 */ Document.prototype.save = function (path) { return __awaiter(this, void 0, void 0, function () { var stream; return __generator(this, function (_a) { stream = createWriteStream(path); this.write(stream); stream.close(); return [2 /*return*/]; }); }); }; /** Return CoNLL-U string representation of the doc */ Document.prototype.toString = function () { return this.sentences.map(function (sentence) { return sentence.toString(); }).join("\u000a\u000a"); }; /** * Validate every sentence dependencies. It immediately return when there's an error. * Otherwise, it return SentenceValidationResult.Ok */ Document.prototype.validate = function () { for (var i in this.sentences) { var validated = this.sentences[i].validate(); if (validated != SentenceValidationResult.Ok) { return validated; } } return SentenceValidationResult.Ok; }; /** Serialize this document as CoNLL-U text into given stream */ Document.prototype.write = function (stream) { return __awaiter(this, void 0, void 0, function () { var _i, _a, sentence; return __generator(this, function (_b) { switch (_b.label) { case 0: _i = 0, _a = this.sentences; _b.label = 1; case 1: if (!(_i < _a.length)) return [3 /*break*/, 4]; sentence = _a[_i]; if (!!stream.write(sentence.toString() + "\u000a\u000a")) return [3 /*break*/, 3]; return [4 /*yield*/, new Promise(function (resolve) { return stream.once('drain', function () { resolve(); }); })]; case 2: _b.sent(); _b.label = 3; case 3: _i++; return [3 /*break*/, 1]; case 4: return [2 /*return*/]; } }); }); }; return Document; }()); export { Document }; /** * Sentence meta data. * * It's a key/value pair. It's defined by prefixing the sentence with * `# key = value` format. */ var Meta = /** @class */ (function () { /** * Construct `Meta` by given dictionary. * @param param0 A dic of `key` and `value` where `value` is optional. * If `value` is omitted, `toString` method will return `Comment` format * string rather than empty value `key` */ function Meta(_a) { var key = _a.key, value = _a.value; this.key = key; this.value = value; } /** * Instantiate the object by providing a `conllu` string. * @param str A string to be parsed into `Meta` */ Meta.parse = function (str) { if (str[0] != '#') { throw "Meta entry must start with `#`"; } str = str.slice(1).trim(); var eqId = str.indexOf("="); if (eqId == -1) { throw "Meta entry must have `=` symbol"; } var key = str.slice(0, eqId).trim(); var value = str.slice(eqId + 1).trim(); var meta = new Meta({ key: key, value: value }); return meta; }; /** Convert this object into `conllu` string */ Meta.prototype.toString = function () { if (this.key && this.value) { return "# " + this.key + " = " + this.value; } else if (this.key) { return "# " + this.key; } else { throw "Missing key from meta"; } }; return Meta; }()); export { Meta }; /** * A comment of sentence. It's similar to `Meta` but doesn't have `=` symbol. * Similar to `Meta`, it must be prefix of sentence. */ var Comment = /** @class */ (function () { /** * @param text Comment to be added */ function Comment(text) { if (text) { text = text.trim(); if (text.length > 0) { this.text = text; } } } /** * Construct a comment object from given string. * @param str A string to be parse as `Comment` */ Comment.parse = function (str) { if (str[0] != "#") { throw "Comment line must begin with `#`"; } var c = new Comment(); str = str.slice(1).trim(); if (str.length > 0) { c.text = str; } return c; }; /** Get `conllu` string from this comment */ Comment.prototype.toString = function () { if (this.text && this.text.length > 0) { return "# " + this.text; } else { return "#"; } }; return Comment; }()); export { Comment }; /** * A validation result for calling validate on each `Sentence`. * It may also throw some exceptions such as "Head of deps that reference to hidden/empty token must be in [integer, integer] format". */ export var SentenceValidationResult; (function (SentenceValidationResult) { SentenceValidationResult[SentenceValidationResult["Ok"] = 0] = "Ok"; /** Compound token end range is beyond index of last token error */ SentenceValidationResult[SentenceValidationResult["CompoundEndBeyondLastTokenError"] = 1] = "CompoundEndBeyondLastTokenError"; /** Some of compound token is overlap to other compound token error */ SentenceValidationResult[SentenceValidationResult["CompoundOverlapError"] = 2] = "CompoundOverlapError"; /** Compound token start index point to token prior to itself error */ /** Head index is larger than number of tokens or less than 1 error */ SentenceValidationResult[SentenceValidationResult["DepHeadOutOfBoundError"] = 3] = "DepHeadOutOfBoundError"; SentenceValidationResult[SentenceValidationResult["CompoundStartAfterTokenError"] = 4] = "CompoundStartAfterTokenError"; /** Empty token after compound token error */ SentenceValidationResult[SentenceValidationResult["EmptyAfterCompoundError"] = 5] = "EmptyAfterCompoundError"; /** Head index is larger than number of tokens or less than 1 error */ SentenceValidationResult[SentenceValidationResult["HeadOutOfBoundError"] = 6] = "HeadOutOfBoundError"; /** NominalToken with head with missing deprel error */ SentenceValidationResult[SentenceValidationResult["HeadWithoutDeprelError"] = 7] = "HeadWithoutDeprelError"; /** NominalToken with non-intenger value in head error */ SentenceValidationResult[SentenceValidationResult["NonIntegerHeadError"] = 8] = "NonIntegerHeadError"; })(SentenceValidationResult || (SentenceValidationResult = {})); /** * `Sentence` consists of: * 1. `meta` which is array. The object inside array can either be `Meta` object or `Comment` object. * 1. `tokens` which is array of derivative of `Token` class. * * To parse sentence text: * 1. You can either construct a `Document` from text by using `parse`, `load`, `read` method and access * `Sentence` via `sentences` field of `Document` object. * 2. You can also use generator function `sentences` to parse each text chunk incrementally. */ var Sentence = /** @class */ (function () { /** * Construct a new sentence from given dictionary * @param param0 A dictionary object contain optional `meta` array of either * `Meta` or `Comment` and tokens field which is array of `Token` derivative. */ function Sentence(_a) { var meta = _a.meta, tokens = _a.tokens; this.meta = meta; this.tokens = tokens; } /** get `conllu` formatted string of current sentence */ Sentence.prototype.toString = function () { var metaStr = "" + this.meta.map(function (m) { return m.toString(); }).join("\u000a"); var id = 1; var hiddenId = 1; var tokensStr = "" + this.tokens.map(function (token) { if (token instanceof CompoundToken) { return token.toString(); } else if (token instanceof EmptyToken) { return id - 1 + "." + hiddenId++ + "\t" + token.toString(); } else if (token instanceof NominalToken) { hiddenId = 1; return id++ + "\t" + token.toString(); } else { throw "Unsupport type of token"; } }).join("\u000a"); return metaStr + "\n" + tokensStr; }; /** * Parse given string as `Sentence` object * @param str A string to be used to instantiate `Sentence`. * @param Parser An `XPOSParser` derivative object */ Sentence.parse = function (str, Parser) { var meta = []; var tokens = []; for (var _i = 0, _a = str.split('\u000a'); _i < _a.length; _i++) { var line = _a[_i]; var l = line.trim(); if (l.startsWith("#")) { var eqIdx = l.indexOf("="); if (eqIdx == -1) { meta.push(new Comment(l.slice(1))); } else { var key = l.slice(1, eqIdx).trim(); var value = l.slice(eqIdx + 1).trim(); meta.push(new Meta({ key: key, value: value })); } } else if (l.length > 0) { var ts = line.split("\t"); if (ts.length != 10) { throw "All token must have 10 columns"; } try { var _b = parseNominal(ts, Parser), _id = _b[0], tok = _b[1]; tokens.push(tok); } catch (e) { if (e === "The give string is not valid Nominal token") { try { var tok = parseCompound(ts); tokens.push(tok); } catch (e) { if (e === "CompoundToken need id to be in format `[start, end]` where `end` > `start`") { var _c = parseEmpty(ts, Parser), _headId = _c[0], _emptyId = _c[1], tok = _c[2]; tokens.push(tok); } } } } } else { break; } } return new Sentence({ meta: meta, tokens: tokens }); }; /** * Validate current sentence whether the token structure is valid and all * `head`, `relation`, and `deps` are valid. */ Sentence.prototype.validate = function () { var end = null; var edges = []; // an index based that is true when head value of a node is equals to the index var hiddenCount = []; var hiddenEdges = []; var tokenCount = 0; var compound = false; var simplifyDeps = function (deps) { for (var _i = 0, deps_1 = deps; _i < deps_1.length; _i++) { var dep = deps_1[_i]; switch (dep[0].length) { case 1: if (!Number.isInteger(dep[0][0])) throw "Head of deps that reference to hidden/empty token must be in [integer, integer] format"; edges[dep[0][0]] = true; break; case 2: // Head is empty token if (hiddenEdges[dep[0][0]] < dep[0][1] || hiddenEdges[dep[0][0]] == undefined) { hiddenEdges[dep[0][0]] = dep[0][1]; } break; default: throw "Invalid deps object. Head of dep must either be [number] or [number, number] "; } } }; /** Return false if all deps are valid, otherwise return true */ var validateDeps = function () { for (var i = 1; i < hiddenEdges.length; i++) { if (hiddenEdges[i] == undefined) { continue; } if (hiddenEdges[i] > hiddenCount[i] || hiddenEdges[i] < 1) { return true; } } return false; }; for (var _i = 0, _a = this.tokens; _i < _a.length; _i++) { var token = _a[_i]; if (token instanceof NominalToken) { tokenCount++; hiddenCount[tokenCount] = 0; compound = false; if (token.head != undefined) { if (!Number.isInteger(token.head)) { return SentenceValidationResult.NonIntegerHeadError; } else if (token.head > this.tokens.length || token.head < 1) { return SentenceValidationResult.HeadOutOfBoundError; } else if (!token.deprel) { return SentenceValidationResult.HeadWithoutDeprelError; } edges[token.head] = true; } if (token.deps) { simplifyDeps(token.deps); } if (end != null && end == tokenCount) { end = null; } } else if (token instanceof CompoundToken) { if (end != null) { return SentenceValidationResult.CompoundOverlapError; } end = token.id[1]; compound = true; if (token.id[0] >= tokenCount && tokenCount != 0) { return SentenceValidationResult.CompoundStartAfterTokenError; } } else if (token instanceof EmptyToken) { hiddenCount[tokenCount]++; simplifyDeps(token.deps); if (compound) return SentenceValidationResult.EmptyAfterCompoundError; } } if (edges.length > tokenCount + 1) { // need to + 1 because edges is zero based return SentenceValidationResult.HeadOutOfBoundError; } else if (validateDeps()) { return SentenceValidationResult.DepHeadOutOfBoundError; } else if (end == null) { return SentenceValidationResult.Ok; } else { return SentenceValidationResult.CompoundEndBeyondLastTokenError; } }; return Sentence; }()); export { Sentence }; /** Root ancestor that all type of Token should inherit from */ var Token = /** @class */ (function () { function Token() { } return Token; }()); export { Token }; /** * A CompoundToken is a token which `id` is a range between [start, end] inclusively * at both start and end index. * * The token requires `id` and `form` with optionally `misc` column. * * All other fields, when convert to string, has `_` values. * ID in string format will be `start`-`end`, e.g. `1-2`. * The `end` index must be greater than start. It is an error to have ID with * `[1, 1]` */ var CompoundToken = /** @class */ (function () { function CompoundToken(_a) { var _b = _a.id, start = _b[0], end = _b[1], form = _a.form, misc = _a.misc; if (end <= start) { throw "CompountToken id range must be in `[start, end]` where `end` > `start`"; } this.id = [start, end]; this.form = form; this.misc = misc; } /** * Parse given string and return a `CompoundToken` * * The string must be tab separate with 10 columns. * See https://universaldependencies.org/format.html for file format. * * Only `id`, `form`, and `misc` columns are use. * All other columns are ignored as * https://universaldependencies.org/format.html#words-tokens-and-empty-nodes * state that all other columns beside these three must be empty. */ CompoundToken.parse = function (str) { var cols = str.split("\t"); return parseCompound(cols); }; /** Retrieve a CoNLL-U format string representation of this token */ CompoundToken.prototype.toString = function () { return tokenToString(this); }; return CompoundToken; }()); export { CompoundToken }; /** * Nominal token is a basic type of token which must exist in `Sentence` in order to * use other type of token. * * The mandatory field is `form` and `upos`. All other fields are optional. * All optional field, when converted to string, will become "_". * * If `deps` field is supplied when construct, it will automatically sort it to comply with * https://universaldependencies.org/format.html#syntactic-annotation */ var NominalToken = /** @class */ (function () { function NominalToken(_a) { var form = _a.form, lemma = _a.lemma, upos = _a.upos, xpos = _a.xpos, feats = _a.feats, headRel = _a.headRel, deps = _a.deps, misc = _a.misc; if (deps && !deps.every(function (dep) { return dep[0].length == 1 || dep[0].length == 2; })) { throw "NominalToken `deps` id must be array with either 1 or 2 number"; } this.form = form; this.lemma = lemma; this.upos = upos; this.xpos = xpos; this.feats = feats ? feats.sort(function (f1, f2) { return f1.name.localeCompare(f2.name); }) : undefined; this.head = headRel ? headRel[0] : undefined; this.deprel = headRel ? headRel[1] : undefined; this.deps = deps ? sortDeps(deps) : undefined; this.misc = misc; } /** * Parse