UNPKG

conllu-core

Version:
377 lines (376 loc) 13.7 kB
/// <reference types="node" /> import { Readable, Writable } from 'stream'; declare type EmptyTokenParseResult = [HeadId, EmptyId, EmptyToken]; declare type NominalTokenParseResult = [number, NominalToken]; /** * A generator function that keep return a `Sentence` object on each call. * Use this generator if whole document cannot be fit into memory. * * @param stream A `Readable` stream that contains CoNLL-U format text. * @param Parser A derivative of `XPOSParser` object for parsing `xpos` field */ export declare function sentences(stream: Readable, Parser?: XPOSParser): AsyncGenerator<Sentence>; /** * `Document` is an entry point to `conllu`. It contains zero or more sentences. * * To programmatically construct a `Document` use it constructor. * To construct a `Document` using CoNLL-U format text, use either * `parse`, `load`, or `read` method depending on source of text. * * If `Document` cannot be fit into memory, use `sentences` generator function. */ export declare class Document { sentences: Sentence[]; constructor(sentences: Sentence[]); /** * Load conllu file as Document. This method is async. * * @param file_path Path to conllu file * @param Parser An optional Parser that is derivative of type XPOSParser for mapping XPOS to UPOS */ static load(file_path: string, Parser?: XPOSParser): Promise<Document>; /** * Parse given stream line by line to construct an object of Document. * * @param stream A stream source of text to be parse * @param Parser An optional Parser that is derivative of type XPOSParser for mapping XPOS to UPOS */ static read(stream: Readable, Parser?: XPOSParser): Promise<Document>; /** * An async utitility function that cumulatively parse each line of string then return a document. * * @param line_iter An async generator object where each call return a line of string * @param Parser a Parser derivative from XPOSParser */ protected static parse_core(line_iter: AsyncGenerator<string>, Parser?: XPOSParser): Promise<Document>; /** * Attempt to parse string as a document. This method is async. * * @param str An entire document in string where each line is terminate by '\u000a' * @param Parser An optional XPOSParser instance */ static parse(str: string, Parser?: XPOSParser): Promise<Document>; /** Save this document to a file in given path. The content encoding is UTF-8 */ save(path: string): Promise<void>; /** Return CoNLL-U string representation of the doc */ toString(): string; /** * Validate every sentence dependencies. It immediately return when there's an error. * Otherwise, it return SentenceValidationResult.Ok */ validate(): SentenceValidationResult; /** Serialize this document as CoNLL-U text into given stream */ write(stream: Writable): Promise<void>; } /** * Sentence meta data. * * It's a key/value pair. It's defined by prefixing the sentence with * `# key = value` format. */ export declare class Meta { key: string; value: string; /** * Construct `Meta` by given dictionary. * @param param0 A dic of `key` and `value` where `value` is optional. * If `value` is omitted, `toString` method will return `Comment` format * string rather than empty value `key` */ constructor({ key, value }: { key: string; value?: string; }); /** * Instantiate the object by providing a `conllu` string. * @param str A string to be parsed into `Meta` */ static parse(str: string): Meta; /** Convert this object into `conllu` string */ toString(): string; } /** * A comment of sentence. It's similar to `Meta` but doesn't have `=` symbol. * Similar to `Meta`, it must be prefix of sentence. */ export declare class Comment { text: string; /** * @param text Comment to be added */ constructor(text?: string); /** * Construct a comment object from given string. * @param str A string to be parse as `Comment` */ static parse(str: string): Comment; /** Get `conllu` string from this comment */ toString(): string; } /** * A validation result for calling validate on each `Sentence`. * It may also throw some exceptions such as "Head of deps that reference to hidden/empty token must be in [integer, integer] format". */ export declare enum SentenceValidationResult { Ok = 0, /** Compound token end range is beyond index of last token error */ CompoundEndBeyondLastTokenError = 1, /** Some of compound token is overlap to other compound token error */ CompoundOverlapError = 2, /** Compound token start index point to token prior to itself error */ /** Head index is larger than number of tokens or less than 1 error */ DepHeadOutOfBoundError = 3, CompoundStartAfterTokenError = 4, /** Empty token after compound token error */ EmptyAfterCompoundError = 5, /** Head index is larger than number of tokens or less than 1 error */ HeadOutOfBoundError = 6, /** NominalToken with head with missing deprel error */ HeadWithoutDeprelError = 7, /** NominalToken with non-intenger value in head error */ NonIntegerHeadError = 8 } /** * `Sentence` consists of: * 1. `meta` which is array. The object inside array can either be `Meta` object or `Comment` object. * 1. `tokens` which is array of derivative of `Token` class. * * To parse sentence text: * 1. You can either construct a `Document` from text by using `parse`, `load`, `read` method and access * `Sentence` via `sentences` field of `Document` object. * 2. You can also use generator function `sentences` to parse each text chunk incrementally. */ export declare class Sentence { meta: (Meta | Comment)[]; tokens: Token[]; /** * Construct a new sentence from given dictionary * @param param0 A dictionary object contain optional `meta` array of either * `Meta` or `Comment` and tokens field which is array of `Token` derivative. */ constructor({ meta, tokens }: { meta?: (Meta | Comment)[]; tokens: Token[]; }); /** get `conllu` formatted string of current sentence */ toString(): string; /** * Parse given string as `Sentence` object * @param str A string to be used to instantiate `Sentence`. * @param Parser An `XPOSParser` derivative object */ static parse(str: string, Parser?: XPOSParser): Sentence; /** * Validate current sentence whether the token structure is valid and all * `head`, `relation`, and `deps` are valid. */ validate(): SentenceValidationResult; } /** Root ancestor that all type of Token should inherit from */ export declare abstract class Token { /** Format the token into `conllu` string */ abstract toString(): string; } export declare type IdRange = [FirstId, LastId]; export declare type FirstId = number; export declare type LastId = number; /** * A CompoundToken is a token which `id` is a range between [start, end] inclusively * at both start and end index. * * The token requires `id` and `form` with optionally `misc` column. * * All other fields, when convert to string, has `_` values. * ID in string format will be `start`-`end`, e.g. `1-2`. * The `end` index must be greater than start. It is an error to have ID with * `[1, 1]` */ export declare class CompoundToken implements Token { id: [number, number]; form: string; misc?: string[]; constructor({ id: [start, end], form, misc }: { id: IdRange; form: string; misc?: string[]; }); /** * Parse given string and return a `CompoundToken` * * The string must be tab separate with 10 columns. * See https://universaldependencies.org/format.html for file format. * * Only `id`, `form`, and `misc` columns are use. * All other columns are ignored as * https://universaldependencies.org/format.html#words-tokens-and-empty-nodes * state that all other columns beside these three must be empty. */ static parse(str: string): CompoundToken; /** Retrieve a CoNLL-U format string representation of this token */ toString(): string; } export declare type HeadId = number; export declare type AdvanceDep = [[HeadId] | [HeadId, EmptyId], DepsRelation]; /** * Nominal token is a basic type of token which must exist in `Sentence` in order to * use other type of token. * * The mandatory field is `form` and `upos`. All other fields are optional. * All optional field, when converted to string, will become "_". * * If `deps` field is supplied when construct, it will automatically sort it to comply with * https://universaldependencies.org/format.html#syntactic-annotation */ export declare class NominalToken implements Token { form: string; lemma?: string; upos: UPOS; xpos?: XPOS; feats?: Feature[]; head?: HeadId; deprel?: Relation; deps?: AdvanceDep[]; misc?: string[]; constructor({ form, lemma, upos, xpos, feats, headRel, deps, misc }: { form: string; lemma: string; upos: UPOS; xpos?: XPOS; feats?: Feature[]; headRel?: [HeadId, Relation]; deps?: [[HeadId] | [HeadId, EmptyId], DepsRelation][]; misc?: string[]; }); /** * Parse given string and construct a `NominalToken` out of it. * If text contains XPOS column and you need to keep XPOS field, you * need to supply a name of an implementation of `XPOSParser`. * @param str A string to be parsed * @param XPOSParser XPOS parser to convert given column into an object of `XPOS` */ static parse(str: string, Parser?: XPOSParser): NominalTokenParseResult; /** * Retrieve a CoNLL-U representation string of this token. The string will have * no `id` as its' ID rely on sequence in sentence. */ toString(): string; } export declare type EmptyId = number; /** * `EmptyToken` is a null token type. Everything except `deps` are optional. * It will automatically sort `deps` field according to * https://universaldependencies.org/format.html#syntactic-annotation */ export declare class EmptyToken implements Token { form?: string; lemma?: string; upos?: UPOS; xpos?: XPOS; feats?: Feature[]; deps: AdvanceDep[]; misc?: string[]; constructor({ form, lemma, upos, xpos, feats, deps, misc }: { form?: string; lemma?: string; upos?: UPOS; xpos?: XPOS; feats?: Feature[]; deps: [[HeadId] | [HeadId, EmptyId], DepsRelation][]; misc?: string[]; }); /** * Parse given string and return an `EmptyToken`. * @param str * @param Parser */ static parse(str: string, Parser?: XPOSParser): EmptyTokenParseResult; /** * Retrieve a CoNLL-U representation string of this token. The string will have * no `id` as its' ID rely on sequence in sentence. */ toString(): string; } /** * All possible part-of-speech defined in CoNLL-U. * A complete list of POS can be found here: * https://universaldependencies.org/u/pos/index.html */ export declare enum UPOS { ADJ = "ADJ", ADP = "ADP", ADV = "ADV", AUX = "AUX", CCONJ = "CCONJ", DET = "DET", INTJ = "INTJ", NOUN = "NOUN", NUM = "NUM", PART = "PART", PRON = "PRON", PROPN = "PROPN", PUNCT = "PUNCT", SCONJ = "SCONJ", SYM = "SYM", VERB = "VERB", Other = "X" } /** Utility function to parse string as UPOS object */ export declare function toUPOS(str: string): UPOS; /** * An abstract class XPOS which every languages that use * `xpos` field need to implement. * * It is mandatory to implement this class to preserve `xpos` field when you want * to use `xpos` field. */ export declare abstract class XPOS { abstract toUPOS(): UPOS; abstract toString(): string; } /** * An abstract class XPOSParser which any language that use `xpos` field and * require to deserialize need to implement. * * The implementation need to implement `parse` as static method. * If you don't pass an implementation of this class when deserialize the * `Document`, `Sentence`, `NominalToken`, and `EmptyToken` then the deserialized * object will have no `xpos` field. */ export declare abstract class XPOSParser { abstract parse(str: string): XPOS; } /** * A feature as describe in https://universaldependencies.org/format.html#morphological-annotation * It is a kind of key/values pair where key is a name of feature type and values is a list of * feature name. * * If you construct this feature via its' constructor, it will validate the name and sort the values for you. */ export declare class Feature { name: string; value: string[]; constructor(name: string, value: string[]); toString(): string; } /** * A DepsRelation is a relation used in `deps` field on `NominalToken` and `EmptyToken`. * A constructor will validate the relation name according to * https://universaldependencies.org/u/overview/enhanced-syntax.html */ export declare class DepsRelation { rel: string; constructor(rel: string); toString(): string; } /** * A Relation is a name of relation that is used to describe the token relation * to it `head`. The field that uses this class is `deprel`. * `deprel` is mandatory if `head` is not empty. * See https://universaldependencies.org/format.html#syntactic-annotation */ export declare class Relation { rel: string; constructor(rel: string); toString(): string; } export {};