conllu-core
Version:
A core type to handle CoNLL-U format
377 lines (376 loc) • 13.7 kB
TypeScript
/// <reference types="node" />
import { Readable, Writable } from 'stream';
declare type EmptyTokenParseResult = [HeadId, EmptyId, EmptyToken];
declare type NominalTokenParseResult = [number, NominalToken];
/**
* A generator function that keep return a `Sentence` object on each call.
* Use this generator if whole document cannot be fit into memory.
*
* @param stream A `Readable` stream that contains CoNLL-U format text.
* @param Parser A derivative of `XPOSParser` object for parsing `xpos` field
*/
export declare function sentences(stream: Readable, Parser?: XPOSParser): AsyncGenerator<Sentence>;
/**
* `Document` is an entry point to `conllu`. It contains zero or more sentences.
*
* To programmatically construct a `Document` use it constructor.
* To construct a `Document` using CoNLL-U format text, use either
* `parse`, `load`, or `read` method depending on source of text.
*
* If `Document` cannot be fit into memory, use `sentences` generator function.
*/
export declare class Document {
sentences: Sentence[];
constructor(sentences: Sentence[]);
/**
* Load conllu file as Document. This method is async.
*
* @param file_path Path to conllu file
* @param Parser An optional Parser that is derivative of type XPOSParser for mapping XPOS to UPOS
*/
static load(file_path: string, Parser?: XPOSParser): Promise<Document>;
/**
* Parse given stream line by line to construct an object of Document.
*
* @param stream A stream source of text to be parse
* @param Parser An optional Parser that is derivative of type XPOSParser for mapping XPOS to UPOS
*/
static read(stream: Readable, Parser?: XPOSParser): Promise<Document>;
/**
* An async utitility function that cumulatively parse each line of string then return a document.
*
* @param line_iter An async generator object where each call return a line of string
* @param Parser a Parser derivative from XPOSParser
*/
protected static parse_core(line_iter: AsyncGenerator<string>, Parser?: XPOSParser): Promise<Document>;
/**
* Attempt to parse string as a document. This method is async.
*
* @param str An entire document in string where each line is terminate by '\u000a'
* @param Parser An optional XPOSParser instance
*/
static parse(str: string, Parser?: XPOSParser): Promise<Document>;
/** Save this document to a file in given path. The content encoding is UTF-8 */
save(path: string): Promise<void>;
/** Return CoNLL-U string representation of the doc */
toString(): string;
/**
* Validate every sentence dependencies. It immediately return when there's an error.
* Otherwise, it return SentenceValidationResult.Ok
*/
validate(): SentenceValidationResult;
/** Serialize this document as CoNLL-U text into given stream */
write(stream: Writable): Promise<void>;
}
/**
* Sentence meta data.
*
* It's a key/value pair. It's defined by prefixing the sentence with
* `# key = value` format.
*/
export declare class Meta {
key: string;
value: string;
/**
* Construct `Meta` by given dictionary.
* @param param0 A dic of `key` and `value` where `value` is optional.
* If `value` is omitted, `toString` method will return `Comment` format
* string rather than empty value `key`
*/
constructor({ key, value }: {
key: string;
value?: string;
});
/**
* Instantiate the object by providing a `conllu` string.
* @param str A string to be parsed into `Meta`
*/
static parse(str: string): Meta;
/** Convert this object into `conllu` string */
toString(): string;
}
/**
* A comment of sentence. It's similar to `Meta` but doesn't have `=` symbol.
* Similar to `Meta`, it must be prefix of sentence.
*/
export declare class Comment {
text: string;
/**
* @param text Comment to be added
*/
constructor(text?: string);
/**
* Construct a comment object from given string.
* @param str A string to be parse as `Comment`
*/
static parse(str: string): Comment;
/** Get `conllu` string from this comment */
toString(): string;
}
/**
* A validation result for calling validate on each `Sentence`.
* It may also throw some exceptions such as "Head of deps that reference to hidden/empty token must be in [integer, integer] format".
*/
export declare enum SentenceValidationResult {
Ok = 0,
/** Compound token end range is beyond index of last token error */
CompoundEndBeyondLastTokenError = 1,
/** Some of compound token is overlap to other compound token error */
CompoundOverlapError = 2,
/** Compound token start index point to token prior to itself error */
/** Head index is larger than number of tokens or less than 1 error */
DepHeadOutOfBoundError = 3,
CompoundStartAfterTokenError = 4,
/** Empty token after compound token error */
EmptyAfterCompoundError = 5,
/** Head index is larger than number of tokens or less than 1 error */
HeadOutOfBoundError = 6,
/** NominalToken with head with missing deprel error */
HeadWithoutDeprelError = 7,
/** NominalToken with non-intenger value in head error */
NonIntegerHeadError = 8
}
/**
* `Sentence` consists of:
* 1. `meta` which is array. The object inside array can either be `Meta` object or `Comment` object.
* 1. `tokens` which is array of derivative of `Token` class.
*
* To parse sentence text:
* 1. You can either construct a `Document` from text by using `parse`, `load`, `read` method and access
* `Sentence` via `sentences` field of `Document` object.
* 2. You can also use generator function `sentences` to parse each text chunk incrementally.
*/
export declare class Sentence {
meta: (Meta | Comment)[];
tokens: Token[];
/**
* Construct a new sentence from given dictionary
* @param param0 A dictionary object contain optional `meta` array of either
* `Meta` or `Comment` and tokens field which is array of `Token` derivative.
*/
constructor({ meta, tokens }: {
meta?: (Meta | Comment)[];
tokens: Token[];
});
/** get `conllu` formatted string of current sentence */
toString(): string;
/**
* Parse given string as `Sentence` object
* @param str A string to be used to instantiate `Sentence`.
* @param Parser An `XPOSParser` derivative object
*/
static parse(str: string, Parser?: XPOSParser): Sentence;
/**
* Validate current sentence whether the token structure is valid and all
* `head`, `relation`, and `deps` are valid.
*/
validate(): SentenceValidationResult;
}
/** Root ancestor that all type of Token should inherit from */
export declare abstract class Token {
/** Format the token into `conllu` string */
abstract toString(): string;
}
export declare type IdRange = [FirstId, LastId];
export declare type FirstId = number;
export declare type LastId = number;
/**
* A CompoundToken is a token which `id` is a range between [start, end] inclusively
* at both start and end index.
*
* The token requires `id` and `form` with optionally `misc` column.
*
* All other fields, when convert to string, has `_` values.
* ID in string format will be `start`-`end`, e.g. `1-2`.
* The `end` index must be greater than start. It is an error to have ID with
* `[1, 1]`
*/
export declare class CompoundToken implements Token {
id: [number, number];
form: string;
misc?: string[];
constructor({ id: [start, end], form, misc }: {
id: IdRange;
form: string;
misc?: string[];
});
/**
* Parse given string and return a `CompoundToken`
*
* The string must be tab separate with 10 columns.
* See https://universaldependencies.org/format.html for file format.
*
* Only `id`, `form`, and `misc` columns are use.
* All other columns are ignored as
* https://universaldependencies.org/format.html#words-tokens-and-empty-nodes
* state that all other columns beside these three must be empty.
*/
static parse(str: string): CompoundToken;
/** Retrieve a CoNLL-U format string representation of this token */
toString(): string;
}
export declare type HeadId = number;
export declare type AdvanceDep = [[HeadId] | [HeadId, EmptyId], DepsRelation];
/**
* Nominal token is a basic type of token which must exist in `Sentence` in order to
* use other type of token.
*
* The mandatory field is `form` and `upos`. All other fields are optional.
* All optional field, when converted to string, will become "_".
*
* If `deps` field is supplied when construct, it will automatically sort it to comply with
* https://universaldependencies.org/format.html#syntactic-annotation
*/
export declare class NominalToken implements Token {
form: string;
lemma?: string;
upos: UPOS;
xpos?: XPOS;
feats?: Feature[];
head?: HeadId;
deprel?: Relation;
deps?: AdvanceDep[];
misc?: string[];
constructor({ form, lemma, upos, xpos, feats, headRel, deps, misc }: {
form: string;
lemma: string;
upos: UPOS;
xpos?: XPOS;
feats?: Feature[];
headRel?: [HeadId, Relation];
deps?: [[HeadId] | [HeadId, EmptyId], DepsRelation][];
misc?: string[];
});
/**
* Parse given string and construct a `NominalToken` out of it.
* If text contains XPOS column and you need to keep XPOS field, you
* need to supply a name of an implementation of `XPOSParser`.
* @param str A string to be parsed
* @param XPOSParser XPOS parser to convert given column into an object of `XPOS`
*/
static parse(str: string, Parser?: XPOSParser): NominalTokenParseResult;
/**
* Retrieve a CoNLL-U representation string of this token. The string will have
* no `id` as its' ID rely on sequence in sentence.
*/
toString(): string;
}
export declare type EmptyId = number;
/**
* `EmptyToken` is a null token type. Everything except `deps` are optional.
* It will automatically sort `deps` field according to
* https://universaldependencies.org/format.html#syntactic-annotation
*/
export declare class EmptyToken implements Token {
form?: string;
lemma?: string;
upos?: UPOS;
xpos?: XPOS;
feats?: Feature[];
deps: AdvanceDep[];
misc?: string[];
constructor({ form, lemma, upos, xpos, feats, deps, misc }: {
form?: string;
lemma?: string;
upos?: UPOS;
xpos?: XPOS;
feats?: Feature[];
deps: [[HeadId] | [HeadId, EmptyId], DepsRelation][];
misc?: string[];
});
/**
* Parse given string and return an `EmptyToken`.
* @param str
* @param Parser
*/
static parse(str: string, Parser?: XPOSParser): EmptyTokenParseResult;
/**
* Retrieve a CoNLL-U representation string of this token. The string will have
* no `id` as its' ID rely on sequence in sentence.
*/
toString(): string;
}
/**
* All possible part-of-speech defined in CoNLL-U.
* A complete list of POS can be found here:
* https://universaldependencies.org/u/pos/index.html
*/
export declare enum UPOS {
ADJ = "ADJ",
ADP = "ADP",
ADV = "ADV",
AUX = "AUX",
CCONJ = "CCONJ",
DET = "DET",
INTJ = "INTJ",
NOUN = "NOUN",
NUM = "NUM",
PART = "PART",
PRON = "PRON",
PROPN = "PROPN",
PUNCT = "PUNCT",
SCONJ = "SCONJ",
SYM = "SYM",
VERB = "VERB",
Other = "X"
}
/** Utility function to parse string as UPOS object */
export declare function toUPOS(str: string): UPOS;
/**
* An abstract class XPOS which every languages that use
* `xpos` field need to implement.
*
* It is mandatory to implement this class to preserve `xpos` field when you want
* to use `xpos` field.
*/
export declare abstract class XPOS {
abstract toUPOS(): UPOS;
abstract toString(): string;
}
/**
* An abstract class XPOSParser which any language that use `xpos` field and
* require to deserialize need to implement.
*
* The implementation need to implement `parse` as static method.
* If you don't pass an implementation of this class when deserialize the
* `Document`, `Sentence`, `NominalToken`, and `EmptyToken` then the deserialized
* object will have no `xpos` field.
*/
export declare abstract class XPOSParser {
abstract parse(str: string): XPOS;
}
/**
* A feature as describe in https://universaldependencies.org/format.html#morphological-annotation
* It is a kind of key/values pair where key is a name of feature type and values is a list of
* feature name.
*
* If you construct this feature via its' constructor, it will validate the name and sort the values for you.
*/
export declare class Feature {
name: string;
value: string[];
constructor(name: string, value: string[]);
toString(): string;
}
/**
* A DepsRelation is a relation used in `deps` field on `NominalToken` and `EmptyToken`.
* A constructor will validate the relation name according to
* https://universaldependencies.org/u/overview/enhanced-syntax.html
*/
export declare class DepsRelation {
rel: string;
constructor(rel: string);
toString(): string;
}
/**
* A Relation is a name of relation that is used to describe the token relation
* to it `head`. The field that uses this class is `deprel`.
* `deprel` is mandatory if `head` is not empty.
* See https://universaldependencies.org/format.html#syntactic-annotation
*/
export declare class Relation {
rel: string;
constructor(rel: string);
toString(): string;
}
export {};