wordmap
Version:
Multi-Lingual Word Alignment Prediction
220 lines (219 loc) • 5.92 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
/**
* Represents a set of zero or more tokens from a text.
*/
class Ngram {
/**
* @param {Array<Token>} [tokens=[]] - a list of tokens of which this n-gram is composed
*/
constructor(tokens = []) {
this.occurrence = 1;
this.occurrences = 1;
this.tokens = tokens;
}
/**
* Returns the length of the n-gram in {@link Token}'s
* @return {number}
*/
get tokenLength() {
return this.tokens.length;
}
/**
* Returns the length of the n-gram in characters.
* This does not account for whitespace.
* @return {number}
*/
get characterLength() {
let length = 0;
for (let i = 0, len = this.tokens.length; i < len; i++) {
length += this.tokens[i].toString().length;
}
return length;
}
/**
* Returns the position (in units of {@link Token} ) at which this n-gram appears in the sentence.
* @return {number} - the position
*/
get tokenPosition() {
if (this.tokens.length) {
return this.tokens[0].position;
}
else {
return 0;
}
}
/**
* Returns the length of the sentence (in units of {@link Token}) in which this n-gram occurs.
* @return {number}
*/
get sentenceTokenLength() {
if (this.tokens.length) {
return this.tokens[0].sentenceTokenLength;
}
else {
return 0;
}
}
/**
* Returns the length of the sentence (in units of character) in which this n-gram occurs.
* This includes whitespace in the sentence
* @return {number}
*/
get sentenceCharacterLength() {
if (this.tokens.length) {
return this.tokens[0].sentenceCharacterLength;
}
else {
return 0;
}
}
/**
* Returns the position (in units of character) at which this n-gram appears in the sentence.
* @return {number} - the position
*/
get characterPosition() {
if (this.tokens.length) {
return this.tokens[0].charPosition;
}
else {
return 0;
}
}
/**
* Returns the n-gram key
*/
get key() {
this.cacheKeys();
return this.cachedKey;
}
/**
* Returns the n-gram lemma-based key
*/
get lemmaKey() {
this.cacheKeys();
return this.cachedLemmaKey;
}
/**
* Checks if this n-gram contains one token
* @return {boolean}
*/
isUnigram() {
return this.tokens.length === 1;
}
/**
* Checks if this n-gram contains two tokens
* @return {boolean}
*/
isBigram() {
return this.tokens.length === 2;
}
/**
* Checks if this n-gram contains three tokens
* @return {boolean}
*/
isTrigram() {
return this.tokens.length === 3;
}
/**
* Checks if this n-grams is an empty placeholder
* @return {boolean}
*/
isNull() {
return this.tokens.length === 0;
}
/**
* Returns the tokens in this n-gram
* @return {Token[]}
*/
getTokens() {
return this.tokens;
}
/**
* Returns a human readable form of the n-gram
* @return {string}
*/
toString() {
return this.key;
}
/**
* Outputs the n-gram to json
* @param verbose - print full metadata
* @return {object}
*/
toJSON(verbose = false) {
const json = [];
for (let i = 0, len = this.tokens.length; i < len; i++) {
json.push(this.tokens[i].toJSON(verbose));
}
return json;
}
/**
* Checks if two n-grams are equal
* @param {Ngram} ngram
* @return {boolean}
*/
equals(ngram) {
if (this.tokens.length === ngram.tokens.length) {
// check if tokens are equal
for (let i = 0, len = this.tokens.length; i < len; i++) {
if (!this.tokens[i].equals(ngram.tokens[i])) {
return false;
}
}
return true;
}
return false;
}
/**
* Checks if two n-grams look the same
* @param {Ngram} ngram
* @return {boolean}
*/
looksLike(ngram) {
if (this.tokens.length === ngram.tokens.length) {
// check if tokens are equal
for (let i = 0, len = this.tokens.length; i < len; i++) {
if (!this.tokens[i].looksLike(ngram.tokens[i])) {
return false;
}
}
return true;
}
return false;
}
/**
* Caches the keys if they have not already been generated
*/
cacheKeys() {
if (this.cachedKey === undefined) {
let defaultKey = "n:";
let lemmaKey = "n:";
let missingLemma = false;
const numTokens = this.tokens.length;
for (let i = 0; i < numTokens; i++) {
const token = this.tokens[i];
defaultKey += token.toString() + ":";
// TRICKY: lemma is not always available
const lemma = token.lemma;
if (lemma !== "") {
lemmaKey += lemma + ":";
}
else {
missingLemma = true;
}
}
if (numTokens > 0) {
this.cachedKey = defaultKey.slice(0, -1).toLowerCase();
}
else {
this.cachedKey = defaultKey;
}
// TRICKY: all tokens must have a lemma
if (lemmaKey.length > 0 && !missingLemma) {
this.cachedLemmaKey = lemmaKey.slice(0, -1).toLowerCase();
}
}
}
}
exports.default = Ngram;