UNPKG

wordmap

Version:
220 lines (219 loc) 5.92 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); /** * Represents a set of zero or more tokens from a text. */ class Ngram { /** * @param {Array<Token>} [tokens=[]] - a list of tokens of which this n-gram is composed */ constructor(tokens = []) { this.occurrence = 1; this.occurrences = 1; this.tokens = tokens; } /** * Returns the length of the n-gram in {@link Token}'s * @return {number} */ get tokenLength() { return this.tokens.length; } /** * Returns the length of the n-gram in characters. * This does not account for whitespace. * @return {number} */ get characterLength() { let length = 0; for (let i = 0, len = this.tokens.length; i < len; i++) { length += this.tokens[i].toString().length; } return length; } /** * Returns the position (in units of {@link Token} ) at which this n-gram appears in the sentence. * @return {number} - the position */ get tokenPosition() { if (this.tokens.length) { return this.tokens[0].position; } else { return 0; } } /** * Returns the length of the sentence (in units of {@link Token}) in which this n-gram occurs. * @return {number} */ get sentenceTokenLength() { if (this.tokens.length) { return this.tokens[0].sentenceTokenLength; } else { return 0; } } /** * Returns the length of the sentence (in units of character) in which this n-gram occurs. * This includes whitespace in the sentence * @return {number} */ get sentenceCharacterLength() { if (this.tokens.length) { return this.tokens[0].sentenceCharacterLength; } else { return 0; } } /** * Returns the position (in units of character) at which this n-gram appears in the sentence. * @return {number} - the position */ get characterPosition() { if (this.tokens.length) { return this.tokens[0].charPosition; } else { return 0; } } /** * Returns the n-gram key */ get key() { this.cacheKeys(); return this.cachedKey; } /** * Returns the n-gram lemma-based key */ get lemmaKey() { this.cacheKeys(); return this.cachedLemmaKey; } /** * Checks if this n-gram contains one token * @return {boolean} */ isUnigram() { return this.tokens.length === 1; } /** * Checks if this n-gram contains two tokens * @return {boolean} */ isBigram() { return this.tokens.length === 2; } /** * Checks if this n-gram contains three tokens * @return {boolean} */ isTrigram() { return this.tokens.length === 3; } /** * Checks if this n-grams is an empty placeholder * @return {boolean} */ isNull() { return this.tokens.length === 0; } /** * Returns the tokens in this n-gram * @return {Token[]} */ getTokens() { return this.tokens; } /** * Returns a human readable form of the n-gram * @return {string} */ toString() { return this.key; } /** * Outputs the n-gram to json * @param verbose - print full metadata * @return {object} */ toJSON(verbose = false) { const json = []; for (let i = 0, len = this.tokens.length; i < len; i++) { json.push(this.tokens[i].toJSON(verbose)); } return json; } /** * Checks if two n-grams are equal * @param {Ngram} ngram * @return {boolean} */ equals(ngram) { if (this.tokens.length === ngram.tokens.length) { // check if tokens are equal for (let i = 0, len = this.tokens.length; i < len; i++) { if (!this.tokens[i].equals(ngram.tokens[i])) { return false; } } return true; } return false; } /** * Checks if two n-grams look the same * @param {Ngram} ngram * @return {boolean} */ looksLike(ngram) { if (this.tokens.length === ngram.tokens.length) { // check if tokens are equal for (let i = 0, len = this.tokens.length; i < len; i++) { if (!this.tokens[i].looksLike(ngram.tokens[i])) { return false; } } return true; } return false; } /** * Caches the keys if they have not already been generated */ cacheKeys() { if (this.cachedKey === undefined) { let defaultKey = "n:"; let lemmaKey = "n:"; let missingLemma = false; const numTokens = this.tokens.length; for (let i = 0; i < numTokens; i++) { const token = this.tokens[i]; defaultKey += token.toString() + ":"; // TRICKY: lemma is not always available const lemma = token.lemma; if (lemma !== "") { lemmaKey += lemma + ":"; } else { missingLemma = true; } } if (numTokens > 0) { this.cachedKey = defaultKey.slice(0, -1).toLowerCase(); } else { this.cachedKey = defaultKey; } // TRICKY: all tokens must have a lemma if (lemmaKey.length > 0 && !missingLemma) { this.cachedLemmaKey = lemmaKey.slice(0, -1).toLowerCase(); } } } } exports.default = Ngram;