wordmap
Version:
Multi-Lingual Word Alignment Prediction
92 lines (91 loc) • 3.82 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
const Alignment_1 = require("./structures/Alignment");
const Ngram_1 = require("./structures/Ngram");
/**
* A collection of parsing functions
*/
class Parser {
/**
* Returns an array of n-grams of a particular size from a sentence
* @param {Array<Token>} sentence - the sentence from which n-grams will be read
* @param {number} ngramLength - the length of each n-gram.
* @returns {Array<Ngram>}
*/
static sizedNgrams(sentence, ngramLength) {
const ngrams = [];
const sentenceLength = sentence.length;
const occurrences = {};
for (let pos = 0; pos < sentenceLength; pos++) {
const end = pos + ngramLength;
if (end > sentenceLength) {
break;
}
const ngram = new Ngram_1.default(sentence.slice(pos, end));
// measure occurrence
if (ngram.key in occurrences) {
occurrences[ngram.key]++;
}
else {
occurrences[ngram.key] = 1;
}
// TRICKY: the total occurrences is measured in the CorpusIndex.
ngram.occurrence = occurrences[ngram.key];
ngrams.push(ngram);
}
return ngrams;
}
/**
* Generates an array of all possible contiguous n-grams within the sentence.
* @param {Array<Token>} sentence - the tokens in a sentence
* @param {number} [maxNgramLength=3] - the maximum n-gram size to generate
* @returns {any[]}
*/
static ngrams(sentence, maxNgramLength = 3) {
if (maxNgramLength < 0) {
throw new RangeError(`Maximum n-gram size cannot be less than 0. Received ${maxNgramLength}`);
}
const ngrams = [];
const maxLength = Math.min(maxNgramLength, sentence.length);
for (let ngramLength = 1; ngramLength <= maxLength; ngramLength++) {
ngrams.push.apply(ngrams, Parser.sizedNgrams(sentence, ngramLength));
}
return ngrams;
}
/**
* Generates an array of all possible alignments between two sets of n-grams
* @deprecated used {@link indexAlignmentPermutations} instead (it's faster).
* @param {Ngram[]} sourceNgrams - every possible n-gram in the source text
* @param {Ngram[]} targetNgrams - every possible n-gram in the target text
* @return {Alignment[]}
*/
static alignments(sourceNgrams, targetNgrams) {
const alignments = [];
for (const source of sourceNgrams) {
for (const target of targetNgrams) {
alignments.push(new Alignment_1.default(source, target));
}
// TRICKY: include empty match alignment
alignments.push(new Alignment_1.default(source, new Ngram_1.default()));
}
return alignments;
}
/**
* Indexes all possible alignment permutations between two sets of n-grams
* @param {Ngram[]} sourceNgrams - every possible n-gram in the source text
* @param {Ngram[]} targetNgrams - every possible n-gram in the target text
* @param {PermutationIndex} index - the index that will receive the permutations
*/
static indexAlignmentPermutations(sourceNgrams, targetNgrams, index) {
const tlen = targetNgrams.length;
for (let s = 0, slen = sourceNgrams.length; s < slen; s++) {
const sourceNgram = sourceNgrams[s];
for (let t = 0; t < tlen; t++) {
index.addAlignment(new Alignment_1.default(sourceNgram, targetNgrams[t]));
}
// TRICKY: include empty match alignment
index.addAlignment(new Alignment_1.default(sourceNgram, new Ngram_1.default()));
}
}
}
exports.default = Parser;