wordmap
Version:
Multi-Lingual Word Alignment Prediction
93 lines (92 loc) • 3.07 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const NgramIndex_1 = require("./NgramIndex");
/**
* A collection of indexes on the static content.
* TODO: maybe I should split this into sentences as well.
* e.g. a source SentenceIndex and a target SentenceIndex
* Then we could reuse it in other places such as word-mt.
*/
class StaticIndex {
constructor() {
this.srcNgramFreqIndex = new NgramIndex_1.default();
this.tgtNgramFreqIndex = new NgramIndex_1.default();
this.srcTokenLength = 0;
this.tgtTokenLength = 0;
this.srcCharLength = 0;
this.tgtCharLength = 0;
}
/**
* Returns an index of source n-gram frequencies in the corpus
* @return {NgramIndex}
*/
get sourceNgramFrequency() {
return this.srcNgramFreqIndex;
}
/**
* Returns an index of target n-gram frequencies in the corpus
* @return {NgramIndex}
*/
get targetNgramFrequency() {
return this.tgtNgramFreqIndex;
}
/**
* Returns the {@link Token} length of the entire source
* @return {number}
*/
get sourceTokenLength() {
return this.srcTokenLength;
}
/**
* Returns the {@link Token} length of the entire target
* @return {number}
*/
get targetTokenLength() {
return this.tgtTokenLength;
}
/**
* Returns the character length of the entire source
* @return {number}
*/
get sourceCharacterLength() {
return this.srcCharLength;
}
/**
* Returns the character length of the entire target
* @return {number}
*/
get targetCharLength() {
return this.tgtCharLength;
}
/**
* Adds a sentence to the index.
* The tokens in these n-grams must be measured for accurate positional metrics.
* The n-grams are passed as arguments instead of being generated internally to reduce
* duplicating work.
*
* @param sourceTokens - the source sentence tokens
* @param targetTokens - the target sentence tokens
* @param sourceNgrams - the source sentence n-grams
* @param targetNgrams - the target sentence n-grams
*/
addSentence(sourceTokens, targetTokens, sourceNgrams, targetNgrams) {
// token length
this.srcTokenLength += sourceTokens.length;
this.tgtTokenLength += targetTokens.length;
// character length
for (let i = 0, len = sourceTokens.length; i < len; i++) {
this.srcCharLength += sourceTokens[i].toString().length;
}
for (let i = 0, len = targetTokens.length; i < len; i++) {
this.tgtCharLength += targetTokens[i].toString().length;
}
// n-gram frequency
for (let i = 0, len = sourceNgrams.length; i < len; i++) {
this.srcNgramFreqIndex.increment(sourceNgrams[i]);
}
for (let i = 0, len = targetNgrams.length; i < len; i++) {
this.tgtNgramFreqIndex.increment(targetNgrams[i]);
}
}
}
exports.default = StaticIndex;