wordmap
Version:
Multi-Lingual Word Alignment Prediction
61 lines (60 loc) • 2.06 kB
TypeScript
import { Token } from "wordmap-lexer";
import Ngram from "../structures/Ngram";
import NgramIndex from "./NgramIndex";
/**
* A collection of indexes on the static content.
* TODO: maybe I should split this into sentences as well.
* e.g. a source SentenceIndex and a target SentenceIndex
* Then we could reuse it in other places such as word-mt.
*/
export default class StaticIndex {
private srcNgramFreqIndex;
private srcTokenLength;
private srcCharLength;
private tgtNgramFreqIndex;
private tgtTokenLength;
private tgtCharLength;
/**
* Returns an index of source n-gram frequencies in the corpus
* @return {NgramIndex}
*/
readonly sourceNgramFrequency: NgramIndex;
/**
* Returns an index of target n-gram frequencies in the corpus
* @return {NgramIndex}
*/
readonly targetNgramFrequency: NgramIndex;
/**
* Returns the {@link Token} length of the entire source
* @return {number}
*/
readonly sourceTokenLength: number;
/**
* Returns the {@link Token} length of the entire target
* @return {number}
*/
readonly targetTokenLength: number;
/**
* Returns the character length of the entire source
* @return {number}
*/
readonly sourceCharacterLength: number;
/**
* Returns the character length of the entire target
* @return {number}
*/
readonly targetCharLength: number;
constructor();
/**
* Adds a sentence to the index.
* The tokens in these n-grams must be measured for accurate positional metrics.
* The n-grams are passed as arguments instead of being generated internally to reduce
* duplicating work.
*
* @param sourceTokens - the source sentence tokens
* @param targetTokens - the target sentence tokens
* @param sourceNgrams - the source sentence n-grams
* @param targetNgrams - the target sentence n-grams
*/
addSentence(sourceTokens: Token[], targetTokens: Token[], sourceNgrams: Ngram[], targetNgrams: Ngram[]): void;
}