wordmap

Version:

Multi-Lingual Word Alignment Prediction

61 lines (60 loc) • 2.06 kB

TypeScript

import { Token } from "wordmap-lexer"; import Ngram from "../structures/Ngram"; import NgramIndex from "./NgramIndex"; /** * A collection of indexes on the static content. * TODO: maybe I should split this into sentences as well. * e.g. a source SentenceIndex and a target SentenceIndex * Then we could reuse it in other places such as word-mt. */ export default class StaticIndex { private srcNgramFreqIndex; private srcTokenLength; private srcCharLength; private tgtNgramFreqIndex; private tgtTokenLength; private tgtCharLength; /** * Returns an index of source n-gram frequencies in the corpus * @return {NgramIndex} */ readonly sourceNgramFrequency: NgramIndex; /** * Returns an index of target n-gram frequencies in the corpus * @return {NgramIndex} */ readonly targetNgramFrequency: NgramIndex; /** * Returns the {@link Token} length of the entire source * @return {number} */ readonly sourceTokenLength: number; /** * Returns the {@link Token} length of the entire target * @return {number} */ readonly targetTokenLength: number; /** * Returns the character length of the entire source * @return {number} */ readonly sourceCharacterLength: number; /** * Returns the character length of the entire target * @return {number} */ readonly targetCharLength: number; constructor(); /** * Adds a sentence to the index. * The tokens in these n-grams must be measured for accurate positional metrics. * The n-grams are passed as arguments instead of being generated internally to reduce * duplicating work. * * @param sourceTokens - the source sentence tokens * @param targetTokens - the target sentence tokens * @param sourceNgrams - the source sentence n-grams * @param targetNgrams - the target sentence n-grams */ addSentence(sourceTokens: Token[], targetTokens: Token[], sourceNgrams: Ngram[], targetNgrams: Ngram[]): void; }