UNPKG

wordmap

Version:

Multi-Lingual Word Alignment Prediction

github.com/translationCoreApps/wordMAP

translationCoreApps/wordMAP

359 lines (358 loc) • 15.1 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const Algorithm_1 = require("./Algorithm"); const GlobalAlgorithm_1 = require("./GlobalAlgorithm"); const AlignmentMemoryIndex_1 = require("./index/AlignmentMemoryIndex"); const CorpusIndex_1 = require("./index/CorpusIndex"); const UnalignedSentenceIndex_1 = require("./index/UnalignedSentenceIndex"); const Parser_1 = require("./Parser"); const Alignment_1 = require("./structures/Alignment"); const Ngram_1 = require("./structures/Ngram"); const Prediction_1 = require("./structures/Prediction"); const Suggestion_1 = require("./structures/Suggestion"); /** * Represents a multi-lingual word alignment prediction engine. */ class Engine { constructor({ sourceNgramLength = 3, targetNgramLength = 3, nGramWarnings = true } = { sourceNgramLength: 3, targetNgramLength: 3, nGramWarnings: true }) { this.registeredAlgorithms = []; this.registeredGlobalAlgorithms = []; this.maxSourceNgramLength = sourceNgramLength; this.maxTargetNgramLength = targetNgramLength; this.nGramWarnings = nGramWarnings; this.corpusIndex = new CorpusIndex_1.default(); this.alignmentMemoryIndex = new AlignmentMemoryIndex_1.default(); } /** * Generates an array of all possible alignment predictions * @param {Ngram[]} sourceNgrams - every possible n-gram in the source text * @param {Ngram[]} targetNgrams - every possible n-gram in the target text * @return {Prediction[]} */ static generatePredictions(sourceNgrams, targetNgrams) { const predictions = []; for (const source of sourceNgrams) { for (const target of targetNgrams) { predictions.push(new Prediction_1.default(new Alignment_1.default(source, target))); } // TRICKY: include empty match alignment predictions.push(new Prediction_1.default(new Alignment_1.default(source, new Ngram_1.default()))); } return predictions; } /** * Generates an array of all possible contiguous n-grams within the sentence. * @deprecated use {@link Parser.ngrams} instead * @param {Array<Token>} sentence - the tokens in a sentence * @param {number} [maxNgramLength=3] - the maximum n-gram size to generate * @returns {any[]} */ static generateSentenceNgrams(sentence, maxNgramLength = 3) { if (maxNgramLength < 0) { throw new RangeError(`Maximum n-gram size cannot be less than 0. Received ${maxNgramLength}`); } const ngrams = []; const maxLength = Math.min(maxNgramLength, sentence.length); for (let ngramLength = 1; ngramLength <= maxLength; ngramLength++) { ngrams.push.apply(ngrams, Parser_1.default.sizedNgrams(sentence, ngramLength)); } return ngrams; } /** * Returns an array of n-grams of a particular size from a sentence * @deprecated used {@link Parser.sizedNgrams} instead * @param {Array<Token>} sentence - the sentence from which n-grams will be read * @param {number} ngramLength - the length of each n-gram. * @returns {Array<Ngram>} */ static readSizedNgrams(sentence, ngramLength) { const ngrams = []; const sentenceLength = sentence.length; for (let pos = 0; pos < sentenceLength; pos++) { const end = pos + ngramLength; if (end > sentenceLength) { break; } const ngram = new Ngram_1.default(sentence.slice(pos, end)); ngrams.push(ngram); } return ngrams; } /** * Calculates the weighted confidence score of a prediction * @param {Prediction} prediction - the prediction to score * @param {string[]} scoreKeys - the score keys to include in the calculation * @param {NumberObject} weights - the weights to influence the calculation * @return {number} */ static calculateWeightedConfidence(prediction, scoreKeys, weights) { let weightSum = 0; let scoreSum = 0; for (const key of scoreKeys) { let weight = 1; if (key in weights) { weight = weights[key]; } // if (prediction.hasScore(key)) { scoreSum += prediction.getScore(key) * weight; weightSum += weight; // } } return scoreSum / weightSum; } /** * Scores the predictions and returns a filtered set of suggestions * TODO: this should not be done in the engine because we don't know anything about the algorithms here. * @param predictions * @param saIndex */ static calculateConfidence(predictions, saIndex) { const finalPredictions = []; const weights = { "alignmentPosition": 0.7, "sourceNgramLength": 0.2, "characterLength": 0.3, "alignmentOccurrences": 0.4, "lemmaAlignmentOccurrences": 0.4, "uniqueness": 0.5, "lemmaUniqueness": 0.5, "sourceCorpusPermutationsFrequencyRatio": 0.7, "sourceCorpusLemmaPermutationsFrequencyRatio": 0.7, "targetCorpusPermutationsFrequencyRatio": 0.7, "targetCorpusLemmaPermutationsFrequencyRatio": 0.7, "sourceAlignmentMemoryFrequencyRatio": 0.7, "sourceAlignmentMemoryLemmaFrequencyRatio": 0.7, "targetAlignmentMemoryFrequencyRatio": 0.7, "targetAlignmentMemoryLemmaFrequencyRatio": 0.7 }; for (const p of predictions) { let isAlignmentMemory = saIndex.alignmentFrequency.read(p.alignment); // TRICKY: fall back to lemma if (!isAlignmentMemory && p.alignment.lemmaKey !== undefined) { isAlignmentMemory = saIndex.alignmentFrequency.read(p.alignment.lemmaKey); } // confidence based on corpus const corpusWeightedKeys = [ "sourceCorpusPermutationsFrequencyRatio", "sourceCorpusLemmaPermutationsFrequencyRatio", "targetCorpusPermutationsFrequencyRatio", "targetCorpusLemmaPermutationsFrequencyRatio", "alignmentPosition", "ngramLength", "characterLength", "alignmentOccurrences", "lemmaAlignmentOccurrences", "uniqueness", "lemmaUniqueness" ]; const corpusConfidence = Engine.calculateWeightedConfidence(p, corpusWeightedKeys, weights); // confidence based on alignment memory const alignmentMemoryWeightedKeys = [ "sourceAlignmentMemoryFrequencyRatio", "sourceAlignmentMemoryLemmaFrequencyRatio", "targetAlignmentMemoryFrequencyRatio", "targetAlignmentMemoryLemmaFrequencyRatio", "alignmentPosition", "ngramLength", "characterLength", "alignmentOccurrences", "lemmaAlignmentOccurrences", "uniqueness", "lemmaUniqueness" ]; let confidence = Engine.calculateWeightedConfidence(p, alignmentMemoryWeightedKeys, weights); // prefer to use the saved alignment confidence if (!isAlignmentMemory) { confidence = corpusConfidence; confidence *= p.getScore("phrasePlausibility"); // TODO: lemmaPhrasePlausibility } // boost confidence for alignment memory if (isAlignmentMemory) { confidence++; } p.setScore("confidence", confidence); finalPredictions.push(p); } return finalPredictions; } /** * Returns an array of alignment suggestions * @param predictions - the predictions from which to base the suggestion * @param maxSuggestions - the maximum number of suggestions to return * @return {Suggestion} */ static suggest(predictions, maxSuggestions = 1) { const suggestions = []; // build suggestions for (let i = 0; i < maxSuggestions; i++) { if (i >= predictions.length) { break; } const suggestion = new Suggestion_1.default(); let filtered = [...predictions]; // TRICKY: sequentially pick the best starting point in descending order const best = filtered.splice(i, 1)[0]; suggestion.addPrediction(best); filtered = filtered.filter((p) => { return !best.intersects(p); }); // fill suggestion while (filtered.length) { const nextBest = filtered.shift(); if (nextBest === undefined) { break; } suggestion.addPrediction(nextBest); filtered = filtered.filter((p) => { return !nextBest.intersects(p); }); } suggestions.push(suggestion); } return Engine.sortSuggestions(suggestions); } /** * Sorts an array of suggestions by compound confidence * @param {Suggestion[]} suggestions - the suggestions to sort * @return {Suggestion[]} */ static sortSuggestions(suggestions) { return suggestions.sort((a, b) => { const aConfidence = a.compoundConfidence(); const bConfidence = b.compoundConfidence(); if (aConfidence < bConfidence) { return 1; } if (aConfidence > bConfidence) { return -1; } return 0; }); } /** * Sorts an array of predictions by confidence * @param {Prediction[]} predictions - the predictions to sort * @return {Prediction[]} */ static sortPredictions(predictions) { return predictions.sort((a, b) => { const aConfidence = a.getScore("confidence"); const bConfidence = b.getScore("confidence"); if (aConfidence < bConfidence) { return 1; } if (aConfidence > bConfidence) { return -1; } return 0; }); } /** * Returns a list of algorithms that are registered in the engine * @return {Array<Algorithm>} */ get algorithms() { return this.registeredAlgorithms; } /** * Executes prediction algorithms on the unaligned sentence pair. * The sentence tokens should contain positional metrics for better accuracy. * * @param {Token[]} sourceSentence - the source sentence tokens. * @param {Token[]} targetSentence - the target sentence tokens. * @param {CorpusIndex} cIndex * @param {AlignmentMemoryIndex} saIndex * @param {Algorithm[]} algorithms * @param {GlobalAlgorithm[]} globalAlgorithms * @return {Prediction[]} */ performPrediction(sourceSentence, targetSentence, cIndex, saIndex, algorithms, globalAlgorithms) { const sourceNgrams = Parser_1.default.ngrams(sourceSentence, this.maxSourceNgramLength); const targetNgrams = Parser_1.default.ngrams(targetSentence, this.maxTargetNgramLength); // generate alignment permutations let predictions = Engine.generatePredictions(sourceNgrams, targetNgrams); const numPredictions = predictions.length; const sentenceIndex = new UnalignedSentenceIndex_1.default(); sentenceIndex.append([sourceSentence], [targetSentence], this.maxSourceNgramLength, this.maxTargetNgramLength); // run global algorithms first for (const algorithm of globalAlgorithms) { predictions = algorithm.execute(predictions, cIndex, saIndex, sentenceIndex); } // run regular algorithms const numAlgorithms = algorithms.length; for (let i = 0; i < numPredictions; i++) { for (let j = 0; j < numAlgorithms; j++) { algorithms[j].execute(predictions[i], cIndex, saIndex, sentenceIndex); } } return predictions; } /** * Generates the final confidence scores and sorts the predictions. * @param {Prediction[]} predictions * @return {Prediction[]} */ score(predictions) { const results = Engine.calculateConfidence(predictions, this.alignmentMemoryIndex); return Engine.sortPredictions(results); } /** * Adds a new algorithm to the engine. * @param {Algorithm} algorithm - the algorithm to run with the engine. */ registerAlgorithm(algorithm) { if (algorithm instanceof GlobalAlgorithm_1.default) { this.registeredGlobalAlgorithms.push(algorithm); } else if (algorithm instanceof Algorithm_1.default) { this.registeredAlgorithms.push(algorithm); } else { throw new Error("Unsupported algorithm type"); } } /** * Appends new corpus to the engine. * @param {[Token[]]} source - an array of tokenized source sentences. * @param {[Token[]]} target - an array of tokenized target sentences. */ addCorpus(source, target) { this.corpusIndex.append(source, target, this.maxSourceNgramLength, this.maxTargetNgramLength); } /** * Appends new alignment memory to the engine. * Adding alignment memory improves the quality of predictions. * @param {Array<Alignment>} alignmentMemory - a list of alignments */ addAlignmentMemory(alignmentMemory) { // TODO: we need a better way for calling program to query the number of nGrams that exceed the limit if (this.nGramWarnings) { for (let i = alignmentMemory.length - 1; i >= 0; i--) { const target = alignmentMemory[i].targetNgram; if (target.tokenLength > this.maxTargetNgramLength) { console.warn(`Target Alignment Memory "${target.key}" exceeds maximum n-gram length of ${this.maxTargetNgramLength} and may be ignored.`); } const source = alignmentMemory[i].sourceNgram; if (source.tokenLength > this.maxSourceNgramLength) { console.warn(`Source Alignment Memory "${source.key}" exceeds maximum n-gram length of ${this.maxSourceNgramLength} and may be ignored.`); } } } this.alignmentMemoryIndex.append(alignmentMemory); } /** * Performs the prediction calculations * @param {Token[]} sourceSentence * @param {Token[]} targetSentence * @return {Prediction[]} */ run(sourceSentence, targetSentence) { return this.performPrediction(sourceSentence, targetSentence, this.corpusIndex, this.alignmentMemoryIndex, this.registeredAlgorithms, this.registeredGlobalAlgorithms); } } exports.default = Engine;