wordmap
Version:
Multi-Lingual Word Alignment Prediction
359 lines (358 loc) • 15.1 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
const Algorithm_1 = require("./Algorithm");
const GlobalAlgorithm_1 = require("./GlobalAlgorithm");
const AlignmentMemoryIndex_1 = require("./index/AlignmentMemoryIndex");
const CorpusIndex_1 = require("./index/CorpusIndex");
const UnalignedSentenceIndex_1 = require("./index/UnalignedSentenceIndex");
const Parser_1 = require("./Parser");
const Alignment_1 = require("./structures/Alignment");
const Ngram_1 = require("./structures/Ngram");
const Prediction_1 = require("./structures/Prediction");
const Suggestion_1 = require("./structures/Suggestion");
/**
* Represents a multi-lingual word alignment prediction engine.
*/
class Engine {
constructor({ sourceNgramLength = 3, targetNgramLength = 3, nGramWarnings = true } = {
sourceNgramLength: 3,
targetNgramLength: 3,
nGramWarnings: true
}) {
this.registeredAlgorithms = [];
this.registeredGlobalAlgorithms = [];
this.maxSourceNgramLength = sourceNgramLength;
this.maxTargetNgramLength = targetNgramLength;
this.nGramWarnings = nGramWarnings;
this.corpusIndex = new CorpusIndex_1.default();
this.alignmentMemoryIndex = new AlignmentMemoryIndex_1.default();
}
/**
* Generates an array of all possible alignment predictions
* @param {Ngram[]} sourceNgrams - every possible n-gram in the source text
* @param {Ngram[]} targetNgrams - every possible n-gram in the target text
* @return {Prediction[]}
*/
static generatePredictions(sourceNgrams, targetNgrams) {
const predictions = [];
for (const source of sourceNgrams) {
for (const target of targetNgrams) {
predictions.push(new Prediction_1.default(new Alignment_1.default(source, target)));
}
// TRICKY: include empty match alignment
predictions.push(new Prediction_1.default(new Alignment_1.default(source, new Ngram_1.default())));
}
return predictions;
}
/**
* Generates an array of all possible contiguous n-grams within the sentence.
* @deprecated use {@link Parser.ngrams} instead
* @param {Array<Token>} sentence - the tokens in a sentence
* @param {number} [maxNgramLength=3] - the maximum n-gram size to generate
* @returns {any[]}
*/
static generateSentenceNgrams(sentence, maxNgramLength = 3) {
if (maxNgramLength < 0) {
throw new RangeError(`Maximum n-gram size cannot be less than 0. Received ${maxNgramLength}`);
}
const ngrams = [];
const maxLength = Math.min(maxNgramLength, sentence.length);
for (let ngramLength = 1; ngramLength <= maxLength; ngramLength++) {
ngrams.push.apply(ngrams, Parser_1.default.sizedNgrams(sentence, ngramLength));
}
return ngrams;
}
/**
* Returns an array of n-grams of a particular size from a sentence
* @deprecated used {@link Parser.sizedNgrams} instead
* @param {Array<Token>} sentence - the sentence from which n-grams will be read
* @param {number} ngramLength - the length of each n-gram.
* @returns {Array<Ngram>}
*/
static readSizedNgrams(sentence, ngramLength) {
const ngrams = [];
const sentenceLength = sentence.length;
for (let pos = 0; pos < sentenceLength; pos++) {
const end = pos + ngramLength;
if (end > sentenceLength) {
break;
}
const ngram = new Ngram_1.default(sentence.slice(pos, end));
ngrams.push(ngram);
}
return ngrams;
}
/**
* Calculates the weighted confidence score of a prediction
* @param {Prediction} prediction - the prediction to score
* @param {string[]} scoreKeys - the score keys to include in the calculation
* @param {NumberObject} weights - the weights to influence the calculation
* @return {number}
*/
static calculateWeightedConfidence(prediction, scoreKeys, weights) {
let weightSum = 0;
let scoreSum = 0;
for (const key of scoreKeys) {
let weight = 1;
if (key in weights) {
weight = weights[key];
}
// if (prediction.hasScore(key)) {
scoreSum += prediction.getScore(key) * weight;
weightSum += weight;
// }
}
return scoreSum / weightSum;
}
/**
* Scores the predictions and returns a filtered set of suggestions
* TODO: this should not be done in the engine because we don't know anything about the algorithms here.
* @param predictions
* @param saIndex
*/
static calculateConfidence(predictions, saIndex) {
const finalPredictions = [];
const weights = {
"alignmentPosition": 0.7,
"sourceNgramLength": 0.2,
"characterLength": 0.3,
"alignmentOccurrences": 0.4,
"lemmaAlignmentOccurrences": 0.4,
"uniqueness": 0.5,
"lemmaUniqueness": 0.5,
"sourceCorpusPermutationsFrequencyRatio": 0.7,
"sourceCorpusLemmaPermutationsFrequencyRatio": 0.7,
"targetCorpusPermutationsFrequencyRatio": 0.7,
"targetCorpusLemmaPermutationsFrequencyRatio": 0.7,
"sourceAlignmentMemoryFrequencyRatio": 0.7,
"sourceAlignmentMemoryLemmaFrequencyRatio": 0.7,
"targetAlignmentMemoryFrequencyRatio": 0.7,
"targetAlignmentMemoryLemmaFrequencyRatio": 0.7
};
for (const p of predictions) {
let isAlignmentMemory = saIndex.alignmentFrequency.read(p.alignment);
// TRICKY: fall back to lemma
if (!isAlignmentMemory && p.alignment.lemmaKey !== undefined) {
isAlignmentMemory = saIndex.alignmentFrequency.read(p.alignment.lemmaKey);
}
// confidence based on corpus
const corpusWeightedKeys = [
"sourceCorpusPermutationsFrequencyRatio",
"sourceCorpusLemmaPermutationsFrequencyRatio",
"targetCorpusPermutationsFrequencyRatio",
"targetCorpusLemmaPermutationsFrequencyRatio",
"alignmentPosition",
"ngramLength",
"characterLength",
"alignmentOccurrences",
"lemmaAlignmentOccurrences",
"uniqueness",
"lemmaUniqueness"
];
const corpusConfidence = Engine.calculateWeightedConfidence(p, corpusWeightedKeys, weights);
// confidence based on alignment memory
const alignmentMemoryWeightedKeys = [
"sourceAlignmentMemoryFrequencyRatio",
"sourceAlignmentMemoryLemmaFrequencyRatio",
"targetAlignmentMemoryFrequencyRatio",
"targetAlignmentMemoryLemmaFrequencyRatio",
"alignmentPosition",
"ngramLength",
"characterLength",
"alignmentOccurrences",
"lemmaAlignmentOccurrences",
"uniqueness",
"lemmaUniqueness"
];
let confidence = Engine.calculateWeightedConfidence(p, alignmentMemoryWeightedKeys, weights);
// prefer to use the saved alignment confidence
if (!isAlignmentMemory) {
confidence = corpusConfidence;
confidence *= p.getScore("phrasePlausibility");
// TODO: lemmaPhrasePlausibility
}
// boost confidence for alignment memory
if (isAlignmentMemory) {
confidence++;
}
p.setScore("confidence", confidence);
finalPredictions.push(p);
}
return finalPredictions;
}
/**
* Returns an array of alignment suggestions
* @param predictions - the predictions from which to base the suggestion
* @param maxSuggestions - the maximum number of suggestions to return
* @return {Suggestion}
*/
static suggest(predictions, maxSuggestions = 1) {
const suggestions = [];
// build suggestions
for (let i = 0; i < maxSuggestions; i++) {
if (i >= predictions.length) {
break;
}
const suggestion = new Suggestion_1.default();
let filtered = [...predictions];
// TRICKY: sequentially pick the best starting point in descending order
const best = filtered.splice(i, 1)[0];
suggestion.addPrediction(best);
filtered = filtered.filter((p) => {
return !best.intersects(p);
});
// fill suggestion
while (filtered.length) {
const nextBest = filtered.shift();
if (nextBest === undefined) {
break;
}
suggestion.addPrediction(nextBest);
filtered = filtered.filter((p) => {
return !nextBest.intersects(p);
});
}
suggestions.push(suggestion);
}
return Engine.sortSuggestions(suggestions);
}
/**
* Sorts an array of suggestions by compound confidence
* @param {Suggestion[]} suggestions - the suggestions to sort
* @return {Suggestion[]}
*/
static sortSuggestions(suggestions) {
return suggestions.sort((a, b) => {
const aConfidence = a.compoundConfidence();
const bConfidence = b.compoundConfidence();
if (aConfidence < bConfidence) {
return 1;
}
if (aConfidence > bConfidence) {
return -1;
}
return 0;
});
}
/**
* Sorts an array of predictions by confidence
* @param {Prediction[]} predictions - the predictions to sort
* @return {Prediction[]}
*/
static sortPredictions(predictions) {
return predictions.sort((a, b) => {
const aConfidence = a.getScore("confidence");
const bConfidence = b.getScore("confidence");
if (aConfidence < bConfidence) {
return 1;
}
if (aConfidence > bConfidence) {
return -1;
}
return 0;
});
}
/**
* Returns a list of algorithms that are registered in the engine
* @return {Array<Algorithm>}
*/
get algorithms() {
return this.registeredAlgorithms;
}
/**
* Executes prediction algorithms on the unaligned sentence pair.
* The sentence tokens should contain positional metrics for better accuracy.
*
* @param {Token[]} sourceSentence - the source sentence tokens.
* @param {Token[]} targetSentence - the target sentence tokens.
* @param {CorpusIndex} cIndex
* @param {AlignmentMemoryIndex} saIndex
* @param {Algorithm[]} algorithms
* @param {GlobalAlgorithm[]} globalAlgorithms
* @return {Prediction[]}
*/
performPrediction(sourceSentence, targetSentence, cIndex, saIndex, algorithms, globalAlgorithms) {
const sourceNgrams = Parser_1.default.ngrams(sourceSentence, this.maxSourceNgramLength);
const targetNgrams = Parser_1.default.ngrams(targetSentence, this.maxTargetNgramLength);
// generate alignment permutations
let predictions = Engine.generatePredictions(sourceNgrams, targetNgrams);
const numPredictions = predictions.length;
const sentenceIndex = new UnalignedSentenceIndex_1.default();
sentenceIndex.append([sourceSentence], [targetSentence], this.maxSourceNgramLength, this.maxTargetNgramLength);
// run global algorithms first
for (const algorithm of globalAlgorithms) {
predictions = algorithm.execute(predictions, cIndex, saIndex, sentenceIndex);
}
// run regular algorithms
const numAlgorithms = algorithms.length;
for (let i = 0; i < numPredictions; i++) {
for (let j = 0; j < numAlgorithms; j++) {
algorithms[j].execute(predictions[i], cIndex, saIndex, sentenceIndex);
}
}
return predictions;
}
/**
* Generates the final confidence scores and sorts the predictions.
* @param {Prediction[]} predictions
* @return {Prediction[]}
*/
score(predictions) {
const results = Engine.calculateConfidence(predictions, this.alignmentMemoryIndex);
return Engine.sortPredictions(results);
}
/**
* Adds a new algorithm to the engine.
* @param {Algorithm} algorithm - the algorithm to run with the engine.
*/
registerAlgorithm(algorithm) {
if (algorithm instanceof GlobalAlgorithm_1.default) {
this.registeredGlobalAlgorithms.push(algorithm);
}
else if (algorithm instanceof Algorithm_1.default) {
this.registeredAlgorithms.push(algorithm);
}
else {
throw new Error("Unsupported algorithm type");
}
}
/**
* Appends new corpus to the engine.
* @param {[Token[]]} source - an array of tokenized source sentences.
* @param {[Token[]]} target - an array of tokenized target sentences.
*/
addCorpus(source, target) {
this.corpusIndex.append(source, target, this.maxSourceNgramLength, this.maxTargetNgramLength);
}
/**
* Appends new alignment memory to the engine.
* Adding alignment memory improves the quality of predictions.
* @param {Array<Alignment>} alignmentMemory - a list of alignments
*/
addAlignmentMemory(alignmentMemory) {
// TODO: we need a better way for calling program to query the number of nGrams that exceed the limit
if (this.nGramWarnings) {
for (let i = alignmentMemory.length - 1; i >= 0; i--) {
const target = alignmentMemory[i].targetNgram;
if (target.tokenLength > this.maxTargetNgramLength) {
console.warn(`Target Alignment Memory "${target.key}" exceeds maximum n-gram length of ${this.maxTargetNgramLength} and may be ignored.`);
}
const source = alignmentMemory[i].sourceNgram;
if (source.tokenLength > this.maxSourceNgramLength) {
console.warn(`Source Alignment Memory "${source.key}" exceeds maximum n-gram length of ${this.maxSourceNgramLength} and may be ignored.`);
}
}
}
this.alignmentMemoryIndex.append(alignmentMemory);
}
/**
* Performs the prediction calculations
* @param {Token[]} sourceSentence
* @param {Token[]} targetSentence
* @return {Prediction[]}
*/
run(sourceSentence, targetSentence) {
return this.performPrediction(sourceSentence, targetSentence, this.corpusIndex, this.alignmentMemoryIndex, this.registeredAlgorithms, this.registeredGlobalAlgorithms);
}
}
exports.default = Engine;