UNPKG

wordmap

Version:

Multi-Lingual Word Alignment Prediction

github.com/translationCoreApps/wordMAP

translationCoreApps/wordMAP

163 lines (162 loc) • 6.97 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const wordmap_lexer_1 = require("wordmap-lexer"); const AlignmentOccurrences_1 = require("./algorithms/AlignmentOccurrences"); const AlignmentPosition_1 = require("./algorithms/AlignmentPosition"); const AlignmentRelativeOccurrence_1 = require("./algorithms/AlignmentRelativeOccurrence"); const CharacterLength_1 = require("./algorithms/CharacterLength"); const LemmaNgramFrequency_1 = require("./algorithms/LemmaNgramFrequency"); const NgramFrequency_1 = require("./algorithms/NgramFrequency"); const NgramLength_1 = require("./algorithms/NgramLength"); const PhrasePlausibility_1 = require("./algorithms/PhrasePlausibility"); const Uniqueness_1 = require("./algorithms/Uniqueness"); const Engine_1 = require("./Engine"); const Alignment_1 = require("./structures/Alignment"); const Ngram_1 = require("./structures/Ngram"); /** * Multi-Lingual Word Alignment Prediction */ class WordMap { constructor(opts = {}) { this.engine = new Engine_1.default(opts); this.engine.registerAlgorithm(new NgramFrequency_1.default()); this.engine.registerAlgorithm(new LemmaNgramFrequency_1.default()); // TODO: combine this with NgramFrequency for better performance this.engine.registerAlgorithm(new AlignmentRelativeOccurrence_1.default()); this.engine.registerAlgorithm(new AlignmentPosition_1.default()); this.engine.registerAlgorithm(new PhrasePlausibility_1.default()); this.engine.registerAlgorithm(new NgramLength_1.default()); this.engine.registerAlgorithm(new CharacterLength_1.default()); this.engine.registerAlgorithm(new AlignmentOccurrences_1.default()); this.engine.registerAlgorithm(new Uniqueness_1.default()); } /** * Adds an array of corpus * @param {string[][]} corpus */ appendCorpus(corpus) { for (const pair of corpus) { this.appendCorpusString(pair[0], pair[1]); } } /** * Add corpus to the MAP. * These may be single sentences or multiple sentence delimited by new lines. * @param {string} source * @param {string} target */ appendCorpusString(source, target) { const sourceSentences = source.split("\n"); const targetSentences = target.split("\n"); const sourceTokens = []; const targetTokens = []; const sourceLength = sourceSentences.length; const targetLength = targetSentences.length; if (sourceLength !== targetLength) { throw Error("source and target corpus must be the same length"); } for (let i = 0; i < sourceLength; i++) { sourceTokens.push(wordmap_lexer_1.default.tokenize(sourceSentences[i])); targetTokens.push(wordmap_lexer_1.default.tokenize(targetSentences[i])); } this.appendCorpusTokens(sourceTokens, targetTokens); } /** * Adds tokenized corpus to map * @param sourceTokens * @param targetTokens */ appendCorpusTokens(sourceTokens, targetTokens) { if (sourceTokens.length !== targetTokens.length) { throw Error("source and target corpus must be the same length"); } this.engine.addCorpus(sourceTokens, targetTokens); } /** * Appends alignment memory engine. * @param alignments - an alignment or array of alignments */ appendAlignmentMemory(alignments) { if (alignments instanceof Array) { this.engine.addAlignmentMemory(alignments); } else { this.engine.addAlignmentMemory([alignments]); } } /** * Appends some alignment memory. * This may be multiple lines of text or a single line. * * @param {string} source - a string of source phrases separated by new lines * @param {string} target - a string of target phrases separated by new lines * @return {Alignment[]} an array of alignment objects (as a convenience) */ appendAlignmentMemoryString(source, target) { const alignments = []; const sourceLines = source.split("\n"); const targetLines = target.split("\n"); const sourceLinesLength = sourceLines.length; if (sourceLinesLength !== targetLines.length) { throw new Error("source and target lines must be the same length"); } for (let i = 0; i < sourceLinesLength; i++) { const sourceTokens = wordmap_lexer_1.default.tokenize(sourceLines[i]); const targetTokens = wordmap_lexer_1.default.tokenize(targetLines[i]); alignments.push(new Alignment_1.default(new Ngram_1.default(sourceTokens), new Ngram_1.default(targetTokens))); } this.appendAlignmentMemory(alignments); return alignments; } /** * Predicts the word alignments between the sentences. * @param {string} sourceSentence - a sentence from the source text * @param {string} targetSentence - a sentence from the target text * @param {number} maxSuggestions - the maximum number of suggestions to return * @return {Suggestion[]} */ predict(sourceSentence, targetSentence, maxSuggestions = 1) { let sourceTokens = []; let targetTokens = []; if (typeof sourceSentence === "string") { sourceTokens = wordmap_lexer_1.default.tokenize(sourceSentence); } else { sourceTokens = sourceSentence; } if (typeof targetSentence === "string") { targetTokens = wordmap_lexer_1.default.tokenize(targetSentence); } else { targetTokens = targetSentence; } let predictions = this.engine.run(sourceTokens, targetTokens); predictions = this.engine.score(predictions); return Engine_1.default.suggest(predictions, maxSuggestions); } /** * Predicts word alignments between the sentences. * Returns an array of suggestions that match the benchmark. * * @param {string} sourceSentence * @param {string} targetSentence * @param {Suggestion} benchmark * @param {number} maxSuggestions * @return {Suggestion[]} */ predictWithBenchmark(sourceSentence, targetSentence, benchmark, maxSuggestions = 1) { const sourceTokens = wordmap_lexer_1.default.tokenize(sourceSentence); const targetTokens = wordmap_lexer_1.default.tokenize(targetSentence); let predictions = this.engine.run(sourceTokens, targetTokens); predictions = this.engine.score(predictions); const validPredictions = []; for (const p of predictions) { for (const a of benchmark) { if (a.key === p.alignment.key) { validPredictions.push(p); } } } return Engine_1.default.suggest(validPredictions, maxSuggestions); } } exports.default = WordMap;