wordmap
Version:
Multi-Lingual Word Alignment Prediction
163 lines (162 loc) • 6.97 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
const wordmap_lexer_1 = require("wordmap-lexer");
const AlignmentOccurrences_1 = require("./algorithms/AlignmentOccurrences");
const AlignmentPosition_1 = require("./algorithms/AlignmentPosition");
const AlignmentRelativeOccurrence_1 = require("./algorithms/AlignmentRelativeOccurrence");
const CharacterLength_1 = require("./algorithms/CharacterLength");
const LemmaNgramFrequency_1 = require("./algorithms/LemmaNgramFrequency");
const NgramFrequency_1 = require("./algorithms/NgramFrequency");
const NgramLength_1 = require("./algorithms/NgramLength");
const PhrasePlausibility_1 = require("./algorithms/PhrasePlausibility");
const Uniqueness_1 = require("./algorithms/Uniqueness");
const Engine_1 = require("./Engine");
const Alignment_1 = require("./structures/Alignment");
const Ngram_1 = require("./structures/Ngram");
/**
* Multi-Lingual Word Alignment Prediction
*/
class WordMap {
constructor(opts = {}) {
this.engine = new Engine_1.default(opts);
this.engine.registerAlgorithm(new NgramFrequency_1.default());
this.engine.registerAlgorithm(new LemmaNgramFrequency_1.default()); // TODO: combine this with NgramFrequency for better performance
this.engine.registerAlgorithm(new AlignmentRelativeOccurrence_1.default());
this.engine.registerAlgorithm(new AlignmentPosition_1.default());
this.engine.registerAlgorithm(new PhrasePlausibility_1.default());
this.engine.registerAlgorithm(new NgramLength_1.default());
this.engine.registerAlgorithm(new CharacterLength_1.default());
this.engine.registerAlgorithm(new AlignmentOccurrences_1.default());
this.engine.registerAlgorithm(new Uniqueness_1.default());
}
/**
* Adds an array of corpus
* @param {string[][]} corpus
*/
appendCorpus(corpus) {
for (const pair of corpus) {
this.appendCorpusString(pair[0], pair[1]);
}
}
/**
* Add corpus to the MAP.
* These may be single sentences or multiple sentence delimited by new lines.
* @param {string} source
* @param {string} target
*/
appendCorpusString(source, target) {
const sourceSentences = source.split("\n");
const targetSentences = target.split("\n");
const sourceTokens = [];
const targetTokens = [];
const sourceLength = sourceSentences.length;
const targetLength = targetSentences.length;
if (sourceLength !== targetLength) {
throw Error("source and target corpus must be the same length");
}
for (let i = 0; i < sourceLength; i++) {
sourceTokens.push(wordmap_lexer_1.default.tokenize(sourceSentences[i]));
targetTokens.push(wordmap_lexer_1.default.tokenize(targetSentences[i]));
}
this.appendCorpusTokens(sourceTokens, targetTokens);
}
/**
* Adds tokenized corpus to map
* @param sourceTokens
* @param targetTokens
*/
appendCorpusTokens(sourceTokens, targetTokens) {
if (sourceTokens.length !== targetTokens.length) {
throw Error("source and target corpus must be the same length");
}
this.engine.addCorpus(sourceTokens, targetTokens);
}
/**
* Appends alignment memory engine.
* @param alignments - an alignment or array of alignments
*/
appendAlignmentMemory(alignments) {
if (alignments instanceof Array) {
this.engine.addAlignmentMemory(alignments);
}
else {
this.engine.addAlignmentMemory([alignments]);
}
}
/**
* Appends some alignment memory.
* This may be multiple lines of text or a single line.
*
* @param {string} source - a string of source phrases separated by new lines
* @param {string} target - a string of target phrases separated by new lines
* @return {Alignment[]} an array of alignment objects (as a convenience)
*/
appendAlignmentMemoryString(source, target) {
const alignments = [];
const sourceLines = source.split("\n");
const targetLines = target.split("\n");
const sourceLinesLength = sourceLines.length;
if (sourceLinesLength !== targetLines.length) {
throw new Error("source and target lines must be the same length");
}
for (let i = 0; i < sourceLinesLength; i++) {
const sourceTokens = wordmap_lexer_1.default.tokenize(sourceLines[i]);
const targetTokens = wordmap_lexer_1.default.tokenize(targetLines[i]);
alignments.push(new Alignment_1.default(new Ngram_1.default(sourceTokens), new Ngram_1.default(targetTokens)));
}
this.appendAlignmentMemory(alignments);
return alignments;
}
/**
* Predicts the word alignments between the sentences.
* @param {string} sourceSentence - a sentence from the source text
* @param {string} targetSentence - a sentence from the target text
* @param {number} maxSuggestions - the maximum number of suggestions to return
* @return {Suggestion[]}
*/
predict(sourceSentence, targetSentence, maxSuggestions = 1) {
let sourceTokens = [];
let targetTokens = [];
if (typeof sourceSentence === "string") {
sourceTokens = wordmap_lexer_1.default.tokenize(sourceSentence);
}
else {
sourceTokens = sourceSentence;
}
if (typeof targetSentence === "string") {
targetTokens = wordmap_lexer_1.default.tokenize(targetSentence);
}
else {
targetTokens = targetSentence;
}
let predictions = this.engine.run(sourceTokens, targetTokens);
predictions = this.engine.score(predictions);
return Engine_1.default.suggest(predictions, maxSuggestions);
}
/**
* Predicts word alignments between the sentences.
* Returns an array of suggestions that match the benchmark.
*
* @param {string} sourceSentence
* @param {string} targetSentence
* @param {Suggestion} benchmark
* @param {number} maxSuggestions
* @return {Suggestion[]}
*/
predictWithBenchmark(sourceSentence, targetSentence, benchmark, maxSuggestions = 1) {
const sourceTokens = wordmap_lexer_1.default.tokenize(sourceSentence);
const targetTokens = wordmap_lexer_1.default.tokenize(targetSentence);
let predictions = this.engine.run(sourceTokens, targetTokens);
predictions = this.engine.score(predictions);
const validPredictions = [];
for (const p of predictions) {
for (const a of benchmark) {
if (a.key === p.alignment.key) {
validPredictions.push(p);
}
}
}
return Engine_1.default.suggest(validPredictions, maxSuggestions);
}
}
exports.default = WordMap;