wordmap
Version:
Multi-Lingual Word Alignment Prediction
251 lines (250 loc) • 8.53 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const wordmap_lexer_1 = require("wordmap-lexer");
const Algorithm_1 = require("../Algorithm");
const Alignment_1 = require("../structures/Alignment");
const Ngram_1 = require("../structures/Ngram");
const Prediction_1 = require("../structures/Prediction");
/**
* Generates a score for how closely the suggestion matches the answer key
* @param {Suggestion} suggestion
* @param {object} answerKey
* @return {number}
*/
function scoreSuggestion(suggestion, answerKey) {
return 0;
}
exports.scoreSuggestion = scoreSuggestion;
/**
* converts some strings into corpus.
* @param {string} source
* @param {string} target
* @return {Token[][][]}
*/
function makeCorpus(source, target) {
const sourceCorpusTokens = wordmap_lexer_1.default.tokenize(source);
const targetCorpusTokens = wordmap_lexer_1.default.tokenize(target);
return [
[sourceCorpusTokens],
[targetCorpusTokens]
];
}
exports.makeCorpus = makeCorpus;
/**
* Generates some strings into corpus with support for lemma
* @param source
* @param target
*/
function makeComplexCorpus(source, target) {
const sourceCorpusTokens = tokenizeComplexMockSentence(source);
const targetCorpusTokens = tokenizeComplexMockSentence(target);
return [
[sourceCorpusTokens],
[targetCorpusTokens]
];
}
exports.makeComplexCorpus = makeComplexCorpus;
/**
* converts some strings into an unaligned sentence pair
* @param {string} source
* @param {string} target
* @return {Token[][]}
*/
function makeUnalignedSentence(source, target) {
return [
tokenizeMockSentence(source),
tokenizeMockSentence(target)
];
}
exports.makeUnalignedSentence = makeUnalignedSentence;
/**
* Generates a sample alignment from a sentence
* @param {String} sentence - a raw sentence from which to generate a mock alignment
* @return {Array<Alignment>} a mock alignment
*/
function alignMockSentence(sentence) {
let alignments = [];
const tokens = tokenizeMockSentence(sentence);
while (tokens.length) {
const ngramLength = randNgramLength(tokens.length, 1);
alignments = [
...alignments,
alignMockTokens(tokens.slice(0, ngramLength))
];
tokens.splice(0, ngramLength);
}
return alignments;
}
exports.alignMockSentence = alignMockSentence;
/**
* Generates a sample alignment from a complex sentence.
* Additional data like `lemma` can be appended to the words like `word:lemma`
* @param sentence
*/
function alignComplexMockSentence(sentence) {
let alignments = [];
const tokens = tokenizeComplexMockSentence(sentence);
while (tokens.length) {
const ngramLength = randNgramLength(tokens.length, 1);
alignments = [
...alignments,
alignComplexMockTokens(tokens.slice(0, ngramLength))
];
tokens.splice(0, ngramLength);
}
return alignments;
}
exports.alignComplexMockSentence = alignComplexMockSentence;
/**
* Creates a mock alignment from two strings.
* The strings will be tokenized and converted to n-grams in the alignment
* @param {string} source
* @param {string} target
* @return {Alignment}
*/
function makeMockAlignment(source, target) {
const sourceTokens = wordmap_lexer_1.default.tokenize(source);
const targetTokens = wordmap_lexer_1.default.tokenize(target);
return new Alignment_1.default(new Ngram_1.default(sourceTokens), new Ngram_1.default(targetTokens));
}
exports.makeMockAlignment = makeMockAlignment;
/**
* Creates a mock alignment from two complex strings.
* Additional data like `lemma` can be appended to the word like `word:lemma`
* @param source
* @param target
*/
function makeComplexMockAlignment(source, target) {
const sourceTokens = tokenizeComplexMockSentence(source);
const targetTokens = tokenizeComplexMockSentence(target);
return new Alignment_1.default(new Ngram_1.default(sourceTokens), new Ngram_1.default(targetTokens));
}
exports.makeComplexMockAlignment = makeComplexMockAlignment;
/**
* Creates a mock prediction from two strings
* @param {string} source
* @param {string} target
* @param {number} confidence - the confidence of the prediction
* @return {Prediction}
*/
function makeMockPrediction(source, target, confidence) {
const prediction = new Prediction_1.default(makeMockAlignment(source, target));
prediction.setScore("confidence", confidence);
return prediction;
}
exports.makeMockPrediction = makeMockPrediction;
/**
* Generates a sample alignment
* @param {Array<Token>} tokens - An array of tokens to align
* @return {Alignment} a sample alignment
*/
function alignMockTokens(tokens) {
const source = new Ngram_1.default(tokens);
const flippedTokens = [];
for (const token of tokens) {
flippedTokens.push(new wordmap_lexer_1.Token({
text: token.toString().split("").reverse().join(""),
position: token.position,
characterPosition: token.charPosition,
sentenceTokenLen: token.sentenceTokenLength,
sentenceCharLen: token.sentenceCharacterLength
}));
}
const target = new Ngram_1.default(flippedTokens);
return new Alignment_1.default(source, target);
}
/**
* Generates a sample alignment
* @param {Array<Token>} tokens - An array of tokens to align
* @return {Alignment} a sample alignment
*/
function alignComplexMockTokens(tokens) {
const source = new Ngram_1.default(tokens);
const flippedTokens = [];
for (const token of tokens) {
flippedTokens.push(new wordmap_lexer_1.Token({
text: token.toString().split("").reverse().join(""),
position: token.position,
characterPosition: token.charPosition,
sentenceTokenLen: token.sentenceTokenLength,
sentenceCharLen: token.sentenceCharacterLength
}));
}
const target = new Ngram_1.default(flippedTokens);
return new Alignment_1.default(source, target);
}
/**
* Reverses the character order of words in a sentence
* @param {string} sentence
* @return {string}
*/
function reverseSentenceWords(sentence) {
return sentence.split(" ").map((word) => {
return word.split("").reverse().join("");
}).join(" ");
}
exports.reverseSentenceWords = reverseSentenceWords;
/**
* Flips a sentence.
* @param sentence
*/
function reverseSentence(sentence) {
return sentence.split(/\s+/).reverse().join(" ");
}
exports.reverseSentence = reverseSentence;
/**
* Converts a sentence to an array of Tokens
* @param {String} sentence - a raw sentence to convert into tokens
* @return {Array<Token>} an array of tokens
*/
function tokenizeMockSentence(sentence) {
return wordmap_lexer_1.default.tokenize(sentence);
}
exports.tokenizeMockSentence = tokenizeMockSentence;
/**
* Converts a sentence to an array of
* @param sentence - a sentence with lemmas appended to words like `word:lemma`.
*/
function tokenizeComplexMockSentence(sentence) {
const words = sentence.split(/\s+/);
const sentenceWords = [];
const lemmaWords = [];
for (const w of words) {
const [text, lemma] = w.split(":");
sentenceWords.push(text);
if (lemma) {
lemmaWords.push(lemma);
}
else {
lemmaWords.push(text);
}
}
const tokens = wordmap_lexer_1.default.tokenize(sentenceWords.join(" "));
const tokenizedSentence = [];
for (let i = 0, len = tokens.length; i < len; i++) {
tokenizedSentence.push(new wordmap_lexer_1.Token(Object.assign(Object.assign({}, tokens[i].toJSON(true)), { lemma: lemmaWords[i] })));
}
return tokenizedSentence;
}
exports.tokenizeComplexMockSentence = tokenizeComplexMockSentence;
/**
* Generates the length of an n-gram.
* n-grams are limited to lengths of 3.
* @param {number} numTokens - the number of tokens available for use in the n-gram.
* @param {number} [maxLength=3] - the maximum length of the n-gram
* @return {number} an n-gram size
*/
function randNgramLength(numTokens, maxLength = 3) {
const ceiling = Math.min(numTokens, maxLength);
return Math.floor(Math.random() * ceiling) + 1;
}
class MockAlgorithm extends Algorithm_1.default {
constructor() {
super(...arguments);
this.name = "mock algorithm";
}
execute(prediction, cIndex, saIndex, usIndex) {
return prediction;
}
}
exports.MockAlgorithm = MockAlgorithm;