taipa
Version:
Taiwanese morphological parsing library
69 lines • 3.31 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.getLatinSyllableCompositions = void 0;
const soundgen_1 = require("./soundgen");
const unit_1 = require("../unit");
const analyzer_1 = require("../unchange/analyzer");
const tonalres_1 = require("./tonalres");
const collections_1 = require("./collections");
/** Get Latin syllable compositions for syllable tokenization. Returned values can be further matched with tone patterns or looked up in dictionary. */
function getLatinSyllableCompositions(str) {
const soundSeqs = new Array();
const letters = (0, analyzer_1.graphAnalyzeTonal)(str).map(x => x.letter && x.letter.literal);
// console.log(letters);
let beginOfSyllable = 0;
while (beginOfSyllable < letters.length) {
const accumulatedSeqs = new Array(); // accumulator for the matched
let shouldBreak = false;
for (let i = 0; i < letters.length; i++) {
// i is used for the end of the specified portion of letters. see letters.slice below
for (let j = 0; j < soundgen_1.syllableCompositions.length; j++) {
if (shouldBreak)
break;
if (i + 1 > beginOfSyllable) {
// bypass those loops when i is less than or equal to beginOfSyllable
let sg = new unit_1.SoundGeneration();
// the letter at position i is exclusive
sg.letters = letters.slice(beginOfSyllable, i + 1);
// console.log(sg.letters, beginOfSyllable, i, j);
if (collections_1.impossibleSequences.includes(sg.letters[i])) {
if (i > 0 && tonalres_1.vowelsTonal.includes(sg.letters[i - 1])) {
shouldBreak = true;
break;
}
}
sg = soundgen_1.syllableCompositions[j](sg);
if (sg.letters.length == sg.matchedSounds.length &&
sg.matching == true) {
accumulatedSeqs.push(sg.matchedSounds);
// console.log(sg.letters, beginOfSyllable, i, j);
}
}
}
if (i + 1 == letters.length) {
// on the last loop
if (accumulatedSeqs.length > 0) {
// the last one should be the longest one?
if (beginOfSyllable +
accumulatedSeqs[accumulatedSeqs.length - 1].length <=
letters.length) {
// when beginOfSyllable adds up to under the length of letters
beginOfSyllable +=
accumulatedSeqs[accumulatedSeqs.length - 1].length;
}
// console.log(beginOfSyllable);
}
}
}
if (accumulatedSeqs.length == 0) {
// break while loop
break;
}
else if (accumulatedSeqs.length > 0) {
soundSeqs.push(accumulatedSeqs);
}
}
return soundSeqs.map(x => x.map(y => y));
}
exports.getLatinSyllableCompositions = getLatinSyllableCompositions;
//# sourceMappingURL=tokenizer.js.map