UNPKG

taipa

Version:

Taiwanese morphological parsing library

69 lines 3.31 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.getLatinSyllableCompositions = void 0; const soundgen_1 = require("./soundgen"); const unit_1 = require("../unit"); const analyzer_1 = require("../unchange/analyzer"); const tonalres_1 = require("./tonalres"); const collections_1 = require("./collections"); /** Get Latin syllable compositions for syllable tokenization. Returned values can be further matched with tone patterns or looked up in dictionary. */ function getLatinSyllableCompositions(str) { const soundSeqs = new Array(); const letters = (0, analyzer_1.graphAnalyzeTonal)(str).map(x => x.letter && x.letter.literal); // console.log(letters); let beginOfSyllable = 0; while (beginOfSyllable < letters.length) { const accumulatedSeqs = new Array(); // accumulator for the matched let shouldBreak = false; for (let i = 0; i < letters.length; i++) { // i is used for the end of the specified portion of letters. see letters.slice below for (let j = 0; j < soundgen_1.syllableCompositions.length; j++) { if (shouldBreak) break; if (i + 1 > beginOfSyllable) { // bypass those loops when i is less than or equal to beginOfSyllable let sg = new unit_1.SoundGeneration(); // the letter at position i is exclusive sg.letters = letters.slice(beginOfSyllable, i + 1); // console.log(sg.letters, beginOfSyllable, i, j); if (collections_1.impossibleSequences.includes(sg.letters[i])) { if (i > 0 && tonalres_1.vowelsTonal.includes(sg.letters[i - 1])) { shouldBreak = true; break; } } sg = soundgen_1.syllableCompositions[j](sg); if (sg.letters.length == sg.matchedSounds.length && sg.matching == true) { accumulatedSeqs.push(sg.matchedSounds); // console.log(sg.letters, beginOfSyllable, i, j); } } } if (i + 1 == letters.length) { // on the last loop if (accumulatedSeqs.length > 0) { // the last one should be the longest one? if (beginOfSyllable + accumulatedSeqs[accumulatedSeqs.length - 1].length <= letters.length) { // when beginOfSyllable adds up to under the length of letters beginOfSyllable += accumulatedSeqs[accumulatedSeqs.length - 1].length; } // console.log(beginOfSyllable); } } } if (accumulatedSeqs.length == 0) { // break while loop break; } else if (accumulatedSeqs.length > 0) { soundSeqs.push(accumulatedSeqs); } } return soundSeqs.map(x => x.map(y => y)); } exports.getLatinSyllableCompositions = getLatinSyllableCompositions; //# sourceMappingURL=tokenizer.js.map