UNPKG

@palasimi/ipa-cluster

Version:

Cluster words with similar IPA transcriptions together

152 lines 4.53 kB
"use strict"; // SPDX-License-Identifier: GPL-3.0-or-later // Copyright (c) 2023 Levi Gruspe // Aligned intermediate representations. Object.defineProperty(exports, "__esModule", { value: true }); exports.align = void 0; /** * Find the number of times that the shorter string should be "shifted" to the * right to maximize the number of alignments (matching symbols) with the * longer string. * * The first argument should not be longer than the second argument. */ function findOptimalShift(short, long) { const m = long.length; const n = short.length; if (m === n) { return 0; } // Find alignment with best score. let maxScore = 0; let bestStart = 0; for (let start = 0; start <= m - n; start++) { let score = 0; for (let i = 0; i < n; i++) { if (short[i] === long[start + i]) { score++; } } if (score > maxScore) { maxScore = score; bestStart = start; } } return bestStart; } /** * Pads the shorter sequence with null sounds ("_") so that the sequences have * the same length. * Both sequences should not contain "#". * * The first argument should not be longer than the second argument. */ function padNull(short, long) { const shift = findOptimalShift(short, long); const newShort = []; for (let i = 0; i < shift; i++) { newShort.push("_"); } newShort.push(...short); while (newShort.length < long.length) { newShort.push("_"); } return [newShort, long.slice()]; } /** * Polyfill for `Array.findLastIndex`. */ function findLastIndex(array, callback) { let i = array.length - 1; while (i >= 0 && !callback(array[i])) { i--; } return i; } /** * Removes word boundaries from the sequence. * Returns a triple. * First element: the trimmed sequence. * Second element: does the sequence begin with "#"? * Third element: does the sequence end with "#"? */ function trimBoundaries(sequence) { const start = sequence.findIndex((sound) => sound !== "#"); let end = findLastIndex(sequence, (sound) => sound !== "#") + 1; if (end === 0) { end = sequence.length; } const trimmed = sequence.slice(start, end); return [trimmed, start > 0, end < sequence.length]; } /** * Reattaches trimmed word boundaries to two sequences, while keeping them * aligned. * `left` and `right` should be of the same length. */ function bound(left, hasPrefixLeft, hasSuffixLeft, right, hasPrefixRight, hasSuffixRight) { const newLeft = []; const newRight = []; // Add trimmed prefixes. if (hasPrefixLeft) { newLeft.push("#"); newRight.push(hasPrefixRight ? "#" : "_"); } else if (hasPrefixRight) { newLeft.push("_"); newRight.push("#"); } // Add sequences. newLeft.push(...left); newRight.push(...right); // Add trimmed suffixes. if (hasSuffixLeft) { newLeft.push("#"); newRight.push(hasSuffixRight ? "#" : "_"); } else if (hasSuffixRight) { newLeft.push("_"); newRight.push("#"); } return [newLeft, newRight]; } /** * Pads left and right sequences with "_" or "#" so that the sequences are * aligned. * "_" means "delete the opposite character". * "#" represents a word boundary. */ function pad(left, right) { // Switch args so that `left.length <= right.length`. const [trimmedLeft, hasPrefixLeft, hasSuffixLeft] = trimBoundaries(left); const [trimmedRight, hasPrefixRight, hasSuffixRight] = trimBoundaries(right); let reversed = false; let [short, long] = [trimmedLeft, trimmedRight]; if (long.length < short.length) { [short, long] = [long, short]; reversed = true; } let [paddedShort, paddedLong] = padNull(short, long); if (reversed) { [paddedShort, paddedLong] = [paddedLong, paddedShort]; } const [paddedLeft, paddedRight] = [paddedShort, paddedLong]; return bound(paddedLeft, hasPrefixLeft, hasSuffixLeft, paddedRight, hasPrefixRight, hasSuffixRight); } /** * Compiles an `ExpandedIR` into an `AlignedIR`. */ function align(ir) { const rules = []; for (const { constraint, left, right } of ir.rules) { const alignment = pad(left, right); rules.push({ constraint, left: alignment[0], right: alignment[1], }); } return { rules }; } exports.align = align; //# sourceMappingURL=align.js.map