@unglish/word-generator
Version:
A simple generator for creating unglish words.
418 lines (417 loc) • 19.3 kB
JavaScript
import { overrideRand, getRand } from "../utils/random.js";
import { createSeededRandom } from "../utils/createSeededRandom.js";
import getWeightedOption from "../utils/getWeightedOption.js";
import { phonemes, invalidOnsetClusters, invalidBoundaryClusters, invalidCodaClusters, sonority } from "../elements/phonemes.js";
import { generatePronunciation } from "./pronounce.js";
import { generateWrittenForm } from "./write.js";
/**
* Determines the sonority level of a given phoneme.
*
* @param phoneme - The phoneme to evaluate.
* @returns The sonority level of the phoneme, or 0 if not found.
*
* This function uses the 'sonority' object to look up the sonority level
* based on the phoneme's type. If the phoneme type is not found in the
* sonority object, it returns 0 as a default value.
*
* Sonority is important in determining the structure of syllables and
* the formation of consonant clusters in many languages, including English.
*/
const sonorityCache = new Map();
function getSonority(phoneme) {
if (!sonorityCache.has(phoneme.type)) {
sonorityCache.set(phoneme.type, sonority[phoneme.type] || 0);
}
return sonorityCache.get(phoneme.type);
}
/**
* Builds a phoneme cluster for either the onset or coda of a syllable.
*
* @param type - The type of cluster to build: "onset" or "coda".
* @param maxLength - The maximum number of phonemes allowed in the cluster. Default is 3.
* @param ignore - An array of phoneme sounds to ignore when building the cluster. Default is an empty array.
* @returns An array of Phoneme objects representing the built cluster.
*
* This function constructs a phoneme cluster following English phonotactic rules:
* 1. It respects sonority sequencing (increasing for onset, decreasing for coda).
* 2. It avoids invalid clusters as defined in invalidOnsetClusters and invalidCodaClusters.
* 3. It handles special cases, such as 's' clusters in onsets and limitations on liquids and nasals.
* 4. It considers phoneme-specific constraints like start/end of word positions.
*
* The function stops building the cluster when it reaches maxLength, runs out of valid candidates,
* or encounters specific conditions (e.g., two-phoneme onset ending in a liquid or nasal).
*/
export function buildCluster(position, maxLength = 3, ignore = [], isStartOfWord, isEndOfWord) {
const cluster = [];
const candidatePhonemes = positionPhonemes[position];
while (cluster.length < maxLength) {
let validCandidates = candidatePhonemes.filter(p => {
const potentialCluster = cluster.map(ph => ph.sound).join('') + p.sound;
const isNotIgnored = !ignore.includes(p.sound);
const isNotDuplicate = !cluster.some(existingP => existingP.sound === p.sound);
let isAllowedToEndWord = !p.endWord || p.endWord > 0;
let isAllowedToStartWord = !p.startWord || p.startWord > 0;
let isValidPosition =
// @ts-ignore
(p[position] === undefined || p[position] > 0) &&
(isStartOfWord ? isAllowedToStartWord : true) &&
(isEndOfWord ? isAllowedToEndWord : true);
let isSonorityException = false;
let hasSuitableSonority = true;
let invalidClusters = invalidBoundaryClusters;
// there are special cases for s in english where it can be followed by something
// that increases in sonority
const lastPhoneme = cluster[cluster.length - 1];
switch (position) {
case "onset":
const lastPhonemeWasAStop = lastPhoneme && ['voicedStop', 'voicelessStop'].includes(lastPhoneme.type);
const canFollowAStop = lastPhonemeWasAStop ? ['glide', 'liquid'].includes(p.type) : false;
isSonorityException =
cluster.length === 1 &&
cluster[0].sound === 's' &&
['t', 'p', 'k'].includes(p.sound);
hasSuitableSonority =
cluster.length === 0 ||
isSonorityException ||
(lastPhonemeWasAStop ? canFollowAStop : getSonority(p) > getSonority(cluster[cluster.length - 1]));
invalidClusters = invalidOnsetClusters;
break;
case "nucleus":
break;
case "coda":
if (!!lastPhoneme) {
const prevSonority = getSonority(lastPhoneme);
const currSonority = getSonority(p);
const lastType = lastPhoneme.type;
const currType = p.type;
// Check for exceptions first
const isEqualSonorityException = (lastType.indexOf('Fricative') > 0 && currType.indexOf('Fricative') > 0) ||
(lastType.indexOf('Stop') > 0 && currType.indexOf('Stop') > 0);
const isReversedSonorityException = (lastType.indexOf('Stop') > 0 && currType.indexOf('Fricative') > 0) ||
(lastType.indexOf('sibilant') > 0 && currType === 'nasal');
hasSuitableSonority =
isEqualSonorityException ||
isReversedSonorityException ||
(currSonority < prevSonority);
}
invalidClusters = invalidCodaClusters;
break;
}
// Check against invalid clusters
const isValidCluster = !invalidClusters.some(regex => regex.test(potentialCluster));
return isValidPosition && isNotIgnored && isNotDuplicate && hasSuitableSonority && isValidCluster;
});
if (!validCandidates.length)
break;
const mappedCandidates = validCandidates.map((p) => {
const phonemePosition = p[position] ?? 0;
const wordPositionModifier = isStartOfWord && p.startWord ||
isEndOfWord && p.endWord ||
p.midWord || 1;
return [
p,
phonemePosition * wordPositionModifier,
];
});
const newPhoneme = getWeightedOption(mappedCandidates);
if (newPhoneme) {
cluster.push(newPhoneme);
}
else {
break; // Exit the loop if no valid phoneme is selected
}
// Special cases for English
if (position === "onset"
&& cluster.length === 2
&& ['liquid', 'nasal'].includes(cluster[1].type)) {
break;
}
}
return cluster;
}
const positionPhonemes = {
onset: phonemes.filter(p => p.onset !== undefined && p.onset > 0),
coda: phonemes.filter(p => p.coda !== undefined && p.coda > 0),
nucleus: phonemes.filter(p => p.nucleus !== undefined && p.nucleus > 0)
};
/**
* Selects and returns an onset (initial consonant cluster) for a syllable.
*
* @param prevSyllable - The previous syllable, if any. Used to determine constraints on the onset.
* @returns An array of Phoneme objects representing the onset.
*
* This function does the following:
* 1. Determines if the new syllable follows a nucleus without a coda in the previous syllable.
* 2. Chooses a weighted random length for the onset (0-3 phonemes).
* 3. Builds the onset cluster using the buildCluster function, avoiding phonemes from the previous syllable's coda.
*
* The onset length probabilities are adjusted based on whether it follows a nucleus:
* - If following a nucleus (no coda in previous syllable), onset cannot be empty (length 0).
* - Otherwise, empty onsets are possible but less likely than single-phoneme onsets.
*/
function pickOnset(prevSyllable) {
const isFollowingNucleus = prevSyllable && prevSyllable.coda.length === 0;
const length = getWeightedOption([
[0, isFollowingNucleus ? 0 : 150],
[1, 675],
[2, 125],
[3, 80]
]);
const isStartOfWord = !prevSyllable;
const toIgnore = prevSyllable ? prevSyllable.coda.map((coda) => coda.sound) : [];
let onset = buildCluster("onset", length, toIgnore, isStartOfWord, false);
return onset;
}
/**
* Selects and returns a nucleus (vowel) for a syllable.
*
* @param prevSyllable - The previous syllable, if any. Used to determine constraints on the nucleus.
* @returns A Phoneme object representing the nucleus.
*/
function pickNucleus(prevSyllable, isEndOfWord) {
const isStartOfWord = !prevSyllable;
let nucleus = buildCluster("nucleus", 1, prevSyllable ? prevSyllable.coda.map((coda) => coda.sound) : [], isStartOfWord, isEndOfWord);
return nucleus;
}
/**
* Selects and returns a coda (final consonant cluster) for a syllable.
*
* @param onset - The onset of the current syllable, used to avoid repetition.
* @param isEndOfWord - Boolean indicating if this is the last syllable of the word.
* @returns An array of Phoneme objects representing the coda.
*
* This function does the following:
* 1. Determines the length of the coda based on weighted probabilities, which differ for the last syllable.
* 2. Builds the coda cluster using the buildCluster function.
* 3. Checks for and potentially avoids repetition between the onset and coda.
*
* The coda length probabilities are adjusted based on whether it's the last syllable:
* - Last syllable: Higher chance of non-empty codas.
* - Other syllables: Higher chance of empty codas.
*
* To avoid repetition:
* - There's a 98% chance to avoid repeating the first onset phoneme as the last coda phoneme.
* - If avoiding repetition, it tries to replace the last coda phoneme with a similar one.
* - If no suitable replacement is found, it removes the last coda phoneme.
*/
function pickCoda(currentSyllable, isEndOfWord = false) {
const length = getWeightedOption(isEndOfWord ? [
[0, 500],
[1, 3000],
[2, 900],
[3, 400],
] : [
[0, 6000],
[1, 3000],
[2, 900],
[3, 100],
]);
if (length === 0)
return [];
const onset = currentSyllable.onset;
let coda = buildCluster("coda", length, [], false, isEndOfWord);
// Check for onset-coda repetition
if (onset.length > 0 && coda.length > 0 && onset[0].sound === coda[coda.length - 1].sound) {
const shouldAvoidRepetition = getWeightedOption([
[true, 98],
[false, 2] // 10% chance to allow repetition
]);
if (shouldAvoidRepetition) {
// Try to replace the last coda phoneme
const alternativeCodas = phonemes.filter(p => p.coda && p.sound !== onset[0].sound && p.type === coda[coda.length - 1].type);
if (alternativeCodas.length > 0) {
coda[coda.length - 1] = getWeightedOption(alternativeCodas.map(p => [p, p.coda ?? 0]));
}
else {
// If no suitable alternative, remove the last coda phoneme
coda.pop();
}
}
}
return coda;
}
/**
* Checks if a cross-syllable boundary is valid based on sonority.
*
* @param prevSyllable - The previous syllable in the word.
* @param currentSyllable - The current syllable being checked.
* @returns {boolean} - True if the cross-syllable boundary is valid, false otherwise.
*
* This function ensures that the sonority profile across syllable boundaries is
* phonologically valid. It checks the sonority of the last phoneme in the coda
* of the previous syllable against the first phoneme in the onset of the current syllable.
*
* The function returns true in the following cases:
* 1. If there's no previous syllable (i.e., it's the first syllable of the word).
* 2. If the previous syllable has no coda.
* 3. If the current syllable has no onset.
* 4. If the sonority of the first onset phoneme is greater than or equal to
* the sonority of the last coda phoneme.
*
* This implementation allows for equal sonority across syllable boundaries,
* which is a simplification and might need refinement based on specific
* phonological rules of the target language.
*/
function checkCrossSyllableSonority(prevSyllable, currentSyllable) {
if (!prevSyllable || !prevSyllable.coda.length || !currentSyllable.onset.length) {
return true; // No cross-syllable cluster, so it's valid
}
const lastCodaPhoneme = prevSyllable.coda[prevSyllable.coda.length - 1];
const firstOnsetPhoneme = currentSyllable.onset[0];
// Allow equal sonority across syllable boundary
// This is a simplification; you might want to refine this based on specific phoneme types
return getSonority(firstOnsetPhoneme) >= getSonority(lastCodaPhoneme);
}
/**
* Attempts to resyllabify two adjacent syllables based on sonority and phonological rules.
*
* This function examines the boundary between two syllables and potentially moves
* phonemes from the coda of the first syllable to the onset of the second syllable,
* or drops the coda entirely, based on sonority principles and probabilistic rules.
*
* @param prevSyllable - The preceding syllable that may have its coda modified.
* @param currentSyllable - The current syllable that may have its onset modified.
* @returns A tuple containing the potentially modified previous and current syllables.
*
* The function performs the following checks and modifications:
* 1. If both syllables have phonemes at their boundary (coda and onset):
* a. It compares the sonority of the last coda phoneme and the first onset phoneme.
* b. It checks if moving the coda to the onset would create a valid onset cluster.
* 2. If the onset sonority is higher and the resulting cluster is valid:
* - The last coda phoneme is moved to the beginning of the onset.
* 3. If the sonorities are equal:
* - There's a 90% chance to drop the coda phoneme, 10% chance to keep it.
*
* This process helps ensure more natural syllable boundaries and can create
* more varied and realistic word structures.
*/
function tryResyllabify(prevSyllable, currentSyllable) {
if (prevSyllable.coda.length && currentSyllable.onset.length) {
const lastCodaPhoneme = prevSyllable.coda[prevSyllable.coda.length - 1];
const firstOnsetPhoneme = currentSyllable.onset[0];
const lastCodaSonority = getSonority(lastCodaPhoneme);
const firstOnsetSonority = getSonority(firstOnsetPhoneme);
// Check if moving the coda to the onset would create a valid onset cluster
const potentialOnset = [lastCodaPhoneme, ...currentSyllable.onset];
const potentialOnsetSounds = potentialOnset.map(p => p.sound).join('');
const isValidBoundaryCluster = !invalidBoundaryClusters.some(regex => regex.test(potentialOnsetSounds));
if (firstOnsetSonority > lastCodaSonority && isValidBoundaryCluster) {
// Move the last coda phoneme to the onset of the next syllable
prevSyllable.coda.pop();
currentSyllable.onset.unshift(lastCodaPhoneme);
}
else if (firstOnsetSonority === lastCodaSonority) {
// When sonority is equal, use getWeightedOption to decide
const shouldDropCoda = getWeightedOption([
[true, 90],
[false, 10] // 10% chance to keep it
]);
if (shouldDropCoda) {
prevSyllable.coda.pop();
}
}
}
return [prevSyllable, currentSyllable];
}
/**
* Generates a single syllable for a word.
*
* This function creates a syllable structure by selecting an onset, nucleus, and coda
* based on phonological rules and the position of the syllable within the word.
*
* @param syllablePosition - The position of the syllable in the word (0-indexed).
* @param syllableCount - The total number of syllables in the word.
* @param prevSyllable - The previous syllable in the word, if any.
* @returns A Syllable object containing onset, nucleus, and coda arrays of phonemes.
*
* The function considers the following factors:
* 1. Whether it's the last syllable in the word (affects coda selection).
* 2. The previous syllable (if any) to ensure phonological consistency.
* 3. Appropriate onset, nucleus, and coda selection based on English phonotactics.
*
* This approach helps create phonologically plausible and varied syllable structures
* that can be combined to form realistic-sounding words.
*/
function generateSyllable(context) {
let newSyllable = {
onset: [],
nucleus: [],
coda: []
};
const i = context.currSyllableIndex;
const prevSyllable = context.word.syllables[i - 1];
const isEndOfWord = i === context.syllableCount - 1;
// Build the syllable structure
newSyllable.onset = pickOnset(prevSyllable);
newSyllable.nucleus = pickNucleus(prevSyllable, isEndOfWord);
newSyllable.coda = pickCoda(newSyllable, isEndOfWord);
return newSyllable;
}
function generateSyllables(context) {
context.syllableCount = context.syllableCount || getWeightedOption([
[1, 8000],
[2, 50000],
[3, 29700],
[4, 11000],
[5, 2200],
[6, 250],
[7, 50]
]);
while (context.currSyllableIndex < context.syllableCount) {
let newSyllable;
let prevSyllable;
let isValid = false;
let i = context.currSyllableIndex;
while (!isValid) {
newSyllable = generateSyllable(context);
prevSyllable = context.word.syllables[i - 1];
if (i === 0) {
isValid = true; // First syllable is always valid
}
else {
isValid = checkCrossSyllableSonority(prevSyllable, newSyllable);
}
// If not valid, we could try to resyllabify here
if (!isValid) {
[prevSyllable, newSyllable] = tryResyllabify(prevSyllable, newSyllable);
isValid = checkCrossSyllableSonority(prevSyllable, newSyllable);
}
}
// @ts-expect-error
context.word.syllables.push(newSyllable);
context.currSyllableIndex++;
}
}
/**
* Generates a word with a specified number of syllables.
*
* @param options - An object containing options for the word generation.
* @returns A Word object containing the syllables, pronunciation, and written form.
*/
export const generateWord = (options = {}) => {
const originalRand = getRand();
const context = {
word: options.word || {
syllables: [],
pronunciation: '',
written: { clean: '', hyphenated: '' }
},
syllableCount: options.syllableCount || 0,
currSyllableIndex: 0,
};
try {
if (options.seed !== undefined) {
const seededRand = createSeededRandom(options.seed);
overrideRand(seededRand);
}
generateSyllables(context);
generateWrittenForm(context);
generatePronunciation(context);
return context.word;
}
finally {
// Ensure the original randomness function is restored
overrideRand(originalRand);
}
};
export default generateWord;