phonemize
Version:
Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.
257 lines (256 loc) • 7.5 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.expandNumbers = expandNumbers;
exports.expandAbbreviations = expandAbbreviations;
exports.expandText = expandText;
const ONES = {
0: "zero",
1: "one",
2: "two",
3: "three",
4: "four",
5: "five",
6: "six",
7: "seven",
8: "eight",
9: "nine",
10: "ten",
11: "eleven",
12: "twelve",
13: "thirteen",
14: "fourteen",
15: "fifteen",
16: "sixteen",
17: "seventeen",
18: "eighteen",
19: "nineteen",
};
const TENS = {
20: "twenty",
30: "thirty",
40: "forty",
50: "fifty",
60: "sixty",
70: "seventy",
80: "eighty",
90: "ninety",
};
const SCALES = {
100: "hundred",
1000: "thousand",
1000000: "million",
1000000000: "billion",
1000000000000: "trillion",
};
function numberToWords(n) {
if (n === 0)
return "zero";
if (n < 0)
return "negative " + numberToWords(-n);
if (n < 20)
return ONES[n] || "";
if (n < 100) {
const tens = Math.floor(n / 10) * 10;
const ones = n % 10;
return TENS[tens] + (ones > 0 ? " " + ONES[ones] : "");
}
if (n < 1000) {
const hundreds = Math.floor(n / 100);
const remainder = n % 100;
return (ONES[hundreds] +
" hundred" +
(remainder > 0 ? " " + numberToWords(remainder) : ""));
}
// Handle larger numbers
const scales = [1000000000000, 1000000000, 1000000, 1000];
for (const scale of scales) {
if (n >= scale) {
const quotient = Math.floor(n / scale);
const remainder = n % scale;
return (numberToWords(quotient) +
" " +
SCALES[scale] +
(remainder > 0 ? " " + numberToWords(remainder) : ""));
}
}
return n.toString();
}
function ordinalToWords(n) {
const base = numberToWords(n);
// Special cases for ordinals
if (n % 100 >= 11 && n % 100 <= 13) {
return base + "th";
}
switch (n % 10) {
case 1:
return base.replace(/one$/, "first");
case 2:
return base.replace(/two$/, "second");
case 3:
return base.replace(/three$/, "third");
case 5:
return base.replace(/five$/, "fifth");
case 8:
return base.replace(/eight$/, "eighth");
case 9:
return base.replace(/nine$/, "ninth");
default:
return base + "th";
}
}
const ABBREVIATIONS = {
// Titles
mr: "mister",
mrs: "missus",
ms: "miss",
dr: "doctor",
prof: "professor",
sr: "senior",
jr: "junior",
// Time
am: "a m",
pm: "p m",
// Common abbreviations
etc: "etcetera",
vs: "versus",
inc: "incorporated",
corp: "corporation",
ltd: "limited",
co: "company",
st: "street",
ave: "avenue",
blvd: "boulevard",
rd: "road",
apt: "apartment",
dept: "department",
gov: "government",
org: "organization",
edu: "education",
com: "commercial",
net: "network",
info: "information",
};
function expandNumbers(text) {
// Expand currency
text = text.replace(/\$(\d+(?:,\d{3})*(?:\.\d{2})?)/g, (_, amount) => {
const num = parseFloat(amount.replace(/,/g, ""));
const dollars = Math.floor(num);
const cents = Math.round((num - dollars) * 100);
let result = "";
if (dollars > 0) {
result +=
numberToWords(dollars) + (dollars === 1 ? " dollar" : " dollars");
}
if (cents > 0) {
if (dollars > 0)
result += " and ";
result += numberToWords(cents) + (cents === 1 ? " cent" : " cents");
}
return result || "zero dollars";
});
// Expand years (1800-2099)
text = text.replace(/\b(1[89]\d{2}|20\d{2})\b/g, (match) => {
const year = parseInt(match);
if (year >= 2000) {
return ("twenty " +
(year === 2000
? "hundred"
: year < 2010
? "oh " + ONES[year % 10]
: numberToWords(year % 100)));
}
else {
const century = Math.floor(year / 100);
const remainder = year % 100;
return (numberToWords(century) +
" " +
(remainder < 10 ? "oh " + ONES[remainder] : numberToWords(remainder)));
}
});
// Expand times (12:34, 1:30 AM, etc.)
text = text.replace(/\b(\d{1,2}):(\d{2})(?:\s*(am|pm))?\b/gi, (_, hours, minutes, ampm) => {
const h = parseInt(hours);
const m = parseInt(minutes);
let result = numberToWords(h === 0 ? 12 : h > 12 ? h - 12 : h);
if (m === 0) {
result += " o'clock";
}
else if (m < 10) {
result += " oh " + numberToWords(m);
}
else {
result += " " + numberToWords(m);
}
if (ampm) {
result += " " + ampm.toLowerCase().replace(/(\w)/g, "$1 ").trim();
}
return result;
});
// Expand ordinals (1st, 2nd, 3rd, etc.)
text = text.replace(/\b(\d+)(?:st|nd|rd|th)\b/gi, (_, num) => {
return ordinalToWords(parseInt(num));
});
// Expand phone numbers (XXX-XXX-XXXX or (XXX) XXX-XXXX)
text = text.replace(/\b(?:\(\d{3}\)\s?|\d{3}-)\d{3}-\d{4}\b/g, (match) => {
return match
.replace(/\D/g, "")
.split("")
.map((d) => ONES[parseInt(d)])
.join(" ");
});
// Expand decimals
text = text.replace(/\b(\d+)\.(\d+)\b/g, (_, whole, decimal) => {
return (numberToWords(parseInt(whole)) +
" point " +
decimal
.split("")
.map((d) => ONES[parseInt(d)])
.join(" "));
});
// Expand percentages
text = text.replace(/\b(\d+(?:\.\d+)?)%/g, (_, num) => {
const n = parseFloat(num);
return (numberToWords(Math.floor(n)) +
(n % 1 !== 0
? " point " +
(n.toString().split(".")[1] || "")
.split("")
.map((d) => ONES[parseInt(d)])
.join(" ")
: "") +
" percent");
});
// Expand regular numbers
text = text.replace(/\b\d+\b/g, (match) => {
return numberToWords(parseInt(match));
});
return text;
}
function expandAbbreviations(text) {
// Handle abbreviations with periods
text = text.replace(/\b([a-z]+)\./gi, (match, abbr) => {
const lower = abbr.toLowerCase();
if (ABBREVIATIONS[lower]) {
return ABBREVIATIONS[lower];
}
return match;
});
// Handle common abbreviations without periods
const words = text.split(/\s+/);
const expandedWords = words.map((word) => {
const clean = word.toLowerCase().replace(/[^\w]/g, "");
if (ABBREVIATIONS[clean]) {
return word.replace(new RegExp(clean, "gi"), ABBREVIATIONS[clean]);
}
return word;
});
return expandedWords.join(" ");
}
function expandText(text) {
// First expand abbreviations
text = expandAbbreviations(text);
// Then expand numbers
text = expandNumbers(text);
// Clean up extra spaces
return text.replace(/\s+/g, " ").trim();
}