phonemize
Version:
Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.
421 lines (420 loc) • 17.4 kB
JavaScript
"use strict";
/**
* Text tokenization and phoneme processing system
* Handles language detection, preprocessing, and format conversion
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Tokenizer = void 0;
exports.tokenizeText = tokenizeText;
exports.textToIPA = textToIPA;
exports.textToARPABET = textToARPABET;
const any_ascii_1 = __importDefault(require("any-ascii"));
const g2p_1 = require("./g2p");
const expand_1 = require("./expand");
const pos_tagger_1 = require("./pos-tagger");
const consts_1 = require("./consts");
const multilingual_processor_1 = require("./multilingual-processor");
const zh_g2p_1 = require("./zh-g2p");
const utils_1 = require("./utils");
/**
* Fast ARPABET to IPA conversion for legacy compatibility
*/
function arpabetToIpa(arpabet) {
var _a;
const stress = (_a = arpabet.match(/[012]$/)) === null || _a === void 0 ? void 0 : _a[0];
const arpabetWithoutStress = arpabet.replace(/[012]$/, "");
const ipa = consts_1.ARPABET_TO_IPA[arpabetWithoutStress];
return stress ? `${consts_1.IPA_STRESS_MAP[stress]}${ipa}` : ipa;
}
/**
* Main tokenizer class for phoneme processing
*/
class Tokenizer {
constructor(options = {}) {
this.options = Object.assign({ stripStress: false, format: "ipa", separator: " ", anyAscii: false, homograph: {}, toneFormat: "unicode" }, options);
}
/**
* Preprocess text with language detection and segmentation
*/
_preprocess(text) {
const segments = this._segmentByLanguage(text);
if (!this.options.anyAscii) {
return {
text,
languageMap: {},
segments,
};
}
// Apply anyAscii conversion while preserving Chinese for G2P
const words = text.split(/(\s+)/);
const languageMap = {};
let processedText = '';
for (const word of words) {
const trimmed = word.trim();
if (trimmed && !consts_1.PUNCTUATION.includes(trimmed)) {
const detectedLang = (0, multilingual_processor_1.detectLanguage)(trimmed);
if (detectedLang) {
if (detectedLang === 'zh' && zh_g2p_1.chineseG2P.isChineseText(trimmed)) {
// Preserve Chinese text for G2P processing
processedText += word;
languageMap[trimmed.toLowerCase()] = detectedLang;
}
else {
// Convert non-Chinese multilingual text to ASCII
const asciiWord = (0, any_ascii_1.default)(trimmed);
processedText += word.replace(trimmed, asciiWord);
languageMap[asciiWord.toLowerCase()] = detectedLang;
}
}
else {
// Convert non-multilingual text to ASCII
processedText += (0, any_ascii_1.default)(word);
}
}
else {
// Preserve whitespace and punctuation
processedText += word;
}
}
return {
text: processedText,
languageMap,
segments,
};
}
/**
* Segment text by character-level language detection
*/
_segmentByLanguage(text) {
const segments = [];
let currentSegment = '';
let currentLanguage = '';
let segmentStartIndex = 0;
for (let i = 0; i < text.length; i++) {
const char = text[i];
const charLang = this._detectCharLanguage(char);
if (charLang !== currentLanguage) {
// Language changed - save current segment if not empty
if (currentSegment.trim()) {
segments.push({
text: currentSegment,
language: currentLanguage || 'en',
startIndex: segmentStartIndex
});
}
// Start new segment
currentSegment = char;
currentLanguage = charLang;
segmentStartIndex = i;
}
else {
currentSegment += char;
}
}
// Add final segment
if (currentSegment.trim()) {
segments.push({
text: currentSegment,
language: currentLanguage || 'en',
startIndex: segmentStartIndex
});
}
return segments;
}
/**
* Fast character-level language detection
*/
_detectCharLanguage(char) {
const code = char.charCodeAt(0);
// Chinese (CJK) - most common ranges first
if ((code >= 0x4e00 && code <= 0x9fff) || // CJK Unified Ideographs
(code >= 0x3400 && code <= 0x4dbf) || // CJK Extension A
(code >= 0x20000 && code <= 0x2a6df)) { // CJK Extension B
return 'zh';
}
// Japanese
if ((code >= 0x3040 && code <= 0x309f) || // Hiragana
(code >= 0x30a0 && code <= 0x30ff)) { // Katakana
return 'ja';
}
// Korean
if ((code >= 0xac00 && code <= 0xd7af) || // Hangul Syllables
(code >= 0x1100 && code <= 0x11ff) || // Hangul Jamo
(code >= 0x3130 && code <= 0x318f)) { // Hangul Compatibility Jamo
return 'ko';
}
// Thai
if (code >= 0x0e00 && code <= 0x0e7f) {
return 'th';
}
// Arabic
if ((code >= 0x0600 && code <= 0x06ff) || // Arabic
(code >= 0x0750 && code <= 0x077f) || // Arabic Supplement
(code >= 0xfb50 && code <= 0xfdff) || // Arabic Presentation Forms-A
(code >= 0xfe70 && code <= 0xfeff)) { // Arabic Presentation Forms-B
return 'ar';
}
// Cyrillic (Russian, etc.)
if (code >= 0x0400 && code <= 0x04ff) {
return 'ru';
}
// Default to English/Latin
return 'en';
}
/**
* Post-process phonemes for format conversion and cleanup
*/
_postProcess(phonemes) {
if (this.options.format === "arpabet") {
// Convert to ARPABET format
phonemes = (0, utils_1.ipaToArpabet)(phonemes);
// Remove ARPABET stress markers if requested
if (this.options.stripStress) {
phonemes = phonemes.replace(/[012]/g, "");
}
}
else if (this.options.format === "zhuyin") {
// Zhuyin format processing - handled per token, not here
// This is a placeholder for any global zhuyin post-processing
// The actual conversion happens in the tokenize method
return phonemes;
}
else {
// IPA format processing
// Convert Chinese tone format if requested
if (this.options.toneFormat === "arrow") {
phonemes = (0, utils_1.convertChineseTonesToArrows)(phonemes);
}
// Remove IPA stress markers if requested
if (this.options.stripStress) {
phonemes = phonemes.replace(/[ˈˌ]/g, "");
}
}
return phonemes;
}
/**
* Core tokenization method - converts text to phoneme array
*/
tokenize(text) {
var _a, _b;
if (!(text === null || text === void 0 ? void 0 : text.trim()))
return [];
const { text: processedText, languageMap } = this._preprocess(text);
const expandedText = (0, expand_1.expandText)(processedText);
// Improved tokenization for better Chinese word preservation
const tokens = this._smartTokenize(expandedText);
// Get POS tags for homograph disambiguation
const cleanWords = tokens.filter(token => token.trim() && !consts_1.PUNCTUATION.includes(token.trim()));
const posResults = pos_tagger_1.simplePOSTagger.tagWords(cleanWords);
const phonemes = [];
let cleanWordIndex = 0;
for (const token of tokens) {
const cleanToken = token.trim();
// Handle punctuation - preserve it
if (consts_1.PUNCTUATION.includes(cleanToken)) {
phonemes.push(cleanToken);
continue;
}
// Get POS tag for homograph disambiguation
const pos = (_a = posResults[cleanWordIndex]) === null || _a === void 0 ? void 0 : _a.pos;
cleanWordIndex++;
// Check for custom pronunciations
const customPronunciation = (_b = this.options.homograph) === null || _b === void 0 ? void 0 : _b[cleanToken.toLowerCase()];
if (customPronunciation) {
let processed = this._postProcess(customPronunciation);
// Apply custom separator to individual phonemes if needed
if (this.options.separator !== " ") {
processed = processed.split(' ').join(this.options.separator);
}
phonemes.push(processed);
continue;
}
// Check language map for multilingual words
const detectedLanguage = languageMap[cleanToken.toLowerCase()];
// Handle Zhuyin format specially
if (this.options.format === "zhuyin") {
let pronunciation;
// Check if it's Chinese text
if (zh_g2p_1.chineseG2P.isChineseText(cleanToken)) {
// Convert Chinese to Zhuyin
pronunciation = zh_g2p_1.chineseG2P.textToZhuyin(cleanToken);
}
else {
// Convert non-Chinese to IPA as fallback
pronunciation = (0, g2p_1.predict)(cleanToken, pos, detectedLanguage);
// Apply IPA post-processing but not tone format conversion
if (this.options.stripStress) {
pronunciation = pronunciation.replace(/[ˈˌ]/g, "");
}
}
// Apply custom separator
if (this.options.separator !== " ") {
pronunciation = pronunciation.split(' ').join(this.options.separator);
}
phonemes.push(pronunciation);
}
else {
// Regular IPA/ARPABET processing
let pronunciation = (0, g2p_1.predict)(cleanToken, pos, detectedLanguage);
pronunciation = this._postProcess(pronunciation);
// Apply custom separator to individual phonemes if needed
if (this.options.separator !== " ") {
pronunciation = pronunciation.split(' ').join(this.options.separator);
}
phonemes.push(pronunciation);
}
}
return phonemes;
}
/**
* Smart tokenization using efficient regex patterns
*/
_smartTokenize(text) {
const tokenRegex = /([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]+|\w+['']?\w*|[^\w\s\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])/g;
const tokens = [];
let match;
while ((match = tokenRegex.exec(text)) !== null) {
const token = match[1];
// Skip pure whitespace tokens
if (/^\s+$/.test(token)) {
continue;
}
// Handle punctuation - only add if it's in our known punctuation list
if (token.length === 1 && consts_1.PUNCTUATION.includes(token)) {
tokens.push(token);
continue;
}
// Add word tokens (Chinese, English, numbers, contractions, etc.)
if (token.trim()) {
tokens.push(token.trim());
}
}
return tokens;
}
/**
* Convert text to phoneme string with specified separator
*/
tokenizeToString(text) {
const phonemes = this.tokenize(text);
// Join phonemes, handling punctuation attachment properly
const result = [];
for (let i = 0; i < phonemes.length; i++) {
const phoneme = phonemes[i];
if (consts_1.PUNCTUATION.includes(phoneme)) {
// Attach punctuation to previous phoneme without space
if (result.length > 0) {
result[result.length - 1] += phoneme;
}
else {
result.push(phoneme);
}
}
else {
// For custom separators, split phonemes into characters
if (this.options.separator !== " ") {
result.push(phoneme.split('').join(this.options.separator));
}
else {
result.push(phoneme);
}
}
}
return result.join(this.options.separator === " " ? " " : " ");
}
/**
* Convert text to detailed phoneme tokens with metadata
*/
tokenizeToTokens(text) {
var _a, _b;
if (!(text === null || text === void 0 ? void 0 : text.trim()))
return [];
const { text: processedText, languageMap } = this._preprocess(text);
const expandedText = (0, expand_1.expandText)(processedText);
// Use regex to get tokens with their positions in original text
const tokenRegex = /([\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]+|\w+['']?\w*|[^\w\s\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff])/g;
const tokenMatches = [];
let match;
while ((match = tokenRegex.exec(expandedText)) !== null) {
const token = match[1];
// Skip pure whitespace tokens
if (/^\s+$/.test(token)) {
continue;
}
// Only process non-whitespace tokens
if (token.trim()) {
tokenMatches.push({
token: token.trim(),
position: match.index
});
}
}
// Get POS tags for homograph disambiguation
const cleanWords = tokenMatches.filter(({ token }) => !consts_1.PUNCTUATION.includes(token));
const posResults = pos_tagger_1.simplePOSTagger.tagWords(cleanWords.map(({ token }) => token));
const results = [];
let cleanWordIndex = 0;
for (const { token, position } of tokenMatches) {
if (!consts_1.PUNCTUATION.includes(token)) {
// Get POS tag for homograph disambiguation
const pos = (_a = posResults[cleanWordIndex]) === null || _a === void 0 ? void 0 : _a.pos;
cleanWordIndex++;
// Check for custom pronunciations
const customPronunciation = (_b = this.options.homograph) === null || _b === void 0 ? void 0 : _b[token.toLowerCase()];
let phoneme;
if (customPronunciation) {
phoneme = this._postProcess(customPronunciation);
}
else {
// Check language map for multilingual words
const detectedLanguage = languageMap[token.toLowerCase()];
// Handle Zhuyin format specially
if (this.options.format === "zhuyin") {
if (zh_g2p_1.chineseG2P.isChineseText(token)) {
// Convert Chinese to Zhuyin
phoneme = zh_g2p_1.chineseG2P.textToZhuyin(token);
}
else {
// Convert non-Chinese to IPA as fallback
phoneme = (0, g2p_1.predict)(token, pos, detectedLanguage);
// Apply IPA post-processing but not tone format conversion
if (this.options.stripStress) {
phoneme = phoneme.replace(/[ˈˌ]/g, "");
}
}
}
else {
// Regular IPA/ARPABET processing
const pronunciation = (0, g2p_1.predict)(token, pos, detectedLanguage);
phoneme = this._postProcess(pronunciation);
}
}
results.push({
phoneme,
word: token,
position
});
}
}
return results;
}
}
exports.Tokenizer = Tokenizer;
// Legacy function exports for backward compatibility
function tokenizeText(text, _g2pPredict, // Deprecated parameter
options = {}) {
const tokenizer = new Tokenizer(options);
return tokenizer.tokenizeToTokens(text);
}
function textToIPA(text, _g2pPredict, // Deprecated parameter
options = {}) {
const tokenizer = new Tokenizer(Object.assign(Object.assign({}, options), { format: "ipa" }));
return tokenizer.tokenizeToString(text);
}
function textToARPABET(text, _g2pPredict, // Deprecated parameter
options = {}) {
const tokenizer = new Tokenizer(Object.assign(Object.assign({}, options), { format: "arpabet" }));
return tokenizer.tokenizeToString(text);
}