phonemize
Version:
Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.
99 lines (98 loc) • 2.99 kB
TypeScript
/**
* Text tokenization and phoneme processing system
* Handles language detection, preprocessing, and format conversion
*/
/**
* Configuration options for tokenizer behavior
*/
export interface TokenizerOptions {
/** Custom pronunciation overrides */
homograph?: Record<string, string>;
/** Remove stress markers from output */
stripStress?: boolean;
/**
* Output format (IPA, ARPABET, or Zhuyin)
*
* Note: Non-chinese in zhuyin format will be converted to IPA
**/
format?: "ipa" | "arpabet" | "zhuyin";
/** Token separator in output string */
separator?: string;
/** Convert non-Latin text to ASCII approximation */
anyAscii?: boolean;
/** Chinese tone format: 'unicode' (˧˩˧) or 'arrow' (↓↗↘→). Only applies when format is 'ipa' */
toneFormat?: "unicode" | "arrow";
}
/**
* Individual phoneme token with metadata
*/
export interface PhonemeToken {
/** IPA or ARPABET phoneme string */
phoneme: string;
/** Original word/text */
word: string;
/** Position in original text */
position: number;
}
/**
* Language segment for multilingual processing
*/
interface LanguageSegment {
text: string;
language: string;
startIndex: number;
}
/**
* Preprocessing result with language information
*/
interface PreprocessResult {
text: string;
languageMap: Record<string, string>;
segments: LanguageSegment[];
}
/**
* Main tokenizer class for phoneme processing
*/
export declare class Tokenizer {
protected readonly options: Required<TokenizerOptions>;
constructor(options?: TokenizerOptions);
/**
* Preprocess text with language detection and segmentation
*/
protected _preprocess(text: string): PreprocessResult;
/**
* Segment text by character-level language detection
*/
private _segmentByLanguage;
/**
* Fast character-level language detection
*/
private _detectCharLanguage;
/**
* Post-process phonemes for format conversion and cleanup
*/
protected _postProcess(phonemes: string): string;
/**
* Core tokenization method - converts text to phoneme array
*/
tokenize(text: string): string[];
/**
* Smart tokenization using efficient regex patterns
*/
private _smartTokenize;
/**
* Convert text to phoneme string with specified separator
*/
tokenizeToString(text: string): string;
/**
* Convert text to detailed phoneme tokens with metadata
*/
tokenizeToTokens(text: string): PhonemeToken[];
}
export declare function tokenizeText(text: string, _g2pPredict: any, // Deprecated parameter
options?: TokenizerOptions): PhonemeToken[];
export declare function textToIPA(text: string, _g2pPredict: any, // Deprecated parameter
options?: TokenizerOptions): string;
export declare function textToARPABET(text: string, _g2pPredict: any, // Deprecated parameter
options?: TokenizerOptions): string;
export {};