phonemize

Version:

Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.

100 lines (99 loc) • 2.79 kB

TypeScript

/** * Text tokenization and phoneme processing system * Handles language detection, preprocessing, and format conversion */ /** * Configuration options for tokenizer behavior */ export interface TokenizerOptions { /** Remove stress markers from output */ stripStress?: boolean; /** * Output format (IPA, ARPABET, or Zhuyin) * * Note: Non-chinese in zhuyin format will be converted to IPA **/ format?: "ipa" | "arpabet" | "zhuyin"; /** Token separator in output string */ separator?: string; /** Convert non-Latin text to ASCII approximation */ anyAscii?: boolean; /** Chinese tone format: 'unicode' (˧˩˧) or 'arrow' (↓↗↘→). Only applies when format is 'ipa' */ toneFormat?: "unicode" | "arrow"; } /** * Individual phoneme token with metadata */ export interface PhonemeToken { /** IPA or ARPABET phoneme string */ phoneme: string; /** Original word/text */ word: string; /** Position in original text */ position: number; } /** * Language segment for multilingual processing */ interface LanguageSegment { text: string; language: string; startIndex: number; } /** * Preprocessing result with language information */ interface PreprocessResult { text: string; languageMap: Record<string, string>; segments: LanguageSegment[]; } /** * Main tokenizer class for phoneme processing */ export declare class Tokenizer { protected readonly options: Required<TokenizerOptions>; constructor(options?: TokenizerOptions); /** * Preprocess text with language detection and segmentation */ protected _preprocess(text: string): PreprocessResult; /** * Detect languages for words and create character-level segments */ private _detectLanguagesAndSegment; /** * Apply anyAscii conversion while preserving Chinese text */ private _applyAnyAscii; /** * Fast character-level language detection */ private _detectCharLanguage; /** * Post-process phonemes for format conversion and cleanup */ protected _postProcess(phonemes: string): string; private _predict; /** * Core token processing method that handles both simple and detailed tokenization */ private _processTokens; /** * Core tokenization method - converts text to phoneme array */ tokenize(text: string): string[]; /** * Smart tokenization using efficient regex patterns */ private _smartTokenize; /** * Convert text to phoneme string with specified separator */ tokenizeToString(text: string): string; /** * Convert text to detailed phoneme tokens with metadata */ tokenizeToTokens(text: string): PhonemeToken[]; } export {};