phonemize

Version:

Fast phonemizer with rule-based G2P prediction. Pure JavaScript implementation.

99 lines (98 loc) • 2.99 kB

TypeScript

/** * Text tokenization and phoneme processing system * Handles language detection, preprocessing, and format conversion */ /** * Configuration options for tokenizer behavior */ export interface TokenizerOptions { /** Custom pronunciation overrides */ homograph?: Record<string, string>; /** Remove stress markers from output */ stripStress?: boolean; /** * Output format (IPA, ARPABET, or Zhuyin) * * Note: Non-chinese in zhuyin format will be converted to IPA **/ format?: "ipa" | "arpabet" | "zhuyin"; /** Token separator in output string */ separator?: string; /** Convert non-Latin text to ASCII approximation */ anyAscii?: boolean; /** Chinese tone format: 'unicode' (˧˩˧) or 'arrow' (↓↗↘→). Only applies when format is 'ipa' */ toneFormat?: "unicode" | "arrow"; } /** * Individual phoneme token with metadata */ export interface PhonemeToken { /** IPA or ARPABET phoneme string */ phoneme: string; /** Original word/text */ word: string; /** Position in original text */ position: number; } /** * Language segment for multilingual processing */ interface LanguageSegment { text: string; language: string; startIndex: number; } /** * Preprocessing result with language information */ interface PreprocessResult { text: string; languageMap: Record<string, string>; segments: LanguageSegment[]; } /** * Main tokenizer class for phoneme processing */ export declare class Tokenizer { protected readonly options: Required<TokenizerOptions>; constructor(options?: TokenizerOptions); /** * Preprocess text with language detection and segmentation */ protected _preprocess(text: string): PreprocessResult; /** * Segment text by character-level language detection */ private _segmentByLanguage; /** * Fast character-level language detection */ private _detectCharLanguage; /** * Post-process phonemes for format conversion and cleanup */ protected _postProcess(phonemes: string): string; /** * Core tokenization method - converts text to phoneme array */ tokenize(text: string): string[]; /** * Smart tokenization using efficient regex patterns */ private _smartTokenize; /** * Convert text to phoneme string with specified separator */ tokenizeToString(text: string): string; /** * Convert text to detailed phoneme tokens with metadata */ tokenizeToTokens(text: string): PhonemeToken[]; } export declare function tokenizeText(text: string, _g2pPredict: any, // Deprecated parameter options?: TokenizerOptions): PhonemeToken[]; export declare function textToIPA(text: string, _g2pPredict: any, // Deprecated parameter options?: TokenizerOptions): string; export declare function textToARPABET(text: string, _g2pPredict: any, // Deprecated parameter options?: TokenizerOptions): string; export {};