pangu

Version:

Paranoid text spacing for good readability, to automatically insert whitespace between CJK (Chinese, Japanese, Korean) and half-width characters (alphabetical letters, numerical digits and symbols).

github.com/vinta/pangu.js

vinta/pangu.js

498 lines (407 loc) • 23.5 kB

text/typescript

// CJK is short for Chinese, Japanese, and Korean: // \u2e80-\u2eff CJK Radicals Supplement // \u2f00-\u2fdf Kangxi Radicals // \u3040-\u309f Hiragana // \u30a0-\u30ff Katakana // \u3100-\u312f Bopomofo // \u3200-\u32ff Enclosed CJK Letters and Months // \u3400-\u4dbf CJK Unified Ideographs Extension A // \u4e00-\u9fff CJK Unified Ideographs // \uf900-\ufaff CJK Compatibility Ideographs // // ANS is short for Alphabets, Numbers, and Symbols: // A includes A-Za-z\u0370-\u03ff // N includes 0-9 // S includes `~!@#$%^&*()-_=+[]{}\|;:'",<.>/? // // All J below does not include \u30fb // Some S below does not include all symbols // // For more information about Unicode blocks, see // https://symbl.cc/en/unicode-table/ const CJK = '\u2e80-\u2eff\u2f00-\u2fdf\u3040-\u309f\u30a0-\u30fa\u30fc-\u30ff\u3100-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff'; // Basic character classes const AN = 'A-Za-z0-9'; const A = 'A-Za-z'; const UPPER_AN = 'A-Z0-9'; // For FIX_CJK_COLON_ANS // Operators - note the different sets! const OPERATORS_BASE = '\\+\\*=&'; const OPERATORS_WITH_HYPHEN = `${OPERATORS_BASE}\\-`; // For CJK patterns const OPERATORS_NO_HYPHEN = OPERATORS_BASE; // For ANS_OPERATOR_ANS only const GRADE_OPERATORS = '\\+\\-\\*'; // For single letter grades // Quotes const QUOTES = '\`"\u05f4'; // Backtick, straight quote, Hebrew punctuation // Brackets - different sets! const LEFT_BRACKETS_BASIC = '\$\\[\\{'; // For AN_LEFT_BRACKET const RIGHT_BRACKETS_BASIC = '\$\\]\\}'; // For RIGHT_BRACKET_AN const LEFT_BRACKETS_EXTENDED = '\$\\[\\{<>\u201c'; // For CJK_LEFT_BRACKET (includes angle brackets + curly quote) const RIGHT_BRACKETS_EXTENDED = '\$\\]\\}<>\u201d'; // For RIGHT_BRACKET_CJK // ANS extended sets - CAREFUL: different symbols! const ANS_CJK_AFTER = `${A}\u0370-\u03ff0-9@\\$%\\^&\\*\\-\\+\\\\=\u00a1-\u00ff\u2150-\u218f\u2700—\u27bf`; // Has @, no punctuation const ANS_BEFORE_CJK = `${A}\u0370-\u03ff0-9\\$%\\^&\\*\\-\\+\\\\=\u00a1-\u00ff\u2150-\u218f\u2700—\u27bf`; // No @ symbol // File path components - common directories in Unix/project paths // prettier-ignore const FILE_PATH_DIRS = 'home|root|usr|etc|var|opt|tmp|dev|mnt|proc|sys|bin|boot|lib|media|run|sbin|srv|node_modules|path|project|src|dist|test|tests|docs|templates|assets|public|static|config|scripts|tools|build|out|target|your|\\.claude|\\.git|\\.vscode'; const FILE_PATH_CHARS = '[A-Za-z0-9_\\-\\.@\\+\\*]+'; // Unix absolute paths: system dirs + common project paths // Examples: /home, /usr/bin, /etc/nginx.conf, /.bashrc, /node_modules/@babel/core, /path/to/your/project const UNIX_ABSOLUTE_FILE_PATH = new RegExp(`/(?:\\.?(?:${FILE_PATH_DIRS})|\\.(?:[A-Za-z0-9_\\-]+))(?:/${FILE_PATH_CHARS})*`); // Unix relative paths common in documentation and blog posts // Examples: src/main.py, dist/index.js, test/spec.js, ./.claude/CLAUDE.md, templates/*.html const UNIX_RELATIVE_FILE_PATH = new RegExp(`(?:\\./)?(?:${FILE_PATH_DIRS})(?:/${FILE_PATH_CHARS})+`); // Windows paths: C:\Users\name\, D:\Program Files\, C:\Windows\System32 const WINDOWS_FILE_PATH = /[A-Z]:\\(?:[A-Za-z0-9_\-\. ]+\\?)+/; const ANY_CJK = new RegExp(`[${CJK}]`); // Handle punctuation after CJK - add space but don't convert to full-width // Support multiple consecutive punctuation marks // Only add space if followed by CJK, letters, or numbers (not at end of text or before same punctuation) const CJK_PUNCTUATION = new RegExp(`([${CJK}])([!;,\\?:]+)(?=[${CJK}${AN}])`, 'g'); // Handle punctuation between AN and CJK - add space after punctuation const AN_PUNCTUATION_CJK = new RegExp(`([${AN}])([!;,\\?]+)([${CJK}])`, 'g'); // Handle tilde separately for special cases like ~= // Only add space if followed by CJK, letters, or numbers (not at end of text) const CJK_TILDE = new RegExp(`([${CJK}])(~+)(?!=)(?=[${CJK}${AN}])`, 'g'); const CJK_TILDE_EQUALS = new RegExp(`([${CJK}])(~=)`, 'g'); // Handle period separately to avoid matching file extensions, multiple dots, and file paths // Note: Multiple dots are handled by DOTS_CJK pattern first // Only add space if followed by CJK, letters, or numbers (not at end of text) const CJK_PERIOD = new RegExp(`([${CJK}])(\\.)(?![${AN}\\./])(?=[${CJK}${AN}])`, 'g'); // Handle period between AN and CJK - avoid file extensions const AN_PERIOD_CJK = new RegExp(`([${AN}])(\\.)([${CJK}])`, 'g'); // Handle colon between AN and CJK const AN_COLON_CJK = new RegExp(`([${AN}])(:)([${CJK}])`, 'g'); const DOTS_CJK = new RegExp(`([\\.]{2,}|\u2026)([${CJK}])`, 'g'); // Special case for colon before uppercase letters/parentheses (convert to full-width) const FIX_CJK_COLON_ANS = new RegExp(`([${CJK}])\\:([${UPPER_AN}\$\$])`, 'g'); // The symbol part does not include ' const CJK_QUOTE = new RegExp(`([${CJK}])([${QUOTES}])`, 'g'); const QUOTE_CJK = new RegExp(`([${QUOTES}])([${CJK}])`, 'g'); const FIX_QUOTE_ANY_QUOTE = new RegExp(`([${QUOTES}]+)[ ]*(.+?)[ ]*([${QUOTES}]+)`, 'g'); // Handle curly quotes with alphanumeric characters // These patterns should only apply to curly quotes, not straight quotes // Straight quotes are already handled by CJK_QUOTE, QUOTE_CJK and FIX_QUOTE_ANY_QUOTE const QUOTE_AN = new RegExp(`([\u201d])([${AN}])`, 'g'); // Only closing curly quotes + AN // Special handling for straight quotes followed by alphanumeric after CJK // This catches patterns like: 中文"ABC where the quote appears to be closing a quoted CJK phrase const CJK_QUOTE_AN = new RegExp(`([${CJK}])(")([${AN}])`, 'g'); const CJK_SINGLE_QUOTE_BUT_POSSESSIVE = new RegExp(`([${CJK}])('[^s])`, 'g'); const SINGLE_QUOTE_CJK = new RegExp(`(')([${CJK}])`, 'g'); const FIX_POSSESSIVE_SINGLE_QUOTE = new RegExp(`([${AN}${CJK}])( )('s)`, 'g'); const HASH_ANS_CJK_HASH = new RegExp(`([${CJK}])(#)([${CJK}]+)(#)([${CJK}])`, 'g'); const CJK_HASH = new RegExp(`([${CJK}])(#([^ ]))`, 'g'); const HASH_CJK = new RegExp(`(([^ ])#)([${CJK}])`, 'g'); // The symbol part only includes + - * = & (excluding | / < >) const CJK_OPERATOR_ANS = new RegExp(`([${CJK}])([${OPERATORS_WITH_HYPHEN}])([${AN}])`, 'g'); const ANS_OPERATOR_CJK = new RegExp(`([${AN}])([${OPERATORS_WITH_HYPHEN}])([${CJK}])`, 'g'); // Handle operators between alphanumeric characters when CJK is present in text // Note: This pattern excludes hyphens entirely (only + * = &) to avoid conflicts with compound words const ANS_OPERATOR_ANS = new RegExp(`([${AN}])([${OPERATORS_NO_HYPHEN}])([${AN}])`, 'g'); // Hyphens that should be treated as operators (with spaces) rather than word connectors // This regex has 3 patterns to catch different cases while preserving compound words: // 1. Letter-Letter/Number: A-B, X-5 (spaces added) BUT NOT co-author, X-ray, GPT-5 (preserved) // 2. Mixed alphanumeric-number patterns that aren't already protected as compound words // 3. Number-Letter: 5-A, 3-B (spaces added) BUT NOT 5-year, 2016-12-26 (preserved) // Note: Patterns like GPT4-5, v1-2 are protected by COMPOUND_WORD_PATTERN and won't get spaces // The negative lookahead (?![a-z]) prevents matching hyphens followed by lowercase letters const ANS_HYPHEN_ANS_NOT_COMPOUND = new RegExp(`([A-Za-z])(-(?![a-z]))([A-Za-z0-9])|([A-Za-z]+[0-9]+)(-(?![a-z]))([0-9])|([0-9])(-(?![a-z0-9]))([A-Za-z])`, 'g'); // Slash patterns for operator vs separator behavior const CJK_SLASH_CJK = new RegExp(`([${CJK}])([/])([${CJK}])`, 'g'); const CJK_SLASH_ANS = new RegExp(`([${CJK}])([/])([${AN}])`, 'g'); const ANS_SLASH_CJK = new RegExp(`([${AN}])([/])([${CJK}])`, 'g'); const ANS_SLASH_ANS = new RegExp(`([${AN}])([/])([${AN}])`, 'g'); // Special handling for single letter grades/ratings (A+, B-, C*) before CJK // These should have space after the operator, not before // Use word boundary to ensure it's a single letter, not part of a longer word const SINGLE_LETTER_GRADE_CJK = new RegExp(`\\b([${A}])([${GRADE_OPERATORS}])([${CJK}])`, 'g'); // Special handling for < and > as comparison operators (not brackets) const CJK_LESS_THAN = new RegExp(`([${CJK}])(<)([${AN}])`, 'g'); const LESS_THAN_CJK = new RegExp(`([${AN}])(<)([${CJK}])`, 'g'); const CJK_GREATER_THAN = new RegExp(`([${CJK}])(>)([${AN}])`, 'g'); const GREATER_THAN_CJK = new RegExp(`([${AN}])(>)([${CJK}])`, 'g'); // Handle < and > between alphanumeric characters when CJK is present in text const ANS_LESS_THAN_ANS = new RegExp(`([${AN}])(<)([${AN}])`, 'g'); const ANS_GREATER_THAN_ANS = new RegExp(`([${AN}])(>)([${AN}])`, 'g'); // Bracket patterns: ( ) [ ] { } and also < > (though < > are also handled as operators separately) // Note: The curly quotes “ ” (\u201c \u201d) appear in CJK_LEFT_BRACKET/RIGHT_BRACKET_CJK but are primarily handled in the patterns below const CJK_LEFT_BRACKET = new RegExp(`([${CJK}])([${LEFT_BRACKETS_EXTENDED}])`, 'g'); const RIGHT_BRACKET_CJK = new RegExp(`([${RIGHT_BRACKETS_EXTENDED}])([${CJK}])`, 'g'); const ANS_CJK_LEFT_BRACKET_ANY_RIGHT_BRACKET = new RegExp(`([${AN}${CJK}])[ ]*([\u201c])([${AN}${CJK}\\-_ ]+)([\u201d])`, 'g'); const LEFT_BRACKET_ANY_RIGHT_BRACKET_ANS_CJK = new RegExp(`([\u201c])([${AN}${CJK}\\-_ ]+)([\u201d])[ ]*([${AN}${CJK}])`, 'g'); const AN_LEFT_BRACKET = new RegExp(`([${AN}])(?<!\\.[${AN}]*)([${LEFT_BRACKETS_BASIC}])`, 'g'); const RIGHT_BRACKET_AN = new RegExp(`([${RIGHT_BRACKETS_BASIC}])([${AN}])`, 'g'); // Special patterns for filesystem paths after CJK const CJK_UNIX_ABSOLUTE_FILE_PATH = new RegExp(`([${CJK}])(${UNIX_ABSOLUTE_FILE_PATH.source})`, 'g'); const CJK_UNIX_RELATIVE_FILE_PATH = new RegExp(`([${CJK}])(${UNIX_RELATIVE_FILE_PATH.source})`, 'g'); const CJK_WINDOWS_PATH = new RegExp(`([${CJK}])(${WINDOWS_FILE_PATH.source})`, 'g'); // Pattern for Unix paths ending with / followed by CJK const UNIX_ABSOLUTE_FILE_PATH_SLASH_CJK = new RegExp(`(${UNIX_ABSOLUTE_FILE_PATH.source}/)([${CJK}])`, 'g'); const UNIX_RELATIVE_FILE_PATH_SLASH_CJK = new RegExp(`(${UNIX_RELATIVE_FILE_PATH.source}/)([${CJK}])`, 'g'); const CJK_ANS = new RegExp(`([${CJK}])([${ANS_CJK_AFTER}])`, 'g'); const ANS_CJK = new RegExp(`([${ANS_BEFORE_CJK}])([${CJK}])`, 'g'); const S_A = new RegExp(`(%)([${A}])`, 'g'); const MIDDLE_DOT = /([ ]*)([\u00b7\u2022\u2027])([ ]*)/g; class PlaceholderReplacer { private items: string[] = []; private index = 0; private pattern: RegExp; constructor( private placeholder: string, private startDelimiter: string, private endDelimiter: string, ) { const escapedStart = this.startDelimiter.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const escapedEnd = this.endDelimiter.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); this.pattern = new RegExp(`${escapedStart}${this.placeholder}(\\d+)${escapedEnd}`, 'g'); } store(item: string) { this.items[this.index] = item; return `${this.startDelimiter}${this.placeholder}${this.index++}${this.endDelimiter}`; } restore(text: string) { return text.replace(this.pattern, (_match, index) => { return this.items[parseInt(index, 10)] || ''; }); } reset() { this.items = []; this.index = 0; } } export class Pangu { version: string; constructor() { this.version = '7.2.0'; } public spacingText(text: string) { if (typeof text !== 'string') { console.warn(`spacingText(text) only accepts string but got ${typeof text}`); return text; } if (text.length <= 1 || !ANY_CJK.test(text)) { return text; } // eslint-disable-next-line @typescript-eslint/no-this-alias const self = this; let newText = text; // Protect backtick content from quote processing but allow spacing around backticks const backtickManager = new PlaceholderReplacer('BACKTICK_CONTENT_', '\uE004', '\uE005'); newText = newText.replace(/`([^`]+)`/g, (_match, content) => { return `\`${backtickManager.store(content)}\``; }); // Initialize placeholder managers const htmlTagManager = new PlaceholderReplacer('HTML_TAG_PLACEHOLDER_', '\uE000', '\uE001'); let hasHtmlTags = false; // Early return for HTML processing if no HTML tags present if (newText.includes('<')) { hasHtmlTags = true; // More specific HTML tag pattern: // - Opening tags: <tagname ...> or <tagname> // - Closing tags: </tagname> // - Self-closing tags: <tagname ... /> // This pattern ensures we only match actual HTML tags, not just any < > content const HTML_TAG_PATTERN = /<\/?[a-zA-Z][a-zA-Z0-9]*(?:\s+[^>]*)?>/g; // Replace all HTML tags with placeholders, but process attribute values newText = newText.replace(HTML_TAG_PATTERN, (match) => { // Process attribute values inside the tag const processedTag = match.replace(/(\w+)="([^"]*)"/g, (_attrMatch, attrName, attrValue) => { // Process the attribute value with spacing const processedValue = self.spacingText(attrValue); return `${attrName}="${processedValue}"`; }); return htmlTagManager.store(processedTag); }); } // Handle multiple dots first (before single period) newText = newText.replace(DOTS_CJK, '$1 $2'); // Handle punctuation after CJK - add space but don't convert to full-width newText = newText.replace(CJK_PUNCTUATION, '$1$2 '); // Handle punctuation between AN and CJK newText = newText.replace(AN_PUNCTUATION_CJK, '$1$2 $3'); // Handle tilde separately for special cases newText = newText.replace(CJK_TILDE, '$1$2 '); newText = newText.replace(CJK_TILDE_EQUALS, '$1 $2 '); // Handle period separately to avoid file extensions newText = newText.replace(CJK_PERIOD, '$1$2 '); newText = newText.replace(AN_PERIOD_CJK, '$1$2 $3'); // Handle colon between AN and CJK newText = newText.replace(AN_COLON_CJK, '$1$2 $3'); // Only convert colon to full-width in specific cases (before uppercase/parentheses) newText = newText.replace(FIX_CJK_COLON_ANS, '$1：$2'); newText = newText.replace(CJK_QUOTE, '$1 $2'); newText = newText.replace(QUOTE_CJK, '$1 $2'); newText = newText.replace(FIX_QUOTE_ANY_QUOTE, '$1$2$3'); // Handle quotes with alphanumeric - closing quotes followed by AN need space newText = newText.replace(QUOTE_AN, '$1 $2'); // Opening quotes preceded by AN don't need space (they're handled by other patterns) // Handle CJK followed by closing quote followed by alphanumeric newText = newText.replace(CJK_QUOTE_AN, '$1$2 $3'); // Handle single quotes more intelligently // First, handle possessive case newText = newText.replace(FIX_POSSESSIVE_SINGLE_QUOTE, "$1's"); // Process single quotes around pure CJK text differently from mixed content const singleQuoteCJKManager = new PlaceholderReplacer('SINGLE_QUOTE_CJK_PLACEHOLDER_', '\uE030', '\uE031'); // Pattern to match single quotes around pure CJK text (no spaces, no other characters) const SINGLE_QUOTE_PURE_CJK = new RegExp(`(')([${CJK}]+)(')`, 'g'); // Protect pure CJK content in single quotes newText = newText.replace(SINGLE_QUOTE_PURE_CJK, (match) => { return singleQuoteCJKManager.store(match); }); // Now process other single quote patterns newText = newText.replace(CJK_SINGLE_QUOTE_BUT_POSSESSIVE, '$1 $2'); newText = newText.replace(SINGLE_QUOTE_CJK, '$1 $2'); // Restore protected pure CJK content newText = singleQuoteCJKManager.restore(newText); // Early return for complex patterns that need longer text const textLength = newText.length; // Check slash count early to determine hashtag behavior const slashCount = (newText.match(/\//g) || []).length; // Early return for slash processing if no slashes present if (slashCount === 0) { // Apply normal hashtag spacing without slash considerations // HASH_ANS_CJK_HASH pattern needs at least 5 characters if (textLength >= 5) { newText = newText.replace(HASH_ANS_CJK_HASH, '$1 $2$3$4 $5'); } newText = newText.replace(CJK_HASH, '$1 $2'); newText = newText.replace(HASH_CJK, '$1 $3'); } else if (slashCount <= 1) { // Single or no slash - apply normal hashtag spacing // HASH_ANS_CJK_HASH pattern needs at least 5 characters if (textLength >= 5) { newText = newText.replace(HASH_ANS_CJK_HASH, '$1 $2$3$4 $5'); } newText = newText.replace(CJK_HASH, '$1 $2'); newText = newText.replace(HASH_CJK, '$1 $3'); } else { // Multiple slashes - skip hashtag processing to preserve path structure // But add space before final hashtag if it's not preceded by a slash // HASH_ANS_CJK_HASH pattern needs at least 5 characters if (textLength >= 5) { newText = newText.replace(HASH_ANS_CJK_HASH, '$1 $2$3$4 $5'); } newText = newText.replace(new RegExp(`([^/])([${CJK}])(#[A-Za-z0-9]+)$`), '$1$2 $3'); } // Protect compound words from operator spacing const compoundWordManager = new PlaceholderReplacer('COMPOUND_WORD_PLACEHOLDER_', '\uE010', '\uE011'); // Pattern to detect compound words: alphanumeric-alphanumeric combinations that look like compound words/product names // Examples: state-of-the-art, machine-learning, GPT-4o, real-time, end-to-end, gpt-4o, GPT-5, claude-4-opus // Match: word-word(s) where at least one part contains lowercase letters OR contains mix of letters and numbers (like GPT-5) const COMPOUND_WORD_PATTERN = /\b(?:[A-Za-z0-9]*[a-z][A-Za-z0-9]*-[A-Za-z0-9]+|[A-Za-z0-9]+-[A-Za-z0-9]*[a-z][A-Za-z0-9]*|[A-Za-z]+-[0-9]+|[A-Za-z]+[0-9]+-[A-Za-z0-9]+)(?:-[A-Za-z0-9]+)*\b/g; // Store compound words and replace with placeholders newText = newText.replace(COMPOUND_WORD_PATTERN, (match) => { return compoundWordManager.store(match); }); // Handle single letter grades (A+, B-, etc.) before general operator rules // This ensures "A+的" becomes "A+ 的" not "A + 的" newText = newText.replace(SINGLE_LETTER_GRADE_CJK, '$1$2 $3'); newText = newText.replace(CJK_OPERATOR_ANS, '$1 $2 $3'); newText = newText.replace(ANS_OPERATOR_CJK, '$1 $2 $3'); newText = newText.replace(ANS_OPERATOR_ANS, '$1 $2 $3'); newText = newText.replace(ANS_HYPHEN_ANS_NOT_COMPOUND, (match, ...groups) => { // Handle all patterns in the alternation if (groups[0] && groups[1] && groups[2]) { // First pattern: letter-alphanumeric return `${groups[0]} ${groups[1]} ${groups[2]}`; } else if (groups[3] && groups[4] && groups[5]) { // Second pattern: version range (letter+number-number) return `${groups[3]} ${groups[4]} ${groups[5]}`; } else if (groups[6] && groups[7] && groups[8]) { // Third pattern: number-letter return `${groups[6]} ${groups[7]} ${groups[8]}`; } return match; }); // Handle < and > as comparison operators newText = newText.replace(CJK_LESS_THAN, '$1 $2 $3'); newText = newText.replace(LESS_THAN_CJK, '$1 $2 $3'); newText = newText.replace(ANS_LESS_THAN_ANS, '$1 $2 $3'); newText = newText.replace(CJK_GREATER_THAN, '$1 $2 $3'); newText = newText.replace(GREATER_THAN_CJK, '$1 $2 $3'); newText = newText.replace(ANS_GREATER_THAN_ANS, '$1 $2 $3'); // Add space before filesystem paths after CJK newText = newText.replace(CJK_UNIX_ABSOLUTE_FILE_PATH, '$1 $2'); newText = newText.replace(CJK_UNIX_RELATIVE_FILE_PATH, '$1 $2'); newText = newText.replace(CJK_WINDOWS_PATH, '$1 $2'); // Add space after Unix paths ending with / before CJK newText = newText.replace(UNIX_ABSOLUTE_FILE_PATH_SLASH_CJK, '$1 $2'); newText = newText.replace(UNIX_RELATIVE_FILE_PATH_SLASH_CJK, '$1 $2'); // Context-aware slash handling: single slash = operator, multiple slashes = separator // But exclude slashes that are part of file paths by protecting them first if (slashCount === 1) { // Temporarily protect file paths from slash operator processing const filePathManager = new PlaceholderReplacer('FILE_PATH_PLACEHOLDER_', '\uE020', '\uE021'); // Store all file paths and replace with placeholders const allFilePathPattern = new RegExp(`(${UNIX_ABSOLUTE_FILE_PATH.source}|${UNIX_RELATIVE_FILE_PATH.source})`, 'g'); newText = newText.replace(allFilePathPattern, (match) => { return filePathManager.store(match); }); // Now apply slash operator spacing newText = newText.replace(CJK_SLASH_CJK, '$1 $2 $3'); newText = newText.replace(CJK_SLASH_ANS, '$1 $2 $3'); newText = newText.replace(ANS_SLASH_CJK, '$1 $2 $3'); newText = newText.replace(ANS_SLASH_ANS, '$1 $2 $3'); // Restore file paths newText = filePathManager.restore(newText); } // If multiple slashes, treat as separator - do nothing (no spaces) // Restore compound words from placeholders newText = compoundWordManager.restore(newText); newText = newText.replace(CJK_LEFT_BRACKET, '$1 $2'); newText = newText.replace(RIGHT_BRACKET_CJK, '$1 $2'); newText = newText.replace(ANS_CJK_LEFT_BRACKET_ANY_RIGHT_BRACKET, '$1 $2$3$4'); newText = newText.replace(LEFT_BRACKET_ANY_RIGHT_BRACKET_ANS_CJK, '$1$2$3 $4'); newText = newText.replace(AN_LEFT_BRACKET, '$1 $2'); newText = newText.replace(RIGHT_BRACKET_AN, '$1 $2'); newText = newText.replace(CJK_ANS, '$1 $2'); newText = newText.replace(ANS_CJK, '$1 $2'); newText = newText.replace(S_A, '$1 $2'); newText = newText.replace(MIDDLE_DOT, '・'); // Brackets: <fcontentl> (fcontentl) [fcontentl] {fcontentl} // f: the first character inside the brackets // l: the last character inside the brackets // content: the content inside the brackets but exclude the first and last characters // DO NOT change the first and last characters inside brackets AT ALL // ONLY spacing the content between them // Fix spacing inside brackets according to the above rules: // Ensure no unwanted spaces immediately after opening or before closing brackets const fixBracketSpacing = (text: string) => { // Process all bracket types at once const bracketPatterns = [ { pattern: /<([^<>]*)>/g, open: '<', close: '>' }, { pattern: /$([^()]*)$/g, open: '(', close: ')' }, { pattern: /\[([^\[\]]*)\]/g, open: '[', close: ']' }, { pattern: /\{([^{}]*)\}/g, open: '{', close: '}' }, ]; for (const { pattern, open, close } of bracketPatterns) { text = text.replace(pattern, (_match, innerContent) => { if (!innerContent) { return `${open}${close}`; } // Remove spaces at the very beginning and end of content const trimmedContent = innerContent.replace(/^ +| +$/g, ''); return `${open}${trimmedContent}${close}`; }); } return text; }; newText = fixBracketSpacing(newText); // Restore HTML tags from placeholders (only if HTML processing occurred) if (hasHtmlTags) { newText = htmlTagManager.restore(newText); } // Restore backtick content newText = backtickManager.restore(newText); // TODO: // Final fix for HTML comments: ensure no space after <!-- // This is needed because <!-- is not protected as an HTML tag // and the ! character gets spaced by ANS_CJK pattern // newText = newText.replace(/<!--\s+/g, '<!--'); return newText; } public hasProperSpacing(text: string) { return this.spacingText(text) === text; } } export const pangu = new Pangu(); export { ANY_CJK }; export default pangu;