UNPKG

recoder-code

Version:

🚀 AI-powered development platform - Chat with 32+ models, build projects, automate workflows. Free models included!

802 lines (801 loc) • 786 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const boundaries_1 = require("./boundaries"); const GraphemerHelper_1 = __importDefault(require("./GraphemerHelper")); const GraphemerIterator_1 = __importDefault(require("./GraphemerIterator")); class Graphemer { /** * Returns the next grapheme break in the string after the given index * @param string {string} * @param index {number} * @returns {number} */ static nextBreak(string, index) { if (index === undefined) { index = 0; } if (index < 0) { return 0; } if (index >= string.length - 1) { return string.length; } const prevCP = GraphemerHelper_1.default.codePointAt(string, index); const prev = Graphemer.getGraphemeBreakProperty(prevCP); const prevEmoji = Graphemer.getEmojiProperty(prevCP); const mid = []; const midEmoji = []; for (let i = index + 1; i < string.length; i++) { // check for already processed low surrogates if (GraphemerHelper_1.default.isSurrogate(string, i - 1)) { continue; } const nextCP = GraphemerHelper_1.default.codePointAt(string, i); const next = Graphemer.getGraphemeBreakProperty(nextCP); const nextEmoji = Graphemer.getEmojiProperty(nextCP); if (GraphemerHelper_1.default.shouldBreak(prev, mid, next, prevEmoji, midEmoji, nextEmoji)) { return i; } mid.push(next); midEmoji.push(nextEmoji); } return string.length; } /** * Breaks the given string into an array of grapheme clusters * @param str {string} * @returns {string[]} */ splitGraphemes(str) { const res = []; let index = 0; let brk; while ((brk = Graphemer.nextBreak(str, index)) < str.length) { res.push(str.slice(index, brk)); index = brk; } if (index < str.length) { res.push(str.slice(index)); } return res; } /** * Returns an iterator of grapheme clusters in the given string * @param str {string} * @returns {GraphemerIterator} */ iterateGraphemes(str) { return new GraphemerIterator_1.default(str, Graphemer.nextBreak); } /** * Returns the number of grapheme clusters in the given string * @param str {string} * @returns {number} */ countGraphemes(str) { let count = 0; let index = 0; let brk; while ((brk = Graphemer.nextBreak(str, index)) < str.length) { index = brk; count++; } if (index < str.length) { count++; } return count; } /** * Given a Unicode code point, determines this symbol's grapheme break property * @param code {number} Unicode code point * @returns {number} */ static getGraphemeBreakProperty(code) { // Grapheme break property taken from: // https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt // and generated by // node ./scripts/generate-grapheme-break.js if (code < 0xbf09) { if (code < 0xac54) { if (code < 0x102d) { if (code < 0xb02) { if (code < 0x93b) { if (code < 0x6df) { if (code < 0x5bf) { if (code < 0x7f) { if (code < 0xb) { if (code < 0xa) { // Cc [10] <control-0000>..<control-0009> if (0x0 <= code && code <= 0x9) { return boundaries_1.CLUSTER_BREAK.CONTROL; } } else { // Cc <control-000A> if (0xa === code) { return boundaries_1.CLUSTER_BREAK.LF; } } } else { if (code < 0xd) { // Cc [2] <control-000B>..<control-000C> if (0xb <= code && code <= 0xc) { return boundaries_1.CLUSTER_BREAK.CONTROL; } } else { if (code < 0xe) { // Cc <control-000D> if (0xd === code) { return boundaries_1.CLUSTER_BREAK.CR; } } else { // Cc [18] <control-000E>..<control-001F> if (0xe <= code && code <= 0x1f) { return boundaries_1.CLUSTER_BREAK.CONTROL; } } } } } else { if (code < 0x300) { if (code < 0xad) { // Cc [33] <control-007F>..<control-009F> if (0x7f <= code && code <= 0x9f) { return boundaries_1.CLUSTER_BREAK.CONTROL; } } else { // Cf SOFT HYPHEN if (0xad === code) { return boundaries_1.CLUSTER_BREAK.CONTROL; } } } else { if (code < 0x483) { // Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X if (0x300 <= code && code <= 0x36f) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0x591) { // Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE // Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN if (0x483 <= code && code <= 0x489) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG if (0x591 <= code && code <= 0x5bd) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } } } else { if (code < 0x610) { if (code < 0x5c4) { if (code < 0x5c1) { // Mn HEBREW POINT RAFE if (0x5bf === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT if (0x5c1 <= code && code <= 0x5c2) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } else { if (code < 0x5c7) { // Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT if (0x5c4 <= code && code <= 0x5c5) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0x600) { // Mn HEBREW POINT QAMATS QATAN if (0x5c7 === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE if (0x600 <= code && code <= 0x605) { return boundaries_1.CLUSTER_BREAK.PREPEND; } } } } } else { if (code < 0x670) { if (code < 0x61c) { // Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA if (0x610 <= code && code <= 0x61a) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0x64b) { // Cf ARABIC LETTER MARK if (0x61c === code) { return boundaries_1.CLUSTER_BREAK.CONTROL; } } else { // Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW if (0x64b <= code && code <= 0x65f) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } else { if (code < 0x6d6) { // Mn ARABIC LETTER SUPERSCRIPT ALEF if (0x670 === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0x6dd) { // Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN if (0x6d6 <= code && code <= 0x6dc) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Cf ARABIC END OF AYAH if (0x6dd === code) { return boundaries_1.CLUSTER_BREAK.PREPEND; } } } } } } } else { if (code < 0x81b) { if (code < 0x730) { if (code < 0x6ea) { if (code < 0x6e7) { // Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA if (0x6df <= code && code <= 0x6e4) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON if (0x6e7 <= code && code <= 0x6e8) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } else { if (code < 0x70f) { // Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM if (0x6ea <= code && code <= 0x6ed) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Cf SYRIAC ABBREVIATION MARK if (0x70f === code) { return boundaries_1.CLUSTER_BREAK.PREPEND; } // Mn SYRIAC LETTER SUPERSCRIPT ALAPH if (0x711 === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } else { if (code < 0x7eb) { if (code < 0x7a6) { // Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH if (0x730 <= code && code <= 0x74a) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn [11] THAANA ABAFILI..THAANA SUKUN if (0x7a6 <= code && code <= 0x7b0) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } else { if (code < 0x7fd) { // Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE if (0x7eb <= code && code <= 0x7f3) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0x816) { // Mn NKO DANTAYALAN if (0x7fd === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH if (0x816 <= code && code <= 0x819) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } } } else { if (code < 0x898) { if (code < 0x829) { if (code < 0x825) { // Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A if (0x81b <= code && code <= 0x823) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U if (0x825 <= code && code <= 0x827) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } else { if (code < 0x859) { // Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA if (0x829 <= code && code <= 0x82d) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0x890) { // Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK if (0x859 <= code && code <= 0x85b) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE if (0x890 <= code && code <= 0x891) { return boundaries_1.CLUSTER_BREAK.PREPEND; } } } } } else { if (code < 0x8e3) { if (code < 0x8ca) { // Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA if (0x898 <= code && code <= 0x89f) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0x8e2) { // Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA if (0x8ca <= code && code <= 0x8e1) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Cf ARABIC DISPUTED END OF AYAH if (0x8e2 === code) { return boundaries_1.CLUSTER_BREAK.PREPEND; } } } } else { if (code < 0x903) { // Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA if (0x8e3 <= code && code <= 0x902) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mc DEVANAGARI SIGN VISARGA if (0x903 === code) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } // Mn DEVANAGARI VOWEL SIGN OE if (0x93a === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } } } } else { if (code < 0xa01) { if (code < 0x982) { if (code < 0x94d) { if (code < 0x93e) { // Mc DEVANAGARI VOWEL SIGN OOE if (0x93b === code) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } // Mn DEVANAGARI SIGN NUKTA if (0x93c === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0x941) { // Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II if (0x93e <= code && code <= 0x940) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } else { if (code < 0x949) { // Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI if (0x941 <= code && code <= 0x948) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU if (0x949 <= code && code <= 0x94c) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } } } } else { if (code < 0x951) { if (code < 0x94e) { // Mn DEVANAGARI SIGN VIRAMA if (0x94d === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mc [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW if (0x94e <= code && code <= 0x94f) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } } else { if (code < 0x962) { // Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE if (0x951 <= code && code <= 0x957) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0x981) { // Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL if (0x962 <= code && code <= 0x963) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn BENGALI SIGN CANDRABINDU if (0x981 === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } } } else { if (code < 0x9c7) { if (code < 0x9be) { if (code < 0x9bc) { // Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA if (0x982 <= code && code <= 0x983) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } else { // Mn BENGALI SIGN NUKTA if (0x9bc === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } else { if (code < 0x9bf) { // Mc BENGALI VOWEL SIGN AA if (0x9be === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0x9c1) { // Mc [2] BENGALI VOWEL SIGN I..BENGALI VOWEL SIGN II if (0x9bf <= code && code <= 0x9c0) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } else { // Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR if (0x9c1 <= code && code <= 0x9c4) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } } else { if (code < 0x9d7) { if (code < 0x9cb) { // Mc [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI if (0x9c7 <= code && code <= 0x9c8) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } else { if (code < 0x9cd) { // Mc [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU if (0x9cb <= code && code <= 0x9cc) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } else { // Mn BENGALI SIGN VIRAMA if (0x9cd === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } else { if (code < 0x9e2) { // Mc BENGALI AU LENGTH MARK if (0x9d7 === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0x9fe) { // Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL if (0x9e2 <= code && code <= 0x9e3) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn BENGALI SANDHI MARK if (0x9fe === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } } } } else { if (code < 0xa83) { if (code < 0xa47) { if (code < 0xa3c) { if (code < 0xa03) { // Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI if (0xa01 <= code && code <= 0xa02) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mc GURMUKHI SIGN VISARGA if (0xa03 === code) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } } else { if (code < 0xa3e) { // Mn GURMUKHI SIGN NUKTA if (0xa3c === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0xa41) { // Mc [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II if (0xa3e <= code && code <= 0xa40) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } else { // Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU if (0xa41 <= code && code <= 0xa42) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } } else { if (code < 0xa70) { if (code < 0xa4b) { // Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI if (0xa47 <= code && code <= 0xa48) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0xa51) { // Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA if (0xa4b <= code && code <= 0xa4d) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn GURMUKHI SIGN UDAAT if (0xa51 === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } else { if (code < 0xa75) { // Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK if (0xa70 <= code && code <= 0xa71) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0xa81) { // Mn GURMUKHI SIGN YAKASH if (0xa75 === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA if (0xa81 <= code && code <= 0xa82) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } } } else { if (code < 0xac9) { if (code < 0xabe) { // Mc GUJARATI SIGN VISARGA if (0xa83 === code) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } // Mn GUJARATI SIGN NUKTA if (0xabc === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0xac1) { // Mc [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II if (0xabe <= code && code <= 0xac0) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } else { if (code < 0xac7) { // Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E if (0xac1 <= code && code <= 0xac5) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI if (0xac7 <= code && code <= 0xac8) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } } else { if (code < 0xae2) { if (code < 0xacb) { // Mc GUJARATI VOWEL SIGN CANDRA O if (0xac9 === code) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } else { if (code < 0xacd) { // Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU if (0xacb <= code && code <= 0xacc) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } else { // Mn GUJARATI SIGN VIRAMA if (0xacd === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } else { if (code < 0xafa) { // Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL if (0xae2 <= code && code <= 0xae3) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0xb01) { // Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE if (0xafa <= code && code <= 0xaff) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { // Mn ORIYA SIGN CANDRABINDU if (0xb01 === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } } } } } } } else { if (code < 0xcf3) { if (code < 0xc04) { if (code < 0xb82) { if (code < 0xb47) { if (code < 0xb3e) { if (code < 0xb3c) { // Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA if (0xb02 <= code && code <= 0xb03) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } else { // Mn ORIYA SIGN NUKTA if (0xb3c === code) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } } else { if (code < 0xb40) { // Mc ORIYA VOWEL SIGN AA // Mn ORIYA VOWEL SIGN I if (0xb3e <= code && code <= 0xb3f) { return boundaries_1.CLUSTER_BREAK.EXTEND; } } else { if (code < 0xb41) { // Mc ORIYA VOWEL SIGN II if (0xb40 === code) { return boundaries_1.CLUSTER_BREAK.SPACINGMARK; } } else {