UNPKG

unicode-segmenter

Version:

A lightweight implementation of the Unicode Text Segmentation (UAX #29)

github.com/cometkim/unicode-segmenter

cometkim/unicode-segmenter

349 lines (298 loc) • 8.97 kB

JavaScript

// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // // Licensed under the MIT license // <LICENSE-MIT or http://opensource.org/licenses/MIT>. // // Modified original Rust library [source code] // (https://github.com/unicode-rs/unicode-segmentation/blob/1f88570/src/grapheme.rs) // // to create JavaScript library [unicode-segmenter] // (https://github.com/cometkim/unicode-segmenter) // @ts-check import { findUnicodeRangeIndex } from './core.js'; import { GraphemeCategory, grapheme_ranges } from './_grapheme_data.js'; import { consonant_ranges } from './_incb_data.js'; /** * @typedef {import('./_grapheme_data.js').GC_Any} GC_Any * * @typedef {import('./_grapheme_data.js').GraphemeCategoryNum} GraphemeCategoryNum * @typedef {import('./_grapheme_data.js').GraphemeCategoryRange} GraphemeCategoryRange * * @typedef {object} GraphemeSegmentExtra * @property {number} _hd The first code point of the segment * @property {GraphemeCategoryNum} _catBegin Beginning Grapheme_Cluster_Break category of the segment * @property {GraphemeCategoryNum} _catEnd Ending Grapheme_Cluster_Break category of the segment * * @typedef {import('./core.js').Segmenter<GraphemeSegmentExtra>} GraphemeSegmenter */ export { GraphemeCategory }; const BMP_MAX = 0xFFFF; /** * Unicode segmentation by extended grapheme rules. * * This is fully compatible with the {@link Intl.Segmenter.segment} API * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter/segment * * @param {string} input * @return {GraphemeSegmenter} iterator for grapheme cluster segments */ export function* graphemeSegments(input) { let cp = input.codePointAt(0); // do nothing on empty string if (cp == null) return; /** Current cursor position. */ let cursor = cp <= BMP_MAX ? 1 : 2; /** Total length of the input string. */ let len = input.length; /** @type {import('./_grapheme_data.js').GraphemeCategoryRange} */ let cache = [0, 0, 2 /* GC_Control */]; /** Category of codepoint immediately preceding cursor */ let catBefore = cat(cp, cache); /** @type {GraphemeCategoryNum | null} Category of codepoint immediately preceding cursor. */ let catAfter = null; /** The number of RIS codepoints preceding `cursor`. */ let risCount = 0; /** Emoji state */ let emoji = false; /** InCB=Consonant */ let consonant = false; /** InCB=Linker */ let linker = false; /** InCB=Consonant InCB=Linker x InCB=Consonant */ let incb = false; let index = 0; /** Beginning category of a segment */ let _catBegin = catBefore; /** Memoize the beginnig code point a the segment. */ let _hd = cp; while (cursor < len) { // Note: Lazily update `consonant` and `linker` state // which is a extra overhead only for Hindi text. if (cp >= 2325) { if (!consonant && catBefore === 0) { consonant = isIndicConjunctConsonant(cp); } else if (catBefore === 3 /* Extend */) { // Note: \p{InCB=Linker} is a subset of \p{Extend} linker = isIndicConjunctLinker(cp); } } cp = /** @type {number} */ (input.codePointAt(cursor)); catAfter = cat(cp, cache); if (catBefore === 10 /* Regional_Indicator */) { risCount++; } else { risCount = 0; if ( catAfter === 14 /* ZWJ */ && (catBefore === 3 /* Extend */ || catBefore === 4 /* Extended_Pictographic */) ) { emoji = true; } else if (catAfter === 0 /* Any */ && cp >= 2325) { // Note: Put GB9c rule checking here to reduce. incb = consonant && linker && (consonant = isIndicConjunctConsonant(cp)); // It cannot be both a linker and a consonant. linker = linker && !consonant; } } if (isBoundary(catBefore, catAfter, risCount, emoji, incb)) { yield { segment: input.slice(index, cursor), index, input, _hd, _catBegin, _catEnd: catBefore, }; // flush emoji = false; incb = false; index = cursor; _catBegin = catAfter; _hd = cp; } cursor += cp <= BMP_MAX ? 1 : 2; catBefore = catAfter; } if (index < len) { yield { segment: input.slice(index), index, input, _hd, _catBegin, _catEnd: catBefore, }; } } /** * Count number of extended grapheme clusters in given text. * * NOTE: * * This function is a small wrapper around {@link graphemeSegments}. * * If you call it more than once at a time, consider memoization * or use {@link graphemeSegments} or {@link splitGraphemes} once instead * * @param {string} text * @return {number} count of grapheme clusters */ export function countGraphemes(text) { let count = 0; for (let _ of graphemeSegments(text)) count += 1; return count; } export { /** * @deprecated use {@link countGraphemes} */ countGraphemes as countGrapheme, }; /** * Split given text into extended grapheme clusters. * * @param {string} text * @return {IterableIterator<string>} iterator for grapheme clusters * * @see {@link graphemeSegments} if you need extra information. * * @example * [...splitGraphemes('abc')] // => ['a', 'b', 'c'] */ export function* splitGraphemes(text) { for (let s of graphemeSegments(text)) yield s.segment; } /** * Precompute a fast lookup table for BMP code points (0..0xFFFF) * This table maps each code point to its Grapheme_Cluster_Break category. * It is generated once at module load time using the grapheme_ranges data. * The table is a Uint8Array of length 0x10000 (64KB), which is acceptable in memory. * For code points >= 0x10000 we fall back to binary search. */ let bmpLookup = new Uint8Array(BMP_MAX + 1); let bmpCursor = (() => { let cursor = 0; let cp = 0; while (cp <= BMP_MAX) { let range = grapheme_ranges[cursor++]; for (cp = range[0]; cp <= range[1]; cp++) { bmpLookup[cp] = range[2]; } } return cursor; })(); /** * `Grapheme_Cluster_Break` property value of a given codepoint * * @see https://www.unicode.org/reports/tr29/tr29-43.html#Default_Grapheme_Cluster_Table * * @param {number} cp * @param {import('./_grapheme_data.js').GraphemeCategoryRange} cache * @return {GraphemeCategoryNum} */ function cat(cp, cache) { // Fast lookup for BMP (0x0000..0xFFFF) using precomputed table if (cp <= BMP_MAX) { return /** @type {GraphemeCategoryNum} */ (bmpLookup[cp]); } // Use cached result if (cp >= cache[0] && cp <= cache[1]) { return cache[2]; } // Binary search, starting from bmpCursor let index = findUnicodeRangeIndex(cp, grapheme_ranges, bmpCursor); if (index < 0) { return 0; } const range = grapheme_ranges[index]; cache[0] = range[0]; cache[1] = range[1]; return (cache[2] = range[2]); }; /** * @param {number} cp * @return {boolean} */ function isIndicConjunctConsonant(cp) { return findUnicodeRangeIndex(cp, consonant_ranges) >= 0; } /** * @param {number} cp * @return {boolean} */ function isIndicConjunctLinker(cp) { return ( cp === 2381 /* 0x094D */ || cp === 2509 /* 0x09CD */ || cp === 2765 /* 0x0ACD */ || cp === 2893 /* 0x0B4D */ || cp === 3149 /* 0x0C4D */ || cp === 3405 /* 0x0D4D */ ); } /** * @param {GraphemeCategoryNum} catBefore * @param {GraphemeCategoryNum} catAfter * @param {number} risCount Regional_Indicator state * @param {boolean} emoji Extended_Pictographic state * @param {boolean} incb Indic_Conjunct_Break state * @return {boolean} * * @see https://www.unicode.org/reports/tr29/tr29-43.html#Grapheme_Cluster_Boundary_Rules */ function isBoundary(catBefore, catAfter, risCount, emoji, incb) { // GB3 if (catBefore === 1 && catAfter === 6) { return false; } // GB4 if (catBefore === 1 || catBefore === 2 || catBefore === 6) { return true; } // GB5 if (catAfter === 1 || catAfter === 2 || catAfter === 6) { return true; } // Most common cases - GB9, GB9a extend rules if (catAfter === 3 || catAfter === 14 || catAfter === 11) { return false; } // GB6 - L x (L | V | LV | LVT) if (catBefore === 5) { return !(catAfter === 5 || catAfter === 7 || catAfter === 8 || catAfter === 13); } // GB7 - (LV | V) x (V | T) if ( (catBefore === 7 || catBefore === 13) && (catAfter === 13 || catAfter === 12) ) { return false; } // GB8 - (LVT | T) x T if ( (catBefore === 8 || catBefore === 12) && catAfter === 12 ) { return false; } // GB9b if (catBefore === 9) { return false; } // GB9c if (catAfter === 0 && incb) { return false; } // GB11 if (catBefore === 14 && catAfter === 4) { return !emoji; } // GB12, GB13 if (catBefore === 10 && catAfter === 10) { return risCount % 2 === 0; } // GB999 return true; }