UNPKG

unicode-segmenter

Version:

A lightweight implementation of the Unicode Text Segmentation (UAX #29)

github.com/cometkim/unicode-segmenter

cometkim/unicode-segmenter

74 lines (66 loc) • 2.15 kB

JavaScript

"use strict"; exports.decodeUnicodeData = decodeUnicodeData; exports.findUnicodeRangeIndex = findUnicodeRangeIndex; // @ts-check /** * @template {number} [T=number] * @typedef {[from: number, to: number, category: T]} CategorizedUnicodeRange */ /** * @typedef {CategorizedUnicodeRange<0>} UnicodeRange */ /** * @typedef {string & { __tag: 'UnicodeDataEncoding' }} UnicodeDataEncoding * * Encoding for array of {@link UnicodeRange}, items separated by comma. * * Each {@link UnicodeDataRow} packed as a base36 integer: * * padding = to - from * encoding = base36(from) + ',' + base36(padding) * * Notes: * - base36 can hold surprisingly large numbers in a few characters. * - The biggest codepoint is 0xE01F0 (918,000) at this point * - The max value of a category is 23; https://www.unicode.org/reports/tr29/tr29-45.html#Table_Word_Break_Property_Values * - The longest range is 42,720; CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF */ /** * @template {number} [T=number] * @param {UnicodeDataEncoding} data * @param {string} [cats=''] * @returns {Array<CategorizedUnicodeRange<T>>} */ function decodeUnicodeData(data, cats = '') { let buf = /** @type {Array<CategorizedUnicodeRange<T>>} */[], nums = data.split(',').map(s => s ? parseInt(s, 36) : 0), n = 0; for (let i = 0; i < nums.length; i++) i % 2 ? buf.push([n, n + nums[i], (/** @type {T} */cats ? parseInt(cats[i >> 1], 36) : 0)]) : n = nums[i]; return buf; } /** * @template {object} Ext * @typedef {{ * segment: string, * index: number, * input: string, * } & Ext} SegmentOutput */ /** * @template {object} T * @typedef {IterableIterator<SegmentOutput<T>>} Segmenter */ /** * @template {number} [T=number] * @param {number} cp * @param {CategorizedUnicodeRange<T>[]} ranges * @return {number} index of matched unicode range, or -1 if no match */ function findUnicodeRangeIndex(cp, ranges, lo = 0, hi = ranges.length - 1) { while (lo <= hi) { let mid = lo + hi >>> 1, range = ranges[mid]; if (cp < range[0]) hi = mid - 1;else if (cp > range[1]) lo = mid + 1;else return mid; } return -1; }