unicode-segmenter
Version:
A lightweight implementation of the Unicode Text Segmentation (UAX #29)
74 lines (66 loc) • 2.15 kB
JavaScript
exports.decodeUnicodeData = decodeUnicodeData;
exports.findUnicodeRangeIndex = findUnicodeRangeIndex;
// @ts-check
/**
* @template {number} [T=number]
* @typedef {[from: number, to: number, category: T]} CategorizedUnicodeRange
*/
/**
* @typedef {CategorizedUnicodeRange<0>} UnicodeRange
*/
/**
* @typedef {string & { __tag: 'UnicodeDataEncoding' }} UnicodeDataEncoding
*
* Encoding for array of {@link UnicodeRange}, items separated by comma.
*
* Each {@link UnicodeDataRow} packed as a base36 integer:
*
* padding = to - from
* encoding = base36(from) + ',' + base36(padding)
*
* Notes:
* - base36 can hold surprisingly large numbers in a few characters.
* - The biggest codepoint is 0xE01F0 (918,000) at this point
* - The max value of a category is 23; https://www.unicode.org/reports/tr29/tr29-45.html#Table_Word_Break_Property_Values
* - The longest range is 42,720; CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
*/
/**
* @template {number} [T=number]
* @param {UnicodeDataEncoding} data
* @param {string} [cats='']
* @returns {Array<CategorizedUnicodeRange<T>>}
*/
function decodeUnicodeData(data, cats = '') {
let buf = /** @type {Array<CategorizedUnicodeRange<T>>} */[],
nums = data.split(',').map(s => s ? parseInt(s, 36) : 0),
n = 0;
for (let i = 0; i < nums.length; i++) i % 2 ? buf.push([n, n + nums[i], (/** @type {T} */cats ? parseInt(cats[i >> 1], 36) : 0)]) : n = nums[i];
return buf;
}
/**
* @template {object} Ext
* @typedef {{
* segment: string,
* index: number,
* input: string,
* } & Ext} SegmentOutput
*/
/**
* @template {object} T
* @typedef {IterableIterator<SegmentOutput<T>>} Segmenter
*/
/**
* @template {number} [T=number]
* @param {number} cp
* @param {CategorizedUnicodeRange<T>[]} ranges
* @return {number} index of matched unicode range, or -1 if no match
*/
function findUnicodeRangeIndex(cp, ranges, lo = 0, hi = ranges.length - 1) {
while (lo <= hi) {
let mid = lo + hi >>> 1,
range = ranges[mid];
if (cp < range[0]) hi = mid - 1;else if (cp > range[1]) lo = mid + 1;else return mid;
}
return -1;
}
;