unicode-segmenter
Version:
A lightweight implementation of the Unicode Text Segmentation (UAX #29)
76 lines (75 loc) • 2.71 kB
TypeScript
/**
* @template {number} [T=number]
* @typedef {[from: number, to: number, category: T]} CategorizedUnicodeRange
*/
/**
* @typedef {CategorizedUnicodeRange<0>} UnicodeRange
*/
/**
* @typedef {string & { __tag: 'UnicodeDataEncoding' }} UnicodeDataEncoding
*
* Encoding for array of {@link UnicodeRange}, items separated by comma.
*
* Each {@link UnicodeDataRow} packed as a base36 integer:
*
* padding = to - from
* encoding = base36(from) + ',' + base36(padding)
*
* Notes:
* - base36 can hold surprisingly large numbers in a few characters.
* - The biggest codepoint is 0xE01F0 (918,000) at this point
* - The max value of a category is 23; https://www.unicode.org/reports/tr29/tr29-45.html#Table_Word_Break_Property_Values
* - The longest range is 42,720; CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
*/
/**
* @template {number} [T=number]
* @param {UnicodeDataEncoding} data
* @param {string} [cats='']
* @returns {Array<CategorizedUnicodeRange<T>>}
*/
export function decodeUnicodeData<T extends number = number>(data: UnicodeDataEncoding, cats?: string): Array<CategorizedUnicodeRange<T>>;
/**
* @template {object} Ext
* @typedef {{
* segment: string,
* index: number,
* input: string,
* } & Ext} SegmentOutput
*/
/**
* @template {object} T
* @typedef {IterableIterator<SegmentOutput<T>>} Segmenter
*/
/**
* @template {number} [T=number]
* @param {number} cp
* @param {CategorizedUnicodeRange<T>[]} ranges
* @return {number} index of matched unicode range, or -1 if no match
*/
export function findUnicodeRangeIndex<T extends number = number>(cp: number, ranges: CategorizedUnicodeRange<T>[], lo?: number, hi?: number): number;
export type CategorizedUnicodeRange<T extends number = number> = [from: number, to: number, category: T];
export type UnicodeRange = CategorizedUnicodeRange<0>;
/**
*
* Encoding for array of {@link UnicodeRange}, items separated by comma.
*
* Each {@link UnicodeDataRow} packed as a base36 integer:
*
* padding = to - from
* encoding = base36(from) + ',' + base36(padding)
*
* Notes:
* - base36 can hold surprisingly large numbers in a few characters.
* - The biggest codepoint is 0xE01F0 (918,000) at this point
* - The max value of a category is 23; https://www.unicode.org/reports/tr29/tr29-45.html#Table_Word_Break_Property_Values
* - The longest range is 42,720; CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
*/
export type UnicodeDataEncoding = string & {
__tag: "UnicodeDataEncoding";
};
export type SegmentOutput<Ext extends object> = {
segment: string;
index: number;
input: string;
} & Ext;
export type Segmenter<T extends object> = IterableIterator<SegmentOutput<T>>;