UNPKG

intl-segmenter

Version:

A high-performance wrapper around `Intl.Segmenter` for efficient text segmentation. This class resolves memory handling issues seen with large strings and can enhance performance by 50-500x. Only ~70 loc (with comments) and no dependencies.

54 lines (53 loc) 1.64 kB
var __defProp = Object.defineProperty; var __name = (target, value) => __defProp(target, "name", { value, configurable: true }); // src/Segmenter.ts var Segmenter = class extends Intl.Segmenter { static { __name(this, "Segmenter"); } constructor(language, options = {}) { super(language, options); this.language = language; this.options = options; } *segment(input) { const { maxChunkLength = 100, ...options } = this.options; let position = 0; while (position < input.length) { const remainingText = input.slice(position); const chunkSize = Math.min(maxChunkLength, remainingText.length); const potentialChunk = remainingText.slice(0, chunkSize); const breakPoint = this.findSafeBreakPoint(potentialChunk); const chunk = potentialChunk.slice(0, breakPoint); const segmenter = new Intl.Segmenter(this.language, { ...options }); const segments = segmenter.segment(chunk); for (const segment of segments) { yield segment; } position += breakPoint; } } findSafeBreakPoint(input) { for (let i = input.length - 1; i >= 0; i--) { if (/\s/.test(input[i]) || /^[\x20-\x7E]$/.test(input[i])) { return i + 1; } } return input.length; } getSegments(input) { const array = []; for (const segment of this.segment(input)) { array.push(segment); } return array; } static getSegments(input, language, options = {}) { const segmenter = new this(language, options); return segmenter.getSegments(input); } }; export { Segmenter }; //# sourceMappingURL=index.mjs.map