intl-segmenter
Version:
A high-performance wrapper around `Intl.Segmenter` for efficient text segmentation. This class resolves memory handling issues seen with large strings and can enhance performance by 50-500x. Only ~70 loc (with comments) and no dependencies.
54 lines (53 loc) • 1.64 kB
JavaScript
var __defProp = Object.defineProperty;
var __name = (target, value) => __defProp(target, "name", { value, configurable: true });
// src/Segmenter.ts
var Segmenter = class extends Intl.Segmenter {
static {
__name(this, "Segmenter");
}
constructor(language, options = {}) {
super(language, options);
this.language = language;
this.options = options;
}
*segment(input) {
const { maxChunkLength = 100, ...options } = this.options;
let position = 0;
while (position < input.length) {
const remainingText = input.slice(position);
const chunkSize = Math.min(maxChunkLength, remainingText.length);
const potentialChunk = remainingText.slice(0, chunkSize);
const breakPoint = this.findSafeBreakPoint(potentialChunk);
const chunk = potentialChunk.slice(0, breakPoint);
const segmenter = new Intl.Segmenter(this.language, { ...options });
const segments = segmenter.segment(chunk);
for (const segment of segments) {
yield segment;
}
position += breakPoint;
}
}
findSafeBreakPoint(input) {
for (let i = input.length - 1; i >= 0; i--) {
if (/\s/.test(input[i]) || /^[\x20-\x7E]$/.test(input[i])) {
return i + 1;
}
}
return input.length;
}
getSegments(input) {
const array = [];
for (const segment of this.segment(input)) {
array.push(segment);
}
return array;
}
static getSegments(input, language, options = {}) {
const segmenter = new this(language, options);
return segmenter.getSegments(input);
}
};
export {
Segmenter
};
//# sourceMappingURL=index.mjs.map