intl-segmenter
Version:
A high-performance wrapper around `Intl.Segmenter` for efficient text segmentation. This class resolves memory handling issues seen with large strings and can enhance performance by 50-500x. Only ~70 loc (with comments) and no dependencies.
79 lines (77 loc) • 2.64 kB
JavaScript
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __name = (target, value) => __defProp(target, "name", { value, configurable: true });
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// index.ts
var intl_segmenter_exports = {};
__export(intl_segmenter_exports, {
Segmenter: () => Segmenter
});
module.exports = __toCommonJS(intl_segmenter_exports);
// src/Segmenter.ts
var Segmenter = class extends Intl.Segmenter {
static {
__name(this, "Segmenter");
}
constructor(language, options = {}) {
super(language, options);
this.language = language;
this.options = options;
}
*segment(input) {
const { maxChunkLength = 100, ...options } = this.options;
let position = 0;
while (position < input.length) {
const remainingText = input.slice(position);
const chunkSize = Math.min(maxChunkLength, remainingText.length);
const potentialChunk = remainingText.slice(0, chunkSize);
const breakPoint = this.findSafeBreakPoint(potentialChunk);
const chunk = potentialChunk.slice(0, breakPoint);
const segmenter = new Intl.Segmenter(this.language, { ...options });
const segments = segmenter.segment(chunk);
for (const segment of segments) {
yield segment;
}
position += breakPoint;
}
}
findSafeBreakPoint(input) {
for (let i = input.length - 1; i >= 0; i--) {
if (/\s/.test(input[i]) || /^[\x20-\x7E]$/.test(input[i])) {
return i + 1;
}
}
return input.length;
}
getSegments(input) {
const array = [];
for (const segment of this.segment(input)) {
array.push(segment);
}
return array;
}
static getSegments(input, language, options = {}) {
const segmenter = new this(language, options);
return segmenter.getSegments(input);
}
};
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
Segmenter
});
//# sourceMappingURL=index.js.map
;