UNPKG

intl-segmenter

Version:

A high-performance wrapper around `Intl.Segmenter` for efficient text segmentation. This class resolves memory handling issues seen with large strings and can enhance performance by 50-500x. Only ~70 loc (with comments) and no dependencies.

79 lines (77 loc) 2.64 kB
"use strict"; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __hasOwnProp = Object.prototype.hasOwnProperty; var __name = (target, value) => __defProp(target, "name", { value, configurable: true }); var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // index.ts var intl_segmenter_exports = {}; __export(intl_segmenter_exports, { Segmenter: () => Segmenter }); module.exports = __toCommonJS(intl_segmenter_exports); // src/Segmenter.ts var Segmenter = class extends Intl.Segmenter { static { __name(this, "Segmenter"); } constructor(language, options = {}) { super(language, options); this.language = language; this.options = options; } *segment(input) { const { maxChunkLength = 100, ...options } = this.options; let position = 0; while (position < input.length) { const remainingText = input.slice(position); const chunkSize = Math.min(maxChunkLength, remainingText.length); const potentialChunk = remainingText.slice(0, chunkSize); const breakPoint = this.findSafeBreakPoint(potentialChunk); const chunk = potentialChunk.slice(0, breakPoint); const segmenter = new Intl.Segmenter(this.language, { ...options }); const segments = segmenter.segment(chunk); for (const segment of segments) { yield segment; } position += breakPoint; } } findSafeBreakPoint(input) { for (let i = input.length - 1; i >= 0; i--) { if (/\s/.test(input[i]) || /^[\x20-\x7E]$/.test(input[i])) { return i + 1; } } return input.length; } getSegments(input) { const array = []; for (const segment of this.segment(input)) { array.push(segment); } return array; } static getSegments(input, language, options = {}) { const segmenter = new this(language, options); return segmenter.getSegments(input); } }; // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { Segmenter }); //# sourceMappingURL=index.js.map