UNPKG

intl-segmenter

Version:

A high-performance wrapper around `Intl.Segmenter` for efficient text segmentation. This class resolves memory handling issues seen with large strings and can enhance performance by 50-500x. Only ~70 loc (with comments) and no dependencies.

1 lines 3.47 kB
{"version":3,"sources":["../src/Segmenter.ts"],"sourcesContent":["export class Segmenter extends Intl.Segmenter {\n constructor(language: string, options: Intl.SegmenterOptions = {}) {\n super(language, options);\n this.language = language;\n this.options = options;\n }\n\n * segment(input: string): Generator<Intl.Segment> {\n const { maxChunkLength = 100, ...options } = this.options;\n let position = 0;\n\n while (position < input.length) {\n const remainingText = input.slice(position);\n const chunkSize = Math.min(maxChunkLength, remainingText.length);\n const potentialChunk = remainingText.slice(0, chunkSize);\n\n // Find a safe position to break the string\n const breakPoint = this.findSafeBreakPoint(potentialChunk);\n const chunk = potentialChunk.slice(0, breakPoint);\n\n // Process the chunk with Intl.Segmenter. Using this approach instead\n // of super.segment() to avoid any potential side effects.\n const segmenter = new Intl.Segmenter(this.language, { ...options });\n const segments = segmenter.segment(chunk);\n\n for (const segment of segments) {\n yield segment;\n }\n\n position += breakPoint;\n }\n }\n\n findSafeBreakPoint(input: string): number {\n // Work backwards from the end of the input\n for (let i = input.length - 1; i >= 0; i--) {\n // Check for whitespace or simple ASCII characters\n if (/\\s/.test(input[i]) || /^[\\x20-\\x7E]$/.test(input[i])) {\n return i + 1;\n }\n }\n\n // If no safe break points were found, return the full length\n return input.length;\n }\n\n getSegments(input: string): Intl.Segment[] {\n const array = [];\n\n // A for loop is much faster than Array.from, it doesn't cause a\n // maximum call stack error for large strings. Also, optimizations\n // in v8 make using `push` much faster than pre-allocating an array,\n // like `Array(input.length)` and setting the values at each index.\n for (const segment of this.segment(input)) {\n array.push(segment);\n }\n\n return array;\n }\n\n static getSegments(\n input: string,\n language: string,\n options: Intl.SegmenterOptions = {}\n ): Intl.Segment[] {\n const segmenter = new this(language, options);\n return segmenter.getSegments(input);\n }\n}\n"],"mappings":";;;;AAAO,IAAM,YAAN,cAAwB,KAAK,UAAU;AAAA,EAA9C,OAA8C;AAAA;AAAA;AAAA,EAC5C,YAAY,UAAkB,UAAiC,CAAC,GAAG;AACjE,UAAM,UAAU,OAAO;AACvB,SAAK,WAAW;AAChB,SAAK,UAAU;AAAA,EACjB;AAAA,EAEA,CAAE,QAAQ,OAAwC;AAChD,UAAM,EAAE,iBAAiB,KAAK,GAAG,QAAQ,IAAI,KAAK;AAClD,QAAI,WAAW;AAEf,WAAO,WAAW,MAAM,QAAQ;AAC9B,YAAM,gBAAgB,MAAM,MAAM,QAAQ;AAC1C,YAAM,YAAY,KAAK,IAAI,gBAAgB,cAAc,MAAM;AAC/D,YAAM,iBAAiB,cAAc,MAAM,GAAG,SAAS;AAGvD,YAAM,aAAa,KAAK,mBAAmB,cAAc;AACzD,YAAM,QAAQ,eAAe,MAAM,GAAG,UAAU;AAIhD,YAAM,YAAY,IAAI,KAAK,UAAU,KAAK,UAAU,EAAE,GAAG,QAAQ,CAAC;AAClE,YAAM,WAAW,UAAU,QAAQ,KAAK;AAExC,iBAAW,WAAW,UAAU;AAC9B,cAAM;AAAA,MACR;AAEA,kBAAY;AAAA,IACd;AAAA,EACF;AAAA,EAEA,mBAAmB,OAAuB;AAExC,aAAS,IAAI,MAAM,SAAS,GAAG,KAAK,GAAG,KAAK;AAE1C,UAAI,KAAK,KAAK,MAAM,CAAC,CAAC,KAAK,gBAAgB,KAAK,MAAM,CAAC,CAAC,GAAG;AACzD,eAAO,IAAI;AAAA,MACb;AAAA,IACF;AAGA,WAAO,MAAM;AAAA,EACf;AAAA,EAEA,YAAY,OAA+B;AACzC,UAAM,QAAQ,CAAC;AAMf,eAAW,WAAW,KAAK,QAAQ,KAAK,GAAG;AACzC,YAAM,KAAK,OAAO;AAAA,IACpB;AAEA,WAAO;AAAA,EACT;AAAA,EAEA,OAAO,YACL,OACA,UACA,UAAiC,CAAC,GAClB;AAChB,UAAM,YAAY,IAAI,KAAK,UAAU,OAAO;AAC5C,WAAO,UAAU,YAAY,KAAK;AAAA,EACpC;AACF;","names":[]}