UNPKG

@ssttevee/streamsearch

Version:

A port of streamsearch for es modules using Web APIs

273 lines (266 loc) 9.78 kB
'use strict'; Object.defineProperty(exports, '__esModule', { value: true }); var u8Utils = require('@ssttevee/u8-utils'); /* Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool */ function coerce(a) { if (a instanceof Uint8Array) { return (index) => a[index]; } return a; } function jsmemcmp(buf1, pos1, buf2, pos2, len) { const fn1 = coerce(buf1); const fn2 = coerce(buf2); for (var i = 0; i < len; ++i) { if (fn1(pos1 + i) !== fn2(pos2 + i)) { return false; } } return true; } function createOccurenceTable(s) { // Populate occurrence table with analysis of the needle, // ignoring last letter. const table = new Array(256).fill(s.length); if (s.length > 1) { for (let i = 0; i < s.length - 1; i++) { table[s[i]] = s.length - 1 - i; } } return table; } const MATCH = Symbol('Match'); class StreamSearch { constructor(needle) { this._lookbehind = new Uint8Array(); if (typeof needle === 'string') { this._needle = needle = u8Utils.stringToArray(needle); } else { this._needle = needle; } this._lastChar = needle[needle.length - 1]; this._occ = createOccurenceTable(needle); } feed(chunk) { let pos = 0; let tokens; const allTokens = []; while (pos !== chunk.length) { [pos, ...tokens] = this._feed(chunk, pos); allTokens.push(...tokens); } return allTokens; } end() { const tail = this._lookbehind; this._lookbehind = new Uint8Array(); return tail; } _feed(data, buf_pos) { const tokens = []; // Positive: points to a position in `data` // pos == 3 points to data[3] // Negative: points to a position in the lookbehind buffer // pos == -2 points to lookbehind[lookbehind_size - 2] let pos = -this._lookbehind.length; if (pos < 0) { // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool // search with character lookup code that considers both the // lookbehind buffer and the current round's haystack data. // // Loop until (condition 1) // there is a match. // or until // we've moved past the position that requires the // lookbehind buffer. In this case we switch to the // optimized loop. // or until (condition 3) // the character to look at lies outside the haystack. while (pos < 0 && pos <= data.length - this._needle.length) { const ch = this._charAt(data, pos + this._needle.length - 1); if (ch === this._lastChar && this._memcmp(data, pos, this._needle.length - 1)) { if (pos > -this._lookbehind.length) { tokens.push(this._lookbehind.slice(0, this._lookbehind.length + pos)); } tokens.push(MATCH); this._lookbehind = new Uint8Array(); return [pos + this._needle.length, ...tokens]; } else { pos += this._occ[ch]; } } // No match. if (pos < 0) { // There's too little data for Boyer-Moore-Horspool to run, // so we'll use a different algorithm to skip as much as // we can. // Forward pos until // the trailing part of lookbehind + data // looks like the beginning of the needle // or until // pos == 0 while (pos < 0 && !this._memcmp(data, pos, data.length - pos)) { pos++; } } if (pos >= 0) { // Discard lookbehind buffer. tokens.push(this._lookbehind); this._lookbehind = new Uint8Array(); } else { // Cut off part of the lookbehind buffer that has // been processed and append the entire haystack // into it. const bytesToCutOff = this._lookbehind.length + pos; if (bytesToCutOff > 0) { // The cut off data is guaranteed not to contain the needle. tokens.push(this._lookbehind.slice(0, bytesToCutOff)); this._lookbehind = this._lookbehind.slice(bytesToCutOff); } this._lookbehind = Uint8Array.from(new Array(this._lookbehind.length + data.length), (_, i) => this._charAt(data, i - this._lookbehind.length)); return [data.length, ...tokens]; } } pos += buf_pos; // Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool // search with optimized character lookup code that only considers // the current round's haystack data. while (pos <= data.length - this._needle.length) { const ch = data[pos + this._needle.length - 1]; if (ch === this._lastChar && data[pos] === this._needle[0] && jsmemcmp(this._needle, 0, data, pos, this._needle.length - 1)) { if (pos > buf_pos) { tokens.push(data.slice(buf_pos, pos)); } tokens.push(MATCH); return [pos + this._needle.length, ...tokens]; } else { pos += this._occ[ch]; } } // There was no match. If there's trailing haystack data that we cannot // match yet using the Boyer-Moore-Horspool algorithm (because the trailing // data is less than the needle size) then match using a modified // algorithm that starts matching from the beginning instead of the end. // Whatever trailing data is left after running this algorithm is added to // the lookbehind buffer. if (pos < data.length) { while (pos < data.length && (data[pos] !== this._needle[0] || !jsmemcmp(data, pos, this._needle, 0, data.length - pos))) { ++pos; } if (pos < data.length) { this._lookbehind = data.slice(pos); } } // Everything until pos is guaranteed not to contain needle data. if (pos > 0) { tokens.push(data.slice(buf_pos, pos < data.length ? pos : data.length)); } return [data.length, ...tokens]; } _charAt(data, pos) { if (pos < 0) { return this._lookbehind[this._lookbehind.length + pos]; } return data[pos]; } ; _memcmp(data, pos, len) { return jsmemcmp(this._charAt.bind(this, data), pos, this._needle, 0, len); } ; } class ReadableStreamSearch { constructor(needle, _readableStream) { this._readableStream = _readableStream; this._search = new StreamSearch(needle); } async *[Symbol.asyncIterator]() { const reader = this._readableStream.getReader(); try { while (true) { const result = await reader.read(); if (result.done) { break; } yield* this._search.feed(result.value); } const tail = this._search.end(); if (tail.length) { yield tail; } } finally { reader.releaseLock(); } } } function splitChunks(chunks, needle) { const search = new StreamSearch(needle); const outchunks = [[]]; for (const chunk of chunks) { for (const token of search.feed(chunk)) { if (token === MATCH) { outchunks.push([]); } else { outchunks[outchunks.length - 1].push(token); } } } const end = search.end(); outchunks[outchunks.length - 1].push(end); return outchunks.map((chunks) => u8Utils.mergeArrays(...chunks)); } function split(buf, needle) { return splitChunks([buf], needle); } async function* chunksIterator(iter) { let chunks = []; for await (const value of iter) { if (value === MATCH) { yield chunks; chunks = []; } else { chunks.push(value); } } yield chunks; } async function* stringIterator(iter) { for await (const chunk of chunksIterator(iter)) { yield chunk.map(u8Utils.arrayToString).join(''); } } async function allStrings(iter) { const segments = []; for await (const value of stringIterator(iter)) { segments.push(value); } return segments; } async function* arrayIterator(iter) { for await (const chunk of chunksIterator(iter)) { yield u8Utils.mergeArrays(...chunk); } } exports.MATCH = MATCH; exports.ReadableStreamSearch = ReadableStreamSearch; exports.StreamSearch = StreamSearch; exports.allStrings = allStrings; exports.arrayIterator = arrayIterator; exports.chunksIterator = chunksIterator; exports.split = split; exports.splitChunks = splitChunks; exports.stringIterator = stringIterator; //# sourceMappingURL=index.cjs.map