@ssttevee/streamsearch
Version:
A port of streamsearch for es modules using Web APIs
306 lines (300 loc) • 10.8 kB
JavaScript
import { stringToArray, mergeArrays, arrayToString } from '@ssttevee/u8-utils';
/*
Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation
by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool
*/
function coerce(a) {
if (a instanceof Uint8Array) {
return (index) => a[index];
}
return a;
}
function jsmemcmp(buf1, pos1, buf2, pos2, len) {
const fn1 = coerce(buf1);
const fn2 = coerce(buf2);
for (var i = 0; i < len; ++i) {
if (fn1(pos1 + i) !== fn2(pos2 + i)) {
return false;
}
}
return true;
}
function createOccurenceTable(s) {
// Populate occurrence table with analysis of the needle,
// ignoring last letter.
const table = new Array(256).fill(s.length);
if (s.length > 1) {
for (let i = 0; i < s.length - 1; i++) {
table[s[i]] = s.length - 1 - i;
}
}
return table;
}
const MATCH = Symbol('Match');
class StreamSearch {
constructor(needle) {
this._lookbehind = new Uint8Array();
if (typeof needle === 'string') {
this._needle = needle = stringToArray(needle);
}
else {
this._needle = needle;
}
this._lastChar = needle[needle.length - 1];
this._occ = createOccurenceTable(needle);
}
feed(chunk) {
let pos = 0;
let tokens;
const allTokens = [];
while (pos !== chunk.length) {
[pos, ...tokens] = this._feed(chunk, pos);
allTokens.push(...tokens);
}
return allTokens;
}
end() {
const tail = this._lookbehind;
this._lookbehind = new Uint8Array();
return tail;
}
_feed(data, buf_pos) {
const tokens = [];
// Positive: points to a position in `data`
// pos == 3 points to data[3]
// Negative: points to a position in the lookbehind buffer
// pos == -2 points to lookbehind[lookbehind_size - 2]
let pos = -this._lookbehind.length;
if (pos < 0) {
// Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool
// search with character lookup code that considers both the
// lookbehind buffer and the current round's haystack data.
//
// Loop until (condition 1)
// there is a match.
// or until
// we've moved past the position that requires the
// lookbehind buffer. In this case we switch to the
// optimized loop.
// or until (condition 3)
// the character to look at lies outside the haystack.
while (pos < 0 && pos <= data.length - this._needle.length) {
const ch = this._charAt(data, pos + this._needle.length - 1);
if (ch === this._lastChar && this._memcmp(data, pos, this._needle.length - 1)) {
if (pos > -this._lookbehind.length) {
tokens.push(this._lookbehind.slice(0, this._lookbehind.length + pos));
}
tokens.push(MATCH);
this._lookbehind = new Uint8Array();
return [pos + this._needle.length, ...tokens];
}
else {
pos += this._occ[ch];
}
}
// No match.
if (pos < 0) {
// There's too little data for Boyer-Moore-Horspool to run,
// so we'll use a different algorithm to skip as much as
// we can.
// Forward pos until
// the trailing part of lookbehind + data
// looks like the beginning of the needle
// or until
// pos == 0
while (pos < 0 && !this._memcmp(data, pos, data.length - pos)) {
pos++;
}
}
if (pos >= 0) {
// Discard lookbehind buffer.
tokens.push(this._lookbehind);
this._lookbehind = new Uint8Array();
}
else {
// Cut off part of the lookbehind buffer that has
// been processed and append the entire haystack
// into it.
const bytesToCutOff = this._lookbehind.length + pos;
if (bytesToCutOff > 0) {
// The cut off data is guaranteed not to contain the needle.
tokens.push(this._lookbehind.slice(0, bytesToCutOff));
this._lookbehind = this._lookbehind.slice(bytesToCutOff);
}
this._lookbehind = Uint8Array.from(new Array(this._lookbehind.length + data.length), (_, i) => this._charAt(data, i - this._lookbehind.length));
return [data.length, ...tokens];
}
}
pos += buf_pos;
// Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool
// search with optimized character lookup code that only considers
// the current round's haystack data.
while (pos <= data.length - this._needle.length) {
const ch = data[pos + this._needle.length - 1];
if (ch === this._lastChar
&& data[pos] === this._needle[0]
&& jsmemcmp(this._needle, 0, data, pos, this._needle.length - 1)) {
if (pos > buf_pos) {
tokens.push(data.slice(buf_pos, pos));
}
tokens.push(MATCH);
return [pos + this._needle.length, ...tokens];
}
else {
pos += this._occ[ch];
}
}
// There was no match. If there's trailing haystack data that we cannot
// match yet using the Boyer-Moore-Horspool algorithm (because the trailing
// data is less than the needle size) then match using a modified
// algorithm that starts matching from the beginning instead of the end.
// Whatever trailing data is left after running this algorithm is added to
// the lookbehind buffer.
if (pos < data.length) {
while (pos < data.length && (data[pos] !== this._needle[0]
|| !jsmemcmp(data, pos, this._needle, 0, data.length - pos))) {
++pos;
}
if (pos < data.length) {
this._lookbehind = data.slice(pos);
}
}
// Everything until pos is guaranteed not to contain needle data.
if (pos > 0) {
tokens.push(data.slice(buf_pos, pos < data.length ? pos : data.length));
}
return [data.length, ...tokens];
}
_charAt(data, pos) {
if (pos < 0) {
return this._lookbehind[this._lookbehind.length + pos];
}
return data[pos];
}
;
_memcmp(data, pos, len) {
return jsmemcmp(this._charAt.bind(this, data), pos, this._needle, 0, len);
}
;
}
class ReadableStreamSearch {
constructor(needle, _readableStream) {
this._readableStream = _readableStream;
this._search = new StreamSearch(needle);
}
async *[Symbol.asyncIterator]() {
const reader = this._readableStream.getReader();
try {
while (true) {
const result = await reader.read();
if (result.done) {
break;
}
yield* this._search.feed(result.value);
}
const tail = this._search.end();
if (tail.length) {
yield tail;
}
}
finally {
reader.releaseLock();
}
}
}
const EOQ = Symbol('End of Queue');
class QueueableStreamSearch {
constructor(needle) {
this._chunksQueue = [];
this._closed = false;
this._search = new StreamSearch(needle);
}
push(...chunks) {
if (this._closed) {
throw new Error('cannot call push after close');
}
this._chunksQueue.push(...chunks);
if (this._notify) {
this._notify();
}
}
close() {
if (this._closed) {
throw new Error('close was already called');
}
this._closed = true;
this._chunksQueue.push(EOQ);
if (this._notify) {
this._notify();
}
}
async *[Symbol.asyncIterator]() {
while (true) {
let chunk;
while (!(chunk = this._chunksQueue.shift())) {
await new Promise((resolve) => this._notify = resolve);
this._notify = undefined;
}
if (chunk === EOQ) {
break;
}
yield* this._search.feed(chunk);
}
const tail = this._search.end();
if (tail.length) {
yield tail;
}
}
}
function splitChunks(chunks, needle) {
const search = new StreamSearch(needle);
const outchunks = [[]];
for (const chunk of chunks) {
for (const token of search.feed(chunk)) {
if (token === MATCH) {
outchunks.push([]);
}
else {
outchunks[outchunks.length - 1].push(token);
}
}
}
const end = search.end();
outchunks[outchunks.length - 1].push(end);
return outchunks.map((chunks) => mergeArrays(...chunks));
}
function split(buf, needle) {
return splitChunks([buf], needle);
}
async function* chunksIterator(iter) {
let chunks = [];
for await (const value of iter) {
if (value === MATCH) {
yield chunks;
chunks = [];
}
else {
chunks.push(value);
}
}
yield chunks;
}
async function* stringIterator(iter) {
for await (const chunk of chunksIterator(iter)) {
yield chunk.map(arrayToString).join('');
}
}
async function allStrings(iter) {
const segments = [];
for await (const value of stringIterator(iter)) {
segments.push(value);
}
return segments;
}
async function* arrayIterator(iter) {
for await (const chunk of chunksIterator(iter)) {
yield mergeArrays(...chunk);
}
}
export { MATCH, QueueableStreamSearch, ReadableStreamSearch, StreamSearch, allStrings, arrayIterator, chunksIterator, split, splitChunks, stringIterator };
//# sourceMappingURL=index.mjs.map