crawfishcloud
Version:
A Streaming S3 Bucket Glob Crawler
84 lines (83 loc) • 4.05 kB
JavaScript
import { Readable } from 'stream';
import { isMatch } from 'micromatch';
import { asVfile, asVinyl, asS3 } from './exporters';
import { s3urlToConfigWfilters, s3ConfigToUrl, loadObjectList } from './utils';
export const crawler = function (input, ...filters) {
var _a;
const config = {
filters,
body: true,
MaxKeys: (_a = input.maxkeys) !== null && _a !== void 0 ? _a : 1000,
BucketsPrefixes: filters.map(s3urlToConfigWfilters),
...input,
};
const { s3c } = input;
const { MaxKeys } = config;
const iter = async function* (i, ...filters) {
var _a;
const bucketPrefixes = filters.length > 0 ? filters.map(s3urlToConfigWfilters) : config.BucketsPrefixes;
for (let j = bucketPrefixes.length - 1; j >= 0; j--) {
const { Bucket, Key, prefix, suffix } = bucketPrefixes[j];
const objListResp = await s3c.listObjectsV2({ Bucket, MaxKeys, Prefix: prefix, ContinuationToken: i.NextContinuationToken }).promise();
const keyList = (_a = objListResp.Contents) !== null && _a !== void 0 ? _a : [];
const keyListFiltered = await Promise.all(keyList.filter(e => { var _a; return isMatch((_a = e.Key) !== null && _a !== void 0 ? _a : '', `${prefix}${suffix}`, { bash: true }); }));
if (!i.body) {
const mappedList = await Promise.all(keyListFiltered.map((s3obj, k) => i.using({ ...s3obj, Bucket, Body: '' }, k)));
yield* mappedList;
if (objListResp.NextContinuationToken) {
yield* iter({
body: i.body,
using: i.using,
NextContinuationToken: objListResp.NextContinuationToken
}, s3ConfigToUrl({ Bucket, Key }));
}
}
else {
const namedObjList = await loadObjectList(s3c, Bucket, ...keyListFiltered);
const r = await Promise.all(namedObjList.map((s3ObjwBody, k) => i.using({ ...s3ObjwBody, Bucket, Body: s3ObjwBody.Body }, k)));
yield* r;
if (objListResp.NextContinuationToken) {
yield* iter({
body: i.body,
using: i.using,
NextContinuationToken: objListResp.NextContinuationToken
}, s3ConfigToUrl({ Bucket, Key }));
}
}
}
};
const stream = (i, ...filters) => {
return Readable.from(iter(i, ...filters), { objectMode: true });
};
const all = async (i, ...filters) => {
const acc = [];
for await (const f of iter(i, ...filters)) {
acc.push(f);
}
return acc;
};
const reduce = async (init, using, reducer, ...filters) => {
let j = 0;
for await (const elem of iter({ body: true, using }, ...filters)) {
init = reducer(init, elem, j);
j++;
}
return init;
};
return {
iter,
all,
stream,
reduce,
vfileStream: (...filters) => crawler(input).stream({ body: true, using: asVfile }, ...filters),
vinylStream: (...filters) => crawler(input).stream({ body: true, using: asVinyl }, ...filters),
s3Stream: (...filters) => crawler({ ...input }).stream({ body: true, using: asS3 }, ...filters),
vfileIter: (...filters) => crawler(input).iter({ body: true, using: asVfile }, ...filters),
vinylIter: (...filters) => crawler(input).iter({ body: true, using: asVinyl }, ...filters),
s3Iter: (...filters) => crawler({ ...input }).iter({ body: true, using: asS3 }, ...filters),
vfileArray: (...filters) => crawler(input).all({ body: true, using: asVfile }, ...filters),
vinylArray: (...filters) => crawler(input).all({ body: true, using: asVinyl }, ...filters),
s3Array: (...filters) => crawler({ ...input }).all({ body: true, using: asS3 }, ...filters),
};
};
export default crawler;