crawfishcloud
Version:
A Streaming S3 Bucket Glob Crawler
139 lines (138 loc) • 8.54 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }
var __asyncValues = (this && this.__asyncValues) || function (o) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var m = o[Symbol.asyncIterator], i;
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
};
var __asyncDelegator = (this && this.__asyncDelegator) || function (o) {
var i, p;
return i = {}, verb("next"), verb("throw", function (e) { throw e; }), verb("return"), i[Symbol.iterator] = function () { return this; }, i;
function verb(n, f) { i[n] = o[n] ? function (v) { return (p = !p) ? { value: __await(o[n](v)), done: n === "return" } : f ? f(v) : v; } : f; }
};
var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var g = generator.apply(thisArg, _arguments || []), i, q = [];
return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i;
function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; }
function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }
function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }
function fulfill(value) { resume("next", value); }
function reject(value) { resume("throw", value); }
function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.crawler = void 0;
const stream_1 = require("stream");
const micromatch_1 = require("micromatch");
const exporters_1 = require("./exporters");
const utils_1 = require("./utils");
const crawler = function (input, ...filters) {
var _a;
const config = Object.assign({ filters, body: true, MaxKeys: (_a = input.maxkeys) !== null && _a !== void 0 ? _a : 1000, BucketsPrefixes: filters.map(utils_1.s3urlToConfigWfilters) }, input);
const { s3c } = input;
const { MaxKeys } = config;
const iter = function (i, ...filters) {
var _a;
return __asyncGenerator(this, arguments, function* () {
const bucketPrefixes = filters.length > 0 ? filters.map(utils_1.s3urlToConfigWfilters) : config.BucketsPrefixes;
for (let j = bucketPrefixes.length - 1; j >= 0; j--) {
const { Bucket, Key, prefix, suffix } = bucketPrefixes[j];
const objListResp = yield __await(s3c.listObjectsV2({ Bucket, MaxKeys, Prefix: prefix, ContinuationToken: i.NextContinuationToken }).promise());
const keyList = (_a = objListResp.Contents) !== null && _a !== void 0 ? _a : [];
const keyListFiltered = yield __await(Promise.all(keyList.filter(e => { var _a; return micromatch_1.isMatch((_a = e.Key) !== null && _a !== void 0 ? _a : '', `${prefix}${suffix}`, { bash: true }); })));
if (!i.body) {
const mappedList = yield __await(Promise.all(keyListFiltered.map((s3obj, k) => i.using(Object.assign(Object.assign({}, s3obj), { Bucket, Body: '' }), k))));
yield __await(yield* __asyncDelegator(__asyncValues(mappedList)));
if (objListResp.NextContinuationToken) {
yield __await(yield* __asyncDelegator(__asyncValues(iter({
body: i.body,
using: i.using,
NextContinuationToken: objListResp.NextContinuationToken
}, utils_1.s3ConfigToUrl({ Bucket, Key })))));
}
}
else {
const namedObjList = yield __await(utils_1.loadObjectList(s3c, Bucket, ...keyListFiltered));
const r = yield __await(Promise.all(namedObjList.map((s3ObjwBody, k) => i.using(Object.assign(Object.assign({}, s3ObjwBody), { Bucket, Body: s3ObjwBody.Body }), k))));
yield __await(yield* __asyncDelegator(__asyncValues(r)));
if (objListResp.NextContinuationToken) {
yield __await(yield* __asyncDelegator(__asyncValues(iter({
body: i.body,
using: i.using,
NextContinuationToken: objListResp.NextContinuationToken
}, utils_1.s3ConfigToUrl({ Bucket, Key })))));
}
}
}
});
};
const stream = (i, ...filters) => {
return stream_1.Readable.from(iter(i, ...filters), { objectMode: true });
};
const all = (i, ...filters) => __awaiter(this, void 0, void 0, function* () {
var e_1, _b;
const acc = [];
try {
for (var _c = __asyncValues(iter(i, ...filters)), _d; _d = yield _c.next(), !_d.done;) {
const f = _d.value;
acc.push(f);
}
}
catch (e_1_1) { e_1 = { error: e_1_1 }; }
finally {
try {
if (_d && !_d.done && (_b = _c.return)) yield _b.call(_c);
}
finally { if (e_1) throw e_1.error; }
}
return acc;
});
const reduce = (init, using, reducer, ...filters) => __awaiter(this, void 0, void 0, function* () {
var e_2, _e;
let j = 0;
try {
for (var _f = __asyncValues(iter({ body: true, using }, ...filters)), _g; _g = yield _f.next(), !_g.done;) {
const elem = _g.value;
init = reducer(init, elem, j);
j++;
}
}
catch (e_2_1) { e_2 = { error: e_2_1 }; }
finally {
try {
if (_g && !_g.done && (_e = _f.return)) yield _e.call(_f);
}
finally { if (e_2) throw e_2.error; }
}
return init;
});
return {
iter,
all,
stream,
reduce,
vfileStream: (...filters) => exports.crawler(input).stream({ body: true, using: exporters_1.asVfile }, ...filters),
vinylStream: (...filters) => exports.crawler(input).stream({ body: true, using: exporters_1.asVinyl }, ...filters),
s3Stream: (...filters) => exports.crawler(Object.assign({}, input)).stream({ body: true, using: exporters_1.asS3 }, ...filters),
vfileIter: (...filters) => exports.crawler(input).iter({ body: true, using: exporters_1.asVfile }, ...filters),
vinylIter: (...filters) => exports.crawler(input).iter({ body: true, using: exporters_1.asVinyl }, ...filters),
s3Iter: (...filters) => exports.crawler(Object.assign({}, input)).iter({ body: true, using: exporters_1.asS3 }, ...filters),
vfileArray: (...filters) => exports.crawler(input).all({ body: true, using: exporters_1.asVfile }, ...filters),
vinylArray: (...filters) => exports.crawler(input).all({ body: true, using: exporters_1.asVinyl }, ...filters),
s3Array: (...filters) => exports.crawler(Object.assign({}, input)).all({ body: true, using: exporters_1.asS3 }, ...filters),
};
};
exports.crawler = crawler;
exports.default = exports.crawler;