mongodb-simplecrawler-queue
Version:
MongoDB FetchQueue Implementation for Simplecrawler
391 lines • 15 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const mongodb_1 = require("mongodb");
const typings_1 = require("./typings");
const GarbageCollector_1 = require("./GarbageCollector");
const Monitor_1 = require("./Monitor");
class MongoDbQueue {
constructor(config) {
this.config = config;
if (!this.config.crawlerName) {
this.config.crawlerName = 'crawler';
}
if (this.config.GCConfig) {
this.config.GCConfig.run = this.config.GCConfig.run || false;
this.config.GCConfig.msInterval = this.config.GCConfig.msInterval || 1000 * 60 * 2;
}
else {
this.config.GCConfig = {
run: false,
msInterval: 1000 * 60 * 2,
};
}
if (this.config.monitorConfig) {
if (!this.config.monitorConfig.msInterval) {
this.config.monitorConfig.msInterval = 1000 * 60 * 2;
}
if (!this.config.monitorConfig.statisticCollectionName) {
this.config.monitorConfig.statisticCollectionName = 'statistic';
}
}
else {
this.config.monitorConfig = {
run: false,
msInterval: 1000 * 60 * 2,
statisticCollectionName: 'statistic',
};
}
this.client = new mongodb_1.MongoClient(this.config.url, { useNewUrlParser: true, useUnifiedTopology: true });
}
async addToQueue(queueItem, filter) {
const addItemCopy = Object.assign({}, queueItem);
addItemCopy.status = typings_1.QueueItemStatus.Queued;
// upsert means add element if no elements found by the filter
const res = await this.collection
.updateOne(filter, { $setOnInsert: addItemCopy }, { upsert: true });
// upserted count count = 0 -> no elements were added
if (res.upsertedCount === 0) {
return null;
}
if (res.result.ok === 1) {
const elem = await this.collection.findOne({ _id: new mongodb_1.ObjectId(res.upsertedId._id) });
if (elem !== null) {
elem.id = elem._id;
}
return elem;
}
throw new Error('Unexpected error happened');
}
convertForUpdate(obj, parent, finalObject) {
Object.entries(obj).forEach(([key, value]) => {
if (value && typeof value === 'object') {
this.convertForUpdate(value, (parent ? `${parent}.` : '') + key, finalObject);
}
else if (key === '_id') {
// eslint-disable-next-line no-param-reassign
finalObject[`${parent ? `${parent}.` : ''}${key.toString()}`] = new mongodb_1.ObjectId(String(value));
}
else {
// eslint-disable-next-line no-param-reassign
finalObject[`${parent ? `${parent}.` : ''}${key.toString()}`] = value;
}
});
}
convertForFilter(obj, parent, finalObject) {
Object.entries(obj).forEach(([key, value]) => {
if (value && typeof value === 'object') {
this.convertForFilter(value, (parent ? `${parent}.` : '') + key, finalObject);
}
else if (key === '_id') {
// eslint-disable-next-line no-param-reassign
finalObject[`${parent ? `${parent}.` : ''}${key.toString()}`] = { $eq: new mongodb_1.ObjectId(String(value)) };
}
else {
// eslint-disable-next-line no-param-reassign
finalObject[`${parent ? `${parent}.` : ''}${key.toString()}`] = { $eq: value };
}
});
}
// eslint-disable-next-line class-methods-use-this
handleCallback(error, returnResult, callback) {
let returnError = error;
if (returnError) {
returnError = returnError instanceof Error ? returnError : new Error(returnError);
}
if (callback) {
return callback(returnError, returnResult);
}
if (error) {
return Promise.reject(returnError);
}
return Promise.resolve(returnResult);
}
async init(callback) {
let returnError = null;
try {
await this.client.connect();
this.db = this.client.db(this.config.dbName);
this.collection = this.db.collection(this.config.collectionName);
await this.collection.createIndex({ status: 1 }, { partialFilterExpression: { status: { $eq: typings_1.QueueItemStatus.Queued } } });
await this.collection.createIndex({ url: 'hashed' });
if (this.config.GCConfig.run) {
this.garbageCollector = new GarbageCollector_1.GarbageCollector(this.collection, this.config.GCConfig.msInterval);
this.garbageCollector.start();
}
if (this.config.monitorConfig.run) {
const statisticCollection = this.db
.collection(this.config.monitorConfig.statisticCollectionName || 'statistic');
this.monitor = new Monitor_1.Monitor(this.collection, statisticCollection, this.config.monitorConfig.msInterval);
this.monitor.start();
}
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, null, callback);
}
async finalize(callback) {
let returnError = null;
try {
if (this.config.GCConfig.run && this.garbageCollector) {
this.garbageCollector.stop();
}
if (this.config.monitorConfig.run && this.monitor) {
this.monitor.stop();
}
await this.client.close();
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, null, callback);
}
async drop(callback) {
let returnError = null;
try {
await this.collection.drop();
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, null, callback);
}
async add(queueItemOriginal, force, callback) {
let returnError = null;
let elem = null;
try {
const queueItem = Object.assign({ modificationTimestamp: Date.now(), modifiedBy: this.config.crawlerName }, queueItemOriginal);
delete queueItem.id;
if (force) {
elem = await this.addToQueue(queueItem, queueItem);
if (elem === null) {
throw new Error('Can\'t add a queueItem instance twice. '
+ 'You may create a new one from the same URL however.');
}
}
else {
elem = await this.addToQueue(queueItem, { url: queueItem.url });
// workaround to throw what FetchQueue requires
if (elem === null) {
throw new typings_1.QueueError('Resource already exists in queue!', 'DUPLICATE');
}
}
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, elem, callback);
}
async exists(url, callback) {
let res = null;
let returnError = null;
try {
const element = await this.collection.findOne({ url });
res = element !== null;
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, res, callback);
}
async get(index, callback) {
let returnError = null;
let res = null;
try {
res = await this.collection.findOne({}, { skip: index, limit: index + 1 });
if (res) {
res.id = res._id;
}
else {
throw new Error('out of range');
}
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, res, callback);
}
async update(id, updates, callback) {
let returnError = null;
let resultQueueItem = null;
try {
const updatesAsAnObject = {};
this.convertForUpdate(updates, '', updatesAsAnObject);
updatesAsAnObject.modificationTimestamp = Date.now();
updatesAsAnObject.modifiedBy = this.config.crawlerName;
const res = await this.collection
.findOneAndUpdate({ _id: new mongodb_1.ObjectId(id) }, { $set: updatesAsAnObject }, { returnOriginal: false });
if (res.ok === 1) {
resultQueueItem = res.value;
if (resultQueueItem === undefined) {
throw new Error('No queueItem found with that ID');
}
resultQueueItem.id = resultQueueItem._id;
}
else {
throw new Error('didnt update');
}
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, resultQueueItem, callback);
}
async oldestUnfetchedItem(callback) {
let returnError = null;
let oldestUnfetchedItem = null;
try {
const res = await this
.collection
.findOneAndUpdate({ status: typings_1.QueueItemStatus.Queued }, {
$set: {
status: typings_1.QueueItemStatus.Pulled,
modificationTimestamp: Date.now(),
modifiedBy: this.config.crawlerName,
},
});
if (res.ok === 1) {
oldestUnfetchedItem = res.value;
if (oldestUnfetchedItem) {
oldestUnfetchedItem.id = oldestUnfetchedItem._id;
}
}
else {
throw new Error('Error occurred during getting an item');
}
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, oldestUnfetchedItem, callback);
}
async max(statisticName, callback) {
let returnError = null;
let result = null;
try {
if (!(statisticName in typings_1.AllowedStatistics)) {
throw new Error('Invalid statistic');
}
const matchObject = {};
matchObject.fetched = true;
matchObject[`stateData.${statisticName}`] = { $type: ['number'] };
result = await this.collection.aggregate([
{ $match: matchObject },
{
$group: {
_id: `stateData.${statisticName}`,
max: { $max: `$stateData.${statisticName}` },
},
},
]).next();
if (result) {
result = result.max;
}
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, result, callback);
}
async min(statisticName, callback) {
let returnError = null;
let result = null;
try {
if (!(statisticName in typings_1.AllowedStatistics)) {
throw new Error('Invalid statistic');
}
const matchObject = {};
matchObject.fetched = true;
matchObject[`stateData.${statisticName}`] = { $type: ['number'] };
result = await this.collection.aggregate([
{ $match: matchObject },
{
$group: {
_id: `stateData.${statisticName}`,
min: { $min: `$stateData.${statisticName}` },
},
},
]).next();
if (result) {
result = result.min;
}
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, result, callback);
}
async avg(statisticName, callback) {
let returnError = null;
let result = null;
try {
if (!(statisticName in typings_1.AllowedStatistics)) {
throw new Error('Invalid statistic');
}
const matchObject = {};
matchObject.fetched = true;
matchObject[`stateData.${statisticName}`] = { $type: ['number'] };
result = await this.collection.aggregate([
{ $match: matchObject },
{
$group: {
_id: `stateData.${statisticName}`,
avg: { $avg: `$stateData.${statisticName}` },
},
},
]).next();
if (result) {
result = result.avg;
}
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, result, callback);
}
async countItems(comparator, callback) {
let returnError = null;
let result = null;
try {
const filterObject = {};
this.convertForFilter(comparator, '', filterObject);
result = await this.collection.find(filterObject).count();
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, result, callback);
}
async filterItems(comparator, callback) {
let returnError = null;
let result = null;
try {
const filterObject = {};
this.convertForFilter(comparator, '', filterObject);
result = await this.collection.find(filterObject).toArray();
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, result, callback);
}
async getLength(callback) {
let returnError = null;
let result = null;
try {
result = await this.collection.countDocuments({});
}
catch (error) {
returnError = error;
}
return this.handleCallback(returnError, result, callback);
}
async freeze(filename, callback) {
return this.handleCallback(null, true, callback);
}
async defrost(filename, callback) {
return this.handleCallback(null, true, callback);
}
}
exports.MongoDbQueue = MongoDbQueue;
//# sourceMappingURL=MongoQueue.js.map