zamza
Version:
Apache Kafka discovery, indexing, searches, storage, hooks and HTTP gateway
565 lines (564 loc) • 23.6 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const Debug = require("debug");
const murmur = require("murmurhash");
const debug = Debug("zamza:model:keyindex");
const moment = require("moment");
const mongoose = require("mongoose");
const Bluebird = require("bluebird");
const toJSONSchema = require("to-json-schema");
const R = require("ramda");
const bigqueryJsonSchema = require("jsonschema-bigquery");
const MessageHandler_1 = require("./../../MessageHandler");
class KeyIndexModel {
constructor(zamza) {
this.metrics = zamza.metrics;
this.discovery = zamza.discovery;
this.zamza = zamza;
this.name = "keyindex";
this.models = {};
this.mongoose = null;
this.schemaConstructor = null;
}
registerModel(mongoosePassed, schemaConstructor) {
this.mongoose = mongoosePassed;
this.schemaConstructor = schemaConstructor;
debug("Not creating any model now, as keyindex models are created per topic on the fly.");
}
ensureModelAndIndicesExist(topic, topicConfig) {
this.getOrCreateModel(topic, topicConfig);
}
getOrCreateModel(originalTopic, unstoredTopicConfig = null) {
const topic = MessageHandler_1.default.cleanTopicNameForMetrics(originalTopic);
const topicConfig = unstoredTopicConfig || this.zamza.messageHandler.findConfigForTopic(originalTopic);
if (!topicConfig) {
debug("Cannot getOrCreateModel, because no config was found for topic", originalTopic);
throw new Error("Cannot getOrCreateModel, because no config was found for topic: " + originalTopic);
}
if (this.models[topic]) {
return this.models[topic];
}
const schemaDefinition = {
key: Number,
timestamp: Number,
partition: Number,
offset: Number,
keyValue: Buffer,
value: topicConfig.queryable ? mongoose.Schema.Types.Mixed : Buffer,
deleteAt: Date,
fromStream: Boolean,
storedAt: Number,
};
const schema = new this.schemaConstructor(schemaDefinition);
// single lookup indices
schema.index({ key: 1, type: -1 });
schema.index({ timestamp: 1, type: 1 });
schema.index({ timestamp: 1, type: -1 });
// compound index
schema.index({ key: 1, fromStream: 1 }, { unique: false });
schema.index({ partition: 1 }, { unique: false });
schema.index({ partition: 1, offset: 1 }, { unique: false });
// ttl index
schema.index({ deleteAt: 1 }, { expireAfterSeconds: 0 });
const model = this.mongoose.model(`${this.name}_${topic}`, schema);
model.on("index", (error) => {
if (error) {
debug("Index creation failed", error.message);
}
else {
debug("Index creation successfull.");
}
});
debug("Registered model with schema for topic", topic);
this.models[topic] = model;
return this.models[topic];
}
hash(value) {
return murmur.v3(value, 0);
}
static cleanMessageResultForResponse(topic, message) {
if (!message) {
return message;
}
if (message.value && Buffer.isBuffer(message.value)) {
message.value = message.value.toString("utf8");
}
const cleanedMessage = {};
cleanedMessage.$index = message._id; // important for pagination
cleanedMessage.topic = topic; // cannot use message.topic, as its a hash
cleanedMessage.partition = message.partition;
cleanedMessage.offset = message.offset;
cleanedMessage.key = message.keyValue ? message.keyValue.toString("utf8") : message.keyValue,
cleanedMessage.value = message.value;
cleanedMessage.timestamp = message.timestamp;
if (typeof cleanedMessage.partition === "undefined") {
debug("Parsed cleaned message seems invalid:", JSON.stringify(message));
}
return cleanedMessage;
}
static cleanMessageResultsForResponse(topic, messages) {
return messages.map((message) => {
return KeyIndexModel.cleanMessageResultForResponse(topic, message);
});
}
async getSimpleCountOfMessagesStoredForTopic(topic, fromMetadata = true) {
const startTime = Date.now();
const model = this.getOrCreateModel(topic);
let count = -1;
if (fromMetadata) {
count = await model.estimatedDocumentCount();
}
else {
count = await model.countDocuments({});
}
const duration = Date.now() - startTime;
this.metrics.set(`mongo_keyindex_simple_count_${fromMetadata ? "m" : "c"}_ms`, duration);
return count;
}
async getMetadataForTopic(topic) {
const startTime = Date.now();
const partitionCountOfTopic = this.discovery.getPartitionCountOfTopic(topic, 40);
const [partitions, earliestOffset, latestOffset, earliestMessage, latestMessage,] = await Promise.all([
this.getPartitionCountsForTopic(topic, partitionCountOfTopic),
this.getEarliestOffset(topic),
this.getLatestOffset(topic),
this.getEarliestTimestamp(topic),
this.getLatestTimestamp(topic),
]);
const counts = this.partitionsToTotalCount(partitions);
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_info_ms", duration);
return {
topic,
messageCount: counts.size,
partitionCount: counts.count,
partitions,
earliestOffset,
latestOffset,
earliestMessage,
latestMessage,
timestamp: moment().valueOf(),
};
}
partitionsToTotalCount(partitions) {
let size = 0;
let count = 0;
Object.keys(partitions).forEach((key) => {
count++;
size += partitions[key];
});
return {
count,
size,
};
}
async getEarliestOffset(topic) {
const startTime = Date.now();
const result = await this.getOrCreateModel(topic).aggregate([
{
$group: {
_id: {},
minOffset: { $min: "$offset" },
},
},
]);
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_info_earliest_offset_ms", duration);
return result.length ? result[0].minOffset : -1;
}
async getLatestOffset(topic) {
const startTime = Date.now();
const result = await this.getOrCreateModel(topic).aggregate([
{
$group: {
_id: {},
maxOffset: { $max: "$offset" },
},
},
]);
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_info_latest_offset_ms", duration);
return result.length ? result[0].maxOffset : -1;
}
async getEarliestTimestamp(topic) {
const startTime = Date.now();
const result = await this.getOrCreateModel(topic).aggregate([
{
$group: {
_id: {},
minTimestamp: { $min: "$timestamp" },
},
},
]);
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_info_earliest_ts_ms", duration);
return result.length ? result[0].minTimestamp : -1;
}
async getLatestTimestamp(topic) {
const startTime = Date.now();
const result = await this.getOrCreateModel(topic).aggregate([
{
$group: {
_id: {},
maxTimestamp: { $max: "$timestamp" },
},
},
]);
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_info_latest_ts_ms", duration);
return result.length ? result[0].maxTimestamp : -1;
}
async getPartitionCountsForTopicViaAggregation(topic) {
const startTime = Date.now();
const partitionAggregation = await this.getOrCreateModel(topic).aggregate([
{ $group: {
_id: {
partition: "$partition",
},
count: { $sum: 1 },
},
},
]);
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_info_partition_aggregation_ms", duration);
const partitions = {};
partitionAggregation.forEach((aggregatedPartition) => {
partitions[aggregatedPartition._id.partition] = aggregatedPartition.count;
});
return partitions;
}
async getPartitionCountsForTopic(topic, partitionCount) {
const startTime = Date.now();
const model = this.getOrCreateModel(topic);
const partitions = [];
for (let i = 0; i < partitionCount; i++) {
partitions.push(i);
}
// although splitting a single query in multiple seems absurd, these counts run on the index
// and are a lot faster then the single aggregate query on top
const partitionCounts = await Bluebird.map(partitions, (partition) => {
return model.countDocuments({ partition }).then((count) => {
return {
partition,
count,
};
});
}, { concurrency: 3 });
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_info_partition_ms", duration);
const partitionResults = {};
partitionCounts.forEach((aggregatedPartition) => {
partitionResults[aggregatedPartition.partition] = aggregatedPartition.count;
});
return partitionResults;
}
async findMessageForKey(topic, key) {
const startTime = Date.now();
const message = await this.getOrCreateModel(topic).findOne({
key: this.hash(key),
}).lean().exec();
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_find_key_ms", duration);
return {
result: KeyIndexModel.cleanMessageResultForResponse(topic, message),
};
}
async findMessageForPartitionAndOffset(topic, partition, offset) {
const startTime = Date.now();
const message = await this.getOrCreateModel(topic).findOne({
partition,
offset,
}).lean().exec();
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_find_pof_ms", duration);
return {
result: KeyIndexModel.cleanMessageResultForResponse(topic, message),
};
}
async findMessageForTimestamp(topic, timestamp) {
const startTime = Date.now();
// TODO: range this a little? use timestamp
const message = await this.getOrCreateModel(topic).findOne({
timestamp,
}).lean().exec();
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_find_ts_ms", duration);
return {
result: KeyIndexModel.cleanMessageResultForResponse(topic, message),
};
}
async findRangeAroundKey(topic, key, range = 50) {
// TODO: implement
return {
results: [],
};
}
async paginateThroughTopic(topic, skipToIndex, limit = 50, order = -1) {
// order
// 1 = ascending = earliest
// -1 = descending = latest
if (limit > 2500) {
throw new Error(limit + " is a huge limit size, please stay under 2500 per call.");
}
debug("Paginating from", skipToIndex, "to next", limit, "on topic", topic, "order", order);
let query = {};
if (skipToIndex && skipToIndex !== "null") {
// use last object id to find cursor, its a million times faster
// than using .skip() which does not use an index
let objectIdIndex = null;
try {
objectIdIndex = mongoose.Types.ObjectId(skipToIndex);
if (!mongoose.Types.ObjectId.isValid(objectIdIndex)) {
throw new Error("Invalid ObjectID");
}
}
catch (error) {
throw new Error("Provided skipToIndex is not a valid ObjectId: " + skipToIndex + ", " + error.message);
}
query = {
_id: { [order === 1 ? "$gt" : "$lt"]: objectIdIndex },
};
}
const startTime = Date.now();
const messages = await this.getOrCreateModel(topic)
.find(query)
.sort({ _id: order })
.limit(limit)
.lean()
.exec();
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_paginate_ms", duration);
return {
results: KeyIndexModel.cleanMessageResultsForResponse(topic, messages),
};
}
getResultsForQueryWithCacheKey(cacheKey) {
return this.zamza.mongoWrapper.balrok.getCacheKeyResult(cacheKey);
}
async filterForQuery(topic, origQuery, limit = null, skipToIndex = null, order = -1, dontAwait = false) {
const topicConfig = this.zamza.messageHandler.findConfigForTopic(topic);
if (!topicConfig) {
debug("Cannot findForQuery, because no config was found for topic", topic);
throw new Error("Cannot findForQuery, because no config was found for topic: " + topic);
}
if (!topicConfig.queryable) {
debug("Cannot run query for topic", topic, "because it is not configured as queryable.", topicConfig);
throw new Error("Cannot run query for topic " + topic + " because it is not configured as queryable.");
}
// order
// 1 = ascending = earliest
// -1 = descending = latest
if (!origQuery || typeof origQuery !== "object") {
throw new Error("query must be an object, filtering for 'dot-notated' keys.");
}
const queryId = topic + "_" +
this.hash(`${Object.keys(origQuery).join("")}${limit}${skipToIndex}${order}`);
debug(queryId, "filtering for", origQuery, "on topic", topic, "limit", limit, "skipToIndex", skipToIndex, "order", order);
// start with a clean query, as we are only allowed to pass indexed fields here
const query = {};
if (skipToIndex && skipToIndex !== "null") {
// use last object id to find cursor, its a million times faster
// than using .skip() which does not use an index
let objectIdIndex = null;
try {
objectIdIndex = mongoose.Types.ObjectId(skipToIndex);
if (!mongoose.Types.ObjectId.isValid(objectIdIndex)) {
throw new Error("Invalid ObjectID");
}
}
catch (error) {
throw new Error("Provided skipToIndex is not a valid ObjectId: " + skipToIndex + ", " + error.message);
}
query._id = { [order === 1 ? "$gt" : "$lt"]: objectIdIndex };
}
// to prevent caching
let queryName = "";
// based on the query object, we build a Ramda function that can be used to evaluate
// documents in the documentOperation call that we pass to balrok
const queryFilter = R.allPass(Object.keys(origQuery).map((key) => {
if (key.indexOf("[") !== -1 || key.indexOf("]") !== -1) {
throw new Error("Character not allowed in query key [], only dot strings as path allowed.");
}
if (Array.isArray(query[key]) || (typeof query[key] === "object" && query[key] !== null)) {
throw new Error("Query field values, must not be arrays or objects, please resolve via flat string paths. " + key);
}
queryName += key + ",";
return R.pathEq(key.split("."), query[key]);
}));
const documentOperation = (doc) => {
return queryFilter(doc);
};
const resolveOptions = {
options: {},
batchSize: 2048,
order,
timeoutMs: 58000,
dontAwait,
noCache: false,
limit,
};
const startTime = Date.now();
const messages = await this.zamza
.mongoWrapper.balrok.filter(this.getOrCreateModel(topic), query, queryName, documentOperation, resolveOptions);
// if the query is not awaited, it will run without reference to this requests lifetime
// we will return the cacheKey of the operation, that can be used to fetch the results
// on another endpoint
if (dontAwait) {
return {
cacheKey: messages.cacheKey,
};
}
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_find_query_ms", duration);
debug(queryId, "filter for query", query, "on topic", topic, "took", duration, "ms");
return {
results: KeyIndexModel.cleanMessageResultsForResponse(topic, messages),
};
}
async getRangeFromLatest(topic, range = 50) {
return this.paginateThroughTopic(topic, null, range, -1);
}
async getRangeFromEarliest(topic, range = 50) {
return this.paginateThroughTopic(topic, null, range, 1);
}
async analyseSingleMessageJSONSchema(topic) {
const latestMessages = await this.getRangeFromLatest(topic, 1);
const message = latestMessages.results.length ? latestMessages.results[0] : null;
if (!message || !message.value) {
throw new Error("The topic " + topic + " has not enough fetchable messages"
+ " to create a stable JSON schema. (At least 1 message with non null value required.)");
}
try {
let value = null;
if (typeof message.value !== "object" || Buffer.isBuffer(message.value)) {
if (Buffer.isBuffer(message.value)) {
value = JSON.parse(message.value.toString("utf8"));
}
else {
value = JSON.parse(message.value);
}
if (!value || typeof value !== "object") {
return;
}
message.value = value;
}
delete message.$index;
message.key = message.key ? message.key.toString("utf8") : message.key;
}
catch (error) {
throw new Error("Cannot determine JSON schema for topic "
+ topic + ", as it does not contain JSON. " + error.message);
}
try {
const schema = toJSONSchema(message);
if (!schema) {
throw new Error("Empty schema.");
}
return schema;
}
catch (error) {
throw new Error("Failed to create schema for topic " + topic + ", " + error.message);
}
}
async analyseSingleMessageBigQuerySchema(topic) {
const jsonSchema = await this.analyseSingleMessageJSONSchema(topic);
return bigqueryJsonSchema.run(jsonSchema);
}
async analyseJSONSchema(topic) {
const earliestMessages = await this.getRangeFromEarliest(topic, 10);
const latestMessages = await this.getRangeFromLatest(topic, 10);
const parsedAndConsolidatedMessages = [];
const analyseMessage = (message) => {
if (!message.value) {
return;
}
try {
let value = null;
if (typeof message.value !== "object" || Buffer.isBuffer(message.value)) {
if (Buffer.isBuffer(message.value)) {
value = JSON.parse(message.value.toString("utf8"));
}
else {
value = JSON.parse(message.value);
}
if (!value || typeof value !== "object") {
return;
}
message.value = value;
}
delete message.$index;
message.key = message.key ? message.key.toString("utf8") : message.key;
parsedAndConsolidatedMessages.push(message);
}
catch (error) {
throw new Error("Cannot determine JSON schema for topic "
+ topic + ", as it does not contain JSON. " + error.message);
}
};
earliestMessages.results.forEach(analyseMessage);
latestMessages.results.forEach(analyseMessage);
if (parsedAndConsolidatedMessages.length < 2) {
throw new Error("The topic " + topic + " has not enough fetchable messages"
+ " to create a stable JSON schema. (At least 2 messages with non null value required.)");
}
try {
const schema = toJSONSchema(parsedAndConsolidatedMessages);
if (!schema) {
throw new Error("Empty schema.");
}
return schema;
}
catch (error) {
throw new Error("Failed to create schema for topic " + topic + ", " + error.message);
}
}
async insert(topic, document) {
if (!document.partition && document.partition !== 0) {
throw new Error("Cannot store key index document without partition: " + JSON.stringify(document));
}
const startTime = Date.now();
const result = await this.getOrCreateModel(topic).create(document);
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_insert_ms", duration);
return result;
}
async upsert(topic, document) {
if (!document.partition && document.partition !== 0) {
throw new Error("Cannot store key index document without partition: " + JSON.stringify(document));
}
if (!document.key) {
debug("Cannot upsert message without key.", topic, document.key);
return null;
}
const startTime = Date.now();
const query = {
key: document.key,
};
const queryOptions = {
upsert: true,
};
const result = await this.getOrCreateModel(topic).findOneAndUpdate(query, document, queryOptions).exec();
const duration = Date.now() - startTime;
this.metrics.set("mongo_keyindex_upsert_ms", duration);
return result;
}
delete(topic, key, fromStream = false) {
if (!topic) {
debug("Cannot delete message without topic", topic, key, fromStream);
return Promise.reject(new Error("Cannot delete message without topic"));
}
if (!key) {
debug("Cannot delete message without key", topic, key, fromStream);
return Promise.reject(new Error("Cannot delete message without key"));
}
return this.getOrCreateModel(topic).deleteMany({
key: this.hash(key),
fromStream,
});
}
deleteForTopic(topic) {
debug("Deleting all entries for topic", topic);
return this.getOrCreateModel(topic).deleteMany({}).exec();
}
}
exports.KeyIndexModel = KeyIndexModel;