kafka-rest
Version:
NodeJS wrapper for the Kafka REST Proxy
116 lines (101 loc) • 4.08 kB
JavaScript
/**
* Copyright 2014 Confluent Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
;
/**
* A simple consumer of tweets that periodically reports trending hashtags.
*/
var KafkaRest = require("../.."),
argv = require('minimist')(process.argv.slice(2));
var api_url = argv.url || "http://localhost:8082";
var topicName = argv.topic;
var consumerGroup = argv.group;
var format = argv.format || "avro";
var fromBeginning = argv['from-beginning'];
var help = (argv.help || argv.h);
if (help || topicName === undefined || (format != "binary" && format != "avro")) {
console.log("Compute and report trending hashtags in tweets.");
console.log();
console.log("Usage: node trending.js [--url <api-base-url>] --topic <topic> [--group <consumer-group-name>] [--from-beginning] [--format <avro|binary>]");
process.exit(help ? 0 : 1);
}
var binary = (format == "binary");
if (consumerGroup === undefined)
consumerGroup = "tweet-trending-consumer-" + Math.round(Math.random() * 100000);
var kafka = new KafkaRest({"url": api_url});
var consumer_instance;
// How often to report the top 10
var report_period = 10000;
// How much to discount the current weights for each report_period
var period_discount_rate = .99;
var consumerConfig = { "format" : format };
if (fromBeginning) {
consumerConfig['auto.offset.reset'] = 'smallest';
}
kafka.consumer(consumerGroup).join(consumerConfig, function(err, ci) {
if (err) return console.log("Failed to create instance in consumer group: " + err);
consumer_instance = ci;
var stream = consumer_instance.subscribe(topicName);
stream.on('data', function(msgs) {
for(var i = 0; i < msgs.length; i++) {
var tweet = (binary ? JSON.parse(msgs[i].value.toString('utf8')) : msgs[i].value);
processTweet(tweet);
}
});
stream.on('error', function(err) {
console.log("Consumer instance reported an error: " + err);
shutdown();
});
process.on('SIGINT', shutdown);
});
// Implements a simple EWA scheme on lower-cased hashtags.
var hashtags = {};
function processTweet(tweet) {
var words = tweet.text.toLowerCase().split(/\s/);
// Filter to hash tags, increment weights
for(var i = 0; i < words.length; i++) {
var word = words[i];
if (word.length > 0 && word[0] == '#') {
if (hashtags[word] === undefined)
hashtags[word] = {'name': word, 'weight': 0};
hashtags[word].weight += 1;
}
}
}
// Setup period reporting, discounting of hashtag weights, and cleanup of small
var reportInterval = setInterval(function() {
var sorted_terms = [];
for(var hashtagKey in hashtags) {
var hashtag = hashtags[hashtagKey];
// Discounting won't affect sorting, so we can do this in the same pass
hashtag.weight *= period_discount_rate;
sorted_terms.push(hashtag);
}
sorted_terms.sort(function(a,b) { return (a.weight > b.weight ? -1 : (a.weight == b.weight ? 0 : 1)); });
for(var i = 0; i < Math.min(10, sorted_terms.length); i++) {
var line = "" + i + ". " + sorted_terms[i].name;
for(var s = 0; s < (50-sorted_terms[i].name.length); s++)
line += ' ';
line += "(" + sorted_terms[i].weight + ")";
console.log(line);
}
console.log();
}, report_period);
function shutdown() {
consumer_instance.shutdown(function(err) {
if (err) console.log("Error shutting down consumer instance: " + err);
});
clearInterval(reportInterval);
}