kafka-rest
Version:
NodeJS wrapper for the Kafka REST Proxy
179 lines (162 loc) • 6.85 kB
JavaScript
/**
* Copyright 2014 Confluent Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
;
/**
* A simple Twitter-Kafka connector using the Twitter streaming API to import.
*/
var KafkaRest = require("../.."),
argv = require('minimist')(process.argv.slice(2)),
util = require('util'),
twitter = require('twitter');
// Twitter arguments
var consumer_key = argv['consumer-key'] || process.env.TWITTER_CONSUMER_KEY;
var consumer_secret = argv['consumer-secret'] || process.env.TWITTER_CONSUMER_SECRET;
var access_key = argv['access-key'] || process.env.TWITTER_ACCESS_KEY;
var access_secret = argv['access-secret'] || process.env.TWITTER_ACCESS_SECRET;
// Kafka arguments
var api_url = argv.url || "http://localhost:8082";
var topicName = argv.topic;
// General arguments
var bufferMessages = argv['message-buffer-size'] || 100; // Number of tweets to buffer before calling produce()
var format = argv.format || "avro";
var filter = argv.filter;
var help = (argv.help || argv.h);
if (help ||
consumer_key === undefined || consumer_secret === undefined ||
access_key === undefined || access_secret === undefined ||
topicName === undefined ||
(format != "binary" && format != "avro"))
{
console.log("Stream tweets and load them into a Kafka topic.");
console.log();
console.log("Usage: node stream_tweets.js [--consumer-key <consumer-key>] [--consumer-secret <consumer-secret>] [--access-key <access-key>] [--access-secret <access-secret>]");
console.log(" [--url <api-base-url>] --topic <topic>");
console.log(" [--message-buffer-size <num-messages>] [--format <avro|binary>]");
console.log(" [--filter <term>]");
console.log();
console.log("You can also specify Twitter credentials via environment variables: TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET, TWITTER_ACCESS_KEY, TWITTER_ACCESS_SECRET.");
process.exit(help ? 0 : 1);
}
var binary = (format == "binary");
var twit = new twitter({
consumer_key: consumer_key,
consumer_secret: consumer_secret,
access_token_key: access_key,
access_token_secret: access_secret
});
var kafka = new KafkaRest({"url": api_url});
var target = kafka.topic(topicName);
var schema = new KafkaRest.AvroSchema({
"name": "TweetText",
"type": "record",
"fields": [
{ "name": "id", "type": "string" },
{ "name": "text", "type": "string" }
]
});
var started = Date.now();
var stream = null;
var consumed = [];
var messages_sent = 0;
var messages_acked = 0;
var exiting = false;
// Windowed stats
var windowed_started = Date.now();
var windowed_collected = 0;
var windowed_acked = 0;
var windowed_period = 10000;
var windowed_interval = setInterval(reportWindowedRate, windowed_period);
var endpoint, endpoint_opts = {};
if (filter) {
endpoint = 'statuses/filter';
endpoint_opts = {'track': filter};
} else {
endpoint = 'statuses/sample';
endpoint_opts = {'language': 'en'};
}
twit.stream(endpoint, endpoint_opts, function(s) {
stream = s;
stream.on('data', function(data) {
if (exiting) return;
// Filter to only tweets. Streaming data may contain other data and events such as friends lists, block/favorite
// events, list modifications, etc. Here we prefilter the data, although this could also be done downstream on
// the consumer if we wanted to preserve the entire stream. The heuristic we use to distinguish real tweets (or
// retweets) is the text field, which shouldn't appear at the top level of other events.
if (data.text === undefined)
return;
// Extract just the ID and text. We could save more data, but this keeps the schema small and simple for this
// example
var saved_data = {
'id': data.id_str,
'text': data.text
};
// If we're using Avro, we can just pass the data in directly. If we're
// using binary, we need to serialize it to a string ourselves.
consumed.push(binary ? JSON.stringify(saved_data) : saved_data);
windowed_collected++;
// Send if we've hit our buffering limit. The number of buffered messages balances your tolerance for losing data
// (if the process/host is killed/dies) against throughput (batching messages into fewer requests makes processing
// more efficient).
if (consumed.length >= bufferMessages) {
messages_sent += consumed.length;
var responseHandler = handleProduceResponse.bind(undefined, consumed.length);
if (binary)
target.produce(consumed, responseHandler);
else
target.produce(schema, consumed, responseHandler);
consumed = [];
}
});
stream.on('error', function(err) {
console.log(util.inspect(err));
// We just shutdown if we encounter an error, but we could also setup retry logic to try to recover the
// stream
shutdown();
});
});
function handleProduceResponse(batch_messages, err, res) {
messages_acked += batch_messages;
windowed_acked += batch_messages;
if (err) {
console.log("Error sending data to specified topic: " + err);
shutdown();
}
checkExit();
}
function reportWindowedRate() {
var now = Date.now();
console.log("Collected " + windowed_collected + " tweets and stored " + windowed_acked + " to Kafka in " + Math.round((now-windowed_started)/1000) + "s, " + Math.round(windowed_collected / ((now - windowed_started) / 1000)) + " tweets/s");
windowed_started = now;
windowed_collected = 0;
windowed_acked = 0;
}
function checkExit() {
if (exiting && messages_acked == messages_sent) {
console.log();
var finished = Date.now();
console.log("Converted " + messages_acked + " tweets, dropped last " + consumed.length + " remaining buffered tweets, " + Math.round(messages_acked/((finished-started)/1000)) + " tweets/s");
exiting = false;
}
}
function shutdown() {
console.log("Exiting...");
exiting = true;
if (stream) stream.destroy();
clearInterval(windowed_interval);
checkExit();
}
// Gracefully shutdown on Ctrl-C
process.on('SIGINT', shutdown);