psuk-parliament
Version:
A library by PublicScrutiny.UK to make it easier to get information from the UK Parliament
84 lines (67 loc) • 2.28 kB
JavaScript
var request = require('request'),
BBCThings = require('bbc-things'),
gramophone = require('gramophone'),
request = require('request'),
xml2js = require('xml2js'),
Q = require('q'),
cheerio = require('cheerio');
request({ url: 'http://feeds.bbci.co.uk/news/world/rss.xml', timeout: 2000 }, function(error, response, body) {
var parser = new xml2js.Parser();
parser.parseString(body, function (err, result) {
result.rss.channel[0].item.forEach(function(e,i) {
var article = {
title: e.title[0],
description: e.description[0],
url: e.link[0],
date: e.pubDate[0]
};
extractEntities(article)
.then(function(article) {
});
});
});
});
function extractEntities(article) {
var deferred = Q.defer();
var promise = request(article.url, function (error, response, body) {
var deferred2 = Q.defer();
var $ = cheerio.load(body);
var text = article.title+" "+article.description+" "+$('div[class=story-body]').text();
// Get tags from text
var gramophoneOptions = { score: false,
stopWords: [],
limit: 50,
ngrams: [1,2,3,4],
stem: true
};
var entities = gramophone.extract(text, gramophoneOptions);
entities.forEach(function(entity, i) {
// Overrides for common tag mis-matches
});
var promise2 = BBCThings.search(entities,true)
.then(function(things) {
var entities = [];
var validThings = {};
if (things) {
for (var thing in things) {
entities.push(thing);
if (things[thing].length > 0)
validThings[thing] = things[thing][0];
}
}
article.tags = validThings;
article.rawTags = entities;
if (Object.keys(article.tags).length > 0) {
console.log("Title: "+article.title);
console.log("Description: "+article.description);
console.log("Tags: "+article.rawTags.join(','));
console.log(article.tags);
console.log('---------');
}
deferred2.resolve(article);
});
return deferred2.promise;
});
deferred.resolve(promise);
return deferred.promise;
}