node-nutch
Version:
A set of Gulp commands that provide similar functionality to Apache Nutch.
180 lines (137 loc) • 4.28 kB
Markdown
http://wiki.apache.org/nutch/CommandLineOptions
inject
generate(batchId)
fetch(batchId)
parse(batchId)
updatedb
solr index
solr delete duplicates
seeds: urls
limits: depth?
Reference projects:
https://www.npmjs.com/package/roboto
DELETING RECORDS
curl -XDELETE 'https://ee7mwzg4:mjl5dsqt1o2gdz1y@redwood-6178337.us-east-1.bonsai.io/calendar-place-2/_query?q=organization:/Organization/gipsy-hill-federation'
/**
* generate: Generates a list of data sources to crawl:
*/
# Troubleshooting
// .pipe(es.map(function (file, cb){
// console.log('got:', file);
// cb(null, file);
// }))
# parse
Two parts of parse have been lost temporarily, whilst we're just processing non-HTML files (this will be fixed when we can detect the file type as it's being passed through):
The first is the HTML parsing stuff:
var cheerio = require('cheerio');
/**
* parse: Parse content from fetched pages:
*
* See:
*
* https://wiki.apache.org/nutch/Nutch2Crawling#Parse
*/
var parse = function (crawlBase, customParser, customParseChanged){
var taskName = 'parse';
return crawlBase.src()
/**
* Only process data sources that have been fetched, and not parsed:
*/
.pipe(filter(function (file){
return (file.data.fetchedContent.status === 200) &&
(!file.data.parseStatus ||
file.data.parseStatus.state === ParseState.NOTPARSED);
}))
/**
* Update the status to indicate that the URL is about to be fetched:
*/
.pipe(es.map(function (file, cb){
var $ = cheerio.load(file.data.fetchedContent.content);
file.data.parse = {
title: $('title').text().trim(),
outlist: $('a').map(function (){
var title;
var _s;
if ($(this).attr('title')){
_s = '@title';
title = $(this).attr('title');
} else {
title = $(this).text().trim();
_s = 'text';
if (title === ''){
if ($('img', this)){
_s = 'img[@alt]';
title = $('img', this).attr('alt');
}
}
}
return {
url: url.resolve(file.data.url, $(this).attr('href') || ''),
_s: _s,
title: title
};
}).get()
};
file.data.parseStatus = new ParseState(ParseState.SUCCESS);
cb(null, file);
}))
Then there's the custom parser stuff:
/**
* Call any custom parser:
*/
.pipe(es.map(function (file, cb){
if (customParser){
var customParse = customParser(file.data.fetchedContent.content);
if (customParse){
/**
* Use the provided function to check if the custom parse value has
* changed:
*/
if (customParseChanged){
file.data.customParseChanged = customParseChanged(customParse, file.data.customParse);
}
/**
* Make sure to only overwrite old value after it has been used to
* check for changes:
*/
file.data.customParse = customParse;
file.data.customParseStatus = new ParseState(ParseState.SUCCESS);
}
}
cb(null, file);
}))
The old slug calculation code:
if (file.data.meta) {
slugTemplate = file.data.meta['slug.template'];
}
if (!slugTemplate) {
slugTemplate = '{{events[0].source}}';
}
slugTemplate = _.template(slugTemplate);
file.data.extracted = customExtractor(
file.data.customParseChanged || file.data.customParse)
.filter(function (obj){
var params = {};
if (!obj || !obj.events){
return false;
}
obj.url = url;
params.summary = obj.summary || obj.events[0].summary;
if (obj.events[0].timex3.date) {
params.year = obj.events[0].timex3.date.year;
params.month = obj.events[0].timex3.date.month;
} else if (obj.events[0].timex3.range) {
params.year = obj.events[0].timex3.range.from.date.year;
params.month = obj.events[0].timex3.range.from.date.month;
}
try {
obj.slug = slugTemplate(params).replace(/ /g, '-');
return true;
} catch (e) {
console.error('Template processing failed:', e,
JSON.stringify(obj));
}
return false;
});
next(null, file);
}))