node-nutch

http://wiki.apache.org/nutch/CommandLineOptions inject generate(batchId) fetch(batchId) parse(batchId) updatedb solr index solr delete duplicates seeds: urls limits: depth? Reference projects: https://www.npmjs.com/package/roboto DELETING RECORDS curl -XDELETE 'https://ee7mwzg4:mjl5dsqt1o2gdz1y@redwood-6178337.us-east-1.bonsai.io/calendar-place-2/_query?q=organization:/Organization/gipsy-hill-federation' /** * generate: Generates a list of data sources to crawl: */ # Troubleshooting // .pipe(es.map(function (file, cb){ // console.log('got:', file); // cb(null, file); // })) # parse Two parts of parse have been lost temporarily, whilst we're just processing non-HTML files (this will be fixed when we can detect the file type as it's being passed through): The first is the HTML parsing stuff: var cheerio = require('cheerio'); /** * parse: Parse content from fetched pages: * * See: * * https://wiki.apache.org/nutch/Nutch2Crawling#Parse */ var parse = function (crawlBase, customParser, customParseChanged){ var taskName = 'parse'; return crawlBase.src() /** * Only process data sources that have been fetched, and not parsed: */ .pipe(filter(function (file){ return (file.data.fetchedContent.status === 200) && (!file.data.parseStatus || file.data.parseStatus.state === ParseState.NOTPARSED); })) /** * Update the status to indicate that the URL is about to be fetched: */ .pipe(es.map(function (file, cb){ var $ = cheerio.load(file.data.fetchedContent.content); file.data.parse = { title: $('title').text().trim(), outlist: $('a').map(function (){ var title; var _s; if ($(this).attr('title')){ _s = '@title'; title = $(this).attr('title'); } else { title = $(this).text().trim(); _s = 'text'; if (title === ''){ if ($('img', this)){ _s = 'img[@alt]'; title = $('img', this).attr('alt'); } } } return { url: url.resolve(file.data.url, $(this).attr('href') || ''), _s: _s, title: title }; }).get() }; file.data.parseStatus = new ParseState(ParseState.SUCCESS); cb(null, file); })) Then there's the custom parser stuff: /** * Call any custom parser: */ .pipe(es.map(function (file, cb){ if (customParser){ var customParse = customParser(file.data.fetchedContent.content); if (customParse){ /** * Use the provided function to check if the custom parse value has * changed: */ if (customParseChanged){ file.data.customParseChanged = customParseChanged(customParse, file.data.customParse); } /** * Make sure to only overwrite old value after it has been used to * check for changes: */ file.data.customParse = customParse; file.data.customParseStatus = new ParseState(ParseState.SUCCESS); } } cb(null, file); })) The old slug calculation code: if (file.data.meta) { slugTemplate = file.data.meta['slug.template']; } if (!slugTemplate) { slugTemplate = '{{events[0].source}}'; } slugTemplate = _.template(slugTemplate); file.data.extracted = customExtractor( file.data.customParseChanged || file.data.customParse) .filter(function (obj){ var params = {}; if (!obj || !obj.events){ return false; } obj.url = url; params.summary = obj.summary || obj.events[0].summary; if (obj.events[0].timex3.date) { params.year = obj.events[0].timex3.date.year; params.month = obj.events[0].timex3.date.month; } else if (obj.events[0].timex3.range) { params.year = obj.events[0].timex3.range.from.date.year; params.month = obj.events[0].timex3.range.from.date.month; } try { obj.slug = slugTemplate(params).replace(/ /g, '-'); return true; } catch (e) { console.error('Template processing failed:', e, JSON.stringify(obj)); } return false; }); next(null, file); }))