UNPKG

node-nutch

Version:

A set of Gulp commands that provide similar functionality to Apache Nutch.

71 lines (63 loc) 1.92 kB
const h = require('highland'); var Buffer = require('buffer').Buffer; var fs = require('fs'); var path = require('path'); var es = require('event-stream'); var lazypipe = require('lazypipe'); var gulp = require('gulp'); var gutil = require('gulp-util'); var CrawlState = require('./crawlState'); var config = require('../config/config'); /** * Create a pipeline to update the crawl database with any changes to an entry: */ module.exports = function(target) { /** * If there is no output target specified then use Gulp's default: */ if (!target) { target = gulp; gulp.exists = fs.exists; } return { crawlState: function(t, uri, meta){ var file = new gutil.File({ base: config.dir.CrawlBase }); file.data = { crawlState: new CrawlState(t), meta: meta }; file.data.url = uri; return file; }, dest: lazypipe() .pipe(es.map, function (file, cb){ file.contents = new Buffer(JSON.stringify( file.data )); file.contentType = 'application/json'; file.base = config.dir.CrawlBase; file.path = config.dir.CrawlBase + path.sep + encodeURIComponent(file.data.url) + path.sep + 'status'; cb(null, file); }) .pipe(target.dest, config.dir.CrawlBase), exists: function(uri, cb){ target.exists(config.dir.CrawlBase + path.sep + encodeURIComponent(uri), cb); }, filesDest: function(){ return target.dest(config.dir.CrawlBase); }, filesSrc: function(uri, dir){ return target.src(config.dir.CrawlBase + path.sep + encodeURIComponent(uri) + path.sep + dir); }, src: function (){ return h(target.src(config.dir.CrawlBase + '/*/status')) .map(function (file){ file.data = JSON.parse(file.contents.toString()); file.path = file.data.url; return file; }); } }; };