UNPKG

node-nutch

Version:

A set of Gulp commands that provide similar functionality to Apache Nutch.

github.com/markbirbeck/node-nutch

markbirbeck/node-nutch

99 lines (77 loc) • 2.17 kB

JavaScript

var path = require('path'); var BasicURLNormalizer = require('../plugins/basicURLNormalizer'); var dir = { root: 'crawl' }; /** * CrawlBase: A list of all URLs we know about with their status: */ dir.CrawlBase = process.env.CRAWLBASE || (dir.root + path.sep + 'CrawlBase'); /** * seeds: A set of text files each of which contains URLs: */ dir.seeds = dir.root + path.sep + 'seeds'; var config = { dir: dir, db: { fetch: { interval: { /** * The default number of seconds between re-fetches of a page (30 days): */ default: 2592000, /** * The maximum number of seconds between re-fetches of a page (90 days). * After this period every page in the db will be re-tried, no matter * what is its status: */ max: 7776000 }, retry: { /** * The maximum number of times a url that has encountered * recoverable errors is generated for fetch: */ max: 3 } }, ignore: { external: { /** * If true, outlinks leading from a page to external hosts * will be ignored. This is an effective way to limit the * crawl to include only initially injected hosts, without * creating complex URLFilters: */ links: true } }, update: { additions: { /** * If true, updatedb will add newly discovered URLs, if false * only already existing URLs in the CrawlDb will be updated * and no new URLs will be added: */ allowed: false } } }, /** * TODO: Allow options to be overridden in app that is using node-nutch. */ elastic: { host: process.env.ELASTICSEARCH_HOST, requestTimeout: 120000 }, urlnormalizer: { /** * Order in which normalizers will run. If any of these isn't * activated it will be silently skipped. If other normalizers * not on the list are activated, they will run in random order * after the ones specified here are run: */ order: [BasicURLNormalizer] } }; module.exports = config;