bot-marvin
Version:
Highly scalable crawler with best features.
102 lines (100 loc) • 3.5 kB
JavaScript
var JSONX = require("../lib/JSONX.js");
var config={
"robot_agent": "*",
"childs": 2,
"child_timeout":1800000,
"verbose": true,
"logging": true,
"parse_sitemaps": true,
"sitemap_parser_timeout":60000,
"env": "/usr/local/bin/node",
"text_editor": "nano",
"web_graph":true,
"retry_times_failed_pages":3,
"failed_queue_size":100,
"inlink_cache_size":5000,
"network_interface": "eth0",
"network_host": "127.0.0.1",
"network_port": "2020",
"cluster_port": 5555,
"http": {
"http_proxy":"",
"https_proxy":"",
"callback_timeout": 20000,
"timeout": 10000,
"max_content_length": 10485760,
"follow_redirect": true,
"max_sockets_per_host": 10,
"max_concurrent_sockets": 10,
"delay_request_same_host":3000,
"accepted_mime_types":["text/html","text/plain","application/xhtml+xml","text/xhtml"],
"headers": {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0",
"Accept": "text/html,application/xhtml+xml,text/plain.text/xhtml;q=0.9",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive"
},
"html_lang_regex":"^en(-.*|)"
},
"log_buffer_lines": 100,
"recrawl_intervals": {
"always": 0,
"monthly": 2592000000,
"daily": 86400000,
"weekly": 604800000,
"yearly": 31536000000
},
"default_recrawl_interval": "monthly",
"override_recrawl_interval_of_sitesmap_file": false,
"tika_host": "127.0.0.1",
"tika_port": "9998",
"tika": true,
"tika_debug":true,
"tika_supported_files": "__REGEXP /\\.(ppt|doc|pdf|docx|pptx)$/gi",
"tika_supported_mime":["application/msword","application/vnd.openxmlformats-officedocument.wordprocessingml.document","application/vnd.ms-powerpoint","application/vnd.openxmlformats-officedocument.presentationml.presentation","application/pdf","application/x-pdf","application/acrobat", "applications/vnd.pdf", "text/pdf", "text/x-pdf"],
"tika_batch_size": 5,
"tika_content_length": 104857600,
"tika_timeout": 300000,
"tika_max_sockets_per_host": 10,
"tika_headers":{
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0",
"Accept": "application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-powerpoint,application/vnd.openxmlformats-officedocument.presentationml.presentation,application/pdf,application/x-pdf,application/acrobat,applications/vnd.pdf,text/pdf,text/x-pdf;q=0.9",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive"
},
"allow_robots": true,
"robots_parser_threads": 500,
"external_links": false,
"batch_size": 100,
"db_type": "mongodb",
"remove_tags": [
"table",
"style",
"script",
"noscript",
"img",
"form",
"input",
"iframe",
"header",
"footer",
"button",
"pre",
"br",
"code",
"select",
"option",
"nav"
],
"accept_regex": "__REGEXP /http(|s):\\/\\/archive\\.org\\/details\\/.+|http(|s):\\/\\/archive\\.org\\/details\\/texts\\?&sort=-downloads&page=.+/gi",
"reject_regex": [
"__REGEXP /^(file|ftp|mailto|javascript|javascrpt|skype|whatsapp|tel):/g",
"__REGEXP /\\.(gif|jpg|png|ico|css|sit|eps|wmf|zip|ppt|doc|mpg|xls|gz|rpm|tgz|mov|exe|jpeg|bmp|js|swf|pdf|xlsx)$/gi"
]
}
function load(){
return JSONX.parse(JSON.stringify(config));
}
exports.load = load;