UNPKG

bot-marvin

Version:

Highly scalable crawler with best features.

135 lines (85 loc) 4.36 kB
var assert = require('chai').assert; var URL = require("../../lib/url.js"); var Message = require("../../lib/message.js"); var JSONX = require("../../lib/JSONX.js"); var conf = require("../../config/config.js").load(); var check = require('check-types'); process.RUN_ENV = "TEST"; var message_obj = new Message(); var config = {}; config.getConfig = function(){ //console.log(DB_CONFIG,"WHEN called"); var val = conf; if (!check.assigned(arguments[0])) { return conf; } for (var i = 0; i < arguments.length; i++) { val = val[arguments[i]]; }; return val; }; message_obj.set('config', config); message_obj.set('links_store', {"http://www.google.com":{"limit_depth": 3}}); var regex_urlfilter = {}; regex_urlfilter["accept"] = config.getConfig("accept_regex"); regex_urlfilter["reject"] = config.getConfig("reject_regex"); //console.log(regex_urlfilter); message_obj.set('regex_urlfilter', regex_urlfilter); var url_obj = new URL(message_obj); describe('Testing URL', function() { it('getFileType -> webpage', function () { var u = url_obj.url("http://www.google.com"); assert.equal(u.details.file_type, 'webpage'); }); it('getFileType -> file', function () { var u = url_obj.url("http://www.google.com/tilak.docx"); assert.equal(u.details.file_type, 'file'); }); it('extractDomain',function(){ var u = url_obj.url("http://www.google.com/tilak.docx"); assert.equal(u.details.domain, 'http://www.google.com'); var u = url_obj.url("http://www.google.com/tilak"); assert.equal(u.details.domain, 'http://www.google.com'); var u = url_obj.url("/tilak"); //relative url with no domain or parent info will be rejected assert.equal(u.details.accepted, false); assert.equal(u.details.domain, undefined); }); it("normalize protocol", function(){ var u = url_obj.url("https://www.google.com/tilak.docx"); assert.equal(u.details.url, "http://www.google.com/tilak.docx"); var u = url_obj.url("http://www.google.com/tilak.docx"); assert.equal(u.details.url, "http://www.google.com/tilak.docx"); var u = url_obj.url("www.google.com/tilak.docx"); //no protocol present rejected assert.equal(u.details.accepted, false); assert.equal(u.details.url, "http://www.google.com/tilak.docx"); }); it("sorted params",function(){ var u = url_obj.url("https://www.google.com/tilak.docx?z=20&a=5"); assert.equal(u.details.url, "http://www.google.com/tilak.docx?a=5&z=20"); var u = url_obj.url("https://www.google.com/tilak.docx?z=20&a=tilak patidar"); assert.equal(u.details.url, "http://www.google.com/tilak.docx?a=tilak%20patidar&z=20"); }); it("normalizeURL", function(){ var u = url_obj.url("http://www.google.com/tilak.docx/#home"); assert.equal(u.details.url, "http://www.google.com/tilak.docx"); var u = url_obj.url("http://www.google.com/tilak.docx/"); assert.equal(u.details.url, "http://www.google.com/tilak.docx"); }); it("nutchStyleUrl", function(){ var u = url_obj.url("http://www.google.com/tilak.docx"); assert.equal(u.details.nutch_key, "com.google.www:http/tilak.docx"); }); it("isAccepted", function(){ var u = url_obj.url("http://www.yahoo.com/tilak.docx", "http://www.google.com"); assert.equal(u.details.accepted, false, "different domain and url check failed"); var u = url_obj.url("http://www.google.com/tilak.docx", "http://www.google.com"); assert.equal(u.details.accepted, true, "tika accepted regex check failed"); var u = url_obj.url("http://www.google.com/tilak.gif", "http://www.google.com"); assert.equal(u.details.accepted, false, "tika reject regex check failed"); var u = url_obj.url("http://www.google.com/a/b/c", "http://www.google.com"); assert.equal(u.details.accepted, true, "limit depth till equal value failed"); //limit_depth check var u = url_obj.url("http://www.google.com/a/b/c/d", "http://www.google.com"); assert.equal(u.details.accepted, false, "limit depth check failed"); }); });