UNPKG

lx-scan

Version:

License eXtension to find 5-tuples of all installed packages: name, version, project home page, license (e. g. Apache v2, BSD) and required notice. It includes a GUI to edit information for each package and to enter information if necessary.

89 lines (76 loc) 2.73 kB
var scraper = require("./scraper_proto"); var url = require("url"); var utils = require("scanner_utils"); var path = require("path"); var cheerio = require("cheerio"); var copyright_regex = /copyright/ig; var filter_by_regex = function (elements,regex) { return elements.filter(function (index,element) { return element.attribs.href.match(regex); }); } var link_parse = function (uri,link) { link = link.attribs.href; return uri + "/" + link.split("/")[1]; } // In the worst case we have to make 3 page requests; 1 to the index page, // 1 to the docs folder page, and 1 to a docs/license file. Minimum is 2. var makeRequest = function(license_object) { return function (callback) { var default_request = utils.default_request(license_object,callback); var uri = license_object.repository; if(uri[uri.length-1]!== "/") uri+= "/"; uri += "source/browse"; var extract_license = function (link) { default_request(link,function (error,res,body) { $ = cheerio.load(body); var license_text = ""; // Works around the default $(*).text() behavior so that // line breaks are included. license_text = $(".source").map(function (index,element) { return $(this).text(); }).get().join("\n"); license_object.licensefile.push({licensepath: link, text: license_text}); return callback(null,license_object); }); } default_request(uri,function (error,res,body) { var $ = cheerio.load(body); var link_list = $("a"); var license_links = filter_by_regex(link_list,/license/i); var doc_links = filter_by_regex(link_list,/doc/i); var readme_links = filter_by_regex(link_list,/readme/i); if(license_links[0]) { var link = link_parse(uri,license_links[0]); } else if(readme_links[0]) { var link = link_parse(uri,readme_links[0]); } if(link) { extract_license(link); } else { if(!doc_links[0]) return callback(null,license_object); var link = link_parse(uri,doc_links[0]); default_request(link, function (error,res,body) { $ = cheerio.load(body); license_links = filter_by_regex(link_list,/license/i); readme_links = filter_by_regex(readme_links,/readme/i); if(license_links[0]) { var link = license_links[0].attribs.href; link = link.split("/")[1]; link = uri + "/" + link; } else if(readme_links[0]) { var link = readme_links[0].attribs.href; link = link.split("/")[1]; link = uri + "/" + link; } else { return callback(null,license_object); } extract_license(link); }); } }); }; }; module.exports = new scraper({regex: /code\.google/,parallel_function: makeRequest});