UNPKG

@warren-bank/node-request-cli

Version:

An extremely lightweight HTTP request client for the command-line. Supports: http, https, proxy, redirects, cookies, content-encoding, multipart/form-data, multi-threading, recursive website crawling and mirroring.

github.com/warren-bank/node-request-cli

warren-bank/node-request-cli

519 lines (427 loc) • 15.6 kB

JavaScript

const path = require('path') const parse_url = require('url').parse const resolve_url = require('url').resolve const {normalize_hostname} = require('../bin/nget/process_argv/web_crawler') const fp_helper = require('./filepath_helper') const log = require('./logger') const html_exts = ['', '.asp', '.apsx', '.cfm', '.cgi', '.dhtml', '.hta', '.htm', '.html', '.jhtml', '.jsp', '.php', '.php2', '.php3', '.php4', '.php5', '.php6', '.php7', '.php8', '.php9', '.phps', '.pht', '.phtm', '.phtml', '.pl', '.shtml'] const regex = { content_types: { html: /^.*(?:text\/html|application\/xhtml\+xml).*$/, css: /^.*(?:text\/css).*$/ }, html_content: { css: [ [ // ex: <style></style> /<style[^>]*>((?:.|[\r\n])*?)<\/style[^>]*>/ig, 1 ], [ // ex: <tag style=''> // ex: <tag style=""> /<[a-z]+\s[^>]*style=(['"])(.*?)(?<!\\)\1/ig, 2 ] ] }, url_rules: { html: [ [ /(href|src)\s*=\s*(['"])(.*?)\2/ig, 3, matches => { if (matches[1].toLowerCase() === 'href') { const url = matches[3] if (url.endsWith('/')) return true let filename = path.basename(url).toLowerCase() let index index = filename.indexOf('#') if (index !== -1) return true index = filename.indexOf('?') if (index !== -1) { if (index === 0) return true filename = filename.substring(0, index) } const ext = path.extname(filename) return html_exts.includes(ext) } return false } ] ], css: [ [ // ex: @import 'http://example.com/style.css' // ex: @import "http://example.com/style.css" /@import\s*(['"])(.*?)\1/ig, 2, false ], [ // ex: url(http://example.com/image.jpg) // ex: url('http://example.com/image.jpg') // ex: url("http://example.com/image.jpg") // ex: url("http://example.com/image.jpg") // ==== // note: // * using `"` as a delimiter is technically only valid for inline css // ex: <tag style="background-image: url("http://example.com/image.jpg")"> /url\s*\(\s*((?:['"]|")?)(.*?)\1\s*\)/ig, 2, false ] ] } } const reset_global_regex = (r) => { if (r instanceof RegExp) { r.lastIndex = 0 } return r } const get_normalized_hostname = (argv_vals, url) => { const parsed_url = parse_url(url) return normalize_hostname(argv_vals, parsed_url.hostname) } const get_normalized_pathname = (url) => { const parsed_url = parse_url(url) let pathname, parts pathname = parsed_url.pathname || '/' parts = pathname.split('/') // remove [0]: always empty // remove [last]: fname parts.shift() parts.pop() pathname = '/' + parts.join('/') return pathname } const apply_crawler_middleware = async (argv_vals, urls, {url, redirects, response, urldata}) => { let should_crawl = argv_vals["--recursive"] let follow_html = true let current_depth, was_html, is_html, is_css if (should_crawl) { current_depth = (Array.isArray(urldata) && (urldata.length >= 3)) ? urldata[2] : 0 was_html = (Array.isArray(urldata) && (urldata.length >= 4)) ? urldata[3] : undefined if (current_depth === 0) { const root_url = (Array.isArray(redirects) && redirects.length && argv_vals["--trust-server-names"]) ? redirects[redirects.length - 1] : url // stash metadata for root URL of active crawl in current session argv_vals["--recursive/root"] = { hostname: get_normalized_hostname(argv_vals, root_url), pathname: get_normalized_pathname(root_url) } } if (!argv_vals["--page-requisites"] && (current_depth >= argv_vals["--level"])) { should_crawl = false } } if (should_crawl) { if (!is_html && (current_depth === 0)) { is_html = {force: true} } if (!is_html && should_force_html(argv_vals, url)) { is_html = {force: true} } if (!is_html && Array.isArray(redirects) && redirects.length && should_force_html(argv_vals, redirects[redirects.length - 1])) { is_html = {force: true} } if (!is_html && response.headers && response.headers['content-type']) { const content_type = response.headers['content-type'].toLowerCase() is_html = regex.content_types.html.test(content_type) ? {mime: true} : false is_css = !is_html && regex.content_types.css.test(content_type) } if (!is_html && !is_css) { should_crawl = false } } if (should_crawl && argv_vals["--page-requisites"]) { if (current_depth === argv_vals["--level"]) { follow_html = false } if (current_depth > argv_vals["--level"]) { follow_html = false should_crawl = !is_html } } if (should_crawl) { log(Array(41).join('-')) log('crawling page:', {recursion_depth: current_depth, url, is_html: !!is_html, is_css: !!is_css}) const statusCode = response.statusCode const headers = response.headers const {new_urls, new_response} = await crawl(argv_vals, {url, redirects, response, current_depth, follow_html, is_html: !!is_html, is_css: !!is_css}) let new_output_filepath if (Array.isArray(new_urls) && new_urls.length) { // prepend new urls so they are downloaded immediately, // while within the context of: "--recursive/root" urls.unshift(...new_urls) } if (new_response) { response = new String(new_response) response.statusCode = statusCode response.headers = headers } if (typeof was_html === 'boolean') { // IMPORTANT: do not change output filepath! Previously crawled files are already linked. // Display a notification when the earlier determination of whether the URL contains HTML content, based only on its path, was incorrect: if (was_html !== !!is_html) { log(`WARNING: The following URL ${was_html ? 'was' : 'was not'} determined to contain HTML content by inspection of its pathname:`) log(' ', url) if ( (!was_html && is_html.mime) || ( was_html && response.headers && response.headers['content-type']) ) { log('However the "Content-Type" of the server response says otherwise:') log(' ', response.headers['content-type']) } } } else if (is_html && is_html.force) { new_output_filepath = fp_helper.get_output_filepath(argv_vals, urldata, {url, redirects, response, is_html: true}) } return {url, redirects, response, new_output_filepath} } else { return {url, redirects, response} } } const should_force_html = (argv_vals, url) => { if (Array.isArray(argv_vals["--force-html"]) && argv_vals["--force-html"].length) { let _regex for (let i=0; i < argv_vals["--force-html"].length; i++) { _regex = argv_vals["--force-html"][i] if (_regex.test(url)) { return true } } } return false } const crawl = (argv_vals, {url, redirects, response, current_depth, follow_html, is_html, is_css}) => { return new Promise((resolve, reject) => { let data = [] response.on('data', (chunk) => { data.push(chunk) }) response.on('end', () => { const text = Buffer.concat(data).toString('utf8') response.destroy() data = undefined resolve(text) }) }) .then(text => { const new_urls = [] const replacer = process_new_url.bind(this, argv_vals, new_urls, {url, redirects, current_depth, follow_html}) let regex_url_rules if (is_html) { regex_url_rules = regex.url_rules.html text = process_regex_url_rules(text, regex_url_rules, replacer) regex_url_rules = regex.url_rules.css text = process_regex_inline_css_rules(text, regex_url_rules, replacer) } if (is_css) { regex_url_rules = regex.url_rules.css text = process_regex_url_rules(text, regex_url_rules, replacer) } return {new_urls, new_response: text} }) } const process_regex_inline_css_rules = (text, regex_url_rules, replacer) => { const regex_inline_css_rules = regex.html_content.css let regex_inline_css_rule if (Array.isArray(regex_inline_css_rules) && regex_inline_css_rules.length) { for (let i=0; i < regex_inline_css_rules.length; i++) { regex_inline_css_rule = regex_inline_css_rules[i] text = process_regex_inline_css_rule(text, regex_inline_css_rule, regex_url_rules, replacer) } } return text } const process_regex_inline_css_rule = (text, regex_inline_css_rule, regex_url_rules, replacer) => { const _regex = reset_global_regex(regex_inline_css_rule[0]) const match_index = regex_inline_css_rule[1] return text.replace(_regex, (...matches) => { const old_match = matches[0] if ((matches.length > match_index) && matches[match_index]) { const old_inline_css = matches[match_index] const index_start = old_match.indexOf(old_inline_css) const index_end = index_start + old_inline_css.length if (old_inline_css && (index_start >= 0)) { const new_inline_css = process_regex_url_rules(old_inline_css, regex_url_rules, replacer) return old_match.substring(0, index_start) + new_inline_css + old_match.substring(index_end, old_match.length) } else { return old_match } } else { return old_match } }) } const process_regex_url_rules = (text, regex_url_rules, replacer) => { let regex_url_rule if (Array.isArray(regex_url_rules) && regex_url_rules.length) { for (let i=0; i < regex_url_rules.length; i++) { regex_url_rule = regex_url_rules[i] text = process_regex_url_rule(text, regex_url_rule, replacer) } } return text } const process_regex_url_rule = (text, regex_url_rule, replacer) => { const _regex = reset_global_regex(regex_url_rule[0]) const match_index = regex_url_rule[1] return text.replace(_regex, (...matches) => { const old_match = matches[0] if ((matches.length > match_index) && matches[match_index]) { const old_url = matches[match_index] // ignore hash-only URLs if (old_url.startsWith('#')) { return old_match } let is_html = false if (regex_url_rule.length > 2) { const _html = regex_url_rule[2] is_html = (typeof _html === 'function') ? _html(matches) : !!_html } const new_url = replacer(old_url, is_html) return new_url ? old_match.replace(old_url, new_url) : old_match } else { return old_match } }) } const process_new_url = (argv_vals, new_urls, {url, redirects, current_depth, follow_html}, new_url, is_html) => { if (!new_url) return '' const base_url = (argv_vals["--base"] && (current_depth === 0)) ? argv_vals["--base"] : url // resolve relative urls with respect to either: // - the user-specified "--base" url when the source document is the root url.. from which the web crawl was started // - the absolute url of the source document (after all redirects have been followed) new_url = resolve_url(base_url, new_url) if (!is_html) { is_html = should_force_html(argv_vals, new_url) } const follow_url = (is_html && !follow_html) ? false : should_follow_url(argv_vals, new_url, is_html) if (follow_url) { const urldata = [new_url] const parent_filepath = fp_helper.get_output_filepath(argv_vals, null, {url, redirects}) const output_filepath = fp_helper.get_output_filepath(argv_vals, urldata, {is_html}) const output_relative_url = get_relative_url(parent_filepath, output_filepath, new_url) log('following:', {recursion_depth: (current_depth + 1), url: new_url, relative_url: output_relative_url, absolute_filepath: output_filepath}) urldata[1] = output_filepath urldata[2] = current_depth + 1 urldata[3] = !!is_html new_urls.push(urldata) return argv_vals["--convert-links"] ? encodeURI(output_relative_url) : new_url } else { log('not following:', {recursion_depth: (current_depth + 1), url: new_url}) return new_url } } const get_relative_url = (from_filepath, to_filepath, url) => { let relative_url = '' // relative dirname relative_url += path.relative( path.dirname(from_filepath), path.dirname(to_filepath) ) if (relative_url) { relative_url += path.sep + path.basename(to_filepath) } else { // same directory relative_url += path.basename(to_filepath) } if (relative_url && (path.sep !== path.posix.sep)) { relative_url = relative_url.split(path.sep).join(path.posix.sep) } if (url) { const parsed_url = parse_url(url) if (parsed_url.hash) { relative_url += parsed_url.hash } } return relative_url } const should_follow_url = (argv_vals, new_url, is_html) => { const root = argv_vals["--recursive/root"] || {} const hostname = get_normalized_hostname(argv_vals, new_url) const pathname = get_normalized_pathname(new_url) const samehost = (hostname === root.hostname) let has_blacklist, has_whitelist if (!samehost && !argv_vals["--span-hosts"]) { return false } if (samehost && argv_vals["--no-parent"] && (pathname.indexOf(root.pathname) !== 0) && (is_html || !argv_vals["--page-requisites"])) { return false } if (samehost && Array.isArray(argv_vals["--exclude-directory"]) && argv_vals["--exclude-directory"].length) { let _pathname has_blacklist = true for (let i=0; i < argv_vals["--exclude-directory"].length; i++) { _pathname = argv_vals["--exclude-directory"][i] if (pathname.indexOf(_pathname) === 0) { return false } } } if (argv_vals["--span-hosts"] && Array.isArray(argv_vals["--exclude-host"]) && argv_vals["--exclude-host"].length) { has_blacklist = true if (argv_vals["--exclude-host"].indexOf(hostname) >= 0) { return false } } if (Array.isArray(argv_vals["--reject-regex"]) && argv_vals["--reject-regex"].length) { let _regex has_blacklist = true for (let i=0; i < argv_vals["--reject-regex"].length; i++) { _regex = argv_vals["--reject-regex"][i] if (_regex.test(new_url)) { return false } } } if (samehost && Array.isArray(argv_vals["--include-directory"]) && argv_vals["--include-directory"].length) { let _pathname has_whitelist = true for (let i=0; i < argv_vals["--include-directory"].length; i++) { _pathname = argv_vals["--include-directory"][i] if (pathname.indexOf(_pathname) === 0) { return true } } } if (argv_vals["--span-hosts"] && Array.isArray(argv_vals["--include-host"]) && argv_vals["--include-host"].length) { has_whitelist = true if (argv_vals["--include-host"].indexOf(hostname) >= 0) { return true } } if (Array.isArray(argv_vals["--accept-regex"]) && argv_vals["--accept-regex"].length) { let _regex has_whitelist = true for (let i=0; i < argv_vals["--accept-regex"].length; i++) { _regex = argv_vals["--accept-regex"][i] if (_regex.test(new_url)) { return true } } } return !has_whitelist } module.exports = {apply_crawler_middleware}