UNPKG

jul11co-wdt

Version:

Jul11Co Web Download Tools

644 lines (543 loc) 17.5 kB
// lib/scraper.js var path = require('path'); var urlutil = require('url'); var zlib = require('zlib'); var fs = require('fs'); var request = require('request'); var cheerio = require('cheerio'); var page_scrapers = []; // scraper // { // name: String, // match: function(link, options) {...}, // scrape: function($, page, options) {...} // } exports.addScraper = function(scraper) { // console.log('Add scraper:', scraper.name); page_scrapers.push(scraper); } function urlGetHost(_url) { if (!_url || _url == '') return ''; var host_url = ''; var url_obj = urlutil.parse(_url); if (url_obj.slashes) { host_url = url_obj.protocol + '//' + url_obj.host; } else { host_url = url_obj.protocol + url_obj.host; } return host_url; } exports.urlGetHost = urlGetHost; function isValidLink(link_href) { if (!link_href || link_href === '') return false; if (link_href.indexOf('#') == 0 || link_href.indexOf('mailto:') >= 0 || link_href.indexOf('javascript:') == 0 || link_href.indexOf('data:') == 0) { return false; } return true; } exports.isValidLink = isValidLink; function getUniqueFileName(file_names, file_name) { var result_file_name = file_name; var file_name_ext = path.extname(file_name); var file_name_base = path.basename(file_name, file_name_ext); var collision = false; for (var i = 0; i < file_names.length; i++) { if (file_name == file_names[i].file_name) { collision = true; file_names[i].current_index++; result_file_name = file_name_base + '(' + file_names[i].current_index + ')' + file_name_ext; } } if (!collision) { file_names.push({ file_name: file_name, current_index: 0 }); } return result_file_name; } function isBlacklisted(link_url, blacklist) { var blacklisted = false; for (var i = 0; i < blacklist.length; i++) { if (link_url.indexOf(blacklist[i]) >= 0) { blacklisted = true; break; } } return blacklisted; } function isFilteredOut(link_url, filters) { var filtered_out = true; for (var i = 0; i < filters.length; i++) { if (link_url.indexOf(filters[i]) >= 0) { filtered_out = false; break; } } return filtered_out; } /// exports.fixImages = function($, page_info, options) { options = options || {}; var page_host_url = urlGetHost(page_info.url); var page_host_url_obj = urlutil.parse(page_host_url); var page_url_obj = urlutil.parse(page_info.base_url || page_info.url); $('img').each(function(){ var image_src = $(this).attr('src'); if (image_src && image_src != "") { var image_url = image_src; if (!isValidLink(image_url)) return; if (image_url.indexOf('//') == 0) { image_url = page_host_url_obj.protocol + image_url; } var image_url_obj = urlutil.parse(image_url); if (!image_url_obj.host) { // image_url = urlutil.resolve(page_url_obj, image_url_obj); if (image_url.indexOf('/') == 0) { image_url = urlutil.resolve(page_host_url_obj, image_url_obj); } else { image_url = urlutil.resolve(page_url_obj, image_url_obj); } } else { image_url = urlutil.format(image_url_obj); } if (image_url != image_src) { $(this).attr('src', image_url); } } }); } exports.fixLinks = function($, page_info, options) { options = options || {}; var page_host_url = urlGetHost(page_info.url); var page_host_url_obj = urlutil.parse(page_host_url); var page_url_obj = urlutil.parse(page_info.base_url || page_info.url); $('body a').each(function(){ var link_href = $(this).attr('href'); if (!isValidLink(link_href)) return; var link_url = link_href; link_url = link_url.replace('http:///', '/'); if (link_url.indexOf('//') == 0) { link_url = page_host_url_obj.protocol + link_url; } var link_url_obj = urlutil.parse(link_url); if (!link_url_obj.host) { if (link_url.indexOf('/') == 0) { link_url = urlutil.resolve(page_host_url_obj, link_url_obj); } else { link_url = urlutil.resolve(page_url_obj, link_url_obj); } } else { link_url = urlutil.format(link_url_obj); } if (link_url != link_href) { $(this).attr('href', link_url); } }); } // options // { // blacklist: [String], // visited_links: [String], // filters: [String], // validator: function(link) {...}, // exclude_visited_links: Boolean // } exports.getLinks = function($, page, selector, options) { options = options || {}; var blacklist = options.blacklist || []; var visited_links = options.visited_links || []; var filters = options.filters || []; var isVisited = function(link) { if (visited_links && visited_links.length) { // && Array.isArray(visited_links) return (visited_links.indexOf(link) >= 0); } return false; } var links = []; var page_host_url = urlGetHost(page.url); var page_host_url_obj = urlutil.parse(page_host_url); var page_url_obj = urlutil.parse(page.base_url || page.url); $('' + selector + ' a').each(function(){ var link_href = $(this).attr('href'); if (!isValidLink(link_href)) return; var link_url = link_href; link_url = link_url.replace('http:///', '/'); if (link_url.indexOf('//') == 0) { link_url = page_host_url_obj.protocol + link_url; } var link_url_obj = urlutil.parse(link_url); var link_url_host = link_url_obj.host; if (!link_url_host) { // link_url = urlutil.resolve(page_host_url_obj, link_url_obj); if (link_url.indexOf('/') == 0) { link_url = urlutil.resolve(page_host_url_obj, link_url_obj); } else { link_url = urlutil.resolve(page_url_obj, link_url_obj); } link_url_host = page_host_url_obj.host; } else { link_url = urlutil.format(link_url_obj); } // filter_host if (typeof options.filter_host != 'undefined') { if (link_url_host != options.filter_host) return; } // $(this).attr('href', link_url); link_url = link_url.split('#')[0]; if (link_url == page.url) return; // exclude visited link if (options.exclude_visited_links) { if (isVisited(link_url)) return; } // blacklist if (typeof blacklist != 'undefined' && blacklist.length > 0) { if (isBlacklisted(link_url, blacklist)) return; } // filters if (typeof filters != 'undefined' && filters.length > 0) { if (isFilteredOut(link_url, filters)) return; } if (links.indexOf(link_url) == -1) { if (typeof options.validator == 'function') { if (options.validator(link_url)){ links.push(link_url); } } else { links.push(link_url); } } }); return links; } // options // { // blacklist: [String], // filters: [String] // } exports.getImages = function($, page, selector, options) { options = options || {}; var blacklist = options.blacklist || []; var filters = options.filters || []; var image_urls = []; var image_file_names = []; var images = []; var page_host_url = urlGetHost(page.url); var page_host_url_obj = urlutil.parse(page_host_url); var page_url_obj = urlutil.parse(page.base_url || page.url); $('' + selector + ' img').each(function(){ var image_src = $(this).attr('src'); var image_alt = $(this).attr('alt'); if (image_src && image_src != "") { // if (image_src.indexOf('data:') == 0) return; if (!isValidLink(image_src)) return; var image_url = image_src; if (image_url.indexOf('//') == 0) { image_url = page_host_url_obj.protocol + image_url; } var image_url_obj = urlutil.parse(image_url); if (!image_url_obj.host) { // image_url = urlutil.resolve(page_host_url_obj, image_url_obj); if (image_url.indexOf('/') == 0) { image_url = urlutil.resolve(page_host_url_obj, image_url_obj); } else { image_url = urlutil.resolve(page_url_obj, image_url_obj); } } else { image_url = urlutil.format(image_url_obj); } if (image_urls.indexOf(image_url) >= 0) return; image_urls.push(image_url); // blacklist if (typeof blacklist != 'undefined' && blacklist.length > 0) { if (isBlacklisted(image_url, blacklist)) return; } // filters if (typeof filters != 'undefined' && filters.length > 0) { if (isFilteredOut(image_url, filters)) return; } var image_info = { src: image_url, file: getUniqueFileName(image_file_names, path.basename(image_url_obj.pathname)) }; if (image_alt && image_alt != '') image_info.alt = image_alt; images.push(image_info); } }); return images; } exports.extractLinks = function($, page, options) { var links = []; var link_urls = []; var blacklist = options.link_blacklist; var filters = options.link_filters; $('body a').each(function(){ var link_href = $(this).attr('href'); if (!isValidLink(link_href)) return; var link_url = link_href.split('#')[0]; // blacklist if (typeof blacklist != 'undefined' && blacklist.length) { if (isBlacklisted(link_url, blacklist)) return; } // filters if (typeof filters != 'undefined' && filters.length) { if (isFilteredOut(link_url, filters)) return; } if (link_urls.indexOf(link_url) == -1) { link_urls.push(link_url); links.push({ url: link_url, title: $(this).text().trim() }); } }); return links; } exports.extractImages = function($, page, options) { var images = []; var image_urls = []; var blacklist = options.link_blacklist; var filters = options.link_filters; $('img').each(function(){ var image_src = $(this).attr('src'); if (!isValidLink(image_src)) return; // blacklist if (typeof blacklist != 'undefined' && blacklist.length) { if (isBlacklisted(image_src, blacklist)) return; } // filters if (typeof filters != 'undefined' && filters.length) { if (isFilteredOut(image_src, filters)) return; } if (image_urls.indexOf(image_src) == -1) { image_urls.push(image_src); images.push({ src: image_src, alt: $(this).attr('alt') }); } }); return images; } var requestWithEncoding = function(options, callback) { var req_err = null; var req = null; try { req = request.get(options); } catch (e) { req_err = e; return callback(req_err); } req.on('response', function(res) { var chunks = []; res.on('data', function(chunk) { chunks.push(chunk); }); res.on('end', function() { if (req_err) { return; } var buffer = Buffer.concat(chunks); var encoding = res.headers['content-encoding']; if (encoding == 'gzip') { zlib.gunzip(buffer, function(err, decoded) { callback(err, res, decoded && decoded.toString()); }); } else if (encoding == 'deflate') { zlib.inflate(buffer, function(err, decoded) { callback(err, res, decoded && decoded.toString()); }) } else { callback(null, res, buffer.toString()); } }); }); req.on('error', function(err) { if (!req_err) { req_err = err; callback(err); } }); } var downloadHtml = function(page_url, options, callback) { var default_headers = { 'User-Agent': 'request' }; var default_timeout = 20000; /* 20 seconds */ var default_jar = false; var request_options = { url: page_url, jar: options.request_jar || default_jar, headers: options.request_headers || default_headers, timeout: options.request_timeout || default_timeout }; return requestWithEncoding(request_options, function(err, response, html) { return callback(err, html, response); }); } exports.extractMeta = function($, page, options) { var meta = {}; if ($('meta').length) { $('meta').each(function() { var meta_name = $(this).attr('name'); if (meta_name) { if (meta[meta_name]) { if (Array.isArray(meta[meta_name])) { meta[meta_name].push($(this).attr('content')) } else { meta[meta_name] = [ meta[meta_name], $(this).attr('content') ]; } } else { meta[meta_name] = $(this).attr('content'); } } else { var meta_property = $(this).attr('property'); if (meta_property) { if (meta[meta_property]) { if (Array.isArray(meta[meta_property])) { meta[meta_property].push($(this).attr('content')) } else { meta[meta_property] = [ meta[meta_property], $(this).attr('content') ]; } } else { meta[meta_property] = $(this).attr('content'); } } } }); } return meta; } // Extract page info from URL // // options // { // include_html: Boolean, /* default: false */ // include_html_body: Boolean, /* default: false */ // include_images: Boolean, /* default: false */ // include_links: Boolean, /* default: false */ // include_og: Boolean, /* default: false */ // include_meta: Boolean, /* default: false */ // downloadHtmlFunc: function(page_url, options, callback: function(err, html, response) {...}) // } // // Default page info // { // url: String, // title: String, // description: String, // image: String, // icon: String // } exports.scrape = function(request_url, options, callback) { var downloadHtmlFunc = downloadHtml; if (typeof options.downloadHtmlFunc == 'function') { downloadHtmlFunc = options.downloadHtmlFunc; } var download_url = request_url; if (options.html_proxy && options.html_proxy != '') { download_url = options.html_proxy + '?url=' + encodeURIComponent(request_url); } downloadHtmlFunc(download_url, options, function(err, html, response){ if (err) { return callback(err); } var page_info = { url: request_url }; if (!options.html_proxy || options.html_proxy === '') { // Page URL (real) page_info.url = response ? response.request.href : request_url; } page_info.content_type = response ? response.headers['content-type'] : 'text/html'; if (page_info.content_type && page_info.content_type.indexOf('html') == -1) { if (response && response.headers) console.log(response.headers); return callback(new Error('Not HTML page (' + page_info.content_type + ')')); } var $ = cheerio.load(html); if ($('head base').length) { page_info.base_url = $('head base').attr('href'); } // Page icon var page_icon = $('link[rel="shortcut icon"]').attr('href'); if (!page_icon) { page_icon = $('link[rel="icon"]').attr('href'); } if (page_icon) page_info.icon = page_icon; // Page title page_info.title = $('title').first().text(); if (page_info.title) { page_info.title = page_info.title.replace(/(\r\n|\n|\r)/gm, ''); } // Page description var page_description = $('meta[name*=description]').attr('content'); if (page_description) { page_description = page_description.replace(/(\r\n|\n|\r)/gm, ''); } if (page_description) page_info.description = page_description; // Page HTML if (options.include_html) { page_info.html = html; } // Page body HTML if (options.include_html_body) { page_info.html_body = $.html('body'); } exports.fixImages($, page_info, options); exports.fixLinks($, page_info, options); // Page images if (options.include_images) { page_info.images = exports.extractImages($, page_info, options); } // Page links if (options.include_links) { page_info.links = exports.extractLinks($, page_info, options); } // Body HTML if (options.include_body) { page_info.body = $('body').html(); } if (options.include_meta) { page_info.meta = exports.extractMeta($, page_info, options); } // Open Graph meta tags var og_metadata = { url: $('meta[property="og:url"]').attr('content'), type: $('meta[property="og:type"]').attr('content'), title: $('meta[property="og:title"]').attr('content'), description: $('meta[property="og:description"]').attr('content'), image: $('meta[property="og:image"]').attr('content') }; if (options.include_og) { page_info.og = og_metadata; } if ((!page_info.image || page_info.image == '') && og_metadata.image) { page_info.image = og_metadata.image; } if (page_info.description == '' && og_metadata.description && og_metadata.description != '') { page_info.description = og_metadata.description.replace(/(\r\n|\n|\r)/gm, ''); } if (page_scrapers.length) { var scrapers = []; for (var i = 0; i < page_scrapers.length; i++) { if (page_scrapers[i].match(page_info.url, options)) { scrapers.push(page_scrapers[i]); } } scrapers.forEach(function(scraper) { scraper.scrape($, page_info, options); }); } $ = null; callback(null, page_info); }); }