jul11co-wdt
Version:
Jul11Co Web Download Tools
644 lines (543 loc) • 17.5 kB
JavaScript
// lib/scraper.js
var path = require('path');
var urlutil = require('url');
var zlib = require('zlib');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var page_scrapers = [];
// scraper
// {
// name: String,
// match: function(link, options) {...},
// scrape: function($, page, options) {...}
// }
exports.addScraper = function(scraper) {
// console.log('Add scraper:', scraper.name);
page_scrapers.push(scraper);
}
function urlGetHost(_url) {
if (!_url || _url == '') return '';
var host_url = '';
var url_obj = urlutil.parse(_url);
if (url_obj.slashes) {
host_url = url_obj.protocol + '//' + url_obj.host;
} else {
host_url = url_obj.protocol + url_obj.host;
}
return host_url;
}
exports.urlGetHost = urlGetHost;
function isValidLink(link_href) {
if (!link_href || link_href === '') return false;
if (link_href.indexOf('#') == 0
|| link_href.indexOf('mailto:') >= 0
|| link_href.indexOf('javascript:') == 0
|| link_href.indexOf('data:') == 0) {
return false;
}
return true;
}
exports.isValidLink = isValidLink;
function getUniqueFileName(file_names, file_name) {
var result_file_name = file_name;
var file_name_ext = path.extname(file_name);
var file_name_base = path.basename(file_name, file_name_ext);
var collision = false;
for (var i = 0; i < file_names.length; i++) {
if (file_name == file_names[i].file_name) {
collision = true;
file_names[i].current_index++;
result_file_name = file_name_base + '(' + file_names[i].current_index + ')' + file_name_ext;
}
}
if (!collision) {
file_names.push({
file_name: file_name,
current_index: 0
});
}
return result_file_name;
}
function isBlacklisted(link_url, blacklist) {
var blacklisted = false;
for (var i = 0; i < blacklist.length; i++) {
if (link_url.indexOf(blacklist[i]) >= 0) {
blacklisted = true;
break;
}
}
return blacklisted;
}
function isFilteredOut(link_url, filters) {
var filtered_out = true;
for (var i = 0; i < filters.length; i++) {
if (link_url.indexOf(filters[i]) >= 0) {
filtered_out = false;
break;
}
}
return filtered_out;
}
///
exports.fixImages = function($, page_info, options) {
options = options || {};
var page_host_url = urlGetHost(page_info.url);
var page_host_url_obj = urlutil.parse(page_host_url);
var page_url_obj = urlutil.parse(page_info.base_url || page_info.url);
$('img').each(function(){
var image_src = $(this).attr('src');
if (image_src && image_src != "") {
var image_url = image_src;
if (!isValidLink(image_url)) return;
if (image_url.indexOf('//') == 0) {
image_url = page_host_url_obj.protocol + image_url;
}
var image_url_obj = urlutil.parse(image_url);
if (!image_url_obj.host) {
// image_url = urlutil.resolve(page_url_obj, image_url_obj);
if (image_url.indexOf('/') == 0) {
image_url = urlutil.resolve(page_host_url_obj, image_url_obj);
} else {
image_url = urlutil.resolve(page_url_obj, image_url_obj);
}
} else {
image_url = urlutil.format(image_url_obj);
}
if (image_url != image_src) {
$(this).attr('src', image_url);
}
}
});
}
exports.fixLinks = function($, page_info, options) {
options = options || {};
var page_host_url = urlGetHost(page_info.url);
var page_host_url_obj = urlutil.parse(page_host_url);
var page_url_obj = urlutil.parse(page_info.base_url || page_info.url);
$('body a').each(function(){
var link_href = $(this).attr('href');
if (!isValidLink(link_href)) return;
var link_url = link_href;
link_url = link_url.replace('http:///', '/');
if (link_url.indexOf('//') == 0) {
link_url = page_host_url_obj.protocol + link_url;
}
var link_url_obj = urlutil.parse(link_url);
if (!link_url_obj.host) {
if (link_url.indexOf('/') == 0) {
link_url = urlutil.resolve(page_host_url_obj, link_url_obj);
} else {
link_url = urlutil.resolve(page_url_obj, link_url_obj);
}
} else {
link_url = urlutil.format(link_url_obj);
}
if (link_url != link_href) {
$(this).attr('href', link_url);
}
});
}
// options
// {
// blacklist: [String],
// visited_links: [String],
// filters: [String],
// validator: function(link) {...},
// exclude_visited_links: Boolean
// }
exports.getLinks = function($, page, selector, options) {
options = options || {};
var blacklist = options.blacklist || [];
var visited_links = options.visited_links || [];
var filters = options.filters || [];
var isVisited = function(link) {
if (visited_links && visited_links.length) { // && Array.isArray(visited_links)
return (visited_links.indexOf(link) >= 0);
}
return false;
}
var links = [];
var page_host_url = urlGetHost(page.url);
var page_host_url_obj = urlutil.parse(page_host_url);
var page_url_obj = urlutil.parse(page.base_url || page.url);
$('' + selector + ' a').each(function(){
var link_href = $(this).attr('href');
if (!isValidLink(link_href)) return;
var link_url = link_href;
link_url = link_url.replace('http:///', '/');
if (link_url.indexOf('//') == 0) {
link_url = page_host_url_obj.protocol + link_url;
}
var link_url_obj = urlutil.parse(link_url);
var link_url_host = link_url_obj.host;
if (!link_url_host) {
// link_url = urlutil.resolve(page_host_url_obj, link_url_obj);
if (link_url.indexOf('/') == 0) {
link_url = urlutil.resolve(page_host_url_obj, link_url_obj);
} else {
link_url = urlutil.resolve(page_url_obj, link_url_obj);
}
link_url_host = page_host_url_obj.host;
} else {
link_url = urlutil.format(link_url_obj);
}
// filter_host
if (typeof options.filter_host != 'undefined') {
if (link_url_host != options.filter_host) return;
}
// $(this).attr('href', link_url);
link_url = link_url.split('#')[0];
if (link_url == page.url) return;
// exclude visited link
if (options.exclude_visited_links) {
if (isVisited(link_url)) return;
}
// blacklist
if (typeof blacklist != 'undefined' && blacklist.length > 0) {
if (isBlacklisted(link_url, blacklist)) return;
}
// filters
if (typeof filters != 'undefined' && filters.length > 0) {
if (isFilteredOut(link_url, filters)) return;
}
if (links.indexOf(link_url) == -1) {
if (typeof options.validator == 'function') {
if (options.validator(link_url)){
links.push(link_url);
}
} else {
links.push(link_url);
}
}
});
return links;
}
// options
// {
// blacklist: [String],
// filters: [String]
// }
exports.getImages = function($, page, selector, options) {
options = options || {};
var blacklist = options.blacklist || [];
var filters = options.filters || [];
var image_urls = [];
var image_file_names = [];
var images = [];
var page_host_url = urlGetHost(page.url);
var page_host_url_obj = urlutil.parse(page_host_url);
var page_url_obj = urlutil.parse(page.base_url || page.url);
$('' + selector + ' img').each(function(){
var image_src = $(this).attr('src');
var image_alt = $(this).attr('alt');
if (image_src && image_src != "") {
// if (image_src.indexOf('data:') == 0) return;
if (!isValidLink(image_src)) return;
var image_url = image_src;
if (image_url.indexOf('//') == 0) {
image_url = page_host_url_obj.protocol + image_url;
}
var image_url_obj = urlutil.parse(image_url);
if (!image_url_obj.host) {
// image_url = urlutil.resolve(page_host_url_obj, image_url_obj);
if (image_url.indexOf('/') == 0) {
image_url = urlutil.resolve(page_host_url_obj, image_url_obj);
} else {
image_url = urlutil.resolve(page_url_obj, image_url_obj);
}
} else {
image_url = urlutil.format(image_url_obj);
}
if (image_urls.indexOf(image_url) >= 0) return;
image_urls.push(image_url);
// blacklist
if (typeof blacklist != 'undefined' && blacklist.length > 0) {
if (isBlacklisted(image_url, blacklist)) return;
}
// filters
if (typeof filters != 'undefined' && filters.length > 0) {
if (isFilteredOut(image_url, filters)) return;
}
var image_info = {
src: image_url,
file: getUniqueFileName(image_file_names, path.basename(image_url_obj.pathname))
};
if (image_alt && image_alt != '') image_info.alt = image_alt;
images.push(image_info);
}
});
return images;
}
exports.extractLinks = function($, page, options) {
var links = [];
var link_urls = [];
var blacklist = options.link_blacklist;
var filters = options.link_filters;
$('body a').each(function(){
var link_href = $(this).attr('href');
if (!isValidLink(link_href)) return;
var link_url = link_href.split('#')[0];
// blacklist
if (typeof blacklist != 'undefined' && blacklist.length) {
if (isBlacklisted(link_url, blacklist)) return;
}
// filters
if (typeof filters != 'undefined' && filters.length) {
if (isFilteredOut(link_url, filters)) return;
}
if (link_urls.indexOf(link_url) == -1) {
link_urls.push(link_url);
links.push({
url: link_url,
title: $(this).text().trim()
});
}
});
return links;
}
exports.extractImages = function($, page, options) {
var images = [];
var image_urls = [];
var blacklist = options.link_blacklist;
var filters = options.link_filters;
$('img').each(function(){
var image_src = $(this).attr('src');
if (!isValidLink(image_src)) return;
// blacklist
if (typeof blacklist != 'undefined' && blacklist.length) {
if (isBlacklisted(image_src, blacklist)) return;
}
// filters
if (typeof filters != 'undefined' && filters.length) {
if (isFilteredOut(image_src, filters)) return;
}
if (image_urls.indexOf(image_src) == -1) {
image_urls.push(image_src);
images.push({
src: image_src,
alt: $(this).attr('alt')
});
}
});
return images;
}
var requestWithEncoding = function(options, callback) {
var req_err = null;
var req = null;
try {
req = request.get(options);
} catch (e) {
req_err = e;
return callback(req_err);
}
req.on('response', function(res) {
var chunks = [];
res.on('data', function(chunk) {
chunks.push(chunk);
});
res.on('end', function() {
if (req_err) {
return;
}
var buffer = Buffer.concat(chunks);
var encoding = res.headers['content-encoding'];
if (encoding == 'gzip') {
zlib.gunzip(buffer, function(err, decoded) {
callback(err, res, decoded && decoded.toString());
});
} else if (encoding == 'deflate') {
zlib.inflate(buffer, function(err, decoded) {
callback(err, res, decoded && decoded.toString());
})
} else {
callback(null, res, buffer.toString());
}
});
});
req.on('error', function(err) {
if (!req_err) {
req_err = err;
callback(err);
}
});
}
var downloadHtml = function(page_url, options, callback) {
var default_headers = {
'User-Agent': 'request'
};
var default_timeout = 20000; /* 20 seconds */
var default_jar = false;
var request_options = {
url: page_url,
jar: options.request_jar || default_jar,
headers: options.request_headers || default_headers,
timeout: options.request_timeout || default_timeout
};
return requestWithEncoding(request_options, function(err, response, html) {
return callback(err, html, response);
});
}
exports.extractMeta = function($, page, options) {
var meta = {};
if ($('meta').length) {
$('meta').each(function() {
var meta_name = $(this).attr('name');
if (meta_name) {
if (meta[meta_name]) {
if (Array.isArray(meta[meta_name])) {
meta[meta_name].push($(this).attr('content'))
} else {
meta[meta_name] = [
meta[meta_name],
$(this).attr('content')
];
}
} else {
meta[meta_name] = $(this).attr('content');
}
} else {
var meta_property = $(this).attr('property');
if (meta_property) {
if (meta[meta_property]) {
if (Array.isArray(meta[meta_property])) {
meta[meta_property].push($(this).attr('content'))
} else {
meta[meta_property] = [
meta[meta_property],
$(this).attr('content')
];
}
} else {
meta[meta_property] = $(this).attr('content');
}
}
}
});
}
return meta;
}
// Extract page info from URL
//
// options
// {
// include_html: Boolean, /* default: false */
// include_html_body: Boolean, /* default: false */
// include_images: Boolean, /* default: false */
// include_links: Boolean, /* default: false */
// include_og: Boolean, /* default: false */
// include_meta: Boolean, /* default: false */
// downloadHtmlFunc: function(page_url, options, callback: function(err, html, response) {...})
// }
//
// Default page info
// {
// url: String,
// title: String,
// description: String,
// image: String,
// icon: String
// }
exports.scrape = function(request_url, options, callback) {
var downloadHtmlFunc = downloadHtml;
if (typeof options.downloadHtmlFunc == 'function') {
downloadHtmlFunc = options.downloadHtmlFunc;
}
var download_url = request_url;
if (options.html_proxy && options.html_proxy != '') {
download_url = options.html_proxy + '?url=' + encodeURIComponent(request_url);
}
downloadHtmlFunc(download_url, options, function(err, html, response){
if (err) {
return callback(err);
}
var page_info = {
url: request_url
};
if (!options.html_proxy || options.html_proxy === '') {
// Page URL (real)
page_info.url = response ? response.request.href : request_url;
}
page_info.content_type = response ? response.headers['content-type'] : 'text/html';
if (page_info.content_type && page_info.content_type.indexOf('html') == -1) {
if (response && response.headers) console.log(response.headers);
return callback(new Error('Not HTML page (' + page_info.content_type + ')'));
}
var $ = cheerio.load(html);
if ($('head base').length) {
page_info.base_url = $('head base').attr('href');
}
// Page icon
var page_icon = $('link[rel="shortcut icon"]').attr('href');
if (!page_icon) {
page_icon = $('link[rel="icon"]').attr('href');
}
if (page_icon) page_info.icon = page_icon;
// Page title
page_info.title = $('title').first().text();
if (page_info.title) {
page_info.title = page_info.title.replace(/(\r\n|\n|\r)/gm, '');
}
// Page description
var page_description = $('meta[name*=description]').attr('content');
if (page_description) {
page_description = page_description.replace(/(\r\n|\n|\r)/gm, '');
}
if (page_description) page_info.description = page_description;
// Page HTML
if (options.include_html) {
page_info.html = html;
}
// Page body HTML
if (options.include_html_body) {
page_info.html_body = $.html('body');
}
exports.fixImages($, page_info, options);
exports.fixLinks($, page_info, options);
// Page images
if (options.include_images) {
page_info.images = exports.extractImages($, page_info, options);
}
// Page links
if (options.include_links) {
page_info.links = exports.extractLinks($, page_info, options);
}
// Body HTML
if (options.include_body) {
page_info.body = $('body').html();
}
if (options.include_meta) {
page_info.meta = exports.extractMeta($, page_info, options);
}
// Open Graph meta tags
var og_metadata = {
url: $('meta[property="og:url"]').attr('content'),
type: $('meta[property="og:type"]').attr('content'),
title: $('meta[property="og:title"]').attr('content'),
description: $('meta[property="og:description"]').attr('content'),
image: $('meta[property="og:image"]').attr('content')
};
if (options.include_og) {
page_info.og = og_metadata;
}
if ((!page_info.image || page_info.image == '') && og_metadata.image) {
page_info.image = og_metadata.image;
}
if (page_info.description == '' && og_metadata.description && og_metadata.description != '') {
page_info.description = og_metadata.description.replace(/(\r\n|\n|\r)/gm, '');
}
if (page_scrapers.length) {
var scrapers = [];
for (var i = 0; i < page_scrapers.length; i++) {
if (page_scrapers[i].match(page_info.url, options)) {
scrapers.push(page_scrapers[i]);
}
}
scrapers.forEach(function(scraper) {
scraper.scrape($, page_info, options);
});
}
$ = null;
callback(null, page_info);
});
}