absoluter
Version:
Get html of website with absolute urls.
236 lines (218 loc) • 9.59 kB
JavaScript
var url = require('url');
var jsdom = require('jsdom').jsdom;
var request = require('request');
var Q = require('q');
/**
* From http://stackoverflow.com/a/7544757
*
* @param {String} mainUrl
* @param {String} html
*/
function replace_all_rel_by_abs(mainUrl, html){
// HTML/XML Attribute may not be prefixed by these characters (common
// attribute chars. This list is not complete, but will be sufficient
// for this function (see http://www.w3.org/TR/REC-xml/#NT-NameChar).
var att = "[^-a-z0-9:._]";
var entityEnd = "(?:;|(?!\\d))";
var ents = {
" ": "(?:\\s| ?|�*32" + entityEnd + "|�*20" + entityEnd + ")",
"(": "(?:\\(|�*40" + entityEnd + "|�*28" + entityEnd + ")",
")": "(?:\\)|�*41" + entityEnd + "|�*29" + entityEnd + ")",
".": "(?:\\.|�*46" + entityEnd + "|�*2e" + entityEnd + ")"
};
// Placeholders to filter obfuscations
var charMap = {};
var shorthandCommonUse = ents[" "] + "*"; //Short-hand for common use
var any = "(?:[^>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^>]*";
var slashRE = new RegExp(anyEnity("/"), 'g');
var dotRE = new RegExp(anyEnity("."), 'g');
// Important: Must be pre- and postfixed by < and >.
// This RE should match anything within a tag!
/**
* Any Entity - Returns a RE-pattern to deal with HTML entities.
* @name ae
* @description Converts a given string in a sequence of the original input and the HTML entity
* @param {String} string String to convert
*/
function anyEnity(string){
var all_chars_lowercase = string.toLowerCase();
if(ents[string]) return ents[string];
var all_chars_uppercase = string.toUpperCase();
var RE_res = "";
for(var i=0; i<string.length; i++){
var char_lowercase = all_chars_lowercase.charAt(i);
if(charMap[char_lowercase]){
RE_res += charMap[char_lowercase];
continue;
}
var char_uppercase = all_chars_uppercase.charAt(i);
var RE_sub = [char_lowercase];
RE_sub.push("�*" + char_lowercase.charCodeAt(0) + entityEnd);
RE_sub.push("�*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd);
if(char_lowercase != char_uppercase){
// Note: RE ignorecase flag has already been activated
RE_sub.push("�*" + char_uppercase.charCodeAt(0) + entityEnd);
RE_sub.push("�*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd);
}
RE_sub = "(?:" + RE_sub.join("|") + ")";
RE_res += (charMap[char_lowercase] = RE_sub);
}
return(ents[string] = RE_res);
}
/**
* @name by
* @description 2nd argument for replace().
*
* @param {String} match
* @param {String} group1
* @param {String} group2
* @param {String} group3
*/
function by(match, group1, group2, group3){
group2 = url.resolve(mainUrl, group2);
return group1 + group2 + group3;
}
/**
* @name by2
* @description 2nd argument for replace(). Parses relevant HTML entities
*
* @param {String} match
* @param {String} group1
* @param {String} group2
* @param {String} group3
*/
function by2(match, group1, group2, group3){
group2 = group2.replace(slashRE, "/").replace(dotRE, ".");
group2 = url.resolve(mainUrl, group2);
return group1 + group2 + group3;
}
/**
* Create Replace - Creates and executes a search-and-replace
* @name cr
* @description Selects a HTML element and performs a
* search-and-replace on attributes
* @param {String} selector HTML substring to match
* @param {String} attribute RegExp-escaped; HTML element attribute to match
* @param {String} marker Optional RegExp-escaped; marks the prefix
* @param {String} delimiter Optional RegExp escaped; non-quote delimiters
* @param {String} end Optional RegExp-escaped; forces the match to end before an occurence of <end>
*/
function cr(selector, attribute, marker, delimiter, end){
if(typeof selector == "string") selector = new RegExp(selector, "gi");
attribute = att + attribute;
marker = typeof marker == "string" ? marker : "\\s*=\\s*";
delimiter = typeof delimiter == "string" ? delimiter : "";
end = typeof end == "string" ? "?)("+end : ")(";
var re1 = new RegExp('(' + attribute + marker + '")([^"' + delimiter + ']+' + end + ')', 'gi');
var re2 = new RegExp("(" + attribute + marker + "')([^'" + delimiter + "]+" + end + ")", 'gi');
var re3 = new RegExp('(' + attribute + marker + ')([^"\'][^\\s>' + delimiter + ']*' + end + ')', 'gi');
html = html.replace(selector, function(match){
return match.replace(re1, by).replace(re2, by).replace(re3, by);
});
}
/**
* Create Replace Inline - Creates and executes a search-and-replace.
* @name cri
* @description Selects an attribute of a HTML element, and performs a search-and-replace on certain values
* @param {String} selector HTML element to match
* @param {String} attribute RegExp-escaped; HTML element attribute to match
* @param {String} front RegExp-escaped; attribute value, prefix to match
* @param {String} flags Optional RegExp flags, default "gi"
* @param {String} delimiter Optional RegExp-escaped; non-quote delimiters
* @param {String} end Optional RegExp-escaped; forces the match to end before an occurence of <end>
*/
function cri(selector, attribute, front, flags, delimiter, end){
if(typeof selector == "string") selector = new RegExp(selector, "gi");
attribute = att + attribute;
flags = typeof flags == "string" ? flags : "gi";
var re1 = new RegExp('(' + attribute + '\\s*=\\s*")([^"]*)', 'gi');
var re2 = new RegExp("(" + attribute + "\\s*=\\s*')([^']+)", 'gi');
var at1 = new RegExp('(' + front + ')([^"]+)(")', flags);
var at2 = new RegExp("(" + front + ")([^']+)(')", flags);
var handleAttr;
if(typeof delimiter == "string"){
end = typeof end == "string" ? end : "";
var at3 = new RegExp("(" + front + ")([^\"'][^" + delimiter + "]*" + (end ? "?)(" + end + ")" : ")()"), flags);
handleAttr = function(match, g1, g2){
return g1 + g2.replace(at1, by2).replace(at2, by2).replace(at3, by2);
};
} else {
handleAttr = function(match, g1, g2){
return g1 + g2.replace(at1, by2).replace(at2, by2)
};
}
html = html.replace(selector, function(match){
return match.replace(re1, handleAttr).replace(re2, handleAttr);
});
}
// <meta http-equiv=refresh content=" ; url= " >
cri("<meta"+any+att+"http-equiv\\s*=\\s*(?:\""+anyEnity("refresh")+"\""+any+">|'"+anyEnity("refresh")+"'"+any+">|"+anyEnity("refresh")+"(?:"+anyEnity(" ")+any+">|>))", "content", anyEnity("url")+shorthandCommonUse+anyEnity("=")+shorthandCommonUse, "i");
cr("<"+any+att+"href\\s*="+any+">", "href"); // Linked elements
cr("<"+any+att+"src\\s*="+any+">", "src"); // Embedded elements
cr("<object"+any+att+"data\\s*="+any+">", "data"); // <object data= >
cr("<applet"+any+att+"codebase\\s*="+any+">", "codebase"); // <applet codebase= >
// <param name=movie value= >/
cr("<param"+any+att+"name\\s*=\\s*(?:\""+anyEnity("movie")+"\""+any+">|'"+anyEnity("movie")+"'"+any+">|"+anyEnity("movie")+"(?:"+anyEnity(" ")+any+">|>))", "value");
cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "url", "\\s*\\(\\s*", "", "\\s*\\)"); // <style>
cri("<"+any+att+"style\\s*="+any+">", "style", anyEnity("url")+shorthandCommonUse+anyEnity("(")+shorthandCommonUse, 0, shorthandCommonUse+anyEnity(")"), anyEnity(")")); // < style=" url(...) " >
return html;
}
/**
* DO REQUEST
*
* @param {String} urlToScrap
*/
function doRequest(urlToScrap){
var df = Q.defer();
request({
url: urlToScrap,
headers: {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36'
}
}, function (err, response, body) {
if(err){
df.reject(err);
}else{
df.resolve({response: response, body: body});
}
});
return df.promise;
}
/**
* CONVER BODY TO DOM
*
* @param {String} body
*/
function convertToDOM(body){
var df = Q.defer();
jsdom.env(body, function (errors, window) {
var html = window.document.documentElement.innerHTML;
df.resolve(html);
});
return df.promise;
}
/**
* USE CASE FOR SCRAP CONTENT
*
* @param {String} urlToScrap
*/
function scrapContent(urlToScrap){
return doRequest(urlToScrap)
.then(function(details){
var response = details.response,
body = details.body;
if(!response.headers['x-frame-options']){
return '';
}else{
if (response.statusCode == 200) {
return convertToDOM(body)
.then(function(html){
return replace_all_rel_by_abs(urlToScrap, html);
});
}else{
throw new Error('ERROR!');
}
}
});
}
module.exports = scrapContent;