scrapeasy
Version:
Automated scraping module using patterns generated by the userscript Scrapeasy.
83 lines (82 loc) • 3.72 kB
JavaScript
var request = require("request");
var toSource = require("tosource");
var parsonic = require("parsonic");
var scraper = require("./scraper");
var maxSockets = 10;
var stringifiedScraper = {};
Object.getOwnPropertyNames(scraper).filter(function(f) {
stringifiedScraper[f] = toSource(scraper[f]);
});
module.exports = function(url, pattern, callback) {
var results = {};
var options = {
'pool.maxSockets': maxSockets,
url: url,
headers: {
'User-Agent': 'scrapeasy'
}
};
request(options, function(err, res, data) {
if (err) {
callback(err);
} else {
try {
if (res.statusCode !== 200) {
console.log("Status:", res.statusCode);
}
parsonic.load(data, {
pattern: pattern,
scraper: stringifiedScraper
}, function(document, args) {
var pattern = args.pattern;
var elements = {};
var results = {};
var toEval = "";
Object.getOwnPropertyNames(args.scraper).filter(function(f) {
toEval += args.scraper[f];
});
eval(toEval);
var selectors = Object.getOwnPropertyNames(pattern);
var asProperties = {};
elements["*"] = document.querySelectorAll("*");
for (var i = 0; i < selectors.length; i++) {
elements[selectors[i]] = document.querySelectorAll(selectors[i]);
pattern[selectors[i]].filter(function(rule) {
var property = rule.as.split("[n]");
if (!property[1].length) {
results[property[0]] = getValuesAsElements(rule, elements[selectors[i]]);
} else {
if (typeof asProperties[property[0]] === "undefined") {
asProperties[property[0]] = {};
if (typeof results[property[0]] === "undefined") {
results[property[0]] = [];
}
}
if (typeof asProperties[property[0]][selectors[i]] === "undefined") {
asProperties[property[0]][selectors[i]] = [];
}
asProperties[property[0]][selectors[i]].push({
property: property[1],
rule: rule
});
}
});
}
Object.getOwnPropertyNames(asProperties).filter(function(name) {
results[name] = results[name].concat(getValuesAsProperties(asProperties[name], elements));
});
return results;
},
function(result) {
if (typeof result.error !== "undefined") {
callback(result.error);
} else {
callback(false, result);
}
});
} catch (err) {
callback(err, results);
}
}
});
};