honda-moto-scrape
Version:
this tool scrapes data for motos and scooters (from honda.es) site and returns them as JSON object
241 lines (210 loc) • 6.14 kB
JavaScript
/**
* Created by Aleksandr Volkov on 20/10/16.
*/
const cheerio = require('cheerio');
const request = require('request');
const fs = require('fs');
const _ = require('lodash');
const tmpFileName = 'tmp/honda-tmp.html';
const debug = require('debug')('honda');
const hondaHome = 'http://www.honda.es';
const QueryProcessor = require('scrape-query-processor');
class HondaScraper {
constructor(cheerio, params) {
this.groupLinks = null;
this.motosPageLinks = null;
this.specUrls = null;
this.vendor = 'honda';
this.cheerio = cheerio;
this.workDir = 'tmp';
this.specsFileName = `${this.workDir}/${this.vendor}/honda-spec-urls.json`;
this.allMotoURL = 'http://www.honda.es/motorcycles.html';
this.motoGroups = ['125cc', 'scooter'];
this.concurrency = params.concurrency || 2;
this.queryProcessor = new QueryProcessor({
vendor: this.vendor,
homeUrl: hondaHome,
concurrency: this.concurrency
});
}
/**
* getting data about all the motos from Honda site
*/
queryHondaSite(url, fileName = tmpFileName) {
return new Promise((success, fail) => {
this.queryProcessor.getPage(url || this.allMotoURL, fileName, (err, data) => {
if (err) return fail(err);
this.groupLinks = data;
success(data);
});
})
}
/**
* saving all urls of motos for further scraping
* @param rawHTML
*/
parseHondaData(rawHTML) {
let $ = this.cheerio(this.groupLinks);
let tmp = [];
_.each($.find('.analyticsEvent.model'),
(elem) => {
tmp.push(this.cheerio(elem).attr('href'));
});
this.motosPageLinks = tmp;
// fs.writeFile('tmp/honda.json', JSON.stringify(tmp), (err)=> console.log('json saved'));
return tmp;
}
/**
* filtering urls only from the specific groups motosURL
* @param motosURL
* @returns {Array}
*/
getSingleMotoPages(motosURL) {
let filteredURLS = [];
_.each(this.motoGroups, (group) => {
let tmp = _.filter(motosURL, (moto) => moto.indexOf(group) > -1);
_.each(tmp, url => {
filteredURLS.push(url)
})
});
// console.log(filteredURLS);
return filteredURLS;
}
/**
* from the main page we can get only links to review pages, so we need to visit them to get links to spec pages
* @param doOverwrite
* @param maxSpecUrls is used for testing, to minimize scraping amount
* @returns {Promise}
*/
getSpecPages(doOverwrite = false, maxSpecUrls) {
return new Promise((resolve, reject) => {
if (doOverwrite) {
scrapeSpecUrls.call(this).then(resolve)
} else {
debug('scrapeSpecUrl11111s')
if (fs.existsSync(this,specsFileName)) {
resolve(JSON.parse(this.loadSavedFile(this.specsFileName)))
} else {
debug('scrapeSpecUrls')
scrapeSpecUrls.call(this).then(resolve)
}
}
});
function scrapeSpecUrls() {
return new Promise((resolve, reject) => {
// let reviewPages = query.parallelScraping(this.getSingleMotoPages(JSON.parse(loadSavedFile('tmp/honda.json'))), 2);
// debug(this.motosPageLinks);
let reviewPages = this.queryProcessor.parallelScraping(this.motosPageLinks, maxSpecUrls);
reviewPages.then((pages) => {
let specUrls = [];
_.each(pages, page => {
specUrls.push(cheerio(page.page).find('nav article a.analyticsEvent').eq(1).attr('href'))
});
fs.writeFile(this.specsFileName, JSON.stringify(specUrls));
debug('specs', specUrls);
this.specUrls = specUrls;
resolve(specUrls);
})
})
}
}
/**
* scraping motos in parallel with specified limit of parallel tasks
* @param urls
*/
scrapeSpecifications(urls) {
return new Promise((success, fail) => {
let pages = this.queryProcessor.parallelScraping(this.specUrls);
let parsed = [];
pages.then((data) => {
_.each(data, (obj) => {
try {
parsed.push(this.parseSpecification(obj.page, obj.url))
} catch (err) {
console.log(obj.url);
console.log(err);
}
}
);
fs.writeFile(`tmp/${this.vendor}/honda-parsed.json`, JSON.stringify(parsed));
success(parsed);
// console.log(parsed);
}, err => {
fail(err)
console.log(err);
});
})
}
/**
* returns all the honda motos and scooters specs
* @returns {Promise}
*/
scrapeAll() {
return new Promise((success, fail) => {
this.queryHondaSite()
.then(() => {
this.parseHondaData();
this.getSpecPages(true)
.then(() => {
this.scrapeSpecifications()
.then(data => {
debug(data);
success(data)
},
err => {
debug(err)
fail(err)
})
}, fail)
}, fail)
})
}
/**
* getting saved file
* @param fileName
* @returns {*}
*/
loadSavedFile(fileName = tmpFileName) {
return fs.readFileSync(fileName, 'utf8')
}
/**
* parsing single file
* @param file
* @param url
*/
parseSpecification(file, url) {
try {
let $ = this.cheerio(file);
let price = $.find('.fullPrice').text();
let title = $.find('.titles h3').text();
let specItemList = $.find('.specItemList tr');
let weight = this._trResult(specItemList, 'peso en orden');
let power = this._trResult(specItemList, 'potencia máxima');
let powerKW = power.match(/(.+)kW/) && power.match(/(.+)kW/)[1];
let powerCV = null;
try {
powerCV = power.match(/\((.+)cv\)/) && power.match(/\((.+)cv\)/)[1];
} catch (err) {
console.log('unable to parse power CV');
console.log(err);
}
let moment = this._trResult(specItemList, 'par máximo');
let fuelCapacity = this._trResult(specItemList, 'capacidad de combustible');
let volume = this._trResult(specItemList, 'cilindrada');
let photo = hondaHome + $.find('.currentChoice img').attr('src');
let res = {price, title, weight, power, powerKW, powerCV, moment, fuelCapacity, photo, volume, url};
return res;
} catch (err) {
console.log(url);
console.log(err);
return {url}
}
}
_trResult(trList, needle) {
let found = _.find(trList, tr => {
return cheerio(tr).find('.title').text().toLowerCase().indexOf(needle) > -1
});
return cheerio(found).find('.result').text();
}
}
module.exports = HondaScraper;