UNPKG

honda-moto-scrape

Version:

this tool scrapes data for motos and scooters (from honda.es) site and returns them as JSON object

241 lines (210 loc) 6.14 kB
/** * Created by Aleksandr Volkov on 20/10/16. */ const cheerio = require('cheerio'); const request = require('request'); const fs = require('fs'); const _ = require('lodash'); const tmpFileName = 'tmp/honda-tmp.html'; const debug = require('debug')('honda'); const hondaHome = 'http://www.honda.es'; const QueryProcessor = require('scrape-query-processor'); class HondaScraper { constructor(cheerio, params) { this.groupLinks = null; this.motosPageLinks = null; this.specUrls = null; this.vendor = 'honda'; this.cheerio = cheerio; this.workDir = 'tmp'; this.specsFileName = `${this.workDir}/${this.vendor}/honda-spec-urls.json`; this.allMotoURL = 'http://www.honda.es/motorcycles.html'; this.motoGroups = ['125cc', 'scooter']; this.concurrency = params.concurrency || 2; this.queryProcessor = new QueryProcessor({ vendor: this.vendor, homeUrl: hondaHome, concurrency: this.concurrency }); } /** * getting data about all the motos from Honda site */ queryHondaSite(url, fileName = tmpFileName) { return new Promise((success, fail) => { this.queryProcessor.getPage(url || this.allMotoURL, fileName, (err, data) => { if (err) return fail(err); this.groupLinks = data; success(data); }); }) } /** * saving all urls of motos for further scraping * @param rawHTML */ parseHondaData(rawHTML) { let $ = this.cheerio(this.groupLinks); let tmp = []; _.each($.find('.analyticsEvent.model'), (elem) => { tmp.push(this.cheerio(elem).attr('href')); }); this.motosPageLinks = tmp; // fs.writeFile('tmp/honda.json', JSON.stringify(tmp), (err)=> console.log('json saved')); return tmp; } /** * filtering urls only from the specific groups motosURL * @param motosURL * @returns {Array} */ getSingleMotoPages(motosURL) { let filteredURLS = []; _.each(this.motoGroups, (group) => { let tmp = _.filter(motosURL, (moto) => moto.indexOf(group) > -1); _.each(tmp, url => { filteredURLS.push(url) }) }); // console.log(filteredURLS); return filteredURLS; } /** * from the main page we can get only links to review pages, so we need to visit them to get links to spec pages * @param doOverwrite * @param maxSpecUrls is used for testing, to minimize scraping amount * @returns {Promise} */ getSpecPages(doOverwrite = false, maxSpecUrls) { return new Promise((resolve, reject) => { if (doOverwrite) { scrapeSpecUrls.call(this).then(resolve) } else { debug('scrapeSpecUrl11111s') if (fs.existsSync(this,specsFileName)) { resolve(JSON.parse(this.loadSavedFile(this.specsFileName))) } else { debug('scrapeSpecUrls') scrapeSpecUrls.call(this).then(resolve) } } }); function scrapeSpecUrls() { return new Promise((resolve, reject) => { // let reviewPages = query.parallelScraping(this.getSingleMotoPages(JSON.parse(loadSavedFile('tmp/honda.json'))), 2); // debug(this.motosPageLinks); let reviewPages = this.queryProcessor.parallelScraping(this.motosPageLinks, maxSpecUrls); reviewPages.then((pages) => { let specUrls = []; _.each(pages, page => { specUrls.push(cheerio(page.page).find('nav article a.analyticsEvent').eq(1).attr('href')) }); fs.writeFile(this.specsFileName, JSON.stringify(specUrls)); debug('specs', specUrls); this.specUrls = specUrls; resolve(specUrls); }) }) } } /** * scraping motos in parallel with specified limit of parallel tasks * @param urls */ scrapeSpecifications(urls) { return new Promise((success, fail) => { let pages = this.queryProcessor.parallelScraping(this.specUrls); let parsed = []; pages.then((data) => { _.each(data, (obj) => { try { parsed.push(this.parseSpecification(obj.page, obj.url)) } catch (err) { console.log(obj.url); console.log(err); } } ); fs.writeFile(`tmp/${this.vendor}/honda-parsed.json`, JSON.stringify(parsed)); success(parsed); // console.log(parsed); }, err => { fail(err) console.log(err); }); }) } /** * returns all the honda motos and scooters specs * @returns {Promise} */ scrapeAll() { return new Promise((success, fail) => { this.queryHondaSite() .then(() => { this.parseHondaData(); this.getSpecPages(true) .then(() => { this.scrapeSpecifications() .then(data => { debug(data); success(data) }, err => { debug(err) fail(err) }) }, fail) }, fail) }) } /** * getting saved file * @param fileName * @returns {*} */ loadSavedFile(fileName = tmpFileName) { return fs.readFileSync(fileName, 'utf8') } /** * parsing single file * @param file * @param url */ parseSpecification(file, url) { try { let $ = this.cheerio(file); let price = $.find('.fullPrice').text(); let title = $.find('.titles h3').text(); let specItemList = $.find('.specItemList tr'); let weight = this._trResult(specItemList, 'peso en orden'); let power = this._trResult(specItemList, 'potencia máxima'); let powerKW = power.match(/(.+)kW/) && power.match(/(.+)kW/)[1]; let powerCV = null; try { powerCV = power.match(/\((.+)cv\)/) && power.match(/\((.+)cv\)/)[1]; } catch (err) { console.log('unable to parse power CV'); console.log(err); } let moment = this._trResult(specItemList, 'par máximo'); let fuelCapacity = this._trResult(specItemList, 'capacidad de combustible'); let volume = this._trResult(specItemList, 'cilindrada'); let photo = hondaHome + $.find('.currentChoice img').attr('src'); let res = {price, title, weight, power, powerKW, powerCV, moment, fuelCapacity, photo, volume, url}; return res; } catch (err) { console.log(url); console.log(err); return {url} } } _trResult(trList, needle) { let found = _.find(trList, tr => { return cheerio(tr).find('.title').text().toLowerCase().indexOf(needle) > -1 }); return cheerio(found).find('.result').text(); } } module.exports = HondaScraper;