UNPKG

yamaha-moto-scraper

Version:

Yamaha.es moto scraping tool

232 lines (215 loc) 6.43 kB
/** * Created by Aleksandr Volkov on 20/10/16. */ // const cheerio = require('cheerio'); const fs = require('fs'); const _ = require('lodash'); const debug = require('debug')('yamaha'); // const specsFileName = 'tmp/yamaha/yamaha-spec-urls.json'; class YamahaScrape { constructor(cheerio, QueryProcessor, params = {}) { this.vendorHome = params.vendorHome || 'https://www.yamaha-motor.eu'; this.vendor = params.vendor || 'yamaha'; this.workDir = params.workDir || `tmp/${this.vendor}/`; this.concurrency = params.concurrency || 2; this.allMotoURL = params.allMotoURL || 'https://www.yamaha-motor.eu/es/products/motorcycles/index.aspx'; this.allScootersURL = params.allScootersURL || 'https://www.yamaha-motor.eu/es/products/scooters/index.aspx'; this.motoGroups = params.motoGroups || ['125cc', 'scooter']; this.tmpFileName = 'tmp/yamaha-tmp.html'; this.saveFiles = params.saveFiles || false; this.queryProcessor = new QueryProcessor({ vendor: this.vendor, homeUrl: this.vendorHome, saveFiles: this.saveFiles, concurrency: this.concurrency }); this.cheerio = cheerio; } /** * getting data about all the motos from Honda site */ querySite(url, fileName = this.tmpFileName, cb) { debug('querysite', url); this.queryProcessor.getPage(url, fileName, cb); } /** * saving all urls of motos for further scraping * @param rawHTML * @param selector * @param fileName */ getLinksToMotos(rawHTML, selector, fileName) { let $ = this.cheerio(rawHTML); let tmp = []; _.each($.find(selector), (elem) => { tmp.push(this.cheerio(elem).attr('href')); }); if (!!fileName && this.saveFiles) { fs.writeFile(`${this.workDir}${fileName}`, JSON.stringify(tmp), (err) => debug('json saved')); } return tmp; } /** * filtering urls only from the specific groups motosURL * @param motosURL * @returns {Array} */ getSingleMotoPages(motosURL) { let filteredURLS = []; _.each(this.motoGroups, (group) => { let tmp = _.filter(motosURL, (moto) => moto.indexOf(group) > -1); _.each(tmp, url => { filteredURLS.push(url) }) }); return filteredURLS; } /** * scraping motos in parallel with specified limit of parallel tasks * @param urls * @param limit * @param fileName */ parallelScraping(urls, fileName) { return new Promise((resolve, reject) => { urls = _.map(urls, url => url += '?view=featurestechspecs'); let pages = this.queryProcessor.parallelScraping(urls); let parsed = []; pages.then((data) => { _.each(data, (obj) => { try { // adding parsed spec to the list parsed.push(this.parseSpecification(obj.page, obj.url)) } catch (err) { console.log(obj.url); console.log(err); } } ); resolve(parsed); this.saveFiles && fs.writeFile(`${this.workDir}${fileName || this.vendor}-parsed.json`, JSON.stringify(parsed)); debug(parsed); }, (err) => { console.log(err); reject(err) }); }) } /** * yamaha splits scooters to the page with 3 sections, we need url for all of them */ getScooterSectionLinks() { return new Promise((resolve, reject) => { let cb = (err, page) => { if (err) return reject(err); let $ = this.cheerio(page); let segments = []; $.find('#segments li').each((index, elem) => { segments.push({ link: this.cheerio(elem).find('a').attr('href'), name: this.cheerio(elem).find('h2').text() }) }); this.saveFiles && fs.writeFile(`${this.workDir}all-yamaha-scooters.json`, JSON.stringify(segments)); resolve(segments); }; this.queryProcessor.getPage(this.allScootersURL, 'all-yamaha-scooters.html', cb); }) } /** * parsing single file * @param file * @param url */ parseSpecification(file, url) { try { let $ = this.cheerio(file); let priceArr = $.find('.price').text().split('€ '); priceArr.shift(); let price = priceArr[0]; debug('price', price); let title = $.find('h1').text(); debug(title); let specItemList = $.find('.specifications tr'); let weight = this._trResult(specItemList, 'Peso en orden de marcha'); let power = this._trResult(specItemList, 'Potencia máxima'); let powerKW = power.match(/(.+)kW/)[1]; let powerCV = null; let moment = this._trResult(specItemList, 'par máximo'); let fuelCapacity = this._trResult(specItemList, 'Capacidad del depósito'); let volume = this._trResult(specItemList, 'Cilindrada'); let photo = $.find('#imgFeatureImage').attr('src'); return {price, title, weight, power, powerKW, powerCV, moment, fuelCapacity, photo, volume, url}; } catch (err) { debug(url); console.log(err); return {url} } } /** * gets urls for scooters and moto from yamaha site */ _getAllVehicleUrls() { return new Promise((resolve, reject) => { // need to get array of urls for scooters and motos try { let cb; let yamahaLinks = [ () => this.queryProcessor.getPage(this.allScootersURL, null, cb), () => this.queryProcessor.getPage(this.allMotoURL, null, cb), ]; let pagesLinks = []; cb = (err, page, url) => { if (err) return reject(err); let parsed = this.getLinksToMotos(page, '#segmentsholder .segmentGrid.hide li>a'); debug('page scraped', url); parsed.forEach(elem => pagesLinks.push(elem)); if (yamahaLinks.length) { yamahaLinks.pop()() } else { this.saveFiles && fs.writeFile(`${this.workDir}yamaha-all-vehicle-urls.json`, JSON.stringify(pagesLinks)); resolve(pagesLinks) } }; yamahaLinks.pop()() } catch (err) { reject(err) } }); } /** * gets all the specs from yamaha site */ scrapeAll() { return new Promise((success, fail) => { this._getAllVehicleUrls() .then((urls) => { this.parallelScraping(urls, 'yamaha-specs') .then((data) => { success(data); }, err => { fail(err); console.log(err) }) }, err => { console.log(err); fail(err); }) }) } /** * looking for a result cell in the table * @param trList * @param needle * @returns {XMLList|*} * @private */ _trResult(trList, needle) { let found = _.find(trList, tr => { return this.cheerio(tr).find('.title').text().toLowerCase().indexOf(needle.toLowerCase()) > -1 }); return this.cheerio(found).find('td p').eq(1).text(); } } module.exports = YamahaScrape;