yamaha-moto-scraper
Version:
Yamaha.es moto scraping tool
232 lines (215 loc) • 6.43 kB
JavaScript
/**
* Created by Aleksandr Volkov on 20/10/16.
*/
// const cheerio = require('cheerio');
const fs = require('fs');
const _ = require('lodash');
const debug = require('debug')('yamaha');
// const specsFileName = 'tmp/yamaha/yamaha-spec-urls.json';
class YamahaScrape {
constructor(cheerio, QueryProcessor, params = {}) {
this.vendorHome = params.vendorHome || 'https://www.yamaha-motor.eu';
this.vendor = params.vendor || 'yamaha';
this.workDir = params.workDir || `tmp/${this.vendor}/`;
this.concurrency = params.concurrency || 2;
this.allMotoURL = params.allMotoURL || 'https://www.yamaha-motor.eu/es/products/motorcycles/index.aspx';
this.allScootersURL = params.allScootersURL || 'https://www.yamaha-motor.eu/es/products/scooters/index.aspx';
this.motoGroups = params.motoGroups || ['125cc', 'scooter'];
this.tmpFileName = 'tmp/yamaha-tmp.html';
this.saveFiles = params.saveFiles || false;
this.queryProcessor = new QueryProcessor({
vendor: this.vendor,
homeUrl: this.vendorHome,
saveFiles: this.saveFiles,
concurrency: this.concurrency
});
this.cheerio = cheerio;
}
/**
* getting data about all the motos from Honda site
*/
querySite(url, fileName = this.tmpFileName, cb) {
debug('querysite', url);
this.queryProcessor.getPage(url, fileName, cb);
}
/**
* saving all urls of motos for further scraping
* @param rawHTML
* @param selector
* @param fileName
*/
getLinksToMotos(rawHTML, selector, fileName) {
let $ = this.cheerio(rawHTML);
let tmp = [];
_.each($.find(selector),
(elem) => {
tmp.push(this.cheerio(elem).attr('href'));
});
if (!!fileName && this.saveFiles) {
fs.writeFile(`${this.workDir}${fileName}`, JSON.stringify(tmp), (err) => debug('json saved'));
}
return tmp;
}
/**
* filtering urls only from the specific groups motosURL
* @param motosURL
* @returns {Array}
*/
getSingleMotoPages(motosURL) {
let filteredURLS = [];
_.each(this.motoGroups, (group) => {
let tmp = _.filter(motosURL, (moto) => moto.indexOf(group) > -1);
_.each(tmp, url => {
filteredURLS.push(url)
})
});
return filteredURLS;
}
/**
* scraping motos in parallel with specified limit of parallel tasks
* @param urls
* @param limit
* @param fileName
*/
parallelScraping(urls, fileName) {
return new Promise((resolve, reject) => {
urls = _.map(urls, url => url += '?view=featurestechspecs');
let pages = this.queryProcessor.parallelScraping(urls);
let parsed = [];
pages.then((data) => {
_.each(data, (obj) => {
try {
// adding parsed spec to the list
parsed.push(this.parseSpecification(obj.page, obj.url))
} catch (err) {
console.log(obj.url);
console.log(err);
}
}
);
resolve(parsed);
this.saveFiles && fs.writeFile(`${this.workDir}${fileName || this.vendor}-parsed.json`, JSON.stringify(parsed));
debug(parsed);
}, (err) => {
console.log(err);
reject(err)
});
})
}
/**
* yamaha splits scooters to the page with 3 sections, we need url for all of them
*/
getScooterSectionLinks() {
return new Promise((resolve, reject) => {
let cb = (err, page) => {
if (err) return reject(err);
let $ = this.cheerio(page);
let segments = [];
$.find('#segments li').each((index, elem) => {
segments.push({
link: this.cheerio(elem).find('a').attr('href'),
name: this.cheerio(elem).find('h2').text()
})
});
this.saveFiles && fs.writeFile(`${this.workDir}all-yamaha-scooters.json`, JSON.stringify(segments));
resolve(segments);
};
this.queryProcessor.getPage(this.allScootersURL, 'all-yamaha-scooters.html', cb);
})
}
/**
* parsing single file
* @param file
* @param url
*/
parseSpecification(file, url) {
try {
let $ = this.cheerio(file);
let priceArr = $.find('.price').text().split('€ ');
priceArr.shift();
let price = priceArr[0];
debug('price', price);
let title = $.find('h1').text();
debug(title);
let specItemList = $.find('.specifications tr');
let weight = this._trResult(specItemList, 'Peso en orden de marcha');
let power = this._trResult(specItemList, 'Potencia máxima');
let powerKW = power.match(/(.+)kW/)[1];
let powerCV = null;
let moment = this._trResult(specItemList, 'par máximo');
let fuelCapacity = this._trResult(specItemList, 'Capacidad del depósito');
let volume = this._trResult(specItemList, 'Cilindrada');
let photo = $.find('#imgFeatureImage').attr('src');
return {price, title, weight, power, powerKW, powerCV, moment, fuelCapacity, photo, volume, url};
} catch (err) {
debug(url);
console.log(err);
return {url}
}
}
/**
* gets urls for scooters and moto from yamaha site
*/
_getAllVehicleUrls() {
return new Promise((resolve, reject) => {
// need to get array of urls for scooters and motos
try {
let cb;
let yamahaLinks = [
() => this.queryProcessor.getPage(this.allScootersURL, null, cb),
() => this.queryProcessor.getPage(this.allMotoURL, null, cb),
];
let pagesLinks = [];
cb = (err, page, url) => {
if (err) return reject(err);
let parsed = this.getLinksToMotos(page, '#segmentsholder .segmentGrid.hide li>a');
debug('page scraped', url);
parsed.forEach(elem => pagesLinks.push(elem));
if (yamahaLinks.length) {
yamahaLinks.pop()()
} else {
this.saveFiles && fs.writeFile(`${this.workDir}yamaha-all-vehicle-urls.json`, JSON.stringify(pagesLinks));
resolve(pagesLinks)
}
};
yamahaLinks.pop()()
} catch (err) {
reject(err)
}
});
}
/**
* gets all the specs from yamaha site
*/
scrapeAll() {
return new Promise((success, fail) => {
this._getAllVehicleUrls()
.then((urls) => {
this.parallelScraping(urls, 'yamaha-specs')
.then((data) => {
success(data);
}, err => {
fail(err);
console.log(err)
})
}, err => {
console.log(err);
fail(err);
})
})
}
/**
* looking for a result cell in the table
* @param trList
* @param needle
* @returns {XMLList|*}
* @private
*/
_trResult(trList, needle) {
let found = _.find(trList, tr => {
return this.cheerio(tr).find('.title').text().toLowerCase().indexOf(needle.toLowerCase()) > -1
});
return this.cheerio(found).find('td p').eq(1).text();
}
}
module.exports = YamahaScrape;