UNPKG

data_scraper

Version:

data scraper for shirts4mike.com

107 lines (98 loc) 3.25 kB
// modules const machinepack = require('machinepack-http'); const http = require('http'); const cheerio = require('cheerio'); const renderer = require('./renderer'); const errors = require('./errors'); // create fileName from date const dateObj = new Date(); let month = dateObj.getUTCMonth() + 1; //months from 1-12 month = (month < 10) ? `0${month}` : month; //add 0 let day = dateObj.getUTCDate(); day = (day < 10) ? `0${day}` : day; //add 0 const year = dateObj.getUTCFullYear(); // json2csv vars let fields = ["Title","Price","ImageUrl","Url","Time"]; const fileName = `${year}-${month}-${day}`; // get information from url function get(){ let myArray = []; machinepack.get({ url: 'http://www.shirts4mike.com/shirts.php', }).exec({ success: function(allShirtsHTML){ // create .csv renderer.create( fields, fileName ); // each shirt const $ = cheerio.load(allShirtsHTML); $('.products').filter(function(){ const products = $(this); $(this).children().each(function(){ let link = $(this).find('a').attr('href'); link = `http:\/\/www.shirts4mike.com/${link}`; // Get each t-shirt's data getShirt( link ); }); }); }, // A non-2xx status code was returned from the server. non200Response: function (err) { const string = `${err.statusCode} - ${http.STATUS_CODES[err.statusCode]}`; errors.log(string); }, // Unexpected connection error: could not send or receive HTTP request. requestFailed: function (err) { const string = `${err}`; errors.log( string ); }, error: function(err){ const string = `${err}`; errors.log( string ); } }); } // get individual shirt function getShirt( link ){ machinepack.get({ url: link, }).exec({ success: function(eachShirtHTML){ // Title, price, imageUrl, URL and Time const shirt = cheerio.load(eachShirtHTML); // variables // price (before removeing it from the h1 element) const shirtPrice = shirt('.shirt-details h1 span.price').text(); // title let shirtTitle = shirt('.shirt-details h1 span.price').remove(); // remove span.price from h1 element shirtTitle = shirt('.shirt-details h1').text().replace(' ',''); // remove first blankspace // imgUrl let imgUrl = shirt('.shirt-picture span img').attr('src'); // get src imgUrl = `http:\/\/www.shirts4mike.com/${imgUrl}`; // create full link // final object const shirtData = { "Title": shirtTitle, "Price": shirtPrice, "ImageUrl": imgUrl, "Url": link, "Time": new Date().toString() }; renderer.append( shirtData, fields, fileName ); console.log(`"${shirtTitle}" added!`.blue ); }, // A non-2xx status code was returned from the server. non200Response: function (err) { const string = `${err.statusCode} - ${http.STATUS_CODES[err.statusCode]}`; errors.log(string); }, // Unexpected connection error: could not send or receive HTTP request. requestFailed: function (err) { const string = `<${err}>`; errors.log( string ); }, error: function(err){ const string = `<${err}>`; errors.log( string ); } }); } module.exports.get = get;