UNPKG

confidencial-ni-node

Version:
211 lines (200 loc) 6.83 kB
// Load modules var jsdom = require('jsdom'); var fs = require('fs'); var S = require('string'); var request = require('request'); var cheerio = require('cheerio'); var async = require('async'); var jquery = fs.readFileSync( require('path').resolve(__dirname, 'jquery.min.js') ).toString(); // confidencial categories var categories = new Array(); categories.push('http://www.confidencial.com.ni/politica/1'); categories.push('http://www.confidencial.com.ni/blogs/40'); categories.push('http://www.confidencial.com.ni/economia/2'); categories.push('http://www.confidencial.com.ni/economia/2/20'); categories.push('http://www.confidencial.com.ni/nacion/4/3'); categories.push('http://www.confidencial.com.ni/mundo/4'); categories.push('http://www.confidencial.com.ni/centroamerica/4/1'); categories.push('http://www.confidencial.com.ni/vida-y-ocio/30'); categories.push('http://www.confidencial.com.ni/turismo/30/35'); categories.push('http://www.confidencial.com.ni/tecnologia/30/32'); categories.push('http://www.confidencial.com.ni/gastronomia/30/34'); categories.push('http://www.confidencial.com.ni/espectaculo/30/33'); categories.push('http://www.confidencial.com.ni/deportes/30/36'); categories.push('http://www.confidencial.com.ni/cultura/30/71'); categories.push('http://www.confidencial.com.ni/reporte-ciudadano/60'); categories.push('http://www.confidencial.com.ni/denuncias/60/62'); categories.push('http://www.confidencial.com.ni/yo-opino/60/61'); // article scraping data var parseArticleOptions = { domain: 'http://www.confidencial.com.ni/', elements: [ { name: 'title', sel: function ($) { var result = $('#articleheader h2').text().trim(); return (!S(result).isEmpty()) ? result : ''; } }, { name: 'title_sub', sel: function($) { var result = $('#articleheader h3').text().trim(); return (!S(result).isEmpty()) ? result : ''; } }, { name: 'title_paragraph', sel: function($) { var result = $('#articleheader p.bold').text().trim(); return (!S(result).isEmpty()) ? result : ''; } }, { name: 'author', sel: function($) { var result = $('#articleheader p.authorname').text(). trim().split('|')[0].trim(); return (!S(result).isEmpty()) ? result : ''; } }, { name: 'date', sel: function($) { var result = $('#articleheader p.authorname').text(); if (!S(result).isEmpty()) { result = result.match(/(\d{1,2}\/\d{1,2}\/\d{4})/g); } return (!S(result).isEmpty()) ? S(result).left(9).s : ''; } }, { name: 'images', sel: function($) { var result = []; var array = $('article img').map(function() { return $(this).attr('src'); }).get(); array.forEach(function(item) { result.push(parseArticleOptions.domain + item); }); return (!S(result).isEmpty()) ? result : []; } }, { name: 'category', sel: function($) { var result = $('#quicknav').text(); if (!S(result).isEmpty()) { result = S(result).trim().s; result = S(result).replaceAll('Confidencial', '').s; result = S(result).replaceAll('Leer artículo', '').s; result = S(result).replaceAll('»', '').s; result = S(result).trim().s; } return (!S(result).isEmpty()) ? S(result).trim().s : ''; } }, { name: 'content', sel: function($) { var newResult = ''; var result = $('article div.content_article div.text_article').html(); if (!S(result).isEmpty()) { result = result.trim(); newResult = S(result).stripTags('p,br,strong').s; newResult = S(newResult).collapseWhitespace().s; newResult = S(newResult).replaceAll('<br />', '\n').s; newResult = S(newResult).replaceAll('<br /> <br />', '\n').s; newResult = S(newResult).replaceAll('<br /><br />', '\n').s; newResult = S(newResult).replaceAll('<p>', '').s; newResult = S(newResult).replaceAll('</p>', '\n').s; newResult = S(newResult).replaceAll('&nbsp;', '').s; newResult = S(newResult.replace(/<a.*href="(.*?)".*>(.*?)<\/a>/gi, '$2 $1')).trim().s; newResult = newResult.replace(/<(?:.|\n)*?>/gm, ''); if (S(newResult).endsWith('\n')) { } } return (!S(result).isEmpty()) ? S(newResult).trim().s : ''; } } ] }; // simple function to compare if exist item in array function inArray(needle, haystack) { var length = haystack.length; for (var i = 0; i < length; i++) { if (haystack[i] == needle) { return true; } } return false; } // function to parse data function parse(site, callback) { jsdom.env({ url: site, scripts: ['http://code.jquery.com/jquery.js'], done: function (errors, window) { var $ = window.$; callback($, errors); } }); }; // Global function to get all links from all categories module.exports.getAllLinks = function(callback) { function out() { var result = []; var fetch = function(url, cb) { request(url, function(err, response, body) { if (err) { cb(err); } else { cb(null, body); // First param indicates error, null=> no error } }); } async.map(categories, fetch, function(err, results) { if (err) { console.log(err); // either file1, file2 or file3 has raised an error, so you should not use results and handle the error } else { results.forEach(function(category) { $ = cheerio.load(category); var links = $('.article h3 a,article h2 a'); //use your CSS selector here $(links).each(function(i, link) { var articleUrl = 'http://confidencial.com.ni/' + $(link).attr('href'); if (!inArray(articleUrl, result)) { result.push(articleUrl); } }); }); callback(result); } }); } out(); } // global function to get data from article url module.exports.getArticle = function(site, cb) { function getId(url) { var id = url.split('/'); return id[4]; } function out(url) { parse(url, function ($, err) { var result = {}; result.id = getId(url); result.url = url; parseArticleOptions.elements.forEach(function (elem) { result[elem.name] = elem.sel($); }); cb(result, err); }); } out(site); }