UNPKG

website-to-json

Version:

Converts all websites to JSON data

163 lines (138 loc) 3.82 kB
var cheerio = require('cheerio'); var _ = require('lodash'); var string = require('extract-data-from-text') var S = require('string'); var db = require('./../recipes') var defaultCheerioOptions = { normalizeWhitespace: false, xmlMode: false, decodeEntities: true }; /** * converts url and html to json data */ exports.convert = function(url, html, options) { var $ = cheerio.load(html, defaultCheerioOptions); var body = $('body').text() social = _.uniq(_.concat([ 'twitter.com', 'linkedin.com', 'pinterest.com', 'youtube.com', 'plus.google.com', 'instagram.com', 'github.com', 'facebook.com', 'behance.net', 'dribbble.com', ], options.social || [])); var data = { meta: { title: S($('title').eq(0).text()).trim().s, h1: S($('h1').first().text()).trim().s, h2: S($('h2').first().text()).trim().s, description: $("meta[name='description' i]").attr('content'), keywords: $("meta[name='keywords' i]").attr('content'), 'og:description': $("meta[property='og:description' i]").attr('content'), 'og:image': $("meta[property='og:image' i]").attr('content') }, social: exports.getSocialUrls(url, $, social) } if (options.emails || (options.fields && options.fields.indexOf('emails') !== -1)) { data.emails = _.uniq(string.emails(html)); } if (options.links || (options.fields && options.fields.indexOf('links') !== -1)) { data.links = $('a').map(function(val) { return { text: S($(this).text()).trim().s, href: $(this).attr('href') }; }).get(); data.links = _.filter(data.links, (val) => { return val.text || val.href; }) } if (options.keywords && _.isArray(options.keywords)) { //data.keywords = exports.findKeywords(html, options.keywords) data.keywords = exports.findKeywords(body, options.keywords) } var element = exports.findRecipe(url, options); if (element) { data.id = exports.generateId(url, element.pattern) data.data = element.parse($) } else { data.id = 'id_not_specified' } if (options.fields && _.isArray(options.fields)) { data = _.pick(data, options.fields) } return data } /** * generate ID by url and pattern */ exports.findKeywords = function(html, keywords) { ////console.log(html.html()); var text = html.toLowerCase() return _.filter(keywords, function(val) { return text.indexOf(val) !== -1 }) } /** * generate ID by url and pattern */ exports.generateId = function(url, pattern) { return url.match(pattern).slice(1).join('_') } /** * find jquery recipe for URL */ exports.findRecipe = function(url, options) { var recipes = db if (options.recipes) { if (_.isArray(options.recipes)) { recipes = options.recipes } else { recipes = require(options.recipes) } } else if (options.recipe) { return options.recipe; } else if (options.parse && _.isFunction(options.parse)) { return { parse: options.parse } } var element = _.find(recipes, function(recipe) { var pattern = recipe.pattern if (url.match(pattern) !== null) { return true } else { return false } }) return element } /** * gets url to fb, youtube, google plus etc */ exports.getSocialUrls = function(url, $, social) { var links = $('a').map(function(val) { return { url: $(this).attr('href'), text: $(this).text() } }).get() var data = {} for (var i = 0 ; i < social.length ; ++i) { var element = _.find(links, function(o) { if (o.url) { return o.url.indexOf(social[i]) !== -1 && url.indexOf(social[i]) === -1; } }); if (element) { var social_name = social[i].split('.')[0] data[social_name] = element.url } } return data }