UNPKG

article-parser

Version:

Extract clean article data from given URL.

561 lines (493 loc) 12.5 kB
/** * Article parser * @ndaidong **/ var bella = require('bellajs'); var Promise = require('promise-wtf'); var fetch = require('node-fetch'); var sanitize = require('sanitize-html'); var cheerio = require('cheerio'); var read = require('es6-readability'); var config = require('./config'); var Duration = require('./duration'); var urlResolver = require('./url-resolver'); var absolutify = urlResolver.absolutify; var purify = urlResolver.purify; var removeUTM = urlResolver.removeUTM; var getDomain = urlResolver.getDomain; var isValidURL = urlResolver.isValidURL; var isExceptDomain = urlResolver.isExceptDomain; var strtolower = (s) => { return s ? s.toLowerCase() : ''; }; var configure = (o) => { if (o.wordsPerMinute) { let wpm = Number(o.wordsPerMinute); if (bella.isNumber(wpm) && wpm > 100 && wpm < 1000) { config.wordsPerMinute = wpm; } } if (o.blackList) { let bl = o.blackList; if (bella.isArray(bl)) { config.blackList = bl; } } if (o.adsDomain) { let ad = o.adsDomain; if (bella.isArray(ad)) { config.adsDomain = ad; } } if (o.htmlRules) { let hr = o.htmlRules; if (bella.isObject(hr)) { if (hr.allowedTags && bella.isArray(hr.allowedTags)) { config.htmlRules.allowedTags = hr.allowedTags; } if (hr.allowedAttributes && bella.isObject(hr.allowedAttributes)) { config.htmlRules.allowedAttributes = hr.allowedAttributes; } } } if (o.SoundCloudKey) { config.SoundCloudKey = o.SoundCloudKey; } if (o.YouTubeKey) { config.YouTubeKey = o.YouTubeKey; } if (o.EmbedlyKey) { config.EmbedlyKey = o.EmbedlyKey; } }; var tracer = {}; var parseMeta = (html, url) => { let entry = { url, canonical: '', title: '', description: '', image: '', author: '', source: '' }; let sourceAttrs = [ 'application-name', 'og:site_name', 'dc.title' ]; let urlAttrs = [ 'og:url', 'twitter:url' ]; let titleAttrs = [ 'title', 'og:title', 'twitter:title' ]; let descriptionAttrs = [ 'description', 'og:description', 'twitter:description' ]; let imageAttrs = [ 'og:image', 'twitter:image', 'twitter:image:src' ]; let authorAttrs = [ 'author', 'creator', 'og:creator', 'og:article:author', 'twitter:creator', 'dc.creator' ]; let doc = cheerio.load(html, { lowerCaseTags: true, lowerCaseAttributeNames: true, recognizeSelfClosing: true }); entry.title = doc('title').text(); doc('link').each((i, link) => { let m = doc(link); let rel = m.attr('rel'); if (rel && rel === 'canonical') { let href = m.attr('href'); if (isValidURL(href)) { entry.canonical = href; } } }); doc('meta').each((i, meta) => { let m = doc(meta); let content = m.attr('content'); let property = strtolower(m.attr('property')); let name = strtolower(m.attr('name')); if (sourceAttrs.includes(property) || sourceAttrs.includes(name)) { entry.source = content; } if (urlAttrs.includes(property) || urlAttrs.includes(name)) { entry.url = content; } if (titleAttrs.includes(property) || titleAttrs.includes(name)) { entry.title = content; } if (descriptionAttrs.includes(property) || descriptionAttrs.includes(name)) { entry.description = content; } if (imageAttrs.includes(property) || imageAttrs.includes(name)) { entry.image = content; } if (authorAttrs.includes(property) || authorAttrs.includes(name)) { entry.author = content; } }); return entry; }; var absolutifyContentSrc = (s, url) => { let $ = cheerio.load(s, { normalizeWhitespace: true, decodeEntities: true }); $('a').each((i, elem) => { let href = $(elem).attr('href'); if (href) { $(elem).attr('href', absolutify(url, href)); } }); $('img').each((i, elem) => { let src = $(elem).attr('src'); if (src) { $(elem).attr('src', absolutify(url, src)); } }); return $.html(); }; var parseWithEmbedly = (url, key = '') => { return new Promise((resolve, reject) => { let u = encodeURIComponent(url); let k = key || config.EmbedlyKey || ''; let target = `http://api.embed.ly/1/extract?key=${k}&url=${u}&format=json`; return fetch(target).then((res) => { return res.json(); }).then((o) => { let author = ''; let authors = o.author || []; if (authors.length) { author = authors[0].name; } let image = ''; let images = o.images || []; if (images.length) { let maxw = 0; let maxh = 0; images.forEach((img) => { if (img.width > maxw && img.height > maxh) { image = img.url; maxw = img.width; maxh = img.height; } }); } return resolve({ url: o.url, title: o.title, description: o.description, author, source: o.provider_name || '', image, content: o.content }); }).catch((e) => { return reject(e); }); }); }; var getArticle = (html) => { return new Promise((resolve, reject) => { let content; Promise.series([ (next) => { if (content) { return next(); } let $ = cheerio.load(html); if (!$) { return next(); } let classes = [ '.post-content noscript', '.post-body', '.post-content', '.article-body', '.article-content', '.entry-inner', '.post', 'article' ]; for (let i = 0; i < classes.length; i++) { let c = $(classes[i]); if (c) { content = c.html(); if (content) { break; } } } return next(); }, (next) => { return read(html).then((a) => { if (a && a.content && !content) { content = a.content; } }).finally(next); }, (next) => { if (!content) { return next(); } let s = sanitize(content, config.htmlRules); let $ = cheerio.load(s, { normalizeWhitespace: true, decodeEntities: true }); $('a').attr('target', '_blank'); content = $.html(); return next(); } ]).then(() => { if (!content) { return reject(new Error('No article determined')); } return null; }).catch((err) => { return reject(err); }).finally(() => { return resolve(content); }); }); }; var extract = (url) => { return new Promise((resolve, reject) => { let error; url = removeUTM(url); let canonicals = [url]; let resURL; let bestURL; let html; let meta; let oemb; let article; let alias = ''; let title = ''; let description = ''; let image = ''; let author = ''; let source = ''; let content = ''; let domain = ''; let duration = 0; Promise.series([ (next) => { if (!isExceptDomain(url)) { return next(); } return parseWithEmbedly(url).then((a) => { resURL = a.url; title = a.title; description = a.description; author = a.author; source = a.source; content = a.content; canonicals.push(resURL); }).catch((e) => { tracer.embedlyError = e; }).finally(next); }, (next) => { if (resURL) { return next(); } return fetch(url).then((res) => { resURL = purify(res.url); if (resURL) { canonicals.push(resURL); } else { error = { code: '001', message: 'No URL or URL is in black list' }; } return res.text().then((s) => { html = s; next(); }); }).catch((e) => { next(e); }); }, (next) => { if (!resURL || !html) { return next(); } meta = parseMeta(html, resURL); if (!meta || !meta.title || !meta.url) { return next(); } canonicals.push(meta.url); if (meta.canonical) { canonicals.push(meta.canonical); } title = meta.title || ''; description = meta.description || ''; image = meta.image || ''; author = meta.author || ''; domain = domain.replace('www.', ''); if (!source) { source = meta.source || ''; } return next(); }, (next) => { let tmp = bella.stabilize(canonicals); canonicals = tmp.unique(); let curls = canonicals.filter((cano) => { if (!cano) { return false; } if (cano.startsWith('//')) { cano = 'http:' + cano; } cano = purify(cano); return isValidURL(cano); }); let tmpCurls = bella.stabilize(curls); canonicals = tmpCurls.unique(); bestURL = canonicals[canonicals.length - 1]; domain = getDomain(bestURL); if (!domain) { error = { code: '002', message: 'No domain determined' }; return next(); } domain = domain.replace('www.', ''); if (!source) { source = domain; } return next(); }, (next) => { if (!bestURL || !domain || !title) { return next(); } let t = bella.time(); alias = bella.createAlias(title) + '-' + t; let tit = bella.stripTags(title); title = bella.truncate(tit, 118); let desc = bella.stripTags(description); if (desc) { description = bella.truncate(desc, 156); } let auth = author; if (auth && auth.indexOf(' ') > 0) { author = bella.ucwords(auth); } article = { alias, url: bestURL, canonicals, title, description, image: absolutify(bestURL, image), content, author, source, domain, duration }; return next(); }, (next) => { if (oemb || !article) { return next(); } return getArticle(html).then((art) => { content = art; }).catch((er) => { error = er; }).finally(next); }, (next) => { if (!article || !content || oemb) { return next(); } let desc = article.description; if (!desc && content) { desc = bella.stripTags(content); article.description = bella.truncate(desc, 156); } return next(); }, (next) => { if (!article || !content || duration) { return next(); } article.content = absolutifyContentSrc(content, bestURL); if (Duration.isMovie(bestURL) || Duration.isAudio(bestURL)) { return Duration.estimate(bestURL).then((d) => { duration = d; return null; }).catch((e) => { error = e; }).finally(next); } return Duration.estimate(content).then((d) => { duration = d; return null; }).catch((e) => { error = e; }).finally(next); }, (next) => { if (!article || !content) { return next(); } article.duration = duration; return next(); } ]).then(() => { if (!article || !article.title || !article.domain || !article.duration) { error = { code: '003', message: 'Not enough info to build article', article }; } return null; }).catch((err) => { error = err; }).finally(() => { if (error) { return reject(new Error(error.message || 'Something wrong while extracting article')); } return resolve(article); }); }); }; module.exports = { configure, getConfig: () => { return bella.clone(config); }, extract, getArticle, getDomain, parseMeta, parseWithEmbedly, absolutify, purify };