article-parser
Version:
Extract clean article data from given URL.
47 lines (37 loc) • 1.03 kB
JavaScript
// utils -> standalizeArticle
var cheerio = require('cheerio');
var sanitize = require('sanitize-html');
var config = require('../config');
var contentOnlyRule = config.article.htmlRules;
var getTimeToRead = require('./getTimeToRead');
var absolutifyURL = require('./absolutifyURL');
var standalize = (input) => {
let {
content: html,
url
} = input;
if (html) {
let $ = cheerio.load(html, {
normalizeWhitespace: true,
decodeEntities: true
});
$('a').each((i, elem) => {
let href = $(elem).attr('href');
if (href) {
$(elem).attr('href', absolutifyURL(url, href));
$(elem).attr('target', '_blank');
}
});
$('img').each((i, elem) => {
let src = $(elem).attr('src');
if (src) {
$(elem).attr('src', absolutifyURL(url, src));
}
});
let content = sanitize($.html(), contentOnlyRule);
input.content = content;
input.duration = getTimeToRead(content);
}
return input;
};
module.exports = standalize;