unfluff
Version:
A web page content extractor
210 lines (209 loc) • 7.78 kB
JavaScript
// Generated by CoffeeScript 2.0.0-beta7
void function () {
var _, cleanArticleTags, cleanBadTags, cleanCodeBlocks, cleanEmTags, cleaner, cleanErrantLinebreaks, cleanParaSpans, cleanUnderlines, divToPara, getReplacementNodes, removeBodyClasses, removeDropCaps, removeNodesRegex, removeScriptsStyles, replaceWithPara;
_ = require('lodash');
module.exports = cleaner = function (doc) {
removeBodyClasses(doc);
cleanArticleTags(doc);
cleanEmTags(doc);
cleanCodeBlocks(doc);
removeDropCaps(doc);
removeScriptsStyles(doc);
cleanBadTags(doc);
removeNodesRegex(doc, /^caption$/);
removeNodesRegex(doc, / google /);
removeNodesRegex(doc, /^[^entry-]more.*$/);
removeNodesRegex(doc, /[^-]facebook/);
removeNodesRegex(doc, /facebook-broadcasting/);
removeNodesRegex(doc, /[^-]twitter/);
cleanParaSpans(doc);
cleanUnderlines(doc);
cleanErrantLinebreaks(doc);
divToPara(doc, 'div');
divToPara(doc, 'span');
return doc;
};
removeBodyClasses = function (doc) {
return doc('body').removeClass();
};
cleanArticleTags = function (doc) {
var articles;
articles = doc('article');
return articles.each(function () {
doc(this).removeAttr('id');
doc(this).removeAttr('name');
return doc(this).removeAttr('class');
});
};
cleanEmTags = function (doc) {
var ems;
ems = doc('em');
return ems.each(function () {
var images;
images = ems.find('img');
if (images.length === 0)
return doc(this).replaceWith(doc(this).html());
});
};
cleanCodeBlocks = function (doc) {
var nodes;
nodes = doc("[class*='highlight-'], pre code, code, pre, ul.task-list");
return nodes.each(function () {
return doc(this).replaceWith(doc(this).text());
});
};
removeDropCaps = function (doc) {
var nodes;
nodes = doc('span[class~=dropcap], span[class~=drop_cap]');
return nodes.each(function () {
return doc(this).replaceWith(doc(this).html());
});
};
removeScriptsStyles = function (doc) {
var comments;
doc('script').remove();
doc('style').remove();
comments = doc('*').contents().filter(function () {
return this.type === 'comment';
});
return doc(comments).remove();
};
cleanBadTags = function (doc) {
var re, removeNodesRe, toRemove;
removeNodesRe = '^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|partner-gravity-ad|video-full-transcript|storytopbar-bucket|utility-bar|inline-share-tools|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text|legende|ajoutVideo|timestamp|js_replies';
re = new RegExp(removeNodesRe, 'i');
toRemove = doc('*').filter(function () {
var cache$, cache$1, cache$2;
return (null != (cache$ = doc(this).attr('id')) ? cache$.match(re) : void 0) || (null != (cache$1 = doc(this).attr('class')) ? cache$1.match(re) : void 0) || (null != (cache$2 = doc(this).attr('name')) ? cache$2.match(re) : void 0);
});
return doc(toRemove).remove();
};
removeNodesRegex = function (doc, pattern) {
var toRemove;
toRemove = doc('div').filter(function () {
var cache$, cache$1;
return (null != (cache$ = doc(this).attr('id')) ? cache$.match(pattern) : void 0) || (null != (cache$1 = doc(this).attr('class')) ? cache$1.match(pattern) : void 0);
});
return doc(toRemove).remove();
};
cleanParaSpans = function (doc) {
var nodes;
nodes = doc('p span');
return nodes.each(function () {
return doc(this).replaceWith(doc(this).html());
});
};
cleanUnderlines = function (doc) {
var nodes;
nodes = doc('u');
return nodes.each(function () {
return doc(this).replaceWith(doc(this).html());
});
};
getReplacementNodes = function (doc, div) {
var childs, nodesToRemove, nodesToReturn, replacementText, txt;
replacementText = [];
nodesToReturn = [];
nodesToRemove = [];
childs = div.contents();
childs.each(function () {
var kid, kidText, kidTextNode, nextSiblingNode, outer, previousSiblingNode, replaceText, txt;
kid = doc(this);
if (kid[0].name === 'p' && replacementText.length > 0) {
txt = replacementText.join('');
nodesToReturn.push(txt);
replacementText = [];
return nodesToReturn.push(doc(kid).html());
} else if (kid[0].type === 'text') {
kidTextNode = kid;
kidText = kid.text();
replaceText = kidText.replace(/\n/g, '\n\n').replace(/\t/g, '').replace(/^\s+$/g, '');
if (replaceText.length > 1) {
previousSiblingNode = kidTextNode.prev();
while (previousSiblingNode[0] && previousSiblingNode[0].name === 'a' && previousSiblingNode.attr('grv-usedalready') !== 'yes') {
outer = ' ' + doc.html(previousSiblingNode) + ' ';
replacementText.push(outer);
nodesToRemove.push(previousSiblingNode);
previousSiblingNode.attr('grv-usedalready', 'yes');
previousSiblingNode = previousSiblingNode.prev();
}
replacementText.push(replaceText);
nextSiblingNode = kidTextNode.next();
return function (accum$) {
while (nextSiblingNode[0] && nextSiblingNode[0].name === 'a' && nextSiblingNode.attr('grv-usedalready') !== 'yes') {
outer = ' ' + doc.html(nextSiblingNode) + ' ';
replacementText.push(outer);
nodesToRemove.push(nextSiblingNode);
nextSiblingNode.attr('grv-usedalready', 'yes');
accum$.push(previousSiblingNode = nextSiblingNode.next());
}
return accum$;
}.call(this, []);
}
} else {
return nodesToReturn.push(doc(kid).html());
}
});
if (replacementText.length > 0) {
txt = replacementText.join('');
nodesToReturn.push(txt);
replacementText = [];
}
_.each(nodesToRemove, function (n) {
return doc(n).remove();
});
return nodesToReturn;
};
replaceWithPara = function (doc, div) {
var divContent;
divContent = doc(div).html();
return doc(div).replaceWith('<p>' + divContent + '</p>');
};
divToPara = function (doc, domType) {
var divs, lastCount, tags;
divs = doc(domType);
lastCount = divs.length + 1;
tags = [
'a',
'blockquote',
'dl',
'div',
'img',
'ol',
'p',
'pre',
'table',
'ul'
];
return divs.each(function () {
var div, html, items, replaceNodes;
div = doc(this);
items = div.find(tags.join(', '));
if (items.length === 0) {
return replaceWithPara(doc, this);
} else {
replaceNodes = getReplacementNodes(doc, div);
html = '';
_.each(replaceNodes, function (node) {
if (node !== '')
return html += '<p>' + node + '</p>';
});
div.empty();
return doc(div).replaceWith('' + html);
}
});
};
cleanErrantLinebreaks = function (doc) {
return doc('p').each(function () {
var c, node;
node = doc(this);
c = node.contents();
return doc(c).each(function () {
var n;
n = doc(this);
if (n[0].type === 'text')
return n.replaceWith(n.text().replace(/([^\n])\n([^\n])/g, '$1 $2'));
});
});
};
}.call(this);