UNPKG

unfluff

Version:
210 lines (209 loc) 7.78 kB
// Generated by CoffeeScript 2.0.0-beta7 void function () { var _, cleanArticleTags, cleanBadTags, cleanCodeBlocks, cleanEmTags, cleaner, cleanErrantLinebreaks, cleanParaSpans, cleanUnderlines, divToPara, getReplacementNodes, removeBodyClasses, removeDropCaps, removeNodesRegex, removeScriptsStyles, replaceWithPara; _ = require('lodash'); module.exports = cleaner = function (doc) { removeBodyClasses(doc); cleanArticleTags(doc); cleanEmTags(doc); cleanCodeBlocks(doc); removeDropCaps(doc); removeScriptsStyles(doc); cleanBadTags(doc); removeNodesRegex(doc, /^caption$/); removeNodesRegex(doc, / google /); removeNodesRegex(doc, /^[^entry-]more.*$/); removeNodesRegex(doc, /[^-]facebook/); removeNodesRegex(doc, /facebook-broadcasting/); removeNodesRegex(doc, /[^-]twitter/); cleanParaSpans(doc); cleanUnderlines(doc); cleanErrantLinebreaks(doc); divToPara(doc, 'div'); divToPara(doc, 'span'); return doc; }; removeBodyClasses = function (doc) { return doc('body').removeClass(); }; cleanArticleTags = function (doc) { var articles; articles = doc('article'); return articles.each(function () { doc(this).removeAttr('id'); doc(this).removeAttr('name'); return doc(this).removeAttr('class'); }); }; cleanEmTags = function (doc) { var ems; ems = doc('em'); return ems.each(function () { var images; images = ems.find('img'); if (images.length === 0) return doc(this).replaceWith(doc(this).html()); }); }; cleanCodeBlocks = function (doc) { var nodes; nodes = doc("[class*='highlight-'], pre code, code, pre, ul.task-list"); return nodes.each(function () { return doc(this).replaceWith(doc(this).text()); }); }; removeDropCaps = function (doc) { var nodes; nodes = doc('span[class~=dropcap], span[class~=drop_cap]'); return nodes.each(function () { return doc(this).replaceWith(doc(this).html()); }); }; removeScriptsStyles = function (doc) { var comments; doc('script').remove(); doc('style').remove(); comments = doc('*').contents().filter(function () { return this.type === 'comment'; }); return doc(comments).remove(); }; cleanBadTags = function (doc) { var re, removeNodesRe, toRemove; removeNodesRe = '^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|partner-gravity-ad|video-full-transcript|storytopbar-bucket|utility-bar|inline-share-tools|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text|legende|ajoutVideo|timestamp|js_replies'; re = new RegExp(removeNodesRe, 'i'); toRemove = doc('*').filter(function () { var cache$, cache$1, cache$2; return (null != (cache$ = doc(this).attr('id')) ? cache$.match(re) : void 0) || (null != (cache$1 = doc(this).attr('class')) ? cache$1.match(re) : void 0) || (null != (cache$2 = doc(this).attr('name')) ? cache$2.match(re) : void 0); }); return doc(toRemove).remove(); }; removeNodesRegex = function (doc, pattern) { var toRemove; toRemove = doc('div').filter(function () { var cache$, cache$1; return (null != (cache$ = doc(this).attr('id')) ? cache$.match(pattern) : void 0) || (null != (cache$1 = doc(this).attr('class')) ? cache$1.match(pattern) : void 0); }); return doc(toRemove).remove(); }; cleanParaSpans = function (doc) { var nodes; nodes = doc('p span'); return nodes.each(function () { return doc(this).replaceWith(doc(this).html()); }); }; cleanUnderlines = function (doc) { var nodes; nodes = doc('u'); return nodes.each(function () { return doc(this).replaceWith(doc(this).html()); }); }; getReplacementNodes = function (doc, div) { var childs, nodesToRemove, nodesToReturn, replacementText, txt; replacementText = []; nodesToReturn = []; nodesToRemove = []; childs = div.contents(); childs.each(function () { var kid, kidText, kidTextNode, nextSiblingNode, outer, previousSiblingNode, replaceText, txt; kid = doc(this); if (kid[0].name === 'p' && replacementText.length > 0) { txt = replacementText.join(''); nodesToReturn.push(txt); replacementText = []; return nodesToReturn.push(doc(kid).html()); } else if (kid[0].type === 'text') { kidTextNode = kid; kidText = kid.text(); replaceText = kidText.replace(/\n/g, '\n\n').replace(/\t/g, '').replace(/^\s+$/g, ''); if (replaceText.length > 1) { previousSiblingNode = kidTextNode.prev(); while (previousSiblingNode[0] && previousSiblingNode[0].name === 'a' && previousSiblingNode.attr('grv-usedalready') !== 'yes') { outer = ' ' + doc.html(previousSiblingNode) + ' '; replacementText.push(outer); nodesToRemove.push(previousSiblingNode); previousSiblingNode.attr('grv-usedalready', 'yes'); previousSiblingNode = previousSiblingNode.prev(); } replacementText.push(replaceText); nextSiblingNode = kidTextNode.next(); return function (accum$) { while (nextSiblingNode[0] && nextSiblingNode[0].name === 'a' && nextSiblingNode.attr('grv-usedalready') !== 'yes') { outer = ' ' + doc.html(nextSiblingNode) + ' '; replacementText.push(outer); nodesToRemove.push(nextSiblingNode); nextSiblingNode.attr('grv-usedalready', 'yes'); accum$.push(previousSiblingNode = nextSiblingNode.next()); } return accum$; }.call(this, []); } } else { return nodesToReturn.push(doc(kid).html()); } }); if (replacementText.length > 0) { txt = replacementText.join(''); nodesToReturn.push(txt); replacementText = []; } _.each(nodesToRemove, function (n) { return doc(n).remove(); }); return nodesToReturn; }; replaceWithPara = function (doc, div) { var divContent; divContent = doc(div).html(); return doc(div).replaceWith('<p>' + divContent + '</p>'); }; divToPara = function (doc, domType) { var divs, lastCount, tags; divs = doc(domType); lastCount = divs.length + 1; tags = [ 'a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul' ]; return divs.each(function () { var div, html, items, replaceNodes; div = doc(this); items = div.find(tags.join(', ')); if (items.length === 0) { return replaceWithPara(doc, this); } else { replaceNodes = getReplacementNodes(doc, div); html = ''; _.each(replaceNodes, function (node) { if (node !== '') return html += '<p>' + node + '</p>'; }); div.empty(); return doc(div).replaceWith('' + html); } }); }; cleanErrantLinebreaks = function (doc) { return doc('p').each(function () { var c, node; node = doc(this); c = node.contents(); return doc(c).each(function () { var n; n = doc(this); if (n[0].type === 'text') return n.replaceWith(n.text().replace(/([^\n])\n([^\n])/g, '$1 $2')); }); }); }; }.call(this);