unfluff

_ = require("lodash") module.exports = cleaner = (doc) -> removeBodyClasses(doc) cleanArticleTags(doc) cleanEmTags(doc) cleanCodeBlocks(doc) removeDropCaps(doc) removeScriptsStyles(doc) cleanBadTags(doc) removeNodesRegex(doc, /^caption$/) removeNodesRegex(doc, / google /) removeNodesRegex(doc, /^[^entry-]more.*$/) removeNodesRegex(doc, /[^-]facebook/) removeNodesRegex(doc, /facebook-broadcasting/) removeNodesRegex(doc, /[^-]twitter/) cleanParaSpans(doc) cleanUnderlines(doc) cleanErrantLinebreaks(doc) divToPara(doc, 'div') divToPara(doc, 'span') return doc removeBodyClasses = (doc) -> doc("body").removeClass() cleanArticleTags = (doc) -> articles = doc("article") articles.each () -> doc(this).removeAttr('id') doc(this).removeAttr('name') doc(this).removeAttr('class') cleanEmTags = (doc) -> ems = doc("em") ems.each () -> images = ems.find("img") if images.length == 0 doc(this).replaceWith(doc(this).html()) cleanCodeBlocks = (doc) -> nodes = doc("[class*='highlight-'], pre code, code, pre, ul.task-list") nodes.each () -> doc(this).replaceWith(doc(this).text()) removeDropCaps = (doc) -> nodes = doc("span[class~=dropcap], span[class~=drop_cap]") nodes.each () -> doc(this).replaceWith(doc(this).html()) removeScriptsStyles = (doc) -> doc("script").remove() doc("style").remove() comments = doc('*').contents().filter () -> this.type == "comment" doc(comments).remove() cleanBadTags = (doc) -> removeNodesRe = "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|partner-gravity-ad|video-full-transcript|storytopbar-bucket|utility-bar|inline-share-tools|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text|legende|ajoutVideo|timestamp|js_replies" re = new RegExp(removeNodesRe, "i"); toRemove = doc('*').filter () -> doc(this).attr('id')?.match(re) || doc(this).attr('class')?.match(re) || doc(this).attr('name')?.match(re) doc(toRemove).remove() removeNodesRegex = (doc, pattern) -> toRemove = doc('div').filter () -> doc(this).attr('id')?.match(pattern) || doc(this).attr('class')?.match(pattern) doc(toRemove).remove() cleanParaSpans = (doc) -> nodes = doc("p span") nodes.each () -> doc(this).replaceWith(doc(this).html()) cleanUnderlines = (doc) -> nodes = doc("u") nodes.each () -> doc(this).replaceWith(doc(this).html()) getReplacementNodes = (doc, div) -> replacementText = [] nodesToReturn = [] nodesToRemove = [] childs = div.contents() childs.each () -> kid = doc(this) # node is a p # and already have some replacement text if kid[0].name == 'p' && replacementText.length > 0 txt = replacementText.join('') nodesToReturn.push(txt) replacementText = [] nodesToReturn.push(doc(kid).html()) # node is a text node else if kid[0].type == 'text' kidTextNode = kid kidText = kid.text() replaceText = kidText.replace(/\n/g, "\n\n").replace(/\t/g, "").replace(/^\s+$/g, "") if(replaceText.length) > 1 previousSiblingNode = kidTextNode.prev() while previousSiblingNode[0] && previousSiblingNode[0].name == "a" && previousSiblingNode.attr('grv-usedalready') != 'yes' outer = " " + doc.html(previousSiblingNode) + " " replacementText.push(outer) nodesToRemove.push(previousSiblingNode) previousSiblingNode.attr('grv-usedalready', 'yes') previousSiblingNode = previousSiblingNode.prev() replacementText.push(replaceText) nextSiblingNode = kidTextNode.next() while nextSiblingNode[0] && nextSiblingNode[0].name == "a" && nextSiblingNode.attr('grv-usedalready') != 'yes' outer = " " + doc.html(nextSiblingNode) + " " replacementText.push(outer) nodesToRemove.push(nextSiblingNode) nextSiblingNode.attr('grv-usedalready', 'yes') previousSiblingNode = nextSiblingNode.next() # otherwise else nodesToReturn.push(doc(kid).html()) # flush out anything still remaining if replacementText.length > 0 txt = replacementText.join('') nodesToReturn.push(txt) replacementText = [] _.each nodesToRemove, (n) -> doc(n).remove() nodesToReturn replaceWithPara = (doc, div) -> divContent = doc(div).html() doc(div).replaceWith("<p>#{divContent}</p>") divToPara = (doc, domType) -> divs = doc(domType) lastCount = divs.length + 1 tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul'] divs.each () -> div = doc(this) items = div.find(tags.join(", ")) if items.length == 0 replaceWithPara(doc, this) else replaceNodes = getReplacementNodes(doc, div) html = "" _.each replaceNodes, (node) -> if node != '' html += "<p>#{node}</p>" div.empty() doc(div).replaceWith("#{html}") # For plain text nodes directly inside of p tags that contain random single # line breaks, remove those junky line breaks. They would never be rendered # by a browser anyway. cleanErrantLinebreaks = (doc) -> doc("p").each () -> node = doc(this) c = node.contents() doc(c).each () -> n = doc(this) if n[0].type == 'text' n.replaceWith(n.text().replace(/([^\n])\n([^\n])/g, "$1 $2"))