unfluff
Version:
A web page content extractor
184 lines (144 loc) • 5.75 kB
text/coffeescript
_ = require("lodash")
module.exports = cleaner = (doc) ->
removeBodyClasses(doc)
cleanArticleTags(doc)
cleanEmTags(doc)
cleanCodeBlocks(doc)
removeDropCaps(doc)
removeScriptsStyles(doc)
cleanBadTags(doc)
removeNodesRegex(doc, /^caption$/)
removeNodesRegex(doc, / google /)
removeNodesRegex(doc, /^[^entry-]more.*$/)
removeNodesRegex(doc, /[^-]facebook/)
removeNodesRegex(doc, /facebook-broadcasting/)
removeNodesRegex(doc, /[^-]twitter/)
cleanParaSpans(doc)
cleanUnderlines(doc)
cleanErrantLinebreaks(doc)
divToPara(doc, 'div')
divToPara(doc, 'span')
return doc
removeBodyClasses = (doc) ->
doc("body").removeClass()
cleanArticleTags = (doc) ->
articles = doc("article")
articles.each () ->
doc(this).removeAttr('id')
doc(this).removeAttr('name')
doc(this).removeAttr('class')
cleanEmTags = (doc) ->
ems = doc("em")
ems.each () ->
images = ems.find("img")
if images.length == 0
doc(this).replaceWith(doc(this).html())
cleanCodeBlocks = (doc) ->
nodes = doc("[class*='highlight-'], pre code, code, pre, ul.task-list")
nodes.each () ->
doc(this).replaceWith(doc(this).text())
removeDropCaps = (doc) ->
nodes = doc("span[class~=dropcap], span[class~=drop_cap]")
nodes.each () ->
doc(this).replaceWith(doc(this).html())
removeScriptsStyles = (doc) ->
doc("script").remove()
doc("style").remove()
comments = doc('*').contents().filter () ->
this.type == "comment"
doc(comments).remove()
cleanBadTags = (doc) ->
removeNodesRe = "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar|partner-gravity-ad|video-full-transcript|storytopbar-bucket|utility-bar|inline-share-tools|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|date|^print$|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text|legende|ajoutVideo|timestamp|js_replies"
re = new RegExp(removeNodesRe, "i");
toRemove = doc('*').filter () ->
doc(this).attr('id')?.match(re) || doc(this).attr('class')?.match(re) || doc(this).attr('name')?.match(re)
doc(toRemove).remove()
removeNodesRegex = (doc, pattern) ->
toRemove = doc('div').filter () ->
doc(this).attr('id')?.match(pattern) || doc(this).attr('class')?.match(pattern)
doc(toRemove).remove()
cleanParaSpans = (doc) ->
nodes = doc("p span")
nodes.each () ->
doc(this).replaceWith(doc(this).html())
cleanUnderlines = (doc) ->
nodes = doc("u")
nodes.each () ->
doc(this).replaceWith(doc(this).html())
getReplacementNodes = (doc, div) ->
replacementText = []
nodesToReturn = []
nodesToRemove = []
childs = div.contents()
childs.each () ->
kid = doc(this)
# node is a p
# and already have some replacement text
if kid[0].name == 'p' && replacementText.length > 0
txt = replacementText.join('')
nodesToReturn.push(txt)
replacementText = []
nodesToReturn.push(doc(kid).html())
# node is a text node
else if kid[0].type == 'text'
kidTextNode = kid
kidText = kid.text()
replaceText = kidText.replace(/\n/g, "\n\n").replace(/\t/g, "").replace(/^\s+$/g, "")
if(replaceText.length) > 1
previousSiblingNode = kidTextNode.prev()
while previousSiblingNode[0] && previousSiblingNode[0].name == "a" && previousSiblingNode.attr('grv-usedalready') != 'yes'
outer = " " + doc.html(previousSiblingNode) + " "
replacementText.push(outer)
nodesToRemove.push(previousSiblingNode)
previousSiblingNode.attr('grv-usedalready', 'yes')
previousSiblingNode = previousSiblingNode.prev()
replacementText.push(replaceText)
nextSiblingNode = kidTextNode.next()
while nextSiblingNode[0] && nextSiblingNode[0].name == "a" && nextSiblingNode.attr('grv-usedalready') != 'yes'
outer = " " + doc.html(nextSiblingNode) + " "
replacementText.push(outer)
nodesToRemove.push(nextSiblingNode)
nextSiblingNode.attr('grv-usedalready', 'yes')
previousSiblingNode = nextSiblingNode.next()
# otherwise
else
nodesToReturn.push(doc(kid).html())
# flush out anything still remaining
if replacementText.length > 0
txt = replacementText.join('')
nodesToReturn.push(txt)
replacementText = []
_.each nodesToRemove, (n) ->
doc(n).remove()
nodesToReturn
replaceWithPara = (doc, div) ->
divContent = doc(div).html()
doc(div).replaceWith("<p>#{divContent}</p>")
divToPara = (doc, domType) ->
divs = doc(domType)
lastCount = divs.length + 1
tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul']
divs.each () ->
div = doc(this)
items = div.find(tags.join(", "))
if items.length == 0
replaceWithPara(doc, this)
else
replaceNodes = getReplacementNodes(doc, div)
html = ""
_.each replaceNodes, (node) ->
if node != ''
html += "<p>#{node}</p>"
div.empty()
doc(div).replaceWith("#{html}")
# For plain text nodes directly inside of p tags that contain random single
# line breaks, remove those junky line breaks. They would never be rendered
# by a browser anyway.
cleanErrantLinebreaks = (doc) ->
doc("p").each () ->
node = doc(this)
c = node.contents()
doc(c).each () ->
n = doc(this)
if n[0].type == 'text'
n.replaceWith(n.text().replace(/([^\n])\n([^\n])/g, "$1 $2"))