unfluff
Version:
A web page content extractor
125 lines (99 loc) • 3.53 kB
text/coffeescript
stopwords = require("./stopwords")
_ = require("lodash")
{XRegExp} = require('xregexp')
module.exports = formatter = (doc, topNode, language) ->
removeNegativescoresNodes(doc, topNode)
linksToText(doc, topNode)
addNewlineToBr(doc, topNode)
replaceWithText(doc, topNode)
removeFewwordsParagraphs(doc, topNode, language)
return convertToText(doc, topNode)
linksToText = (doc, topNode) ->
nodes = topNode.find('a')
nodes.each () ->
doc(this).replaceWith(doc(this).html())
ulToText = (doc, node) ->
nodes = node.find('li')
txt = ""
nodes.each () ->
txt = txt + "\n * #{doc(this).text()}"
txt = txt + "\n"
txt
replaceWithText = (doc, topNode) ->
nodes = topNode.find('b, strong, i, br, sup')
nodes.each () ->
doc(this).replaceWith(doc(this).text())
cleanParagraphText = (rawText) ->
txt = rawText.trim()
txt.replace(/[\s\t]+/g, ' ')
txt
# Turn an html element (and children) into nicely formatted text
convertToText = (doc, topNode) ->
txts = []
nodes = topNode.contents()
# To hold any text fragments that end up in text nodes outside of
# html elements
hangingText = ""
nodes.each () ->
node = doc(this)
nodeType = node[0].type
nodeName = node[0].name
# Handle top level text nodes by adding them to a running list
# and then treating all the hanging nodes as one paragraph tag
if nodeType == "text"
hangingText += node.text()
# Same as 'continue'
return true
else if nodeName == "ul"
hangingText += ulToText(doc, node)
return true
# If we hit a real node and still have extra acculated text,
# pop it out as if it was a paragraph tag
if hangingText.length > 0
txt = cleanParagraphText(hangingText)
txts = txts.concat(txt.split(/\r?\n/))
hangingText = ""
txt = cleanParagraphText(node.text())
txt = txt.replace(/(\w+\.)([A-Z]+)/, '$1 $2')
txts = txts.concat(txt.split(/\r?\n/))
# Catch any left-over hanging text nodes
if hangingText.length > 0
txt = cleanParagraphText(hangingText)
txts = txts.concat(txt.split(/\r?\n/))
txts = _.map txts, (txt) ->
txt.trim()
# Make sure each text chunk includes at least one text character or number.
# This supports multiple languages words using XRegExp to generate the
# regex that matches wranges of unicode characters used in words.
regex = XRegExp('[\\p{Number}\\p{Letter}]')
txts = _.filter txts, (txt) ->
regex.test(txt)
txts.join('\n\n')
addNewlineToBr = (doc, topNode) ->
brs = topNode.find("br")
brs.each () ->
br = doc(this)
br.replaceWith("\n\n")
# Remove nodes with a negative score because they are probably trash
removeNegativescoresNodes = (doc, topNode) ->
gravityItems = topNode.find("*[gravityScore]")
gravityItems.each () ->
item = doc(this)
score = parseInt(item.attr('gravityScore')) || 0
if score < 1
doc(item).remove()
# remove paragraphs that have less than x number of words,
# would indicate that it's some sort of link
removeFewwordsParagraphs = (doc, topNode, language) ->
allNodes = topNode.find("*")
allNodes.each () ->
el = doc(this)
tag = el[0].name
text = el.text()
stopWords = stopwords(text, language)
if (tag != 'br' || text != '\\r') && stopWords.stopwordCount < 3 && el.find("object").length == 0 && el.find("embed").length == 0
doc(el).remove()
else
trimmed = text.trim()
if trimmed[0] == "(" && trimmed[trimmed.length - 1] == ")"
doc(el).remove()