unfluff

stopwords = require("./stopwords") _ = require("lodash") {XRegExp} = require('xregexp') module.exports = formatter = (doc, topNode, language) -> removeNegativescoresNodes(doc, topNode) linksToText(doc, topNode) addNewlineToBr(doc, topNode) replaceWithText(doc, topNode) removeFewwordsParagraphs(doc, topNode, language) return convertToText(doc, topNode) linksToText = (doc, topNode) -> nodes = topNode.find('a') nodes.each () -> doc(this).replaceWith(doc(this).html()) ulToText = (doc, node) -> nodes = node.find('li') txt = "" nodes.each () -> txt = txt + "\n * #{doc(this).text()}" txt = txt + "\n" txt replaceWithText = (doc, topNode) -> nodes = topNode.find('b, strong, i, br, sup') nodes.each () -> doc(this).replaceWith(doc(this).text()) cleanParagraphText = (rawText) -> txt = rawText.trim() txt.replace(/[\s\t]+/g, ' ') txt # Turn an html element (and children) into nicely formatted text convertToText = (doc, topNode) -> txts = [] nodes = topNode.contents() # To hold any text fragments that end up in text nodes outside of # html elements hangingText = "" nodes.each () -> node = doc(this) nodeType = node[0].type nodeName = node[0].name # Handle top level text nodes by adding them to a running list # and then treating all the hanging nodes as one paragraph tag if nodeType == "text" hangingText += node.text() # Same as 'continue' return true else if nodeName == "ul" hangingText += ulToText(doc, node) return true # If we hit a real node and still have extra acculated text, # pop it out as if it was a paragraph tag if hangingText.length > 0 txt = cleanParagraphText(hangingText) txts = txts.concat(txt.split(/\r?\n/)) hangingText = "" txt = cleanParagraphText(node.text()) txt = txt.replace(/(\w+\.)([A-Z]+)/, '$1 $2') txts = txts.concat(txt.split(/\r?\n/)) # Catch any left-over hanging text nodes if hangingText.length > 0 txt = cleanParagraphText(hangingText) txts = txts.concat(txt.split(/\r?\n/)) txts = _.map txts, (txt) -> txt.trim() # Make sure each text chunk includes at least one text character or number. # This supports multiple languages words using XRegExp to generate the # regex that matches wranges of unicode characters used in words. regex = XRegExp('[\\p{Number}\\p{Letter}]') txts = _.filter txts, (txt) -> regex.test(txt) txts.join('\n\n') addNewlineToBr = (doc, topNode) -> brs = topNode.find("br") brs.each () -> br = doc(this) br.replaceWith("\n\n") # Remove nodes with a negative score because they are probably trash removeNegativescoresNodes = (doc, topNode) -> gravityItems = topNode.find("*[gravityScore]") gravityItems.each () -> item = doc(this) score = parseInt(item.attr('gravityScore')) || 0 if score < 1 doc(item).remove() # remove paragraphs that have less than x number of words, # would indicate that it's some sort of link removeFewwordsParagraphs = (doc, topNode, language) -> allNodes = topNode.find("*") allNodes.each () -> el = doc(this) tag = el[0].name text = el.text() stopWords = stopwords(text, language) if (tag != 'br' || text != '\\r') && stopWords.stopwordCount < 3 && el.find("object").length == 0 && el.find("embed").length == 0 doc(el).remove() else trimmed = text.trim() if trimmed[0] == "(" && trimmed[trimmed.length - 1] == ")" doc(el).remove()