UNPKG

unfluff

Version:

A web page content extractor

github.com/ageitgey/node-unfluff

ageitgey/node-unfluff

123 lines (122 loc) • 3.88 kB

JavaScript

// Generated by CoffeeScript 2.0.0-beta7 void function () { var _, addNewlineToBr, cleanParagraphText, convertToText, formatter, linksToText, removeFewwordsParagraphs, removeNegativescoresNodes, replaceWithText, stopwords, ulToText, XRegExp; stopwords = require('./stopwords'); _ = require('lodash'); XRegExp = require('xregexp').XRegExp; module.exports = formatter = function (doc, topNode, language) { removeNegativescoresNodes(doc, topNode); linksToText(doc, topNode); addNewlineToBr(doc, topNode); replaceWithText(doc, topNode); removeFewwordsParagraphs(doc, topNode, language); return convertToText(doc, topNode); }; linksToText = function (doc, topNode) { var nodes; nodes = topNode.find('a'); return nodes.each(function () { return doc(this).replaceWith(doc(this).html()); }); }; ulToText = function (doc, node) { var nodes, txt; nodes = node.find('li'); txt = ''; nodes.each(function () { return txt = txt + ('\n * ' + doc(this).text()); }); txt = txt + '\n'; return txt; }; replaceWithText = function (doc, topNode) { var nodes; nodes = topNode.find('b, strong, i, br, sup'); return nodes.each(function () { return doc(this).replaceWith(doc(this).text()); }); }; cleanParagraphText = function (rawText) { var txt; txt = rawText.trim(); txt.replace(/[\s\t]+/g, ' '); return txt; }; convertToText = function (doc, topNode) { var hangingText, nodes, regex, txt, txts; txts = []; nodes = topNode.contents(); hangingText = ''; nodes.each(function () { var node, nodeName, nodeType, txt; node = doc(this); nodeType = node[0].type; nodeName = node[0].name; if (nodeType === 'text') { hangingText += node.text(); return true; } else if (nodeName === 'ul') { hangingText += ulToText(doc, node); return true; } if (hangingText.length > 0) { txt = cleanParagraphText(hangingText); txts = txts.concat(txt.split(/\r?\n/)); hangingText = ''; } txt = cleanParagraphText(node.text()); txt = txt.replace(/(\w+\.)([A-Z]+)/, '$1 $2'); return txts = txts.concat(txt.split(/\r?\n/)); }); if (hangingText.length > 0) { txt = cleanParagraphText(hangingText); txts = txts.concat(txt.split(/\r?\n/)); } txts = _.map(txts, function (txt) { return txt.trim(); }); regex = XRegExp('[\\p{Number}\\p{Letter}]'); txts = _.filter(txts, function (txt) { return regex.test(txt); }); return txts.join('\n\n'); }; addNewlineToBr = function (doc, topNode) { var brs; brs = topNode.find('br'); return brs.each(function () { var br; br = doc(this); return br.replaceWith('\n\n'); }); }; removeNegativescoresNodes = function (doc, topNode) { var gravityItems; gravityItems = topNode.find('*[gravityScore]'); return gravityItems.each(function () { var item, score; item = doc(this); score = parseInt(item.attr('gravityScore')) || 0; if (score < 1) return doc(item).remove(); }); }; removeFewwordsParagraphs = function (doc, topNode, language) { var allNodes; allNodes = topNode.find('*'); return allNodes.each(function () { var el, stopWords, tag, text, trimmed; el = doc(this); tag = el[0].name; text = el.text(); stopWords = stopwords(text, language); if ((tag !== 'br' || text !== '\\r') && stopWords.stopwordCount < 3 && el.find('object').length === 0 && el.find('embed').length === 0) { return doc(el).remove(); } else { trimmed = text.trim(); if (trimmed[0] === '(' && trimmed[trimmed.length - 1] === ')') return doc(el).remove(); } }); }; }.call(this);