unfluff
Version:
A web page content extractor
123 lines (122 loc) • 3.88 kB
JavaScript
// Generated by CoffeeScript 2.0.0-beta7
void function () {
var _, addNewlineToBr, cleanParagraphText, convertToText, formatter, linksToText, removeFewwordsParagraphs, removeNegativescoresNodes, replaceWithText, stopwords, ulToText, XRegExp;
stopwords = require('./stopwords');
_ = require('lodash');
XRegExp = require('xregexp').XRegExp;
module.exports = formatter = function (doc, topNode, language) {
removeNegativescoresNodes(doc, topNode);
linksToText(doc, topNode);
addNewlineToBr(doc, topNode);
replaceWithText(doc, topNode);
removeFewwordsParagraphs(doc, topNode, language);
return convertToText(doc, topNode);
};
linksToText = function (doc, topNode) {
var nodes;
nodes = topNode.find('a');
return nodes.each(function () {
return doc(this).replaceWith(doc(this).html());
});
};
ulToText = function (doc, node) {
var nodes, txt;
nodes = node.find('li');
txt = '';
nodes.each(function () {
return txt = txt + ('\n * ' + doc(this).text());
});
txt = txt + '\n';
return txt;
};
replaceWithText = function (doc, topNode) {
var nodes;
nodes = topNode.find('b, strong, i, br, sup');
return nodes.each(function () {
return doc(this).replaceWith(doc(this).text());
});
};
cleanParagraphText = function (rawText) {
var txt;
txt = rawText.trim();
txt.replace(/[\s\t]+/g, ' ');
return txt;
};
convertToText = function (doc, topNode) {
var hangingText, nodes, regex, txt, txts;
txts = [];
nodes = topNode.contents();
hangingText = '';
nodes.each(function () {
var node, nodeName, nodeType, txt;
node = doc(this);
nodeType = node[0].type;
nodeName = node[0].name;
if (nodeType === 'text') {
hangingText += node.text();
return true;
} else if (nodeName === 'ul') {
hangingText += ulToText(doc, node);
return true;
}
if (hangingText.length > 0) {
txt = cleanParagraphText(hangingText);
txts = txts.concat(txt.split(/\r?\n/));
hangingText = '';
}
txt = cleanParagraphText(node.text());
txt = txt.replace(/(\w+\.)([A-Z]+)/, '$1 $2');
return txts = txts.concat(txt.split(/\r?\n/));
});
if (hangingText.length > 0) {
txt = cleanParagraphText(hangingText);
txts = txts.concat(txt.split(/\r?\n/));
}
txts = _.map(txts, function (txt) {
return txt.trim();
});
regex = XRegExp('[\\p{Number}\\p{Letter}]');
txts = _.filter(txts, function (txt) {
return regex.test(txt);
});
return txts.join('\n\n');
};
addNewlineToBr = function (doc, topNode) {
var brs;
brs = topNode.find('br');
return brs.each(function () {
var br;
br = doc(this);
return br.replaceWith('\n\n');
});
};
removeNegativescoresNodes = function (doc, topNode) {
var gravityItems;
gravityItems = topNode.find('*[gravityScore]');
return gravityItems.each(function () {
var item, score;
item = doc(this);
score = parseInt(item.attr('gravityScore')) || 0;
if (score < 1)
return doc(item).remove();
});
};
removeFewwordsParagraphs = function (doc, topNode, language) {
var allNodes;
allNodes = topNode.find('*');
return allNodes.each(function () {
var el, stopWords, tag, text, trimmed;
el = doc(this);
tag = el[0].name;
text = el.text();
stopWords = stopwords(text, language);
if ((tag !== 'br' || text !== '\\r') && stopWords.stopwordCount < 3 && el.find('object').length === 0 && el.find('embed').length === 0) {
return doc(el).remove();
} else {
trimmed = text.trim();
if (trimmed[0] === '(' && trimmed[trimmed.length - 1] === ')')
return doc(el).remove();
}
});
};
}.call(this);