unfluff
Version:
A web page content extractor
508 lines (507 loc) • 20.3 kB
JavaScript
// Generated by CoffeeScript 2.0.0-beta7
void function () {
var _, addSiblings, biggestTitleChunk, cleanNull, cleanText, cleanTitle, formatter, getObjectTag, getScore, getSiblingsContent, getSiblingsScore, getVideoAttrs, isBoostable, isHighlinkDensity, isNodescoreThresholdMet, isTableAndNoParaExist, postCleanup, rawTitle, stopwords, updateNodeCount, updateScore;
_ = require('lodash');
stopwords = require('./stopwords');
formatter = require('./formatter');
module.exports = {
date: function (doc) {
var cache$, cache$1, cache$2, cache$3, cache$4, dateCandidates;
dateCandidates = doc("meta[property='article:published_time'], meta[itemprop*='datePublished'], meta[name='dcterms.modified'], meta[name='dcterms.date'], meta[name='DC.date.issued'], meta[name='dc.date.issued'], meta[name='dc.date.modified'], meta[name='dc.date.created'], meta[name='DC.date'], meta[name='DC.Date'], meta[name='dc.date'], meta[name='date'], time[itemprop*='pubDate'], time[itemprop*='pubdate'], span[itemprop*='datePublished'], span[property*='datePublished'], p[itemprop*='datePublished'], p[property*='datePublished'], div[itemprop*='datePublished'], div[property*='datePublished'], li[itemprop*='datePublished'], li[property*='datePublished'], time, span[class*='date'], p[class*='date'], div[class*='date']");
return (null != (cache$ = cleanNull(null != dateCandidates && null != (cache$1 = dateCandidates.first()) ? cache$1.attr('content') : void 0)) ? cache$.trim() : void 0) || (null != (cache$2 = cleanNull(null != dateCandidates && null != (cache$3 = dateCandidates.first()) ? cache$3.attr('datetime') : void 0)) ? cache$2.trim() : void 0) || cleanText(null != dateCandidates && null != (cache$4 = dateCandidates.first()) ? cache$4.text() : void 0) || null;
},
copyright: function (doc) {
var cache$, copyright, copyrightCandidates, text;
copyrightCandidates = doc("p[class*='copyright'], div[class*='copyright'], span[class*='copyright'], li[class*='copyright'], p[id*='copyright'], div[id*='copyright'], span[id*='copyright'], li[id*='copyright']");
text = null != copyrightCandidates && null != (cache$ = copyrightCandidates.first()) ? cache$.text() : void 0;
if (!text) {
text = doc('body').text().replace(/\s*[\r\n]+\s*/g, '. ');
if (!(text.indexOf('\xa9') > 0))
return null;
}
copyright = text.replace(/.*?©(\s*copyright)?([^,;:.|\r\n]+).*/gi, '$2').trim();
return cleanText(copyright);
},
author: function (doc) {
var authorCandidates, authorList, cache$, cache$1, cache$2, cache$3, cache$4, cache$5, fallbackAuthor;
authorCandidates = doc("meta[property='article:author'], meta[property='og:article:author'], meta[name='author'], meta[name='dcterms.creator'], meta[name='DC.creator'], meta[name='DC.Creator'], meta[name='dc.creator'], meta[name='creator']");
authorList = [];
authorCandidates.each(function () {
var author, cache$, cache$1;
author = null != (cache$ = cleanNull(null != (cache$1 = doc(this)) ? cache$1.attr('content') : void 0)) ? cache$.trim() : void 0;
if (author)
return authorList.push(author);
});
if (authorList.length === 0) {
fallbackAuthor = (null != (cache$ = doc("span[class*='author']").first()) ? cache$.text() : void 0) || (null != (cache$1 = doc("p[class*='author']").first()) ? cache$1.text() : void 0) || (null != (cache$2 = doc("div[class*='author']").first()) ? cache$2.text() : void 0) || (null != (cache$3 = doc("span[class*='byline']").first()) ? cache$3.text() : void 0) || (null != (cache$4 = doc("p[class*='byline']").first()) ? cache$4.text() : void 0) || (null != (cache$5 = doc("div[class*='byline']").first()) ? cache$5.text() : void 0);
if (fallbackAuthor)
authorList.push(cleanText(fallbackAuthor));
}
return authorList;
},
publisher: function (doc) {
var cache$, cache$1, publisherCandidates;
publisherCandidates = doc("meta[property='og:site_name'], meta[name='dc.publisher'], meta[name='DC.publisher'], meta[name='DC.Publisher']");
return (null != (cache$ = cleanNull(null != publisherCandidates && null != (cache$1 = publisherCandidates.first()) ? cache$1.attr('content') : void 0)) ? cache$.trim() : void 0) || null;
},
title: function (doc) {
var titleText;
titleText = rawTitle(doc);
return cleanTitle(titleText, [
'|',
' - ',
'\xbb',
':'
]);
},
softTitle: function (doc) {
var titleText;
titleText = rawTitle(doc);
return cleanTitle(titleText, [
'|',
' - ',
'\xbb'
]);
},
text: function (doc, topNode, lang) {
if (topNode) {
topNode = postCleanup(doc, topNode, lang);
return formatter(doc, topNode, lang);
} else {
return '';
}
},
image: function (doc) {
var images;
images = doc("meta[property='og:image'], meta[property='og:image:url'], meta[itemprop=image], meta[name='twitter:image:src'], meta[name='twitter:image'], meta[name='twitter:image0']");
if (images.length > 0 && cleanNull(images.first().attr('content')))
return cleanNull(images.first().attr('content'));
return null;
},
links: function (doc, topNode, lang) {
var gatherLinks, links;
links = [];
gatherLinks = function (doc, topNode) {
var nodes;
nodes = topNode.find('a');
return nodes.each(function () {
var href, text;
href = doc(this).attr('href');
text = doc(this).html();
if (href && text)
return links.push({
text: text,
href: href
});
});
};
if (topNode) {
topNode = postCleanup(doc, topNode, lang);
gatherLinks(doc, topNode);
}
return links;
},
videos: function (doc, topNode) {
var candidates, results, urls, videoList;
videoList = [];
candidates = doc(topNode).find('iframe, embed, object, video');
candidates.each(function () {
var candidate, tag;
candidate = doc(this);
tag = candidate[0].name;
if (tag === 'embed') {
if (candidate.parent() && candidate.parent()[0].name === 'object') {
return videoList.push(getObjectTag(doc, candidate));
} else {
return videoList.push(getVideoAttrs(doc, candidate));
}
} else if (tag === 'object') {
return videoList.push(getObjectTag(doc, candidate));
} else if (tag === 'iframe' || tag === 'video') {
return videoList.push(getVideoAttrs(doc, candidate));
}
});
urls = [];
results = [];
_.each(videoList, function (vid) {
if (vid && vid.height && vid.width && urls.indexOf(vid.src) === -1) {
results.push(vid);
return urls.push(vid.src);
}
});
return results;
},
favicon: function (doc) {
var tag;
tag = doc('link').filter(function () {
var cache$;
return (null != (cache$ = doc(this).attr('rel')) ? cache$.toLowerCase() : void 0) === 'shortcut icon';
});
return tag.attr('href');
},
lang: function (doc) {
var cache$, l, tag, value;
l = null != (cache$ = doc('html')) ? cache$.attr('lang') : void 0;
if (!l) {
tag = doc('meta[name=lang]') || doc('meta[http-equiv=content-language]');
l = null != tag ? tag.attr('content') : void 0;
}
if (l) {
value = l.slice(0, +1 + 1 || 9e9);
if (/^[A-Za-z]{2}$/.test(value))
return value.toLowerCase();
}
return null;
},
description: function (doc) {
var cache$, cache$1, tag;
tag = doc("meta[name=description], meta[property='og:description']");
if (null != (cache$ = cleanNull(null != tag && null != (cache$1 = tag.first()) ? cache$1.attr('content') : void 0)))
return cache$.trim();
},
keywords: function (doc) {
var tag;
tag = doc('meta[name=keywords]');
return cleanNull(null != tag ? tag.attr('content') : void 0);
},
canonicalLink: function (doc) {
var tag;
tag = doc('link[rel=canonical]');
return cleanNull(null != tag ? tag.attr('href') : void 0);
},
tags: function (doc) {
var elements, tags;
elements = doc("a[rel='tag']");
if (elements.length === 0) {
elements = doc("a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']");
if (elements.length === 0)
return [];
}
tags = [];
elements.each(function () {
var el, tag;
el = doc(this);
tag = el.text().trim();
tag.replace(/[\s\t\n]+/g, '');
if (tag && tag.length > 0)
return tags.push(tag);
});
return _.uniq(tags);
},
calculateBestNode: function (doc, lang) {
var bottomNegativescoreNodes, cnt, i, negativeScoring, nodesNumber, nodesToCheck, nodesWithText, parentNodes, startingBoost, topNode, topNodeScore;
topNode = null;
nodesToCheck = doc('p, pre, td');
startingBoost = 1;
cnt = 0;
i = 0;
parentNodes = [];
nodesWithText = [];
nodesToCheck.each(function () {
var highLinkDensity, node, textNode, wordStats;
node = doc(this);
textNode = node.text();
wordStats = stopwords(textNode, lang);
highLinkDensity = isHighlinkDensity(doc, node);
if (wordStats.stopwordCount > 2 && !highLinkDensity)
return nodesWithText.push(node);
});
nodesNumber = nodesWithText.length;
negativeScoring = 0;
bottomNegativescoreNodes = nodesNumber * .25;
_.each(nodesWithText, function (node) {
var booster, boostScore, negscore, parentNode, parentParentNode, textNode, upscore, wordStats;
boostScore = 0;
if (isBoostable(doc, node, lang) === true)
if (cnt >= 0) {
boostScore = 1 / startingBoost * 50;
startingBoost += 1;
}
if (nodesNumber > 15)
if (nodesNumber - i <= bottomNegativescoreNodes) {
booster = bottomNegativescoreNodes - (nodesNumber - i);
boostScore = -1 * Math.pow(booster, 2);
negscore = Math.abs(boostScore) + negativeScoring;
if (negscore > 40)
boostScore = 5;
}
textNode = node.text();
wordStats = stopwords(textNode, lang);
upscore = Math.floor(wordStats.stopwordCount + boostScore);
parentNode = node.parent();
updateScore(parentNode, upscore);
updateNodeCount(parentNode, 1);
if (parentNodes.indexOf(parentNode[0]) === -1)
parentNodes.push(parentNode[0]);
parentParentNode = parentNode.parent();
if (parentParentNode) {
updateNodeCount(parentParentNode, 1);
updateScore(parentParentNode, upscore / 2);
if (parentNodes.indexOf(parentParentNode[0]) === -1)
parentNodes.push(parentParentNode[0]);
}
cnt += 1;
return i += 1;
});
topNodeScore = 0;
_.each(parentNodes, function (e) {
var score;
score = getScore(doc(e));
if (score > topNodeScore) {
topNode = e;
topNodeScore = score;
}
if (topNode === null)
return topNode = e;
});
return doc(topNode);
}
};
getVideoAttrs = function (doc, node) {
var data, el;
el = doc(node);
return data = {
src: el.attr('src'),
height: el.attr('height'),
width: el.attr('width')
};
};
getObjectTag = function (doc, node) {
var src, srcNode, video;
srcNode = node.find('param[name=movie]');
if (!(srcNode.length > 0))
return null;
src = srcNode.attr('value');
video = getVideoAttrs(doc, node);
video.src = src;
return video;
};
biggestTitleChunk = function (title, splitter) {
var largeTextIndex, largeTextLength, titlePieces;
largeTextLength = 0;
largeTextIndex = 0;
titlePieces = title.split(splitter);
_.each(titlePieces, function (piece, i) {
if (piece.length > largeTextLength) {
largeTextLength = piece.length;
return largeTextIndex = i;
}
});
return titlePieces[largeTextIndex];
};
isBoostable = function (doc, node, lang) {
var boostable, maxStepsawayFromNode, minimumStopwordCount, nodes, stepsAway;
stepsAway = 0;
minimumStopwordCount = 5;
maxStepsawayFromNode = 3;
nodes = node.prevAll();
boostable = false;
nodes.each(function () {
var currentNode, currentNodeTag, paraText, wordStats;
currentNode = doc(this);
currentNodeTag = currentNode[0].name;
if (currentNodeTag === 'p') {
if (stepsAway >= maxStepsawayFromNode) {
boostable = false;
return false;
}
paraText = currentNode.text();
wordStats = stopwords(paraText, lang);
if (wordStats.stopwordCount > minimumStopwordCount) {
boostable = true;
return false;
}
return stepsAway += 1;
}
});
return boostable;
};
addSiblings = function (doc, topNode, lang) {
var baselinescoreSiblingsPara, sibs;
baselinescoreSiblingsPara = getSiblingsScore(doc, topNode, lang);
sibs = topNode.prevAll();
sibs.each(function () {
var currentNode, ps;
currentNode = doc(this);
ps = getSiblingsContent(doc, lang, currentNode, baselinescoreSiblingsPara);
return _.each(ps, function (p) {
return topNode.prepend('<p>' + p + '</p>');
});
});
return topNode;
};
getSiblingsContent = function (doc, lang, currentSibling, baselinescoreSiblingsPara) {
var potentialParagraphs, ps;
if (currentSibling[0].name === 'p' && currentSibling.text().length > 0) {
return [currentSibling];
} else {
potentialParagraphs = currentSibling.find('p');
if (potentialParagraphs === null) {
return null;
} else {
ps = [];
potentialParagraphs.each(function () {
var firstParagraph, highLinkDensity, paragraphScore, score, siblingBaselineScore, txt, wordStats;
firstParagraph = doc(this);
txt = firstParagraph.text();
if (txt.length > 0) {
wordStats = stopwords(txt, lang);
paragraphScore = wordStats.stopwordCount;
siblingBaselineScore = .3;
highLinkDensity = isHighlinkDensity(doc, firstParagraph);
score = baselinescoreSiblingsPara * siblingBaselineScore;
if (score < paragraphScore && !highLinkDensity)
return ps.push(txt);
}
});
return ps;
}
}
};
getSiblingsScore = function (doc, topNode, lang) {
var base, nodesToCheck, paragraphsNumber, paragraphsScore;
base = 1e5;
paragraphsNumber = 0;
paragraphsScore = 0;
nodesToCheck = topNode.find('p');
nodesToCheck.each(function () {
var highLinkDensity, node, textNode, wordStats;
node = doc(this);
textNode = node.text();
wordStats = stopwords(textNode, lang);
highLinkDensity = isHighlinkDensity(doc, node);
if (wordStats.stopwordCount > 2 && !highLinkDensity) {
paragraphsNumber += 1;
return paragraphsScore += wordStats.stopwordCount;
}
});
if (paragraphsNumber > 0)
base = paragraphsScore / paragraphsNumber;
return base;
};
updateScore = function (node, addToScore) {
var currentScore, newScore, scoreString;
currentScore = 0;
scoreString = node.attr('gravityScore');
if (scoreString)
currentScore = parseInt(scoreString);
newScore = currentScore + addToScore;
return node.attr('gravityScore', newScore);
};
updateNodeCount = function (node, addToCount) {
var countString, currentScore, newScore;
currentScore = 0;
countString = node.attr('gravityNodes');
if (countString)
currentScore = parseInt(countString);
newScore = currentScore + addToCount;
return node.attr('gravityNodes', newScore);
};
isHighlinkDensity = function (doc, node) {
var links, linkText, linkWords, numberOfLinks, numberOfLinkWords, numberOfWords, percentLinkWords, sb, score, txt, words;
links = node.find('a');
if (!(links.length > 0))
return false;
txt = node.text();
words = txt.split(' ');
numberOfWords = words.length;
sb = [];
links.each(function () {
return sb.push(doc(this).text());
});
linkText = sb.join(' ');
linkWords = linkText.split(' ');
numberOfLinkWords = linkWords.length;
numberOfLinks = links.length;
percentLinkWords = numberOfLinkWords / numberOfWords;
score = percentLinkWords * numberOfLinks;
return score >= 1;
};
getScore = function (node) {
var grvScoreString;
grvScoreString = node.attr('gravityScore');
if (!grvScoreString) {
return 0;
} else {
return parseInt(grvScoreString);
}
};
isTableAndNoParaExist = function (doc, e) {
var subParagraphs, subParagraphs2;
subParagraphs = e.find('p');
subParagraphs.each(function () {
var p, txt;
p = doc(this);
txt = p.text();
if (txt.length < 25)
return doc(p).remove();
});
subParagraphs2 = e.find('p');
if (subParagraphs2.length === 0 && !(e[0].name === 'td' || e[0].name === 'ul' || e[0].name === 'ol')) {
return true;
} else {
return false;
}
};
isNodescoreThresholdMet = function (doc, node, e) {
var currentNodeScore, thresholdScore, topNodeScore;
topNodeScore = getScore(node);
currentNodeScore = getScore(e);
thresholdScore = topNodeScore * .08;
if (currentNodeScore < thresholdScore && !(e[0].name === 'td' || e[0].name === 'ul' || e[0].name === 'ol' || e[0].name === 'blockquote')) {
return false;
} else {
return true;
}
};
postCleanup = function (doc, targetNode, lang) {
var node;
node = addSiblings(doc, targetNode, lang);
node.children().each(function () {
var e, eTag;
e = doc(this);
eTag = e[0].name;
if (!(eTag === 'p' || eTag === 'a'))
if (isHighlinkDensity(doc, e) || isTableAndNoParaExist(doc, e) || !isNodescoreThresholdMet(doc, node, e)) {
return doc(e).remove();
}
});
return node;
};
cleanNull = function (text) {
return null != text ? text.replace(/^null$/g, '') : void 0;
};
cleanText = function (text) {
return null != text ? text.replace(/[\r\n\t]/g, ' ').replace(/\s\s+/g, ' ').replace(/<!--.+?-->/g, '').replace(/�/g, '').trim() : void 0;
};
cleanTitle = function (title, delimiters) {
var titleText, usedDelimeter;
titleText = title || '';
usedDelimeter = false;
_.each(delimiters, function (c) {
if (titleText.indexOf(c) >= 0 && !usedDelimeter) {
titleText = biggestTitleChunk(titleText, c);
return usedDelimeter = true;
}
});
return cleanText(titleText);
};
rawTitle = function (doc) {
var cache$, cache$1, cache$2, cache$3, cache$4, cache$5, cache$6, cache$7, cache$8, cache$9, gotTitle, titleText;
gotTitle = false;
titleText = '';
_.each([
null != (cache$ = doc("meta[property='og:title']")) && null != (cache$1 = cache$.first()) ? cache$1.attr('content') : void 0,
null != (cache$2 = doc("h1[class*='title']")) && null != (cache$3 = cache$2.first()) ? cache$3.text() : void 0,
null != (cache$4 = doc('title')) && null != (cache$5 = cache$4.first()) ? cache$5.text() : void 0,
null != (cache$6 = doc('h1')) && null != (cache$7 = cache$6.first()) ? cache$7.text() : void 0,
null != (cache$8 = doc('h2')) && null != (cache$9 = cache$8.first()) ? cache$9.text() : void 0
], function (candidate) {
if (candidate && candidate.trim() && !gotTitle) {
titleText = candidate.trim();
return gotTitle = true;
}
});
return titleText;
};
}.call(this);