unfluff
Version:
A web page content extractor
141 lines (140 loc) • 5.24 kB
JavaScript
// Generated by CoffeeScript 2.0.0-beta7
void function () {
var cheerio, cleaner, extractor, getCleanedDoc, getParsedDoc, getTopNode, unfluff;
cheerio = require('cheerio');
extractor = require('./extractor');
cleaner = require('./cleaner');
module.exports = unfluff = function (html, language) {
var doc, lng, pageData, topNode;
doc = cheerio.load(html);
lng = language || extractor.lang(doc);
pageData = {
title: extractor.title(doc),
softTitle: extractor.softTitle(doc),
date: extractor.date(doc),
author: extractor.author(doc),
publisher: extractor.publisher(doc),
copyright: extractor.copyright(doc),
favicon: extractor.favicon(doc),
description: extractor.description(doc),
keywords: extractor.keywords(doc),
lang: lng,
canonicalLink: extractor.canonicalLink(doc),
tags: extractor.tags(doc),
image: extractor.image(doc)
};
cleaner(doc);
topNode = extractor.calculateBestNode(doc, lng);
pageData.videos = extractor.videos(doc, topNode);
pageData.links = extractor.links(doc, topNode, lng);
pageData.text = extractor.text(doc, topNode, lng);
return pageData;
};
unfluff.lazy = function (html, language) {
return {
title: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.title_ ? this.title_ : this.title_ = extractor.title(doc);
},
softTitle: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.softTitle_ ? this.softTitle_ : this.softTitle_ = extractor.softTitle(doc);
},
date: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.date_ ? this.date_ : this.date_ = extractor.date(doc);
},
copyright: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.copyright_ ? this.copyright_ : this.copyright_ = extractor.copyright(doc);
},
author: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.author_ ? this.author_ : this.author_ = extractor.author(doc);
},
publisher: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.publisher_ ? this.publisher_ : this.publisher_ = extractor.publisher(doc);
},
favicon: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.favicon_ ? this.favicon_ : this.favicon_ = extractor.favicon(doc);
},
description: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.description_ ? this.description_ : this.description_ = extractor.description(doc);
},
keywords: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.keywords_ ? this.keywords_ : this.keywords_ = extractor.keywords(doc);
},
lang: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.language_ ? this.language_ : this.language_ = language || extractor.lang(doc);
},
canonicalLink: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.canonicalLink_ ? this.canonicalLink_ : this.canonicalLink_ = extractor.canonicalLink(doc);
},
tags: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.tags_ ? this.tags_ : this.tags_ = extractor.tags(doc);
},
image: function () {
var doc;
doc = getParsedDoc.call(this, html);
return null != this.image_ ? this.image_ : this.image_ = extractor.image(doc);
},
videos: function () {
var doc, topNode;
if (null != this.videos_)
return this.videos_;
doc = getCleanedDoc.call(this, html);
topNode = getTopNode.call(this, doc, this.lang());
return this.videos_ = extractor.videos(doc, topNode);
},
text: function () {
var doc, topNode;
if (null != this.text_)
return this.text_;
doc = getCleanedDoc.call(this, html);
topNode = getTopNode.call(this, doc, this.lang());
return this.text_ = extractor.text(doc, topNode, this.lang());
},
links: function () {
var doc, topNode;
if (null != this.links_)
return this.links_;
doc = getCleanedDoc.call(this, html);
topNode = getTopNode.call(this, doc, this.lang());
return this.links_ = extractor.links(doc, topNode, this.lang());
}
};
};
getParsedDoc = function (html) {
return null != this.doc_ ? this.doc_ : this.doc_ = cheerio.load(html);
};
getTopNode = function (doc, lng) {
return null != this.topNode_ ? this.topNode_ : this.topNode_ = extractor.calculateBestNode(doc, lng);
};
getCleanedDoc = function (html) {
var doc;
if (null != this.cleanedDoc_)
return this.cleanedDoc_;
doc = getParsedDoc.call(this, html);
this.cleanedDoc_ = cleaner(doc);
return this.cleanedDoc_;
};
}.call(this);