unfluff
Version:
A web page content extractor
123 lines (97 loc) • 3.42 kB
text/coffeescript
cheerio = require("cheerio")
extractor = require("./extractor")
cleaner = require("./cleaner")
module.exports = unfluff = (html, language) ->
doc = cheerio.load(html)
lng = language || extractor.lang(doc)
pageData =
title: extractor.title(doc)
softTitle: extractor.softTitle(doc)
date: extractor.date(doc)
author: extractor.author(doc)
publisher: extractor.publisher(doc)
copyright: extractor.copyright(doc)
favicon: extractor.favicon(doc)
description: extractor.description(doc)
keywords: extractor.keywords(doc)
lang: lng
canonicalLink: extractor.canonicalLink(doc)
tags: extractor.tags(doc)
image: extractor.image(doc)
# Step 1: Clean the doc
cleaner(doc)
# Step 2: Find the doc node with the best text
topNode = extractor.calculateBestNode(doc, lng)
# Step 3: Extract text, videos, images, links
pageData.videos = extractor.videos(doc, topNode)
pageData.links = extractor.links(doc, topNode, lng)
pageData.text = extractor.text(doc, topNode, lng)
pageData
# Allow access to document properties with lazy evaluation
unfluff.lazy = (html, language) ->
title: () ->
doc = getParsedDoc.call(this, html)
?= extractor.title(doc)
softTitle: () ->
doc = getParsedDoc.call(this, html)
?= extractor.softTitle(doc)
date: () ->
doc = getParsedDoc.call(this, html)
?= extractor.date(doc)
copyright: () ->
doc = getParsedDoc.call(this, html)
?= extractor.copyright(doc)
author: () ->
doc = getParsedDoc.call(this, html)
?= extractor.author(doc)
publisher: () ->
doc = getParsedDoc.call(this, html)
?= extractor.publisher(doc)
favicon: () ->
doc = getParsedDoc.call(this, html)
?= extractor.favicon(doc)
description: () ->
doc = getParsedDoc.call(this, html)
?= extractor.description(doc)
keywords: () ->
doc = getParsedDoc.call(this, html)
?= extractor.keywords(doc)
lang: () ->
doc = getParsedDoc.call(this, html)
?= language or extractor.lang(doc)
canonicalLink: () ->
doc = getParsedDoc.call(this, html)
?= extractor.canonicalLink(doc)
tags: () ->
doc = getParsedDoc.call(this, html)
?= extractor.tags(doc)
image: () ->
doc = getParsedDoc.call(this, html)
?= extractor.image(doc)
videos: () ->
return if ?
doc = getCleanedDoc.call(this, html)
topNode = getTopNode.call(this, doc, this.lang())
= extractor.videos(doc, topNode)
text: () ->
return if ?
doc = getCleanedDoc.call(this, html)
topNode = getTopNode.call(this, doc, this.lang())
= extractor.text(doc, topNode, this.lang())
links: () ->
return if ?
doc = getCleanedDoc.call(this, html)
topNode = getTopNode.call(this, doc, this.lang())
= extractor.links(doc, topNode, this.lang())
# Load the doc in cheerio and cache it
getParsedDoc = (html) ->
?= cheerio.load(html)
# Cached version of calculateBestNode
getTopNode = (doc, lng) ->
?= extractor.calculateBestNode(doc, lng)
# Cached version of the cleaned doc
getCleanedDoc = (html) ->
return if ?
doc = getParsedDoc.call(this, html)
= cleaner(doc)