unfluff
Version:
A web page content extractor
132 lines (105 loc) • 6.28 kB
text/coffeescript
suite 'Extractor', ->
extractor = require("../src/extractor")
cheerio = require("cheerio")
test 'exists', ->
ok extractor
test 'returns a blank title', ->
doc = cheerio.load("<html><head><title></title></head></html>")
title = extractor.title(doc)
eq title, ""
test 'returns a simple title', ->
doc = cheerio.load("<html><head><title>Hello!</title></head></html>")
title = extractor.title(doc)
eq title, "Hello!"
test 'returns a simple title chunk', ->
doc = cheerio.load("<html><head><title>This is my page - mysite</title></head></html>")
title = extractor.title(doc)
eq title, "This is my page"
test 'returns a soft title chunk without truncation', ->
doc = cheerio.load("<html><head><title>University Budgets: Where Your Fees Go | Top Universities</title></head></html>")
title = extractor.softTitle(doc)
eq title, "University Budgets: Where Your Fees Go"
test 'prefers the meta tag title', ->
doc = cheerio.load("<html><head><title>This is my page - mysite</title><meta property=\"og:title\" content=\"Open graph title\"></head></html>")
title = extractor.title(doc)
eq title, "Open graph title"
test 'falls back to title if empty meta tag', ->
doc = cheerio.load("<html><head><title>This is my page - mysite</title><meta property=\"og:title\" content=\"\"></head></html>")
title = extractor.title(doc)
eq title, "This is my page"
test 'returns another simple title chunk', ->
doc = cheerio.load("<html><head><title>coolsite.com: This is my page</title></head></html>")
title = extractor.title(doc)
eq title, "This is my page"
test 'returns a title chunk without �', ->
doc = cheerio.load("<html><head><title>coolsite.com: � This� is my page</title></head></html>")
title = extractor.title(doc)
eq title, "This is my page"
test 'returns the first title;', ->
doc = cheerio.load("<html><head><title>This is my page</title></head><svg xmlns=\"http://www.w3.org/2000/svg\"><title>svg title</title></svg></html>")
title = extractor.title(doc)
eq title, "This is my page"
test 'handles missing favicons', ->
doc = cheerio.load("<html><head><title></title></head></html>")
favicon = extractor.favicon(doc)
eq undefined, favicon
test 'returns the article published meta date', ->
doc = cheerio.load("<html><head><meta property=\"article:published_time\" content=\"2014-10-15T00:01:03+00:00\" /></head></html>")
date = extractor.date(doc)
eq date, "2014-10-15T00:01:03+00:00"
test 'returns the article dublin core meta date', ->
doc = cheerio.load("<html><head><meta name=\"DC.date.issued\" content=\"2014-10-15T00:01:03+00:00\" /></head></html>")
date = extractor.date(doc)
eq date, "2014-10-15T00:01:03+00:00"
test 'returns the date in the <time> element', ->
doc = cheerio.load("<html><head></head><body><time>24 May, 2010</time></body></html>")
date = extractor.date(doc)
eq date, "24 May, 2010"
test 'returns the date in the <time> element datetime attribute', ->
doc = cheerio.load("<html><head></head><body><time datetime=\"2010-05-24T13:47:52+0000\">24 May, 2010</time></body></html>")
date = extractor.date(doc)
eq date, "2010-05-24T13:47:52+0000"
test 'returns nothing if date eq "null"', ->
doc = cheerio.load("<html><head><meta property=\"article:published_time\" content=\"null\" /></head></html>")
date = extractor.date(doc)
eq date, null
test 'returns the copyright line element', ->
doc = cheerio.load("<html><head></head><body><div>Some stuff</div><ul><li class='copyright'><!-- // some garbage -->© 2016 The World Bank Group, All Rights Reserved.</li></ul></body></html>")
copyright = extractor.copyright(doc)
eq copyright, "2016 The World Bank Group"
test 'returns the copyright found in the text', ->
doc = cheerio.load("<html><head></head><body><div>Some stuff</div><ul>© 2016 The World Bank Group, All Rights Reserved\nSome garbage following</li></ul></body></html>")
copyright = extractor.copyright(doc)
eq copyright, "2016 The World Bank Group"
test 'returns nothing if no copyright in the text', ->
doc = cheerio.load("<html><head></head><body></body></html>")
copyright = extractor.copyright(doc)
eq copyright, null
test 'returns the article published meta author', ->
doc = cheerio.load("<html><head><meta property=\"article:author\" content=\"Joe Bloggs\" /></head></html>")
author = extractor.author(doc)
eq JSON.stringify(author), JSON.stringify(["Joe Bloggs"])
test 'returns the meta author', ->
doc = cheerio.load("<html><head><meta property=\"article:author\" content=\"Sarah Smith\" /><meta name=\"author\" content=\"Joe Bloggs\" /></head></html>")
author = extractor.author(doc)
eq JSON.stringify(author), JSON.stringify(["Sarah Smith", "Joe Bloggs"])
test 'returns the named author in the text as fallback', ->
doc = cheerio.load("<html><head></head><body><span class=\"author\"><a href=\"/author/gary-trust-6318\" class=\"article__author-link\">Gary Trust</a></span></body></html>")
author = extractor.author(doc)
eq JSON.stringify(author), JSON.stringify(["Gary Trust"])
test 'returns the meta author but ignore "null" value', ->
doc = cheerio.load("<html><head><meta property=\"article:author\" content=\"null\" /><meta name=\"author\" content=\"Joe Bloggs\" /></head></html>")
author = extractor.author(doc)
eq JSON.stringify(author), JSON.stringify(["Joe Bloggs"])
test 'returns the meta publisher', ->
doc = cheerio.load("<html><head><meta property=\"og:site_name\" content=\"Polygon\" /><meta name=\"author\" content=\"Griffin McElroy\" /></head></html>")
publisher = extractor.publisher(doc)
eq publisher, "Polygon"
test 'returns nothing if publisher eq "null"', ->
doc = cheerio.load("<html><head><meta property=\"og:site_name\" content=\"null\" /></head></html>")
publisher = extractor.publisher(doc)
eq publisher, null
test 'returns nothing if image eq "null"', ->
doc = cheerio.load("<html><head><meta property=\"og:image\" content=\"null\" /></head></html>")
image = extractor.image(doc)
eq image, null