unfluff
Version:
A web page content extractor
207 lines (154 loc) • 8.18 kB
text/coffeescript
suite 'Unfluff', ->
_ = require('lodash')
extractor = require("../src/unfluff")
cleanTestingText = (text, origTextLength) ->
text.replace(/\n\n/g, " ").replace(/\ \ /g, " ")[0..origTextLength-1]
cleanOrigText = (text) ->
text.replace(/\n\n/g, " ")
checkFixture = (site, fields) ->
html = fs.readFileSync("./fixtures/test_#{site}.html").toString()
orig = JSON.parse(fs.readFileSync("./fixtures/test_#{site}.json"))
data = extractor(html)
dataLazy = extractor.lazy(html)
_.each fields, (field) ->
if field == 'title'
eq orig.expected.title, data.title, "#{site}: title didn't match expected value"
eq data.title, dataLazy.title()
else if field == 'cleaned_text'
origText = cleanOrigText(orig.expected.cleaned_text)
newText = cleanTestingText(data.text, origText.length)
partialExtractText = cleanTestingText(dataLazy.text(), origText.length)
ok newText, "#{site}: no text was found"
ok data.text.length >= orig.expected.cleaned_text.length , "#{site}: cleaned text was too short"
eq origText, newText, "#{site}: cleaned text didn't match expected value"
eq origText, partialExtractText, "#{site}: cleaned text from partial extract didn't match expected value"
else if field == 'link'
eq orig.expected.final_url, data.canonicalLink, "#{site}: canonical link didn't match expected value"
eq data.canonicalLink, dataLazy.canonicalLink(), "#{site}: canonical link from partial extraction didn't match expected value"
else if field == 'image'
eq orig.expected.image, data.image, "#{site}: image didn't match expected value"
eq data.image, dataLazy.image(), "#{site}: image from partial extraction didn't match expected value"
else if field == 'description'
eq orig.expected.meta_description, data.description, "#{site}: meta description didn't match expected value"
eq data.description, dataLazy.description(), "#{site}: description from partial extraction didn't match expected value"
else if field == 'lang'
eq orig.expected.meta_lang, data.lang, "#{site}: detected langauge didn't match expected value"
eq data.lang, dataLazy.lang(), "#{site}: langauge from partial extraction didn't match expected value"
else if field == 'keywords'
eq orig.expected.meta_keywords, data.keywords, "#{site}: meta keywords didn't match expected value"
eq data.keywords, dataLazy.keywords(), "#{site}: meta keywords from partial extraction didn't match expected value"
else if field == 'favicon'
eq orig.expected.meta_favicon, data.favicon, "#{site}: favicon url didn't match expected value"
eq data.favicon, dataLazy.favicon(), "#{site}: favicon url from partial extraction didn't match expected value"
else if field == 'tags'
sortedTags = data.tags.sort()
arrayEq orig.expected.tags.sort(), sortedTags, "#{site}: meta tags didn't match expected value"
arrayEq sortedTags, dataLazy.tags().sort(), "#{site}: meta tags from partial extraction didn't match expected value"
else if field == 'links'
sortedLinks = data.links.sort()
sortedLazyLinks = dataLazy.links().sort()
if !orig.expected.links
orig.expected.links = sortedLinks
fs.writeFileSync("./fixtures/test_#{site}.json", JSON.stringify(orig, null, 4))
deepEq orig.expected.links.sort(), sortedLinks, "#{site}: links didn't match expected value"
deepEq orig.expected.links.sort(), sortedLazyLinks, "#{site}: links from partial extraction didn't match expected value"
else if field == 'videos'
sortedVideos = data.videos.sort()
deepEq orig.expected.movies.sort(), sortedVideos, "#{site}: videos didn't match expected value"
deepEq sortedVideos, dataLazy.videos().sort(), "#{site}: videos from partial extraction didn't match expected value"
else
# Oops!
eq true, false, "#{site}: Invalid test!"
test 'exists', ->
ok extractor
test 'lazy version exists', ->
ok extractor.lazy
test 'reads favicon', ->
checkFixture('aolNews' , ['favicon'])
test 'reads description', ->
checkFixture('allnewlyrics1' , ['description'])
test 'reads open graph description', ->
checkFixture('twitter' , ['description'])
test 'reads keywords', ->
checkFixture('allnewlyrics1' , ['keywords'])
test 'reads lang', ->
checkFixture('allnewlyrics1' , ['lang'])
test 'reads canonical link', ->
checkFixture('allnewlyrics1' , ['link'])
test 'reads tags', ->
checkFixture('tags_kexp' , ['tags'])
checkFixture('tags_deadline' , ['tags'])
checkFixture('tags_wnyc' , ['tags'])
checkFixture('tags_cnet' , ['tags'])
checkFixture('tags_abcau' , ['tags'])
test 'reads videos', ->
checkFixture('embed' , ['videos'])
checkFixture('iframe' , ['videos'])
checkFixture('object' , ['videos'])
test 'links', ->
checkFixture('theverge1' , ['links'])
checkFixture('techcrunch1' , ['links'])
checkFixture('polygon' , ['links'])
test 'images', ->
checkFixture('aolNews' , ['image'])
checkFixture('polygon' , ['image'])
checkFixture('theverge1' , ['image'])
test 'gets cleaned text - Polygon', ->
checkFixture('polygon' , ['cleaned_text', 'title', 'link', 'description', 'lang', 'favicon'])
test 'gets cleaned text - The Verge', ->
checkFixture('theverge1' , ['cleaned_text', 'title', 'link', 'description', 'lang', 'favicon'])
test 'gets cleaned tags - The Verge', ->
checkFixture('theverge2' , ['tags'])
test 'gets cleaned text - McSweeneys', ->
checkFixture('mcsweeney', ['cleaned_text', 'link', 'lang', 'favicon'])
test 'gets cleaned text - CNN', ->
checkFixture('cnn1' , ['cleaned_text'])
test 'gets cleaned text - MSN', ->
checkFixture('msn1' , ['cleaned_text'])
test 'gets cleaned text - Time', ->
checkFixture('time2' , ['cleaned_text'])
test 'gets cleaned text - BI', ->
checkFixture('businessinsider1' , ['cleaned_text'])
checkFixture('businessinsider2' , ['cleaned_text'])
checkFixture('businessinsider3' , ['cleaned_text'])
test 'gets cleaned text - CNBC', ->
checkFixture('cnbc1' , ['cleaned_text'])
test 'gets cleaned text - CBS Local', ->
checkFixture('cbslocal' , ['cleaned_text'])
test 'gets cleaned text - Business Week', ->
checkFixture('businessWeek1' , ['cleaned_text'])
checkFixture('businessWeek2' , ['cleaned_text'])
checkFixture('businessWeek3' , ['cleaned_text'])
test 'gets cleaned text - El Pais', ->
checkFixture('elpais' , ['cleaned_text'])
test 'gets cleaned text - Techcrunk', ->
checkFixture('techcrunch1' , ['cleaned_text'])
test 'gets cleaned text - Fox "News"', ->
checkFixture('foxNews' , ['cleaned_text'])
test 'gets cleaned text - Huff Po', ->
checkFixture('huffingtonPost2' , ['cleaned_text'])
checkFixture('testHuffingtonPost' , ['cleaned_text', 'description', 'title'])
test 'gets cleaned text - ESPN', ->
checkFixture('espn' , ['cleaned_text'])
test 'gets cleaned text - Time', ->
checkFixture('time' , ['cleaned_text'])
test 'gets cleaned text - CNet', ->
checkFixture('cnet' , ['cleaned_text'])
test 'gets cleaned text - Yahoo', ->
checkFixture('yahoo' , ['cleaned_text'])
test 'gets cleaned text - Politico', ->
checkFixture('politico' , ['cleaned_text'])
test 'gets cleaned text - Goose Regressions', ->
checkFixture('issue4' , ['cleaned_text'])
checkFixture('issue24' , ['cleaned_text'])
checkFixture('issue25' , ['cleaned_text'])
checkFixture('issue28' , ['cleaned_text'])
test 'gets cleaned text - Gizmodo', ->
checkFixture('gizmodo1' , ['cleaned_text', 'description', 'keywords'])
test 'gets cleaned text - Mashable', ->
checkFixture('mashable_issue_74' , ['cleaned_text'])
test 'gets cleaned text - USA Today', ->
checkFixture('usatoday_issue_74' , ['cleaned_text'])
checkFixture('usatoday1' , ['cleaned_text'])
test 'gets cleaned text - dcurt.is', ->
checkFixture('dcurtis' , ['cleaned_text'])