UNPKG

html-metadata

Version:

Scrapes metadata of several different standards

184 lines (162 loc) 6.58 kB
'use strict'; const meta = require( '../index' ); const assert = require( 'assert' ); const cheerio = require( 'cheerio' ); // mocha defines to avoid eslint breakage /* global describe, it */ describe( 'scraping', function () { this.timeout( 100000 ); const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'; const acceptHeader = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'; function getWithHeaders( url ) { // eslint-disable-next-line n/no-unsupported-features/node-builtins return fetch( url, { method: 'GET', headers: { 'User-Agent': userAgent, Accept: acceptHeader } // res.body is a ReadableStream of a Uint8Array, but we just want the string } ).then( ( res ) => res.text() ); } describe( 'parseAll function', () => { it( 'should resolve promise from woorank with headers', () => { const url = 'https://www.woorank.com/en/blog/dublin-core-metadata-for-seo-and-usability'; return meta( { uri: url, headers: { 'User-Agent': userAgent, Accept: acceptHeader } } ) .then( ( result ) => { assert.ok( result, 'Expected result to be truthy' ); } ) .catch( ( e ) => { console.error( 'Error in woorank test:', e ); throw e; } ); } ); it( 'should resolve promise from blog.schema.org without headers', () => { const url = 'http://blog.schema.org'; return meta( url ) .then( ( result ) => { assert.ok( result, 'Expected result to be truthy' ); } ) .catch( ( e ) => { console.error( 'Error in blog.schema.org test:', e ); throw e; } ); } ); it( 'should throw error if no uri supplied', () => meta() .then( () => { assert.fail( 'Should have rejected the promise' ); } ) .catch( ( e ) => { assert.ok( e instanceof Error, 'Error should be an Error object' ); assert.strictEqual( e.message, 'No uri supplied in argument', 'Error message should match expected message' ); } ) ); it( 'should support await implementation with headers', async () => { const url = 'http://blog.schema.org'; const result = await meta( { uri: url, headers: { 'User-Agent': userAgent, Accept: acceptHeader } } ); assert.ok( result, 'Expected result to be truthy' ); } ); it( 'should support await implementation without headers', async () => { const url = 'http://blog.schema.org'; const result = await meta( url ); assert.ok( result, 'Expected result to be truthy' ); } ); it( 'should throw error if no uri is supplied with async/await', async () => { try { await meta(); assert.fail( 'Should have thrown an error' ); } catch ( e ) { assert.ok( e instanceof Error, 'Error should be an Error object' ); assert.strictEqual( e.message, 'No uri supplied in argument', 'Error message should match expected message' ); } } ); } ); describe( 'parseBEPress function', () => { it( 'should get BE Press metadata tags', () => { const url = 'http://biostats.bepress.com/harvardbiostat/paper154/'; return getWithHeaders( url ).then( ( body ) => { const expectedAuthors = [ 'Claggett, Brian', 'Xie, Minge', 'Tian, Lu' ]; const expectedAuthorInstitutions = [ 'Harvard', 'Rutgers University - New Brunswick/Piscataway', 'Stanford University School of Medicine' ]; const chtml = cheerio.load( body ); return meta.parseBEPress( chtml ) .then( ( results ) => { assert.deepStrictEqual( results.author, expectedAuthors ); assert.deepStrictEqual( results.author_institution, expectedAuthorInstitutions ); [ 'series_title', 'author', 'author_institution', 'title', 'date', 'pdf_url', 'abstract_html_url', 'publisher', 'online_date' ].forEach( ( key ) => { assert.ok( results[ key ], `Expected to find the ${ key } key in the response!` ); } ); } ); } ); } ); } ); describe( 'parseCOinS function', () => { it( 'should get COinS metadata', () => { const url = 'https://en.wikipedia.org/wiki/Viral_phylodynamics'; return getWithHeaders( url ).then( ( body ) => { const chtml = cheerio.load( body ); return meta.parseCOinS( chtml ) .then( ( results ) => { assert.ok( Array.isArray( results ), `Expected Array, got ${ typeof results }` ); assert.ok( results.length > 0, 'Expected Array with at least 1 item' ); assert.ok( results[ 0 ].rft, 'Expected first item of Array to contain key rft' ); } ); } ); } ); } ); describe( 'parseEPrints function', () => { it( 'should get EPrints metadata', () => { const url = 'http://eprints.gla.ac.uk/113711/'; return getWithHeaders( url ).then( ( body ) => { const chtml = cheerio.load( body ); const expectedAuthors = [ 'Gatherer, Derek', 'Kohl, Alain' ]; return meta.parseEprints( chtml ) .then( ( results ) => { assert.deepStrictEqual( results.creators_name, expectedAuthors ); [ 'eprintid', 'datestamp', 'title', 'abstract', 'issn', 'creators_name', 'publication', 'citation' ].forEach( ( key ) => { assert.ok( results[ key ], `Expected to find the ${ key } key in the response!` ); } ); } ); } ); } ); } ); describe( 'parseGeneral function', () => { it( 'should get html lang parameter', () => { const expected = 'fr'; const url = 'http://www.lemonde.fr'; return getWithHeaders( url ).then( ( body ) => { const chtml = cheerio.load( body ); return meta.parseGeneral( chtml ).then( ( results ) => { assert.strictEqual( results.lang, expected ); } ); } ); } ); it( 'should get html dir parameter', () => { const expected = 'rtl'; const url = 'https://www.iranrights.org/fa/'; return getWithHeaders( url ).then( ( body ) => { const chtml = cheerio.load( body ); return meta.parseGeneral( chtml ).then( ( results ) => { assert.strictEqual( results.dir, expected ); } ); } ); } ); } ); it( 'should not have any undefined values', () => { const url = 'http://web.archive.org/web/20220127144804/https://www.cnet.com/special-reports/vr101/'; return getWithHeaders( url ).then( ( body ) => { const chtml = cheerio.load( body ); return meta.parseAll( chtml ) .then( ( results ) => { Object.keys( results ).forEach( ( metadataType ) => { Object.keys( results[ metadataType ] ).forEach( ( key ) => { assert.notStrictEqual( results[ metadataType ][ key ], undefined, `${ metadataType }.${ key } should not be undefined` ); } ); } ); } ); } ); } ); } );