UNPKG

webscrape

Version:

Scrape web pages. Uses and returns promises

213 lines (171 loc) 6.87 kB
/* global describe, it, afterEach, before, after */ import assert from 'assert'; import Scraper from '../src/index'; import Promise from 'bluebird'; import fs from 'fs'; import mkdirp from 'mkdirp'; import path from 'path'; import app from './server'; Promise.promisifyAll(fs); const mkdir = Promise.promisify(mkdirp); async function deleteIfExists(...files) { return files.reduce( async (chain, file) => { try { await chain; const stat = await fs.statAsync(file); return stat.isDirectory() && await fs.rmdirAsync(file) || await fs.unlinkAsync(file); } catch (err) { // file probably doesn't exist, so we ignore } }, Promise.resolve()); } describe('Scraper', () => { // MAYDO: Transpilation means that the function length cannot be properly tested const apis = [ { name: 'get', length: 2 }, { name: 'post', length: 2 }, { name: 'download', length: 2} ]; it('should return a scrape object', async () => { const scraper = Scraper(); assert.equal(typeof scraper, 'object'); apis.forEach(api => { assert.equal(typeof scraper[api.name], 'function'); }); }); it('should be able to fetch Google\'s home page', async () => { const scraper = Scraper(); const result = await scraper.get('https://www.google.com'); assert.ok(result.body); assert.ok(result.body.includes('<title>Google</title>')); }); it('should get a $ representation of Google\'s home page', async () => { const scraper = Scraper(); const result = await scraper.get('https://www.google.com'); assert.ok(result.$); assert.equal(result.$('title').text(), 'Google'); }); it('should be able to fetch a JSON response from Google\'s GEOCoding API', async () => { const query = { 'address': '1600 Amphitheatre Parkway, Mountain View, CA', 'sensor': 'false' }; const scraper = Scraper(); const result = await scraper.get('http://maps.googleapis.com/maps/api/geocode/json', { query }); assert.ok(result.json); }); describe('download', () => { const url = 'https://ajax.googleapis.com/ajax/libs/swfobject/2.2/swfobject.js'; const innerpath = 'tempinnerfolder'; const baseName = 'swfobject.js'; const newName = 'differentswfobject.js'; // yuck let server; let port; before((next) => { server = app.listen(() => { port = server.address().port; next(); }); }); after(() => server.close()); afterEach( async () => await deleteIfExists(baseName, newName, path.join(innerpath, baseName), innerpath)); it('should be able to download the SWF object javascript library from Google\'s CDN', async() => { const scraper = Scraper(); const downloadedName = await scraper.download(url); assert.equal(downloadedName, baseName); const contents = await fs.readFileAsync(downloadedName, { encoding: 'utf8' }); assert.ok(contents.includes('SWFObject')); }); it('should be able to download to the "inner" folder when that is used as the "target filename"', async () => { const scraper = Scraper(); await mkdir(innerpath); const downloadedName = await scraper.download(url, { filename: innerpath }); assert.equal(downloadedName, path.join(innerpath, baseName)); const contents = await fs.readFileAsync(downloadedName, { encoding: 'utf8'}); assert.ok(contents.includes('SWFObject')); }); it('should be able to download the SWF object javascript library under a different name', async () => { const scraper = Scraper(); const downloadedName = await scraper.download(url, { filename: newName }); assert.equal(downloadedName, newName); const contents = await fs.readFileAsync(downloadedName, { encoding: 'utf8'}); assert.ok(contents.includes('SWFObject')); }); it('should be able to download using POST with URL encoded body', async () => { const scraper = Scraper(); const downloadedName = await scraper.download(`http://127.0.0.1:${port}/download1`, { post: { one: '1', two: '2' }, filename: newName }); const contents = await fs.readFileAsync(downloadedName, { encoding: 'utf8' }); assert.ok(contents.includes('SWFObject')); }); it('should be able to download using POST with JSON encoded body', async () => { const scraper = Scraper(); const downloadedName = await scraper.download(`http://127.0.0.1:${port}/download2`, { headers: { 'content-type': 'application/json' }, post: { one: '1', two: '2' }, filename: newName }); const contents = await fs.readFileAsync(downloadedName, { encoding: 'utf8' }); assert.ok(contents.includes('SWFObject')); }); it('should NOT be able to download using POST with JSON encoded body if that is not supported by the server', async () => { const scraper = Scraper(); try { await scraper.download(`http://127.0.0.1:${port}/download1`, { headers: { 'content-type': 'application/json' }, post: { one: 1, two: 2 }, filename: newName }); assert.fail('An error should have been thrown'); } catch (err) { assert(err); } }); }); describe('support for headers', () => { const tempfile = "testfile.txt"; const headers = { Referer: 'nonsense' }; const HEADER_DIAGNOSIS_URL = 'https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending'; it('should be able to add a header that is sent in the HTTP request', async () => { const scraper = Scraper(); const result = await scraper.get(HEADER_DIAGNOSIS_URL, { headers }); assert.ok(result.$('th:contains("Referer")').next().text().includes('nonsense')); }); it('should be able to download a file with the header sent in the HTTP request', async () => { const scraper = Scraper(); const downloadedName = await scraper.download(HEADER_DIAGNOSIS_URL, { filename: tempfile, headers }); const contents = await fs.readFileAsync(downloadedName, { encoding: 'utf8' }); assert.ok(contents.includes('nonsense')); }); it('should be able to read headers from the response', async () => { const scraper = Scraper(); const { headers } = await scraper.get('https://www.instagram.com'); assert.ok(headers['set-cookie']); }); afterEach(async () => await deleteIfExists(tempfile)); }); describe('querystring support', () => { it('should use indices by default for arrays in query', async () => { const scraper = Scraper(); const { json } = await scraper.get('https://postman-echo.com/get', { query: { test: [123,234] } }); assert.equal(decodeURI(json.url), 'https://postman-echo.com/get?test[0]=123&test[1]=234'); }); it('should disable indicies if specified as false in options', async () => { const scraper = Scraper(); const { json } = await scraper.get('https://postman-echo.com/get', { query: { test: [123,234] }, indicies: false }); assert.equal(decodeURI(json.url), 'https://postman-echo.com/get?test=123&test=234'); }); }); });