UNPKG

website-scraper

Version:

Download website to a local directory (including all css, images, js, etc.)

289 lines (239 loc) 9.3 kB
import PromiseQueue from 'p-queue'; import logger from './logger.js'; import defaults from './config/defaults.js'; import recursiveSources from './config/recursive-sources.js'; import Resource from './resource.js'; import request from './request.js'; import ResourceHandler from './resource-handler/index.js'; import { SaveResourceToFileSystemPlugin, GenerateFilenameBySiteStructurePlugin, GenerateFilenameByTypePlugin, GetResourceReferencePlugin } from './plugins/index.js'; import * as utils from './utils/index.js'; const { extend, union, urlsEqual, getTypeByMime, getTypeByFilename, series } = utils; import NormalizedUrlMap from './utils/normalized-url-map.js'; const actionNames = [ 'beforeStart', 'afterFinish', 'error', 'beforeRequest', 'afterResponse', 'onResourceSaved', 'onResourceError', 'generateFilename', 'getReference', 'saveResource', ]; const mandatoryActions = [ { name: 'saveResource', pluginClass: SaveResourceToFileSystemPlugin }, { name: 'generateFilename', pluginClass: GenerateFilenameByTypePlugin }, { name: 'getReference', pluginClass: GetResourceReferencePlugin }, ]; const filenameGeneratorPlugins = { byType: GenerateFilenameByTypePlugin, bySiteStructure: GenerateFilenameBySiteStructurePlugin }; class Scraper { constructor (options) { this.normalizeOptions(options); logger.info('init with options', this.options); this.applyPlugins(this.options.plugins); this.resourceHandler = new ResourceHandler(this.options, { requestResource: this.requestResource.bind(this), getReference: this.runActions.bind(this, 'getReference') }); this.resources = this.options.urls.map(({url, filename}) => new Resource(url, filename)); this.requestedResourcePromises = new NormalizedUrlMap(); // Map url -> request promise this.loadedResources = new NormalizedUrlMap(); // Map url -> resource this.requestQueue = new PromiseQueue({concurrency: this.options.requestConcurrency}); } normalizeOptions (options) { this.options = extend(defaults, options); this.options.request = extend(defaults.request, options.request); const urls = Array.isArray(options.urls) ? options.urls : [options.urls]; this.options.urls = urls.map((urlItem) => { if (typeof urlItem === 'string') { return { url: urlItem, filename: this.options.defaultFilename }; } else { return {url: urlItem.url, filename: urlItem.filename || this.options.defaultFilename}; } }); if (this.options.subdirectories) { this.options.subdirectories.forEach((element) => { element.extensions = element.extensions.map((ext) => ext.toLowerCase()); }); } this.options.recursiveSources = recursiveSources; if (this.options.recursive) { this.options.sources = union(this.options.sources, this.options.recursiveSources); } this.options.plugins = this.options.plugins || []; if (Object.keys(filenameGeneratorPlugins).includes(this.options.filenameGenerator)) { this.options.plugins.unshift(new filenameGeneratorPlugins[this.options.filenameGenerator]()); } } applyPlugins (plugins = []) { this.actions = {}; actionNames.forEach(actionName => this.actions[actionName] = []); plugins.forEach(plugin => { logger.debug(`[plugin] apply plugin ${plugin.constructor.name}`); plugin.apply(this.addAction.bind(this)); }); mandatoryActions.forEach(mandatoryAction => { if (this.actions[mandatoryAction.name].length === 0) { const plugin = new mandatoryAction.pluginClass(); logger.debug(`[plugin] apply default plugin ${plugin.constructor.name} for action ${mandatoryAction.name}`); plugin.apply(this.addAction.bind(this)); } }); } addAction (name, handler) { if (!actionNames.includes(name)) { throw new Error(`Unknown action "${name}"`); } logger.debug(`add action ${name}`); this.actions[name].push(handler); } loadResource (resource) { const url = resource.getUrl(); if (this.loadedResources.has(url)) { logger.debug('found loaded resource for ' + resource); } else { logger.debug('add loaded resource ' + resource); this.loadedResources.set(url, resource); } } async saveResource (resource) { resource.setSaved(); try { await this.resourceHandler.handleResource(resource); logger.info('saving resource ' + resource + ' to fs'); await this.runActions('saveResource', {resource}); // ignore promise here, just notifying external code about resource saved this.runActions('onResourceSaved', {resource}); } catch (err) { logger.warn('failed to save resource ' + resource); await this.handleError(err, resource); } } createNewRequest (resource) { const self = this; const url = resource.getUrl(); const requestPromise = Promise.resolve() .then(async () => { const referer = resource.parent ? resource.parent.getUrl() : null; return this.requestQueue.add(async () => { const {requestOptions} = await this.runActions('beforeRequest', {resource, requestOptions: this.options.request}); return request.get({ url, referer, options: requestOptions, afterResponse: this.actions.afterResponse.length ? this.runActions.bind(this, 'afterResponse') : undefined }); }); }).then(async function requestCompleted (responseData) { if (!responseData) { logger.debug('no response returned for url ' + url); return null; } if (!urlsEqual(responseData.url, url)) { // Url may be changed in redirects logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url); if (self.requestedResourcePromises.has(responseData.url)) { return self.requestedResourcePromises.get(responseData.url); } resource.setUrl(responseData.url); self.requestedResourcePromises.set(responseData.url, requestPromise); } resource.setType(getTypeByMime(responseData.mimeType)); const { filename } = await self.runActions('generateFilename', { resource, responseData }); resource.setFilename(filename); // if type was not determined by mime we can try to get it from filename after it was generated if (!resource.getType()) { resource.setType(getTypeByFilename(filename)); } if (responseData.metadata) { resource.setMetadata(responseData.metadata); } resource.setEncoding(responseData.encoding); resource.setText(responseData.body); self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad return resource; }).catch(function handleError (err) { logger.error('failed to request resource ' + resource); return self.handleError(err, resource); }); self.requestedResourcePromises.set(url, requestPromise); return requestPromise; } async requestResource (resource) { const url = resource.getUrl(); const depth = resource.getDepth(); if (this.options.urlFilter && depth > 0 && !this.options.urlFilter(url)) { logger.debug('filtering out ' + resource + ' by url filter'); return null; } if (this.options.maxDepth && depth > this.options.maxDepth) { logger.debug('filtering out ' + resource + ' by depth'); return null; } if (this.requestedResourcePromises.has(url)) { logger.debug('found requested resource for ' + resource); return this.requestedResourcePromises.get(url); } return this.createNewRequest(resource); } async runActions (actionName, params) { logger.debug(`run ${this.actions[actionName].length} actions ${actionName}`); let result = extend(params); for (let action of this.actions[actionName]) { if (typeof action === 'function') { result = await action(extend(params, result)); } } return result; } async load () { const loadResourcePromises = this.resources.map( resource => this.requestResource.bind(this, resource) ); await series(loadResourcePromises); return this.waitForLoad(); } // Returns a promise which gets resolved when all resources are loaded. // 1. Get all not saved resources and save them // 2. Recursion if any new not saved resource were added during this time. If not, loading is done. async waitForLoad () { const resourcesToSave = Array.from(this.loadedResources.values()).filter((r) => !r.isSaved()); const loadingIsFinished = resourcesToSave.length === 0; if (!loadingIsFinished) { const saveResourcePromises = resourcesToSave.map( resource => this.saveResource.bind(this, resource) ); await series(saveResourcePromises); return this.waitForLoad(); } logger.info('downloading is finished successfully'); return Promise.resolve(this.resources); } async handleError (err, resource) { // ignore promise here, just notifying external code about resource error this.runActions('onResourceError', {resource, error: err}); if (this.options.ignoreErrors) { logger.warn('ignoring error: ' + err.message); return null; } logger.error('error occurred: ' + err.message); throw err; } async scrape () { try { await this.runActions('beforeStart', {options: this.options, utils}); return await this.load(); } catch (error) { logger.error('finishing with error: ' + error.message); await this.runActions('error', {error}); throw error; } finally { await this.runActions('afterFinish'); } } } export default Scraper;