UNPKG

@egeria/scraper-plugin

Version:

Egeria | scraper plugin

158 lines (144 loc) 4.48 kB
/* .--. .-'. .--. .--. .--. .--. .`-. .--. :::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\ ' `--' `.-' `--' `--' `--' `-.' `--' ` Egeria - She bestows Knowledge and Wisdom Copyright (C) 2016-2019 MySidesTheyAreGone <mysidestheyaregone@protonmail.com> This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. .--. .-'. .--. .--. .--. .--. .`-. .--. :::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\ ' `--' `.-' `--' `--' `--' `-.' `--' ` */ const R = require('ramda') const xray = require('x-ray') const T = require('@egeria/tools') const W = require('@egeria/httplib') const liburl = require('url') async function failsafe (fn, params) { var prms if (!R.is(Array, params)) { prms = R.of(params) } else { prms = params } let result try { result = await R.apply(fn, prms) } catch (e) { result = e } return result } function x (body, selector, def) { let _x = xray() return new Promise((resolve, reject) => { _x(body, selector, def)((err, obj) => { if (!R.isNil(err)) { reject(err) } else { resolve(obj) } }) }) } async function step (api, cfg, n, url) { let page = await api.enqueue(() => W.httpreq({ url: url })) let data = await x(page.body, cfg.items, [cfg.fields]) R.forEach(R.pipe(T.collapse(''), R.assoc('origin', 'scraper'), api.announce), data) data = null let ref, next if (!T.isMissing(cfg.nextPage)) { ref = await x(page.body, 'body', [{ next: cfg.nextPage }]) ref = ref[0].next } if (!T.isMissing(ref) && ++n < R.defaultTo(1, cfg.limit)) { next = liburl.resolve(url, ref) await step(api, cfg, n, next) } return true } async function action (api, cfg) { let jobs = [] for (let url of R.flatten(R.of(cfg.start))) { jobs.push(failsafe(step, [api, cfg, 0, url])) } let results = await Promise.all(jobs) for (let metadata in results) { metadata.origin = 'scraper' api.announce(metadata) } } async function lookupAction (api, cfg, fact) { let urls = R.map((t) => fact.applyTemplate(t), R.flatten(R.of(cfg.start))) let result for (let url of urls) { let page = await api.enqueue(() => W.httpreq({ url: url })) let data = await x(page.body, cfg.items, [cfg.fields]) if (!T.isMissing(data)) { result = data[0] break } } return fact.map(R.merge(result)) } const plugins = { scraper: { type: 'input', requires: ['schedule'], sanity: { type: 'object', required: ['start', 'items', 'fields'], properties: { start: { type: ['string', 'array'] }, nextPage: { type: 'string' }, limit: { type: 'number' }, items: { type: 'string' }, fields: { type: 'object', not: { patternRequired: ['system:.*'] } } } }, limits: { upstream: 'http', concurrency: 4, delay: 1, timeout: 60 }, act: action }, scraperLookup: { type: 'mutator', sanity: { type: 'object', required: ['start', 'items', 'fields'], properties: { start: { type: ['string', 'array'] }, nextPage: { type: 'string' }, limit: { type: 'number' }, items: { type: 'string' }, fields: { type: 'object', not: { patternRequired: 'system:.*' } } } }, limits: { upstream: 'http', concurrency: 4, delay: 1, timeout: 60 }, act: lookupAction } } module.exports = plugins