@egeria/scraper-plugin
Version:
Egeria | scraper plugin
158 lines (144 loc) • 4.48 kB
JavaScript
/*
.--. .-'. .--. .--. .--. .--. .`-. .--.
:::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\
' `--' `.-' `--' `--' `--' `-.' `--' `
Egeria - She bestows Knowledge and Wisdom
Copyright (C) 2016-2019 MySidesTheyAreGone <mysidestheyaregone@protonmail.com>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
.--. .-'. .--. .--. .--. .--. .`-. .--.
:::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\::::::::.\
' `--' `.-' `--' `--' `--' `-.' `--' `
*/
const R = require('ramda')
const xray = require('x-ray')
const T = require('@egeria/tools')
const W = require('@egeria/httplib')
const liburl = require('url')
async function failsafe (fn, params) {
var prms
if (!R.is(Array, params)) {
prms = R.of(params)
} else {
prms = params
}
let result
try {
result = await R.apply(fn, prms)
} catch (e) {
result = e
}
return result
}
function x (body, selector, def) {
let _x = xray()
return new Promise((resolve, reject) => {
_x(body, selector, def)((err, obj) => {
if (!R.isNil(err)) {
reject(err)
} else {
resolve(obj)
}
})
})
}
async function step (api, cfg, n, url) {
let page = await api.enqueue(() => W.httpreq({ url: url }))
let data = await x(page.body, cfg.items, [cfg.fields])
R.forEach(R.pipe(T.collapse(''), R.assoc('origin', 'scraper'), api.announce), data)
data = null
let ref, next
if (!T.isMissing(cfg.nextPage)) {
ref = await x(page.body, 'body', [{ next: cfg.nextPage }])
ref = ref[0].next
}
if (!T.isMissing(ref) && ++n < R.defaultTo(1, cfg.limit)) {
next = liburl.resolve(url, ref)
await step(api, cfg, n, next)
}
return true
}
async function action (api, cfg) {
let jobs = []
for (let url of R.flatten(R.of(cfg.start))) {
jobs.push(failsafe(step, [api, cfg, 0, url]))
}
let results = await Promise.all(jobs)
for (let metadata in results) {
metadata.origin = 'scraper'
api.announce(metadata)
}
}
async function lookupAction (api, cfg, fact) {
let urls = R.map((t) => fact.applyTemplate(t), R.flatten(R.of(cfg.start)))
let result
for (let url of urls) {
let page = await api.enqueue(() => W.httpreq({ url: url }))
let data = await x(page.body, cfg.items, [cfg.fields])
if (!T.isMissing(data)) {
result = data[0]
break
}
}
return fact.map(R.merge(result))
}
const plugins = {
scraper: {
type: 'input',
requires: ['schedule'],
sanity: {
type: 'object',
required: ['start', 'items', 'fields'],
properties: {
start: { type: ['string', 'array'] },
nextPage: { type: 'string' },
limit: { type: 'number' },
items: { type: 'string' },
fields: {
type: 'object',
not: { patternRequired: ['system:.*'] }
}
}
},
limits: {
upstream: 'http',
concurrency: 4,
delay: 1,
timeout: 60
},
act: action
},
scraperLookup: {
type: 'mutator',
sanity: {
type: 'object',
required: ['start', 'items', 'fields'],
properties: {
start: { type: ['string', 'array'] },
nextPage: { type: 'string' },
limit: { type: 'number' },
items: { type: 'string' },
fields: {
type: 'object',
not: { patternRequired: 'system:.*' }
}
}
},
limits: {
upstream: 'http',
concurrency: 4,
delay: 1,
timeout: 60
},
act: lookupAction
}
}
module.exports = plugins