UNPKG

@sugarcube/plugin-http

Version:
196 lines (165 loc) 6.23 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.default = void 0; var _fs = _interopRequireDefault(require("fs")); var _os = _interopRequireDefault(require("os")); var _path = _interopRequireDefault(require("path")); var _util = require("util"); var _fp = require("lodash/fp"); var _dashp = _interopRequireWildcard(require("dashp")); var _core = require("@sugarcube/core"); var _utils = require("@sugarcube/utils"); var _pluginFs = require("@sugarcube/plugin-fs"); var _utils2 = require("../utils"); var _browser = _interopRequireDefault(require("../browser")); function _getRequireWildcardCache(nodeInterop) { if (typeof WeakMap !== "function") return null; var cacheBabelInterop = new WeakMap(); var cacheNodeInterop = new WeakMap(); return (_getRequireWildcardCache = function (nodeInterop) { return nodeInterop ? cacheNodeInterop : cacheBabelInterop; })(nodeInterop); } function _interopRequireWildcard(obj, nodeInterop) { if (!nodeInterop && obj && obj.__esModule) { return obj; } if (obj === null || typeof obj !== "object" && typeof obj !== "function") { return { default: obj }; } var cache = _getRequireWildcardCache(nodeInterop); if (cache && cache.has(obj)) { return cache.get(obj); } var newObj = {}; var hasPropertyDescriptor = Object.defineProperty && Object.getOwnPropertyDescriptor; for (var key in obj) { if (key !== "default" && Object.prototype.hasOwnProperty.call(obj, key)) { var desc = hasPropertyDescriptor ? Object.getOwnPropertyDescriptor(obj, key) : null; if (desc && (desc.get || desc.set)) { Object.defineProperty(newObj, key, desc); } else { newObj[key] = obj[key]; } } } newObj.default = obj; if (cache) { cache.set(obj, newObj); } return newObj; } function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } const mkdtemp = (0, _util.promisify)(_fs.default.mkdtemp); const querySource = "http_url"; const plugin = async (envelope, { log, cfg, stats }) => { const parallel = (0, _fp.get)("http.import_parallel", cfg); const queries = _core.envelope.queriesByType(querySource, envelope); let mod; switch (parallel) { case parallel < 1 ? parallel : null: log.warn(`--http.import_parallel must be between 1 and 8. Setting to 1.`); mod = ""; break; case parallel === 1 ? parallel : null: log.info(`Run a single import at a time.`); mod = ""; break; case parallel > 8 ? parallel : null: log.warn(`--http.import_parallel must be between 1 and 8. Setting to 8.`); mod = 8; break; default: log.info(`Run ${parallel} imports concurrently.`); mod = parallel; } const mapper = _dashp.default[`flatmapP${mod}`]; const { browse, dispose } = await (0, _browser.default)(); const tmpdir = await mkdtemp(_path.default.join(_os.default.tmpdir(), "sugarcube-")); await (0, _pluginFs.mkdirP)(tmpdir); const logCounter = (0, _utils.counter)(envelope.data.length, ({ cnt, total, percent }) => log.debug(`Progress: ${cnt}/${total} units (${percent}%).`)); const decisions = (0, _core.createFeatureDecisions)(); const data = await (0, _dashp.flowP)([mapper(async url => { stats.count("total"); let unit; let media = []; let mediaType; try { mediaType = await (0, _utils2.urlContentType)(url); } catch (e) { stats.fail({ type: "http_import", term: url, reason: e.message }); return null; } try { if (mediaType === "url") { // Import URLS using the hypercube model. See the readme for a // link to referenced paper. Provide a location for a temporary // download. const target = _path.default.join(tmpdir, `${_core.crypto.uid(url)}.html`); [unit, media] = await (0, _utils2.hypercubeImport)(browse, target, url); } else { // Images, videos and documents are imported using simply Apache Tika. unit = await (0, _utils2.basicImport)(url); media.push({ type: mediaType, term: url }); } } catch (e) { stats.fail({ type: "http_import", term: url, reason: e.message }); return null; } if (unit == null) return null; log.info(`Imported url ${url} as media type "${mediaType}".`); stats.count("success"); logCounter(); // Test whether the new Ncube data format is enabled. if (decisions.canNcube()) return { _sc_id: url, _sc_entity: "website", _sc_id_fields: ["_sc_id"], _sc_media: [{ type: "url", term: url }].concat(media), _sc_queries: [{ type: querySource, term: url }], _sc_href: url, ...(0, _utils.tikaToEntity)(unit), _sc_data: { location: url, // Fields that couldn't be extracted are not added to the unit. ...Object.keys(unit).reduce((memo, key) => { if (unit[key] == null) return memo; return Object.assign(memo, { [key]: unit[key] }); }, {}) } }; // Use the old data forma. return { _sc_id_fields: ["location"], _sc_media: [{ type: "url", term: url }].concat(media), _sc_queries: [{ type: querySource, term: url }], _sc_href: url, ...(0, _utils.tikaToEntity)(unit), location: url, // Fields that couldn't be extracted are not added to the unit. ...Object.keys(unit).reduce((memo, key) => { if (unit[key] == null) return memo; return Object.assign(memo, { [key]: unit[key] }); }, {}) }; }), async rs => { if (tmpdir != null) await (0, _pluginFs.cleanUp)(tmpdir); if (dispose != null) await dispose(); return rs.filter(r => r !== null); }], queries); return _core.envelope.concatData(data, envelope); }; plugin.argv = { "http.import_parallel": { type: "number", nargs: 1, desc: "The number of parallel HTTP imports. Can be between 1 and 8.", default: 1 } }; plugin.desc = "Import HTTP URI's as Sugarcube units."; var _default = plugin; exports.default = _default;