UNPKG

unionpedia-extractor

Version:

Extract incoming and outgoing relations for a concept from unionpedia.org

233 lines (232 loc) 7.76 kB
"use strict"; var __defProp = Object.defineProperty; var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value; var __publicField = (obj, key, value) => { __defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value); return value; }; var __async = (__this, __arguments, generator) => { return new Promise((resolve, reject) => { var fulfilled = (value) => { try { step(generator.next(value)); } catch (e) { reject(e); } }; var rejected = (value) => { try { step(generator.throw(value)); } catch (e) { reject(e); } }; var step = (x) => x.done ? resolve(x.value) : Promise.resolve(x.value).then(fulfilled, rejected); step((generator = generator.apply(__this, __arguments)).next()); }); }; const cheerio = require("cheerio"); const fr = require("follow-redirects"); const LRU = require("lru-cache"); function _interopNamespaceDefault(e) { const n = Object.create(null, { [Symbol.toStringTag]: { value: "Module" } }); if (e) { for (const k in e) { if (k !== "default") { const d = Object.getOwnPropertyDescriptor(e, k); Object.defineProperty(n, k, d.get ? d : { enumerable: true, get: () => e[k] }); } } } n.default = e; return Object.freeze(n); } const cheerio__namespace = /* @__PURE__ */ _interopNamespaceDefault(cheerio); const { https } = fr; class Unionpedia { constructor(url, cacheOptions = { max: 1e3, ttl: 1e3 * 60 * 60 * 24 }, objectCacheOptions, outgoingRelationsCacheOptions, incomingRelationsCacheOptions, htmlCacheOptions) { __publicField(this, "BASE", "https://en.unionpedia.org/"); // Cache for the concept object __publicField(this, "CACHE"); // Cache for the outgoing edges __publicField(this, "CACHE_OUTGOING"); // Cache for the incoming edges __publicField(this, "CACHE_INCOMING"); // Cache for the fetched html sites __publicField(this, "CACHE_HTML"); url = url || this.BASE; this.BASE = url.endsWith("/") ? url : `${url}/`; this.CACHE = new LRU(objectCacheOptions || cacheOptions); this.CACHE_OUTGOING = new LRU(outgoingRelationsCacheOptions || cacheOptions); this.CACHE_INCOMING = new LRU(incomingRelationsCacheOptions || cacheOptions); this.CACHE_HTML = new LRU(htmlCacheOptions || cacheOptions); } getConceptDescription($) { const pageTitles = $(".page-title + p"); if (pageTitles.length === 0) return ""; return pageTitles[0].children[0].data.replace(/ \[[0-9]*]/, "").replace(/\[[0-9]*]/, ""); } getConceptTitle($) { const headers = $(".page-title h1"); if (headers.length === 0) return ""; return headers[0].children[0].data; } getLinks($) { const descs = Array.from($("h2 + .rel-desc")); const links = Array.from($("h2 > a")); const results = []; for (let i = 0; i < descs.length; i++) { results.push({ // @ts-ignore title: links[i].children[0].data, // @ts-ignore description: descs[i].children[0].data, href: links[i].attribs.href.replaceAll(/\.\/(i\/)?(.+)/g, `${this.BASE}$2`) }); } return results; } fetchAndParseHTML(concept, outgoing = true) { return new Promise((resolve, reject) => __async(this, null, function* () { let url = this.BASE + concept.replaceAll(" ", "_"); if (!outgoing) { if (this.CACHE_HTML.has(`${concept}/incoming`)) { return resolve(this.CACHE_HTML.get(`${concept}/incoming`)); } let $; if (this.CACHE_HTML.has(concept)) { $ = this.CACHE_HTML.get(concept); } else { const html = yield new Promise((_resolve, _reject) => { https.get(url, (res) => { let data = ""; res.on("data", (d) => { data += d; }); res.on("close", () => { _resolve(data); }); }).on("error", (e) => { _reject(e); }); }).catch(reject); $ = cheerio__namespace.load(html); if ($(".page-title").length === 0) { $.exists = false; } this.CACHE_HTML.set(concept, $); } const links = $("#tabs-bar a"); if (links.length === 0) return reject("Unknown concept"); url = links[1].attribs.href.replace("./", this.BASE); } else if (this.CACHE_HTML.has(concept)) { return resolve(this.CACHE_HTML.get(concept)); } https.get(url, (res) => { let data = ""; res.on("data", (d) => { data += d; }); res.on("close", () => { const $ = cheerio__namespace.load(data); if ($(".page-title").length === 0) { $.exists = false; } if (outgoing) { this.CACHE_HTML.set(concept, $); } else { this.CACHE_HTML.set(`${concept}/incoming`, $); } resolve($); }); }).on("error", (e) => { reject(e); }); })); } checkForValidConcept(concept) { if (typeof concept !== "string") return Promise.reject("Concept not a string"); if (concept.length === 0) return Promise.reject("Concept is empty"); } getConceptObject(concept) { return __async(this, null, function* () { yield this.checkForValidConcept(concept); concept = concept.trim(); if (this.CACHE.has(concept)) { if (this.CACHE_HTML.get(concept).exists === false) { return Promise.reject("Unknown concept"); } return this.CACHE.get(concept); } const $ = yield this.fetchAndParseHTML(concept); if ($.exists === false) { this.CACHE.set(concept, {}); return Promise.reject("Unknown concept"); } const description = this.getConceptDescription($); const title = this.getConceptTitle($); const [link] = $("#tabs-bar a"); const object = { href: link.attribs.href.replace("./", this.BASE), title, description }; this.CACHE.set(concept, object); return object; }); } getOutgoingRelations(concept) { return __async(this, null, function* () { yield this.checkForValidConcept(concept); concept = concept.trim(); if (this.CACHE_OUTGOING.has(concept)) { if (this.CACHE_HTML.get(concept).exists === false) { return Promise.reject("Unknown concept"); } return this.CACHE_OUTGOING.get(concept); } const $ = yield this.fetchAndParseHTML(concept); if ($.exists === false) { this.CACHE_OUTGOING.set(concept, []); return Promise.reject("Unknown concept"); } const relations = this.getLinks($); this.CACHE_OUTGOING.set(concept, relations); return relations; }); } getIncomingRelations(concept) { return __async(this, null, function* () { yield this.checkForValidConcept(concept); concept = concept.trim(); if (this.CACHE_INCOMING.has(concept)) { if (this.CACHE_HTML.get(concept).exists === false) { return Promise.reject("Unknown concept"); } return this.CACHE_INCOMING.get(concept); } let relations = []; try { const $ = yield this.fetchAndParseHTML(concept, false); relations = this.getLinks($); } catch (e) { this.CACHE_INCOMING.set(concept, []); throw e; } this.CACHE_INCOMING.set(concept, relations); return relations; }); } } module.exports = Unionpedia;