unionpedia-extractor
Version:
Extract incoming and outgoing relations for a concept from unionpedia.org
217 lines (216 loc) • 7.22 kB
JavaScript
var __defProp = Object.defineProperty;
var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
var __publicField = (obj, key, value) => {
__defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value);
return value;
};
var __async = (__this, __arguments, generator) => {
return new Promise((resolve, reject) => {
var fulfilled = (value) => {
try {
step(generator.next(value));
} catch (e) {
reject(e);
}
};
var rejected = (value) => {
try {
step(generator.throw(value));
} catch (e) {
reject(e);
}
};
var step = (x) => x.done ? resolve(x.value) : Promise.resolve(x.value).then(fulfilled, rejected);
step((generator = generator.apply(__this, __arguments)).next());
});
};
import * as cheerio from "cheerio";
import fr from "follow-redirects";
import LRU from "lru-cache";
const { https } = fr;
class Unionpedia {
constructor(url, cacheOptions = {
max: 1e3,
ttl: 1e3 * 60 * 60 * 24
}, objectCacheOptions, outgoingRelationsCacheOptions, incomingRelationsCacheOptions, htmlCacheOptions) {
__publicField(this, "BASE", "https://en.unionpedia.org/");
// Cache for the concept object
__publicField(this, "CACHE");
// Cache for the outgoing edges
__publicField(this, "CACHE_OUTGOING");
// Cache for the incoming edges
__publicField(this, "CACHE_INCOMING");
// Cache for the fetched html sites
__publicField(this, "CACHE_HTML");
url = url || this.BASE;
this.BASE = url.endsWith("/") ? url : `${url}/`;
this.CACHE = new LRU(objectCacheOptions || cacheOptions);
this.CACHE_OUTGOING = new LRU(outgoingRelationsCacheOptions || cacheOptions);
this.CACHE_INCOMING = new LRU(incomingRelationsCacheOptions || cacheOptions);
this.CACHE_HTML = new LRU(htmlCacheOptions || cacheOptions);
}
getConceptDescription($) {
const pageTitles = $(".page-title + p");
if (pageTitles.length === 0)
return "";
return pageTitles[0].children[0].data.replace(/ \[[0-9]*]/, "").replace(/\[[0-9]*]/, "");
}
getConceptTitle($) {
const headers = $(".page-title h1");
if (headers.length === 0)
return "";
return headers[0].children[0].data;
}
getLinks($) {
const descs = Array.from($("h2 + .rel-desc"));
const links = Array.from($("h2 > a"));
const results = [];
for (let i = 0; i < descs.length; i++) {
results.push({
// @ts-ignore
title: links[i].children[0].data,
// @ts-ignore
description: descs[i].children[0].data,
href: links[i].attribs.href.replaceAll(/\.\/(i\/)?(.+)/g, `${this.BASE}$2`)
});
}
return results;
}
fetchAndParseHTML(concept, outgoing = true) {
return new Promise((resolve, reject) => __async(this, null, function* () {
let url = this.BASE + concept.replaceAll(" ", "_");
if (!outgoing) {
if (this.CACHE_HTML.has(`${concept}/incoming`)) {
return resolve(this.CACHE_HTML.get(`${concept}/incoming`));
}
let $;
if (this.CACHE_HTML.has(concept)) {
$ = this.CACHE_HTML.get(concept);
} else {
const html = yield new Promise((_resolve, _reject) => {
https.get(url, (res) => {
let data = "";
res.on("data", (d) => {
data += d;
});
res.on("close", () => {
_resolve(data);
});
}).on("error", (e) => {
_reject(e);
});
}).catch(reject);
$ = cheerio.load(html);
if ($(".page-title").length === 0) {
$.exists = false;
}
this.CACHE_HTML.set(concept, $);
}
const links = $("#tabs-bar a");
if (links.length === 0)
return reject("Unknown concept");
url = links[1].attribs.href.replace("./", this.BASE);
} else if (this.CACHE_HTML.has(concept)) {
return resolve(this.CACHE_HTML.get(concept));
}
https.get(url, (res) => {
let data = "";
res.on("data", (d) => {
data += d;
});
res.on("close", () => {
const $ = cheerio.load(data);
if ($(".page-title").length === 0) {
$.exists = false;
}
if (outgoing) {
this.CACHE_HTML.set(concept, $);
} else {
this.CACHE_HTML.set(`${concept}/incoming`, $);
}
resolve($);
});
}).on("error", (e) => {
reject(e);
});
}));
}
checkForValidConcept(concept) {
if (typeof concept !== "string")
return Promise.reject("Concept not a string");
if (concept.length === 0)
return Promise.reject("Concept is empty");
}
getConceptObject(concept) {
return __async(this, null, function* () {
yield this.checkForValidConcept(concept);
concept = concept.trim();
if (this.CACHE.has(concept)) {
if (this.CACHE_HTML.get(concept).exists === false) {
return Promise.reject("Unknown concept");
}
return this.CACHE.get(concept);
}
const $ = yield this.fetchAndParseHTML(concept);
if ($.exists === false) {
this.CACHE.set(concept, {});
return Promise.reject("Unknown concept");
}
const description = this.getConceptDescription($);
const title = this.getConceptTitle($);
const [link] = $("#tabs-bar a");
const object = {
href: link.attribs.href.replace("./", this.BASE),
title,
description
};
this.CACHE.set(concept, object);
return object;
});
}
getOutgoingRelations(concept) {
return __async(this, null, function* () {
yield this.checkForValidConcept(concept);
concept = concept.trim();
if (this.CACHE_OUTGOING.has(concept)) {
if (this.CACHE_HTML.get(concept).exists === false) {
return Promise.reject("Unknown concept");
}
return this.CACHE_OUTGOING.get(concept);
}
const $ = yield this.fetchAndParseHTML(concept);
if ($.exists === false) {
this.CACHE_OUTGOING.set(concept, []);
return Promise.reject("Unknown concept");
}
const relations = this.getLinks($);
this.CACHE_OUTGOING.set(concept, relations);
return relations;
});
}
getIncomingRelations(concept) {
return __async(this, null, function* () {
yield this.checkForValidConcept(concept);
concept = concept.trim();
if (this.CACHE_INCOMING.has(concept)) {
if (this.CACHE_HTML.get(concept).exists === false) {
return Promise.reject("Unknown concept");
}
return this.CACHE_INCOMING.get(concept);
}
let relations = [];
try {
const $ = yield this.fetchAndParseHTML(concept, false);
relations = this.getLinks($);
} catch (e) {
this.CACHE_INCOMING.set(concept, []);
throw e;
}
this.CACHE_INCOMING.set(concept, relations);
return relations;
});
}
}
export {
Unionpedia as default
};