@rundexter/dexter-interactive
Version:
Facilitate interactivity between a webpage and a Dexter chatbot
174 lines (167 loc) • 5.97 kB
JavaScript
const crypto = require('crypto')
, fs = require('fs')
, util = require('util')
, axios = require('axios')
, JSDom = require('jsdom')
, CACHE_PATH = `${__dirname}/cache`
;
/**
* Quick and dirty string hash
*
* @param {string} s - What to hash
* @return {string} SHA1 hash
*/
function quickHash(s) {
const shasum = crypto.createHash('sha1');
shasum.update(s);
return shasum.digest('hex');
}
/**
* Make sure we have a cache folder
*
*/
function initCache() {
try {
fs.mkdirSync(CACHE_PATH);
} catch (e) {
if (e.code !== 'EEXIST') throw e;
}
}
/**
* Stash and return the results of a GET request.
* Side effect: results are stored in ./cache/$url_hash.
*
* @param {string} url - What to fetch
* @param {string} type - Either string or json
* @return {Promise<string>} Data.
*/
function fetchAndCache(url, type) {
return axios.get(url)
.then((res) => {
const hash = quickHash(url)
, path = `${CACHE_PATH}/${hash}`
;
fs.writeFileSync(path, type === 'json' ? JSON.stringify(res.data, null, 2) : res.data);
// Be kind to the API and wait a few seconds
return new Promise((resolve) => {
console.info('Sleeping for API, fetched ' + url);
setTimeout(() => resolve(res.data), 2000);
});
})
;
}
/**
* Get data, first from a cache if we have it, then from the web if we don't.
* We do this so we don't spam the page unnecessarily during development.
*
* @param {string} url - Where to get the data
* @param {string} type - Either string or json
* @return {Promise<string>} Data
*/
function getContents(url, type) {
const hash = quickHash(url)
, path = `${CACHE_PATH}/${hash}`
, reader = util.promisify(fs.readFile)
;
// console.log('Cached?', fs.existsSync(path), path);
if (fs.existsSync(path)) {
return reader(path, {encoding: 'utf8', flag: 'r'})
.then((res) => type === 'json' ? JSON.parse(res) : res)
;
}
return fetchAndCache(url, type);
}
/**
* Shortcut for getting elements by class
*
* @param {JSDom.JSDOM} dom - Document model
* @param {string} className - What to fetch
* @return {Array} List of elements
*/
function getByClass(dom, className) {
return dom.window.document.getElementsByClassName(className);
}
initCache();
getContents('https://en.wikipedia.org/wiki/Wikipedia:Vital_articles', 'string')
.then((res) => {
const dom = new JSDom.JSDOM(res)
, containers = getByClass(dom, 'multicol')
, stripDetails = (s) => s.indexOf('(') > 0
? s.replace(/(.*?) \(.*/, '$1').trim()
: s.trim()
, tree = {}
;
// Each top-level category is contained in a "table.multicol"
Array.from(containers).forEach((container) => {
// Each of these containers is wrapped in a div that is preceded by an h2 with the level1 category
// This looks like $Category (...)
// Note that jsDom uses a nonstandard textContent property for innerText
const level1 = stripDetails(container.parentElement.previousElementSibling.textContent)
// Under the container are up to 3 column cells
, subcontainers = container.getElementsByTagName('td')
, iterables = Array.from(subcontainers).reduce((gathered, curr) => {
return gathered.concat(Array.from(curr.children));
}, [])
;
tree[level1] = {};
let currLevel2 = '';
iterables.forEach((node) => {
switch (node.nodeName) {
case 'H3':
currLevel2 = stripDetails(node.textContent);
tree[level1][currLevel2] = [];
break;
case 'UL':
Array.from(node.getElementsByTagName('li')).forEach((item) => {
const anchor = item.getElementsByTagName('a')[0]
, href = anchor.getAttribute('href')
, page = href.replace(/\/.*\/(.*)/, '$1')
;
tree[level1][currLevel2].push({page});
});
break;
}
});
});
return tree;
})
// .then((tree) => console.log(JSON.stringify(tree, null, 2)))
.then((tree) => {
// Make a top-level promise we can extend as we find more pages that need information
let promise = new Promise((resolve) => resolve());
// Load up a big ugly chain of promises for each page
Object.keys(tree).forEach((level1) => {
Object.keys(tree[level1]).forEach((level2) => {
tree[level1][level2].forEach((article) => {
// Here's where we add to the promise chain
promise = promise
.then(() => getContents('https://en.wikipedia.org/api/rest_v1/page/summary/' + article.page, 'json'))
.then((data) => {
article.title = data.title;
if (data.originalimage) {
// The "replace" is to fix a weird escaping issue. It's the only one, so it's a quick hack,
// but there's likely a smarter way to do it.
article.image = data.originalimage.source.replace(/%27/g, "'");
// Wikimedia's recommended permalink strategy.
article.safe_image = [
'https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file'
, article.image.replace(/.*?\/([^\/]+)$/, '$1')
].join('/');
} else {
article.image = '';
article.safe_image = '';
}
article.url = data.content_urls.desktop.page;
article.summary = data.extract;
})
;
});
});
});
return promise.then(() => tree);
})
.then((complete) => console.log(JSON.stringify(complete, null, 2)))
.catch((err) => {
console.log('Whoops', err);
})
;