@fcc-cdc/it-events
Version:
IT Events Crawler of China
99 lines (98 loc) • 6.15 kB
JavaScript
;
var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }
var __asyncValues = (this && this.__asyncValues) || function (o) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var m = o[Symbol.asyncIterator], i;
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
};
var __asyncDelegator = (this && this.__asyncDelegator) || function (o) {
var i, p;
return i = {}, verb("next"), verb("throw", function (e) { throw e; }), verb("return"), i[Symbol.iterator] = function () { return this; }, i;
function verb(n, f) { i[n] = o[n] ? function (v) { return (p = !p) ? { value: __await(o[n](v)), done: false } : f ? f(v) : v; } : f; }
};
var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var g = generator.apply(thisArg, _arguments || []), i, q = [];
return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i;
function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; }
function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }
function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }
function fulfill(value) { resume("next", value); }
function reject(value) { resume("throw", value); }
function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.CommonAgendaCrawler = exports.TimePattern = void 0;
const jsdom_1 = require("jsdom");
const web_utility_1 = require("web-utility");
const utility_1 = require("../utility");
const core_1 = require("./core");
exports.TimePattern = /\d{1,2}\s*[::]\s*\d{2}/;
const HeadingSelector = `h1, h2, h3, h4, h5, h6, strong, b`;
class CommonAgendaCrawler extends core_1.AgendaCrawler {
getList(URI) {
return __asyncGenerator(this, arguments, function* getList_1() {
const { window: { document } } = yield __await(jsdom_1.JSDOM.fromURL(URI));
this.document = document;
const timeBoxes = Array.from((0, web_utility_1.walkDOM)(document.body, 3), ({ nodeValue, parentElement }) => exports.TimePattern.test(nodeValue) && {
selector: (0, utility_1.getCSSSelector)(parentElement, document.body, utility_1.CSSSelectorPrecision.Medium)
}).filter(Boolean);
const timeBoxCount = (0, web_utility_1.countBy)(timeBoxes, ({ selector }) => selector);
const [[agendaTimeSelector]] = Object.entries(timeBoxCount).sort(([, a], [, b]) => b - a);
const [first, second] = document.querySelectorAll(agendaTimeSelector);
const agendaBox = (0, utility_1.sameParentOf)(first, second);
const agendaBoxSelector = (0, utility_1.getCSSSelector)(agendaBox, document.body, utility_1.CSSSelectorPrecision.High);
for (let i = 0; i < agendaBox.childElementCount; i++)
if (agendaBox.tagName.toLowerCase() === 'tbody')
yield __await(yield* __asyncDelegator(__asyncValues(this.getItems(agendaBox.children[i]))));
else
yield yield __await(yield __await(this.getItem(`${agendaBoxSelector} > :nth-child(${i + 1})`)));
this.document = undefined;
});
}
async getItem(selector) {
var _a, _b;
const agendaItem = (_a = this.document) === null || _a === void 0 ? void 0 : _a.querySelector(selector);
if (!agendaItem)
return {};
let time = '';
const [head, body] = Array.from((0, web_utility_1.walkDOM)(agendaItem, 3)).reduce((group, { parentElement, nodeValue }) => {
const isHeading = parentElement.matches(HeadingSelector) ||
!!parentElement.closest(HeadingSelector);
if (exports.TimePattern.test(nodeValue))
time = nodeValue.trim();
else
group[isHeading ? 0 : 1].push(nodeValue.trim());
return group;
}, [[], []]);
const [startTime, endTime] = time.split(/[^\d::]+/), [name, title] = head.sort((a, b) => (0, web_utility_1.byteLength)(a) - (0, web_utility_1.byteLength)(b)), [position, summary] = body.sort((a, b) => (0, web_utility_1.byteLength)(a) - (0, web_utility_1.byteLength)(b)), avatar = (_b = agendaItem.querySelector('img[src]')) === null || _b === void 0 ? void 0 : _b.src;
return {
mentor: { name, position, avatar },
title,
summary,
startTime,
endTime
};
}
getItems({ children }) {
const [time, ...agendas] = [...children];
const [startTime, endTime] = time.textContent.trim().split(/[^\d::]+/);
return agendas.map(agendaItem => {
var _a;
const [name, position, title, summary] = agendaItem.textContent
.trim()
.split('\n')
.sort((a, b) => (0, web_utility_1.byteLength)(a) - (0, web_utility_1.byteLength)(b)), avatar = (_a = agendaItem.querySelector('img[src]')) === null || _a === void 0 ? void 0 : _a.src;
return {
mentor: { name, position, avatar },
title,
summary,
startTime,
endTime
};
});
}
}
exports.CommonAgendaCrawler = CommonAgendaCrawler;