@hardbulls/wbsc-crawler
Version:
Tool to crawl events, leagues and statistics from WBSC based websites.
144 lines (143 loc) • 7.07 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.GameCrawler = void 0;
const jsdom_1 = require("jsdom");
const date_fns_1 = require("date-fns");
const GameStatus_1 = require("./Model/GameStatus");
const Selector_1 = require("./Parser/Selector");
const NodeNotFoundError_1 = require("./Parser/NodeNotFoundError");
const date_fns_tz_1 = require("date-fns-tz");
const fetch_1 = require("./fetch");
exports.GameCrawler = {
crawl: (url, timezone) => __awaiter(void 0, void 0, void 0, function* () {
const html = yield (yield (0, fetch_1.fetchUrl)(url, { method: "GET" })).text();
const dom = new jsdom_1.JSDOM(html);
const appElement = dom.window.document.querySelector("#app");
if (appElement && appElement.hasAttribute("data-page")) {
return crawlAppJson(dom, timezone);
}
return crawlHtml(dom, timezone);
}),
};
function crawlHtml(dom, timezone) {
var _a, _b, _c, _d, _e;
const rows = dom.window.document.querySelectorAll(".schedule-item");
const games = [];
for (const row of rows) {
const gameInfo = (0, Selector_1.querySelectorOrThrow)(row, "a.box-score-link:nth-child(1)");
// TODO find a game where is set...
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const note = null;
const venue = (_a = (0, Selector_1.querySelectorOrThrow)(gameInfo, "div:nth-child(1) > p:nth-child(2)").textContent) === null || _a === void 0 ? void 0 : _a.trim();
const date = (0, Selector_1.querySelectorOrThrow)(gameInfo, "div:nth-child(2) > p:nth-child(2)");
if (!date || !date.textContent) {
throw new NodeNotFoundError_1.NodeNotFoundError(2);
}
let parsedDate = (0, date_fns_1.parse)(date.textContent, "dd/MM/yyyy, HH:mm", new Date());
if (!(0, date_fns_1.isValid)(parsedDate)) {
parsedDate = (0, date_fns_1.parse)(date.textContent, "dd/MM/yyyy --:--", new Date());
if ((0, date_fns_1.isValid)(parsedDate)) {
parsedDate.setHours(14, 0);
}
}
if (timezone) {
parsedDate = (0, date_fns_tz_1.fromZonedTime)(parsedDate, timezone);
}
const teamInfo = row.querySelector(".score") ||
row.querySelector(".regular-score") ||
(0, Selector_1.querySelectorOrThrow)(row, ".baseball-score-bug");
const awayTeamInfo = (0, Selector_1.querySelectorOrThrow)(teamInfo, "div.team-info:nth-child(1)");
const awayTeamName = (_b = (0, Selector_1.querySelectorOrThrow)(awayTeamInfo, "p:nth-child(4)p:nth-child(4)").textContent) === null || _b === void 0 ? void 0 : _b.trim();
const homeTeamInfo = (0, Selector_1.querySelectorOrThrow)(teamInfo, "div.team-info:nth-child(3)");
const homeTeamName = (_c = (0, Selector_1.querySelectorOrThrow)(homeTeamInfo, "p:nth-child(4)p:nth-child(4)").textContent) === null || _c === void 0 ? void 0 : _c.trim();
const scoreInfo = row.querySelector("div.score > div:nth-child(2) > p") ||
row.querySelector("div.regular-score > div:nth-child(2) > p") ||
(0, Selector_1.querySelectorOrThrow)(row, "div.baseball-score-bug > div:nth-child(2) > p");
const scoreInfoText = (_d = scoreInfo.textContent) === null || _d === void 0 ? void 0 : _d.trim();
let awayScore = 0;
let homeScore = 0;
if (scoreInfoText) {
const parsedScore = scoreInfoText
.split(":")
.map((v) => Number.parseInt(v.trim()));
awayScore = parsedScore[0] || 0;
homeScore = parsedScore[1] || 0;
}
const gameStatusInfo = (0, Selector_1.querySelectorOrThrow)(row, "div.calendar-buttons a > div > p");
let gameStatus = GameStatus_1.GameStatus.SCHEDULED;
const statusText = (_e = gameStatusInfo.textContent) === null || _e === void 0 ? void 0 : _e.trim().toLowerCase();
if (statusText && ["w.o.", "forfeit"].includes(statusText)) {
gameStatus = GameStatus_1.GameStatus.FORFEIT;
}
else if (statusText &&
["fortsetzung", "suspended"].includes(statusText)) {
gameStatus = GameStatus_1.GameStatus.SUSPENDED;
}
else if (statusText &&
("final" === statusText || statusText.match(/^f\/\d+$/))) {
gameStatus = GameStatus_1.GameStatus.FINISHED;
}
else if (statusText === "canceled") {
gameStatus = GameStatus_1.GameStatus.CANCELED;
}
games.push({
venue: venue || "Unknown",
home: homeTeamName || "Unknown",
away: awayTeamName || "Unknown",
awayScore: awayScore,
homeScore: homeScore,
status: gameStatus,
date: parsedDate,
note: null,
});
}
return games;
}
function crawlAppJson(dom, timezone) {
const appElement = dom.window.document.querySelector("#app");
const dataPage = appElement.getAttribute("data-page");
const data = JSON.parse(dataPage);
const games = [];
for (const gameData of data.props.games) {
let gameStatus = GameStatus_1.GameStatus.SCHEDULED;
if (gameData.gamestatus === 3) {
gameStatus = GameStatus_1.GameStatus.FINISHED;
}
else if (gameData.gamestatus === 4) {
gameStatus = GameStatus_1.GameStatus.FORFEIT;
}
else if (gameData.gamestatus === 0) {
gameStatus = GameStatus_1.GameStatus.SCHEDULED;
}
else if (gameData.gamestatus === -3) {
gameStatus = GameStatus_1.GameStatus.CANCELED;
}
else if (gameData.gamestatus === -2) {
gameStatus = GameStatus_1.GameStatus.SUSPENDED;
}
let parsedDate = gameData.start;
if (timezone) {
parsedDate = (0, date_fns_tz_1.fromZonedTime)(parsedDate, timezone);
}
games.push({
venue: [gameData.stadium, gameData.location].join(", ") || "Unknown",
home: gameData.homelabel || "Unknown",
away: gameData.awaylabel || "Unknown",
awayScore: gameData.awayruns,
homeScore: gameData.homeruns,
status: gameStatus,
date: parsedDate,
note: gameData.note || null,
});
}
return games;
}