@hardbulls/wbsc-crawler
Version:
Tool to crawl events, leagues and statistics from WBSC based websites.
183 lines (182 loc) • 9.4 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.GameCrawler = void 0;
const date_fns_1 = require("date-fns");
const GameStatus_1 = require("./Model/GameStatus");
const Selector_1 = require("./Parser/Selector");
const NodeNotFoundError_1 = require("./Parser/NodeNotFoundError");
const date_fns_tz_1 = require("date-fns-tz");
const fetch_1 = require("./fetch");
exports.GameCrawler = {
crawl: (url, options) => __awaiter(void 0, void 0, void 0, function* () {
const { JSDOM } = yield Promise.resolve().then(() => __importStar(require("jsdom")));
const html = yield (yield (0, fetch_1.fetchUrl)(url, { method: "GET" })).text();
const dom = new JSDOM(html);
const appElement = dom.window.document.querySelector("#app");
if (appElement && appElement.hasAttribute("data-page")) {
return crawlAppJson(dom, options);
}
return crawlHtml(dom, options === null || options === void 0 ? void 0 : options.timezone);
}),
};
function crawlHtml(dom, timezone) {
var _a, _b, _c, _d, _e;
const rows = dom.window.document.querySelectorAll(".schedule-item");
const games = [];
for (const row of rows) {
const gameInfo = (0, Selector_1.querySelectorOrThrow)(row, "a.box-score-link:nth-child(1)");
// TODO find a game where is set...
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const note = null;
const venue = (_a = (0, Selector_1.querySelectorOrThrow)(gameInfo, "div:nth-child(1) > p:nth-child(2)").textContent) === null || _a === void 0 ? void 0 : _a.trim();
const date = (0, Selector_1.querySelectorOrThrow)(gameInfo, "div:nth-child(2) > p:nth-child(2)");
if (!date || !date.textContent) {
throw new NodeNotFoundError_1.NodeNotFoundError(2);
}
let parsedDate = (0, date_fns_1.parse)(date.textContent, "dd/MM/yyyy, HH:mm", new Date());
if (!(0, date_fns_1.isValid)(parsedDate)) {
parsedDate = (0, date_fns_1.parse)(date.textContent, "dd/MM/yyyy --:--", new Date());
if ((0, date_fns_1.isValid)(parsedDate)) {
parsedDate.setHours(14, 0);
}
}
if (timezone) {
parsedDate = (0, date_fns_tz_1.fromZonedTime)(parsedDate, timezone);
}
const teamInfo = row.querySelector(".score") ||
row.querySelector(".regular-score") ||
(0, Selector_1.querySelectorOrThrow)(row, ".baseball-score-bug");
const awayTeamInfo = (0, Selector_1.querySelectorOrThrow)(teamInfo, "div.team-info:nth-child(1)");
const awayTeamName = (_b = (0, Selector_1.querySelectorOrThrow)(awayTeamInfo, "p:nth-child(4)p:nth-child(4)").textContent) === null || _b === void 0 ? void 0 : _b.trim();
const homeTeamInfo = (0, Selector_1.querySelectorOrThrow)(teamInfo, "div.team-info:nth-child(3)");
const homeTeamName = (_c = (0, Selector_1.querySelectorOrThrow)(homeTeamInfo, "p:nth-child(4)p:nth-child(4)").textContent) === null || _c === void 0 ? void 0 : _c.trim();
const scoreInfo = row.querySelector("div.score > div:nth-child(2) > p") ||
row.querySelector("div.regular-score > div:nth-child(2) > p") ||
(0, Selector_1.querySelectorOrThrow)(row, "div.baseball-score-bug > div:nth-child(2) > p");
const scoreInfoText = (_d = scoreInfo.textContent) === null || _d === void 0 ? void 0 : _d.trim();
let awayScore = 0;
let homeScore = 0;
if (scoreInfoText) {
const parsedScore = scoreInfoText
.split(":")
.map((v) => Number.parseInt(v.trim()));
awayScore = parsedScore[0] || 0;
homeScore = parsedScore[1] || 0;
}
const gameStatusInfo = (0, Selector_1.querySelectorOrThrow)(row, "div.calendar-buttons a > div > p");
let gameStatus = GameStatus_1.GameStatus.SCHEDULED;
const statusText = (_e = gameStatusInfo.textContent) === null || _e === void 0 ? void 0 : _e.trim().toLowerCase();
if (statusText && ["w.o.", "forfeit"].includes(statusText)) {
gameStatus = GameStatus_1.GameStatus.FORFEIT;
}
else if (statusText &&
["fortsetzung", "suspended"].includes(statusText)) {
gameStatus = GameStatus_1.GameStatus.SUSPENDED;
}
else if (statusText &&
("final" === statusText || statusText.match(/^f\/\d+$/))) {
gameStatus = GameStatus_1.GameStatus.FINISHED;
}
else if (statusText === "canceled") {
gameStatus = GameStatus_1.GameStatus.CANCELED;
}
games.push({
venue: venue || "Unknown",
home: homeTeamName || "Unknown",
away: awayTeamName || "Unknown",
awayScore: awayScore,
homeScore: homeScore,
status: gameStatus,
date: parsedDate,
note: null,
});
}
return games;
}
function crawlAppJson(dom, options) {
var _a, _b, _c, _d, _e, _f;
const appElement = dom.window.document.querySelector("#app");
const dataPage = appElement.getAttribute("data-page");
const data = JSON.parse(dataPage);
const games = [];
const tournamentkey = (_c = (_b = (_a = data.props) === null || _a === void 0 ? void 0 : _a.tournament) === null || _b === void 0 ? void 0 : _b.tournamentkey) !== null && _c !== void 0 ? _c : null;
const tournamentid = (_f = (_e = (_d = data.props) === null || _d === void 0 ? void 0 : _d.tournament) === null || _e === void 0 ? void 0 : _e.id) !== null && _f !== void 0 ? _f : null;
for (const gameData of data.props.games) {
let gameStatus = GameStatus_1.GameStatus.SCHEDULED;
if (gameData.gamestatus === 1) {
gameStatus = GameStatus_1.GameStatus.ONGOING;
}
else if (gameData.gamestatus === 3 || gameData.gamestatus === 2) {
gameStatus = GameStatus_1.GameStatus.FINISHED;
}
else if (gameData.gamestatus === 4) {
gameStatus = GameStatus_1.GameStatus.FORFEIT;
}
else if (gameData.gamestatus === 0) {
gameStatus = GameStatus_1.GameStatus.SCHEDULED;
}
else if (gameData.gamestatus === -3) {
gameStatus = GameStatus_1.GameStatus.CANCELED;
}
else if (gameData.gamestatus === -2) {
gameStatus = GameStatus_1.GameStatus.SUSPENDED;
}
let parsedDate = gameData.start;
if (options === null || options === void 0 ? void 0 : options.timezone) {
parsedDate = (0, date_fns_tz_1.fromZonedTime)(parsedDate, options.timezone);
}
let tickerUrl = null;
if (tournamentkey && (options === null || options === void 0 ? void 0 : options.tickerUrlPattern) && gameData.id) {
tickerUrl = options.tickerUrlPattern
.replace("{tournamentkey}", tournamentkey !== null && tournamentkey !== void 0 ? tournamentkey : "")
.replace("{id}", gameData.id);
}
games.push({
venue: [gameData.stadium, gameData.location].join(", ") || "Unknown",
home: gameData.homelabel || "Unknown",
away: gameData.awaylabel || "Unknown",
awayScore: gameData.awayruns,
homeScore: gameData.homeruns,
status: gameStatus,
date: parsedDate,
note: gameData.note || null,
tickerUrl: tickerUrl,
externalTournamentId: `${tournamentid}`,
externalTournamentKey: `${tournamentkey}`,
externalGameId: `${gameData.id}`,
});
}
return games;
}