jeopardy-json
Version:
A tool that scrapes and transforms Jeopardy! games from the J! Archive into structured JSON for trivia platforms and developers.
160 lines (135 loc) • 5.03 kB
JavaScript
/**
* Scrapes Jeopardy! game data by season from j-archive.com and saves it to a JSON file.
*
* - Fetches all season links from the main season list.
* - For each season, retrieves game details (game_id, show_number, air_date, season).
* - Merges new data with existing JSON, deduplicates by game_id, and sorts by show_number.
* - Skips abnormal or already-scraped seasons.
*
* Dependencies: axios, cheerio, fs
*
* @fileoverview Scrapes and stores Jeopardy! game metadata by season.
*/
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
const OUTPUT_FILE = 'games.json';
const BASE_URL = 'http://www.j-archive.com';
async function getSeasonLinks() {
const res = await axios.get(`${BASE_URL}/listseasons.php`);
const $ = cheerio.load(res.data);
return $('td a')
.map((i, el) => $(el).attr('href'))
.get()
.filter(href => href.startsWith('showseason'));
}
/**
* Fetches and parses a list of games from a given season page URL.
*
* Uses axios to retrieve the HTML content and cheerio to parse it.
* Extracts game information from anchor tags with hrefs starting with "showgame.php".
*
* @async
* @function fetchGamesFromSeason
* @param {string} url - The URL of the season page (e.g., "showseason.php?season=1").
* @returns {Promise<Array<{game_id: number, show_number: number, air_date: string}>>}
* Resolves to an array of game objects with game_id, show_number, and air_date.
*/
async function getGamesBySeason(url) {
const res = await axios.get(`${BASE_URL}/${url}`);
const $ = cheerio.load(res.data);
const games = [];
const seasonMatch = url.match(/season=(\d+)/);
let seasonNumber = seasonMatch[1];
// make it a number, if possible
if (!isNaN(Number(seasonNumber))) {
seasonNumber = Number(seasonNumber);
}
$('td a[href^="showgame.php"]').each((i, el) => {
const text = $(el).text();
const href = $(el).attr('href');
// Extracts the game ID from the href query string:
const gameIdMatch = href.match(/game_id=(\d+)/);
// Extracts the show number, which appears after a '#'
const showMatch = text.match(/#(\d+)/);
// Extracts the air date in YYYY-MM-DD format
const dateMatch = text.match(/aired\s*(\d{4}-\d{2}-\d{2})/);
if (gameIdMatch && showMatch && dateMatch) {
games.push({
game_id: parseInt(gameIdMatch[1]),
show_number: parseInt(showMatch[1]),
air_date: dateMatch[1],
season: seasonNumber
});
}
});
return games;
}
async function update(log = false) {
let existing = [];
if (fs.existsSync(OUTPUT_FILE)) {
try {
existing = JSON.parse(fs.readFileSync(OUTPUT_FILE, 'utf8'));
} catch {
console.warn('Failed to parse existing JSON. Starting fresh.');
}
}
// Ensure existing data is sorted by show_number
let lastSeason = 1;
if (existing.length > 0) {
lastSeason = existing[existing.length - 1].season
}
const allGames = [];
let seasonLinks = await getSeasonLinks();
// remove abnormal jeopardy seasons and
// skip seasons before the last one scraped
// make sure always scrape the last season
seasonLinks = seasonLinks.filter(str => {
val = Number(str.split("=")[1]);
if (isNaN(val) || val < lastSeason)
// skip seasons that are not numbers
// or anything before the last scraped season
return false;
else
return true;
});
for (const link of seasonLinks) {
const games = await getGamesBySeason(link);
allGames.push(...games);
if (log)
console.log(`Scraped ${link} (${games.length} games)`);
}
const merged = [...existing, ...allGames];
// create a mapping to overwrite duplicates from the merged array
// extract the values since were keep the array sorted by show number
const deduped = Array.from(new Map(merged.map(item => [item.game_id, item])).values());
deduped.sort((a, b) => a.show_number - b.show_number);
fs.writeFileSync(OUTPUT_FILE, JSON.stringify(deduped, null, 2));
if (log)
console.log(`Done! Saved ${deduped.length} games to ${OUTPUT_FILE}`);
return deduped;
}
/**
* Retrieves the list of games from the output file if it exists.
* Reads and parses the JSON data from the specified output file.
* Throws an error if the file exists but cannot be parsed as valid JSON.
*
* @async
* @function
* @returns {Promise<Object|undefined>} The parsed games list object if the file exists, otherwise undefined.
* @throws {Error} If the existing JSON file cannot be parsed.
*/
async function getGamesList() {
if (fs.existsSync(OUTPUT_FILE)) {
try {
const existingData = JSON.parse(fs.readFileSync(OUTPUT_FILE, 'utf8'));
return existingData;
} catch {
throw new Error('Failed to parse existing JSON.');
}
}
}
module.exports = {
update,
getGamesList
};