UNPKG

jeopardy-json

Version:

A tool that scrapes and transforms Jeopardy! games from the J! Archive into structured JSON for trivia platforms and developers.

303 lines (258 loc) 9.65 kB
const http = require('http'); // Loads the http module const axios = require('axios'); // for making HTTP requests, promises based const cheerio = require('cheerio'); // implments a subset of jQuery for easy HTML parsing const scraper = require('./scraper'); // for scraping the j-archive /** * Parses the HTML of a Jeopardy! game page and extracts game data into a structured object. * * @param {string} html - The HTML content of the Jeopardy! game page. * @param {number} [game_id=1] - The ID of the current game being parsed. * @returns {Object} An object representing the parsed game, including title, rounds, categories, clues, responses, and game navigation. * * @example * const html = '<html>...</html>'; * const gameData = parseGame(html, 1234); * * gameData = { * title: "Game Title", * jeopardy_round: { * "Category 1": [ * { clue: "Clue text", response: "Correct answer", value: "Value of the clue", dd: true/false, row: 0 }, * ... * ], * ... * }, * double_jeopardy_round: { * "Category 1": [ ... ], * ... * }, * final_jeopardy_round: { * "Category Name": { clue: "Final clue text", response: "Final correct answer" } * }, * current_game: 1234, * next_game: 1235, * prev_game: 1233 * } */ function parseGame(html, game_id = 1) { $ = cheerio.load(html); let game = { "title": "", "jeopardy_round": { }, "double_jeopardy_round": { }, "final_jeopardy_round": {}, "current_game": game_id, "next_game": null, "prev_game": null }; // the values for each row in the jeopardy and double jeopardy rounds // these are hardcoded because they are the same for every game const rowValues = { "jeopardy_round": ['$200', '$400', '$600', '$800', '$1000'], "double_jeopardy_round": ['$400', '$800', '$1200', '$1600', '$2000'] }; game.title = $("#game_title h1").text(); // get the next game id from the title next_game = $("#contestants_table td a:contains([next game >>])").attr("href"); if (next_game) { game.next_game = Number(next_game.slice("showgame.php?game_id=".length)); } // get the prev game id from the title prev_game = $('#contestants_table td a:contains("\\[<< previous game\\]")').attr('href'); if (prev_game) { game.prev_game = Number(prev_game.slice("showgame.php?game_id=".length)); } // loop through the jeopardy clues and parse them // also record the values of each row for (round of ["jeopardy_round", "double_jeopardy_round"]) { const categories = []; let $rows = $("#" + round + " table.round > tbody").children(); $rows.each((row, v) => { let $row = $(v); // add the categories to the game object $row.find("td.category_name").each((_, v) => { let $data = $(v); let category = $data.text(); categories.push(category); }); // we found all the categories, now we can parse the clues // and answers for this round $row.find("td.clue").each((column, v) => { let $data = $(v); let clue = ""; let response = $data.find("em.correct_response").text() || ""; let val = $data.find("td.clue_value").text(); let dd = false; // daily double const cat = categories[column]; let clueProps = parseClue($data.find("td.clue_text").html()); const jeodparyClue = { clue: clueProps.text, response: null, value: val, dd: dd, image: clueProps.image, video: clueProps.video, column: column, row: row - 1 // row is 1-indexed in the HTML, so we need to subtract 1 } // if there is a clue but no value, then it is a daily double if (val == "" && clue != "") { dd = true; } // hardcode the rowValues for the clues jeodparyClue.value = rowValues[round][row - 1]; // if clue contains onmouseover, then it has a correct answer // in the onmouseover attribute let attr = $data.find("div").attr("onmouseover"); if (attr) { // if the clue has a correct answer, then we need to parse it response = $data.find("em.correct_response").text() || "" jeodparyClue.response = response; } // initialize the clue list for this round and category // if it doesn't already exist if (!game[round][cat]) { game[round][cat] = []; } game[round][cat].push(jeodparyClue); }); }); } // the final jeopardy round is a bit different // so we need to parse it separately const $fj = $("#final_jeopardy_round"); const attr = $fj.find(".category > div").attr("onmouseover"); const cat = $fj.find(".category_name").text() || ""; if (attr && $fj) { clueProps = parseClue($fj.find("td.clue_text").html()) game.final_jeopardy_round[cat] = { clue: clueProps.text, response: $fj.find("em.correct_response").text() || "", image: clueProps.image, video: clueProps.video } } return game; } /** * Removes HTML entities and * organizes image and video URLs */ function parseClue(encodedText) { const clueProps = { text: "", image: "", video: "" }; // missing clue if (encodedText == null) return clueProps; // helper function to leftover html entities const decodeHtmlEntity = function (str) { try { return str.replace(/&#(\d+);/g, function (match, dec) { return String.fromCharCode(dec); }); } catch { console.error("Could not decode HTML in", str); return str; } }; let decodedText = decodeHtmlEntity(encodedText); // there is html embedded in the text (probably a video or image) let startOfHidden = decodedText.indexOf("(<a"); if (startOfHidden != -1) { const videoMatch = decodedText.match(/href="([^"]+\.mp4)"/); if (videoMatch) { clueProps.video = videoMatch[1]; } const imgMatch = decodedText.match(/href="([^"]+\.jpg)"/); if (imgMatch) { clueProps.image = imgMatch[1]; } let endOfHidden = decodedText.indexOf(">)"); if (endOfHidden != -1) { decodedText = decodedText.slice(endOfHidden + 2); } } $chr = cheerio.load(decodedText); decodedText = $chr.text(); clueProps.text = decodedText; return clueProps; } /* * Get the game from the j-archive * @param {Number} game_id - The game id to get (dictated by the archive) * @returns {Promise} - A promise that resolves to the game object * in the format: * { * title: "Game Title", * jeopardy_round: { * "Category 1": [ * { clue: "Clue text", response: "Correct answer", value: "Value of the clue", dd: true/false, row: 0 }, * ... * ], * ... * }, * double_jeopardy_round: { * "Category 1": [ ... ], * ... * }, * final_jeopardy_round: { * "Category Name": { clue: "Final clue text", response: "Final correct answer" } * }, * current_game: 1234, * next_game: 1235, * prev_game: 1233 * } */ function getGame(game_id = 1) { let game_url = 'http://www.j-archive.com/showgame.php?game_id=' + game_id; // make a erquest to the j-archive game page // and parse the html, then check for .wmv files // TODO: handle the .wmv files return axios.get(game_url) .then(resp => parseGame(resp.data, game_id)) .then(game => { return game; }); } /** * Get the game from the j-archive and return it as a string * @param {Number} game_id * @returns a promise that resolves to a stringified version of the game object */ function getString(game_id) { return new Promise((resolve, reject) => { getGame(game_id) .then(game => { resolve(JSON.stringify(game)); }) .catch(err => { reject(err); }); }); } /** * Retrieves a specific show from the list of games. * * @async * @function * @param {number} [showNum=1] - The number of the show to retrieve (1-based index). * @returns {Promise} - A promise that resolves to the game object * @throws {Error} If the show number is less than 1 or exceeds the number of available games. */ async function getGameByShow(showNum=1) { const games = await scraper.getGamesList(); if (showNum < 1 || showNum > games.length) throw new Error("ERROR: invalid show number"); return await getGame(games[showNum - 1].game_id); } module.exports = { getGame, getString, getGameByShow, ...scraper };