UNPKG

worm-scraper

Version:

Scrapes the web serial Worm and its sequel Ward into an ebook format

github.com/domenic/worm-scraper

domenic/worm-scraper

144 lines (119 loc) • 5.17 kB

JavaScript

"use strict"; const path = require("path"); const fs = require("fs").promises; const { JSDOM } = require("jsdom"); const progressUtils = require("./progress-utils.js"); const FILENAME_PREFIX = "chapter"; module.exports = async (cachePath, manifestPath, bookData) => { await fs.mkdir(cachePath, { recursive: true }); let manifest; try { const manifestContents = await fs.readFile(manifestPath, { encoding: "utf-8" }); // Include this in the try/catch because it's possible to get interrupted while writing manifest.json, which would // result in a corrupt file. In that case we need to treat it as if the file doesn't exist. manifest = JSON.parse(manifestContents); } catch (e) { if (e.code === "ENOENT" || e.name === "SyntaxError") { return downloadAllChapters(null, bookData, cachePath, manifestPath); } throw e; } return downloadAllChapters(manifest, bookData, cachePath, manifestPath); }; async function downloadAllChapters(manifest, bookData, cachePath, manifestPath) { const toDownloadManifestIndicesByURL = new Map(); if (manifest === null) { // If we've never run the program before, create a skeleton manifest from the book data which consists of just URLs. manifest = []; for (const arc of bookData.arcs) { for (const chapter of arc.chapters) { const manifestEntry = { url: chapter.url }; toDownloadManifestIndicesByURL.set(chapter.url, manifest.length); manifest.push(manifestEntry); } } } else { // Otherwise, note all the chapters which need to be downloaded, and their manifest entry indices which will be // updated as we perform those downloads. for (let i = 0; i < manifest.length; ++i) { const manifestEntry = manifest[i]; if (!("filename" in manifestEntry)) { toDownloadManifestIndicesByURL.set(manifestEntry.url, i); } } } console.log( `Cache contains ${manifest.length - toDownloadManifestIndicesByURL.size}/${manifest.length} HTML pages` ); if (toDownloadManifestIndicesByURL.size === 0) { return; } console.log(`Downloading ${toDownloadManifestIndicesByURL.size} remaining HTML pages`); const progress = progressUtils.start(toDownloadManifestIndicesByURL.size); const promises = []; let manifestUpdatePromise = null; for (const [chapterURL, manifestIndex] of toDownloadManifestIndicesByURL) { // eslint-disable-next-line no-loop-func promises.push((async () => { const filename = `${FILENAME_PREFIX}${manifestIndex.toString().padStart(3, "0")}.html`; const { contents, dom } = await downloadChapter(chapterURL); const title = getChapterTitle(dom.window.document); const datePublished = getChapterDatePublished(dom.window.document); const dateModified = getChapterDateModified(dom.window.document); dom.window.close(); manifest[manifestIndex].title = title; manifest[manifestIndex].datePublished = datePublished; manifest[manifestIndex].dateModified = dateModified; manifest[manifestIndex].filename = filename; await fs.writeFile(path.resolve(cachePath, filename), contents); // Incrementally update the manifest after every successful download, instead of waiting until the end. // But don't start the update until the previous one is done. if (manifestUpdatePromise === null) { const newManifestContents = JSON.stringify(manifest, undefined, 2); manifestUpdatePromise = fs.writeFile(manifestPath, newManifestContents).then(() => { manifestUpdatePromise = null; }); } progressUtils.increment(progress); })()); } await Promise.all(promises); console.log(`All HTML pages downloaded in ${progressUtils.getTotalSeconds(progress)} seconds`); } function getChapterTitle(rawChapterDoc) { // Remove " – " because it's present in Ward but not in Worm, which is inconsistent. (And leaving it in causes slight // issues down the line where we remove spaces around em dashes during conversion.) // // TODO: now that we have an "original" option, figure out how to just leave this as-is (without messing up the em // dash processing). return rawChapterDoc.querySelector("h1.entry-title").textContent.replace(/ – /u, " "); } function getChapterDatePublished(rawChapterDoc) { return rawChapterDoc.querySelector(`meta[property="article:published_time"]`).content; } function getChapterDateModified(rawChapterDoc) { return rawChapterDoc.querySelector(`meta[property="article:modified_time"]`).content; } async function downloadChapter(url) { const response = await downloadWithRetry(url); const contents = await response.text(); const dom = new JSDOM(contents, { url }); return { url, contents, dom }; } function downloadWithRetry(url) { return retry(3, async () => { const response = await fetch(url); if (response.status !== 200) { throw new Error(`Response status for ${url} was ${response.status}`); } return response; }); } function retry(times, fn) { if (times === 0) { return fn(); } return fn().catch(() => { return retry(times - 1, fn); }); }