UNPKG

epub

Version:

Parse EPUB electronic book files with Node.JS

507 lines (506 loc) 17.7 kB
import { readFile } from "node:fs/promises"; import { XMLParser, XMLValidator } from "fast-xml-parser"; import JSZip from "jszip"; const xmlParser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: "@_", ignoreDeclaration: true, }); /** * Parse XML string, validate it, strip the root element, and return the children. */ function parseXml(xml) { const validation = XMLValidator.validate(xml); if (validation !== true) { const e = validation.err; throw new Error(`${e.msg}\nLine: ${e.line}\nColumn: ${e.col}\nChar: `); } const raw = xmlParser.parse(xml); // Strip root element (equivalent to xml2js explicitRoot: false) const keys = Object.keys(raw); if (keys.length === 1) { return raw[keys[0]]; } return raw; } /** Extract text content from a parsed XML value (string, or object with #text). */ function textOf(val) { if (val == null) { return ""; } if (typeof val === "string") { return val.trim(); } if (typeof val === "number") { return String(val); } if (typeof val === "object" && "#text" in val) { return String(val["#text"] ?? "").trim(); } return ""; } /** Extract all @_ prefixed attributes from a parsed element into a clean object. */ function attrsOf(obj) { const result = {}; for (const key of Object.keys(obj)) { if (key.startsWith("@_")) { result[key.slice(2)] = String(obj[key]); } } return result; } /** Ensure value is an array. */ function asArray(val) { if (val == null) { return []; } return Array.isArray(val) ? val : [val]; } function extractIdentifiers(val, out) { if (typeof val !== "object" || val == null) { return; } const obj = val; const scheme = obj["@_opf:scheme"]; const id = obj["@_id"]; const contents = textOf(obj); if (scheme) { out[scheme] = contents; } else if (id && id.match(/uuid/i)) { out.UUID = contents.replace("urn:uuid:", "").toUpperCase().trim(); } } export class EPub { input; imageroot; linkroot; metadata = {}; manifest = {}; guide = []; spine = { toc: false, contents: [], }; flow = []; toc = []; version = "2.0"; zip; containerFile = false; mimeFile = false; rootFile = false; constructor(input, imageroot, linkroot) { this.input = input; this.imageroot = (imageroot || "/images/").trim(); this.linkroot = (linkroot || "/links/").trim(); if (!this.imageroot.endsWith("/")) { this.imageroot += "/"; } if (!this.linkroot.endsWith("/")) { this.linkroot += "/"; } } async parse() { this.containerFile = false; this.mimeFile = false; this.rootFile = false; this.metadata = {}; this.manifest = {}; this.guide = []; this.spine = { toc: false, contents: [] }; this.flow = []; this.toc = []; await this._open(); await this._checkMimeType(); await this._getRootFiles(); const rootfileData = await this._handleRootFile(); this._parseRootFile(rootfileData); if (this.spine.toc) { await this._parseTOC(); } } async _readFile(name) { const file = this.zip.file(name); if (!file) { throw new Error(`Entry not found: ${name}`); } return file.async("nodebuffer"); } async _open() { try { const buf = typeof this.input === "string" ? await readFile(this.input) : this.input; this.zip = await JSZip.loadAsync(buf); } catch { throw new Error("Invalid/missing file"); } if (!Object.keys(this.zip.files).length) { throw new Error("No files in archive"); } } async _checkMimeType() { for (const name of Object.keys(this.zip.files)) { if (name.toLowerCase() === "mimetype") { this.mimeFile = name; break; } } if (!this.mimeFile) { throw new Error("No mimetype file in archive"); } const data = await this._readFile(this.mimeFile); const txt = data.toString("utf-8").toLowerCase().trim(); if (txt !== "application/epub+zip") { throw new Error("Unsupported mime type"); } } async _getRootFiles() { for (const name of Object.keys(this.zip.files)) { if (name.toLowerCase() === "meta-inf/container.xml") { this.containerFile = name; break; } } if (!this.containerFile) { throw new Error("No container file in archive"); } const data = await this._readFile(this.containerFile); const xml = data.toString("utf-8").trim(); const result = parseXml(xml); const rootfiles = result.rootfiles; if (!rootfiles || !rootfiles.rootfile) { throw new Error("No rootfiles found"); } for (const rf of asArray(rootfiles.rootfile)) { if (String(rf["@_media-type"]).toLowerCase() === "application/oebps-package+xml" && rf["@_full-path"]) { this.rootFile = String(rf["@_full-path"]); break; } } if (!this.rootFile) { throw new Error("Rootfile not found from archive"); } } async _handleRootFile() { const data = await this._readFile(this.rootFile); const xml = data.toString("utf-8"); return parseXml(xml); } _parseRootFile(rootfile) { this.version = String(rootfile["@_version"] || "2.0"); for (const fullKey of Object.keys(rootfile)) { if (fullKey.startsWith("@_")) { continue; } const key = (fullKey.split(":").pop() || "").toLowerCase().trim(); switch (key) { case "metadata": this._parseMetadata(rootfile[fullKey]); break; case "manifest": this._parseManifest(rootfile[fullKey]); break; case "spine": this._parseSpine(rootfile[fullKey]); break; case "guide": this._parseGuide(rootfile[fullKey]); break; } } } _parseMetadata(metadata) { for (const fullKey of Object.keys(metadata)) { if (fullKey.startsWith("@_")) { continue; } const metadataValue = metadata[fullKey]; const key = (fullKey.split(":").pop() || "").toLowerCase().trim(); switch (key) { case "publisher": case "title": case "description": case "date": { if (Array.isArray(metadataValue)) { this.metadata[key] = textOf(metadataValue[0]); } else { this.metadata[key] = textOf(metadataValue); } break; } case "language": { if (Array.isArray(metadataValue)) { this.metadata.language = textOf(metadataValue[0]).toLowerCase(); } else { this.metadata.language = textOf(metadataValue).toLowerCase(); } break; } case "subject": { const subjects = asArray(metadataValue); if (subjects.length === 0) { this.metadata.subject = ""; } else { this.metadata.subjects = subjects.map((v) => textOf(v)); this.metadata.subject = this.metadata.subjects[0] ?? ""; } break; } case "creator": { if (Array.isArray(metadataValue)) { const first = metadataValue[0]; this.metadata.creator = textOf(first); this.metadata.creatorFileAs = String((typeof first === "object" && first?.["@_opf:file-as"]) || this.metadata.creator).trim(); } else { this.metadata.creator = textOf(metadataValue); const fileAs = typeof metadataValue === "object" && metadataValue != null && metadataValue["@_opf:file-as"]; this.metadata.creatorFileAs = String(fileAs || this.metadata.creator).trim(); } break; } case "identifier": { for (const v of asArray(metadataValue)) { extractIdentifiers(v, this.metadata); } break; } case "source": { const sources = asArray(metadataValue); this.metadata.source = sources.length > 0 ? textOf(sources[0]) : ""; break; } } } for (const meta of asArray(metadata.meta)) { const name = meta["@_name"]; const content = meta["@_content"]; const property = meta["@_property"]; if (name) { this.metadata[name] = content; } if (meta["#text"] && property) { this.metadata[property] = meta["#text"]; } } } _parseManifest(manifest) { const path = this.rootFile.split("/"); path.pop(); const pathStr = path.join("/"); for (const item of asArray(manifest.item)) { const element = attrsOf(item); if (element.href && element.href.substring(0, pathStr.length) !== pathStr) { element.href = path.concat([element.href]).join("/"); } if (element.id) { this.manifest[element.id] = element; } } } _parseGuide(guide) { const path = this.rootFile.split("/"); path.pop(); const pathStr = path.join("/"); for (const ref of asArray(guide.reference)) { const element = attrsOf(ref); if (element.href && element.href.substring(0, pathStr.length) !== pathStr) { element.href = path.concat([element.href]).join("/"); } this.guide.push(element); } } _parseSpine(spine) { const toc = spine["@_toc"]; if (toc) { this.spine.toc = this.manifest[toc] || false; } for (const itemref of asArray(spine.itemref)) { const idref = itemref["@_idref"]; if (idref) { const element = this.manifest[idref]; if (element) { this.spine.contents.push(element); } } } this.flow = this.spine.contents; } async _parseTOC() { const tocHref = this.spine.toc.href; const path = tocHref.split("/"); path.pop(); const idList = {}; for (const key of Object.keys(this.manifest)) { idList[this.manifest[key].href] = key; } const data = await this._readFile(tocHref); const xml = data.toString("utf-8"); let result; try { result = parseXml(xml); } catch (err) { throw new Error("Parsing container XML failed in TOC: " + (err instanceof Error ? err.message : String(err))); } const navMap = result.navMap; if (navMap?.navPoint) { this.toc = this.walkNavMap(navMap.navPoint, path, idList); } } walkNavMap(branch, path, idList, level = 0) { if (level > 7) { return []; } const output = []; const items = Array.isArray(branch) ? branch : [branch]; for (const item of items) { const navLabel = item.navLabel; if (navLabel) { let title = ""; if (typeof navLabel.text === "string") { title = navLabel.text.trim(); } let order = Number(item["@_playOrder"] || 0); if (isNaN(order)) { order = 0; } let href = ""; const content = item.content; if (typeof content?.["@_src"] === "string") { href = content["@_src"].trim(); } let element = { level, order, title, id: "", href: "" }; if (href) { href = path.concat([href]).join("/"); element.href = href; if (idList[element.href]) { element = this.manifest[idList[element.href]]; element.title = title; element.order = order; element.level = level; } else { element.href = href; element.id = String(item["@_id"] || "").trim(); } output.push(element); } } if (item.navPoint) { output.push(...this.walkNavMap(item.navPoint, path, idList, level + 1)); } } return output; } async getChapter(id) { const str = await this.getChapterRaw(id); const path = this.rootFile.split("/"); path.pop(); const keys = Object.keys(this.manifest); // remove linebreaks (no multi line matches in JS regex!) let s = str.replace(/\r?\n/g, "\u0000"); // keep only <body> contents s.replace(/<body[^>]*?>(.*)<\/body[^>]*?>/i, (_o, d) => { s = d.trim(); return ""; }); // remove <script> blocks s = s.replace(/<script[^>]*?>(.*?)<\/script[^>]*?>/gi, () => ""); // remove <style> blocks s = s.replace(/<style[^>]*?>(.*?)<\/style[^>]*?>/gi, () => ""); // remove onEvent handlers s = s.replace(/(\s)(on\w+)(\s*=\s*["']?[^"'\s>]*?["'\s>])/g, (_o, a, b, c) => { return a + "skip-" + b + c; }); // replace images s = s.replace(/(\ssrc\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (_o, a, b, c) => { const img = path.concat([b]).join("/").trim(); let element; for (const k of keys) { if (this.manifest[k].href === img) { element = this.manifest[k]; break; } } if (element) { return a + this.imageroot + element.id + "/" + img + c; } return ""; }); // replace links s = s.replace(/(\shref\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (_o, a, b, c) => { const linkparts = b ? b.split("#") : []; let link = linkparts.length ? path .concat([linkparts.shift() || ""]) .join("/") .trim() : ""; let element; for (const k of keys) { if (this.manifest[k].href.split("#")[0] === link) { element = this.manifest[k]; break; } } if (linkparts.length) { link += "#" + linkparts.join("#"); } if (element) { return a + this.linkroot + element.id + "/" + link + c; } return a + b + c; }); // bring back linebreaks // eslint-disable-next-line no-control-regex s = s.replace(/\u0000/g, "\n").trim(); return s; } async getChapterRaw(id) { if (!this.manifest[id]) { throw new Error("File not found"); } const mediaType = this.manifest[id]["media-type"]; if (mediaType !== "application/xhtml+xml" && mediaType !== "image/svg+xml") { throw new Error("Invalid mime type for chapter"); } const data = await this._readFile(this.manifest[id].href); return data ? data.toString("utf-8") : ""; } async getImage(id) { if (!this.manifest[id]) { throw new Error("File not found"); } const mediaType = (this.manifest[id]["media-type"] || "").toLowerCase().trim(); if (!mediaType.startsWith("image/")) { throw new Error("Invalid mime type for image"); } return this.getFile(id); } async getFile(id) { if (!this.manifest[id]) { throw new Error("File not found"); } const data = await this._readFile(this.manifest[id].href); return { data, mimeType: this.manifest[id]["media-type"] }; } async readFile(filename, encoding) { const data = await this._readFile(filename); if (encoding) { return data.toString(encoding); } return data; } hasDRM() { return this.zip.file("META-INF/encryption.xml") !== null; } } export default EPub;