UNPKG

epub

Version:

Parse EPUB electronic book files with Node.JS

618 lines (541 loc) 16.8 kB
import { readFile } from "node:fs/promises"; import { XMLParser, XMLValidator } from "fast-xml-parser"; import JSZip from "jszip"; const xmlParser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: "@_", ignoreDeclaration: true, }); /** * Parse XML string, validate it, strip the root element, and return the children. */ function parseXml(xml: string): Record<string, unknown> { const validation = XMLValidator.validate(xml); if (validation !== true) { const e = validation.err; throw new Error(`${e.msg}\nLine: ${e.line}\nColumn: ${e.col}\nChar: `); } const raw = xmlParser.parse(xml) as Record<string, unknown>; // Strip root element (equivalent to xml2js explicitRoot: false) const keys = Object.keys(raw); if (keys.length === 1) { return raw[keys[0]] as Record<string, unknown>; } return raw; } /** Extract text content from a parsed XML value (string, or object with #text). */ function textOf(val: unknown): string { if (val == null) { return ""; } if (typeof val === "string") { return val.trim(); } if (typeof val === "number") { return String(val); } if (typeof val === "object" && "#text" in (val as object)) { return String((val as Record<string, unknown>)["#text"] ?? "").trim(); } return ""; } /** Extract all @_ prefixed attributes from a parsed element into a clean object. */ function attrsOf(obj: Record<string, unknown>): Record<string, string> { const result: Record<string, string> = {}; for (const key of Object.keys(obj)) { if (key.startsWith("@_")) { result[key.slice(2)] = String(obj[key]); } } return result; } /** Ensure value is an array. */ function asArray<T>(val: T | T[] | undefined): T[] { if (val == null) { return []; } return Array.isArray(val) ? val : [val]; } export interface ManifestItem { id: string; href: string; "media-type": string; [key: string]: unknown; } export interface TocElement { level: number; order: number; title: string; id: string; href: string; "media-type"?: string; [key: string]: unknown; } export interface Metadata { creator: string; creatorFileAs: string; title: string; language: string; subject: string; subjects?: string[]; date: string; description: string; publisher?: string; source?: string; UUID?: string; [key: string]: unknown; } function extractIdentifiers(val: unknown, out: Metadata): void { if (typeof val !== "object" || val == null) { return; } const obj = val as Record<string, unknown>; const scheme = obj["@_opf:scheme"] as string | undefined; const id = obj["@_id"] as string | undefined; const contents = textOf(obj); if (scheme) { (out as Record<string, unknown>)[scheme] = contents; } else if (id && id.match(/uuid/i)) { out.UUID = contents.replace("urn:uuid:", "").toUpperCase().trim(); } } export class EPub { input: string | Buffer | ArrayBuffer; imageroot: string; linkroot: string; metadata: Metadata = {} as Metadata; manifest: Record<string, ManifestItem> = {}; guide: Record<string, string>[] = []; spine: { toc: ManifestItem | false; contents: ManifestItem[] } = { toc: false, contents: [], }; flow: ManifestItem[] = []; toc: TocElement[] = []; version: string = "2.0"; zip!: JSZip; containerFile: string | false = false; mimeFile: string | false = false; rootFile: string | false = false; constructor(input: string | Buffer | ArrayBuffer, imageroot?: string, linkroot?: string) { this.input = input; this.imageroot = (imageroot || "/images/").trim(); this.linkroot = (linkroot || "/links/").trim(); if (!this.imageroot.endsWith("/")) { this.imageroot += "/"; } if (!this.linkroot.endsWith("/")) { this.linkroot += "/"; } } async parse(): Promise<void> { this.containerFile = false; this.mimeFile = false; this.rootFile = false; this.metadata = {} as Metadata; this.manifest = {}; this.guide = []; this.spine = { toc: false, contents: [] }; this.flow = []; this.toc = []; await this._open(); await this._checkMimeType(); await this._getRootFiles(); const rootfileData = await this._handleRootFile(); this._parseRootFile(rootfileData); if (this.spine.toc) { await this._parseTOC(); } } private async _readFile(name: string): Promise<Buffer> { const file = this.zip.file(name); if (!file) { throw new Error(`Entry not found: ${name}`); } return file.async("nodebuffer"); } private async _open(): Promise<void> { try { const buf = typeof this.input === "string" ? await readFile(this.input) : this.input; this.zip = await JSZip.loadAsync(buf); } catch { throw new Error("Invalid/missing file"); } if (!Object.keys(this.zip.files).length) { throw new Error("No files in archive"); } } private async _checkMimeType(): Promise<void> { for (const name of Object.keys(this.zip.files)) { if (name.toLowerCase() === "mimetype") { this.mimeFile = name; break; } } if (!this.mimeFile) { throw new Error("No mimetype file in archive"); } const data = await this._readFile(this.mimeFile); const txt = data.toString("utf-8").toLowerCase().trim(); if (txt !== "application/epub+zip") { throw new Error("Unsupported mime type"); } } private async _getRootFiles(): Promise<void> { for (const name of Object.keys(this.zip.files)) { if (name.toLowerCase() === "meta-inf/container.xml") { this.containerFile = name; break; } } if (!this.containerFile) { throw new Error("No container file in archive"); } const data = await this._readFile(this.containerFile); const xml = data.toString("utf-8").trim(); const result = parseXml(xml); const rootfiles = result.rootfiles as Record<string, unknown> | undefined; if (!rootfiles || !rootfiles.rootfile) { throw new Error("No rootfiles found"); } for (const rf of asArray(rootfiles.rootfile) as Record<string, unknown>[]) { if ( String(rf["@_media-type"]).toLowerCase() === "application/oebps-package+xml" && rf["@_full-path"] ) { this.rootFile = String(rf["@_full-path"]); break; } } if (!this.rootFile) { throw new Error("Rootfile not found from archive"); } } private async _handleRootFile(): Promise<Record<string, unknown>> { const data = await this._readFile(this.rootFile as string); const xml = data.toString("utf-8"); return parseXml(xml); } private _parseRootFile(rootfile: Record<string, unknown>): void { this.version = String(rootfile["@_version"] || "2.0"); for (const fullKey of Object.keys(rootfile)) { if (fullKey.startsWith("@_")) { continue; } const key = (fullKey.split(":").pop() || "").toLowerCase().trim(); switch (key) { case "metadata": this._parseMetadata(rootfile[fullKey] as Record<string, unknown>); break; case "manifest": this._parseManifest(rootfile[fullKey] as Record<string, unknown>); break; case "spine": this._parseSpine(rootfile[fullKey] as Record<string, unknown>); break; case "guide": this._parseGuide(rootfile[fullKey] as Record<string, unknown>); break; } } } private _parseMetadata(metadata: Record<string, unknown>): void { for (const fullKey of Object.keys(metadata)) { if (fullKey.startsWith("@_")) { continue; } const metadataValue = metadata[fullKey]; const key = (fullKey.split(":").pop() || "").toLowerCase().trim(); switch (key) { case "publisher": case "title": case "description": case "date": { if (Array.isArray(metadataValue)) { (this.metadata as Record<string, unknown>)[key] = textOf(metadataValue[0]); } else { (this.metadata as Record<string, unknown>)[key] = textOf(metadataValue); } break; } case "language": { if (Array.isArray(metadataValue)) { this.metadata.language = textOf(metadataValue[0]).toLowerCase(); } else { this.metadata.language = textOf(metadataValue).toLowerCase(); } break; } case "subject": { const subjects = asArray(metadataValue); if (subjects.length === 0) { this.metadata.subject = ""; } else { this.metadata.subjects = subjects.map((v) => textOf(v)); this.metadata.subject = this.metadata.subjects[0] ?? ""; } break; } case "creator": { if (Array.isArray(metadataValue)) { const first = metadataValue[0] as Record<string, unknown> | string | undefined; this.metadata.creator = textOf(first); this.metadata.creatorFileAs = String( (typeof first === "object" && first?.["@_opf:file-as"]) || this.metadata.creator, ).trim(); } else { this.metadata.creator = textOf(metadataValue); const fileAs = typeof metadataValue === "object" && metadataValue != null && (metadataValue as Record<string, unknown>)["@_opf:file-as"]; this.metadata.creatorFileAs = String(fileAs || this.metadata.creator).trim(); } break; } case "identifier": { for (const v of asArray(metadataValue)) { extractIdentifiers(v, this.metadata); } break; } case "source": { const sources = asArray(metadataValue); this.metadata.source = sources.length > 0 ? textOf(sources[0]) : ""; break; } } } for (const meta of asArray(metadata.meta) as Record<string, unknown>[]) { const name = meta["@_name"] as string | undefined; const content = meta["@_content"] as string | undefined; const property = meta["@_property"] as string | undefined; if (name) { (this.metadata as Record<string, unknown>)[name] = content; } if (meta["#text"] && property) { (this.metadata as Record<string, unknown>)[property] = meta["#text"]; } } } private _parseManifest(manifest: Record<string, unknown>): void { const path = (this.rootFile as string).split("/"); path.pop(); const pathStr = path.join("/"); for (const item of asArray(manifest.item) as Record<string, unknown>[]) { const element = attrsOf(item) as unknown as ManifestItem; if (element.href && element.href.substring(0, pathStr.length) !== pathStr) { element.href = path.concat([element.href]).join("/"); } if (element.id) { this.manifest[element.id] = element; } } } private _parseGuide(guide: Record<string, unknown>): void { const path = (this.rootFile as string).split("/"); path.pop(); const pathStr = path.join("/"); for (const ref of asArray(guide.reference) as Record<string, unknown>[]) { const element = attrsOf(ref); if (element.href && element.href.substring(0, pathStr.length) !== pathStr) { element.href = path.concat([element.href]).join("/"); } this.guide.push(element); } } private _parseSpine(spine: Record<string, unknown>): void { const toc = spine["@_toc"] as string | undefined; if (toc) { this.spine.toc = this.manifest[toc] || false; } for (const itemref of asArray(spine.itemref) as Record<string, unknown>[]) { const idref = itemref["@_idref"] as string | undefined; if (idref) { const element = this.manifest[idref]; if (element) { this.spine.contents.push(element); } } } this.flow = this.spine.contents; } private async _parseTOC(): Promise<void> { const tocHref = (this.spine.toc as ManifestItem).href; const path = tocHref.split("/"); path.pop(); const idList: Record<string, string> = {}; for (const key of Object.keys(this.manifest)) { idList[this.manifest[key].href as string] = key; } const data = await this._readFile(tocHref); const xml = data.toString("utf-8"); let result: Record<string, unknown>; try { result = parseXml(xml); } catch (err) { throw new Error( "Parsing container XML failed in TOC: " + (err instanceof Error ? err.message : String(err)), ); } const navMap = result.navMap as Record<string, unknown> | undefined; if (navMap?.navPoint) { this.toc = this.walkNavMap(navMap.navPoint as Record<string, unknown>[], path, idList); } } walkNavMap( branch: Record<string, unknown> | Record<string, unknown>[], path: string[], idList: Record<string, string>, level: number = 0, ): TocElement[] { if (level > 7) { return []; } const output: TocElement[] = []; const items = Array.isArray(branch) ? branch : [branch]; for (const item of items) { const navLabel = item.navLabel as Record<string, unknown> | undefined; if (navLabel) { let title = ""; if (typeof navLabel.text === "string") { title = navLabel.text.trim(); } let order = Number(item["@_playOrder"] || 0); if (isNaN(order)) { order = 0; } let href = ""; const content = item.content as Record<string, unknown> | undefined; if (typeof content?.["@_src"] === "string") { href = (content["@_src"] as string).trim(); } let element: TocElement = { level, order, title, id: "", href: "" }; if (href) { href = path.concat([href]).join("/"); element.href = href; if (idList[element.href]) { element = this.manifest[idList[element.href]] as ManifestItem & TocElement; element.title = title; element.order = order; element.level = level; } else { element.href = href; element.id = String(item["@_id"] || "").trim(); } output.push(element); } } if (item.navPoint) { output.push( ...this.walkNavMap(item.navPoint as Record<string, unknown>[], path, idList, level + 1), ); } } return output; } async getChapter(id: string): Promise<string> { const str = await this.getChapterRaw(id); const path = (this.rootFile as string).split("/"); path.pop(); const keys = Object.keys(this.manifest); // remove linebreaks (no multi line matches in JS regex!) let s = str.replace(/\r?\n/g, "\u0000"); // keep only <body> contents s.replace(/<body[^>]*?>(.*)<\/body[^>]*?>/i, (_o, d) => { s = d.trim(); return ""; }); // remove <script> blocks s = s.replace(/<script[^>]*?>(.*?)<\/script[^>]*?>/gi, () => ""); // remove <style> blocks s = s.replace(/<style[^>]*?>(.*?)<\/style[^>]*?>/gi, () => ""); // remove onEvent handlers s = s.replace(/(\s)(on\w+)(\s*=\s*["']?[^"'\s>]*?["'\s>])/g, (_o, a, b, c) => { return a + "skip-" + b + c; }); // replace images s = s.replace(/(\ssrc\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (_o, a, b, c) => { const img = path.concat([b]).join("/").trim(); let element: Record<string, unknown> | undefined; for (const k of keys) { if (this.manifest[k].href === img) { element = this.manifest[k]; break; } } if (element) { return a + this.imageroot + element.id + "/" + img + c; } return ""; }); // replace links s = s.replace(/(\shref\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (_o, a, b, c) => { const linkparts = b ? b.split("#") : []; let link = linkparts.length ? path .concat([linkparts.shift() || ""]) .join("/") .trim() : ""; let element: Record<string, unknown> | undefined; for (const k of keys) { if ((this.manifest[k].href as string).split("#")[0] === link) { element = this.manifest[k]; break; } } if (linkparts.length) { link += "#" + linkparts.join("#"); } if (element) { return a + this.linkroot + element.id + "/" + link + c; } return a + b + c; }); // bring back linebreaks // eslint-disable-next-line no-control-regex s = s.replace(/\u0000/g, "\n").trim(); return s; } async getChapterRaw(id: string): Promise<string> { if (!this.manifest[id]) { throw new Error("File not found"); } const mediaType = this.manifest[id]["media-type"]; if (mediaType !== "application/xhtml+xml" && mediaType !== "image/svg+xml") { throw new Error("Invalid mime type for chapter"); } const data = await this._readFile(this.manifest[id].href); return data ? data.toString("utf-8") : ""; } async getImage(id: string): Promise<{ data: Buffer; mimeType: string }> { if (!this.manifest[id]) { throw new Error("File not found"); } const mediaType = (this.manifest[id]["media-type"] || "").toLowerCase().trim(); if (!mediaType.startsWith("image/")) { throw new Error("Invalid mime type for image"); } return this.getFile(id); } async getFile(id: string): Promise<{ data: Buffer; mimeType: string }> { if (!this.manifest[id]) { throw new Error("File not found"); } const data = await this._readFile(this.manifest[id].href); return { data, mimeType: this.manifest[id]["media-type"] }; } async readFile(filename: string, encoding?: BufferEncoding): Promise<Buffer | string> { const data = await this._readFile(filename); if (encoding) { return data.toString(encoding); } return data; } hasDRM(): boolean { return this.zip.file("META-INF/encryption.xml") !== null; } } export default EPub;