epub
Version:
Parse EPUB electronic book files with Node.JS
507 lines (506 loc) • 17.7 kB
JavaScript
import { readFile } from "node:fs/promises";
import { XMLParser, XMLValidator } from "fast-xml-parser";
import JSZip from "jszip";
const xmlParser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: "@_",
ignoreDeclaration: true,
});
/**
* Parse XML string, validate it, strip the root element, and return the children.
*/
function parseXml(xml) {
const validation = XMLValidator.validate(xml);
if (validation !== true) {
const e = validation.err;
throw new Error(`${e.msg}\nLine: ${e.line}\nColumn: ${e.col}\nChar: `);
}
const raw = xmlParser.parse(xml);
// Strip root element (equivalent to xml2js explicitRoot: false)
const keys = Object.keys(raw);
if (keys.length === 1) {
return raw[keys[0]];
}
return raw;
}
/** Extract text content from a parsed XML value (string, or object with #text). */
function textOf(val) {
if (val == null) {
return "";
}
if (typeof val === "string") {
return val.trim();
}
if (typeof val === "number") {
return String(val);
}
if (typeof val === "object" && "#text" in val) {
return String(val["#text"] ?? "").trim();
}
return "";
}
/** Extract all @_ prefixed attributes from a parsed element into a clean object. */
function attrsOf(obj) {
const result = {};
for (const key of Object.keys(obj)) {
if (key.startsWith("@_")) {
result[key.slice(2)] = String(obj[key]);
}
}
return result;
}
/** Ensure value is an array. */
function asArray(val) {
if (val == null) {
return [];
}
return Array.isArray(val) ? val : [val];
}
function extractIdentifiers(val, out) {
if (typeof val !== "object" || val == null) {
return;
}
const obj = val;
const scheme = obj["@_opf:scheme"];
const id = obj["@_id"];
const contents = textOf(obj);
if (scheme) {
out[scheme] = contents;
}
else if (id && id.match(/uuid/i)) {
out.UUID = contents.replace("urn:uuid:", "").toUpperCase().trim();
}
}
export class EPub {
input;
imageroot;
linkroot;
metadata = {};
manifest = {};
guide = [];
spine = {
toc: false,
contents: [],
};
flow = [];
toc = [];
version = "2.0";
zip;
containerFile = false;
mimeFile = false;
rootFile = false;
constructor(input, imageroot, linkroot) {
this.input = input;
this.imageroot = (imageroot || "/images/").trim();
this.linkroot = (linkroot || "/links/").trim();
if (!this.imageroot.endsWith("/")) {
this.imageroot += "/";
}
if (!this.linkroot.endsWith("/")) {
this.linkroot += "/";
}
}
async parse() {
this.containerFile = false;
this.mimeFile = false;
this.rootFile = false;
this.metadata = {};
this.manifest = {};
this.guide = [];
this.spine = { toc: false, contents: [] };
this.flow = [];
this.toc = [];
await this._open();
await this._checkMimeType();
await this._getRootFiles();
const rootfileData = await this._handleRootFile();
this._parseRootFile(rootfileData);
if (this.spine.toc) {
await this._parseTOC();
}
}
async _readFile(name) {
const file = this.zip.file(name);
if (!file) {
throw new Error(`Entry not found: ${name}`);
}
return file.async("nodebuffer");
}
async _open() {
try {
const buf = typeof this.input === "string" ? await readFile(this.input) : this.input;
this.zip = await JSZip.loadAsync(buf);
}
catch {
throw new Error("Invalid/missing file");
}
if (!Object.keys(this.zip.files).length) {
throw new Error("No files in archive");
}
}
async _checkMimeType() {
for (const name of Object.keys(this.zip.files)) {
if (name.toLowerCase() === "mimetype") {
this.mimeFile = name;
break;
}
}
if (!this.mimeFile) {
throw new Error("No mimetype file in archive");
}
const data = await this._readFile(this.mimeFile);
const txt = data.toString("utf-8").toLowerCase().trim();
if (txt !== "application/epub+zip") {
throw new Error("Unsupported mime type");
}
}
async _getRootFiles() {
for (const name of Object.keys(this.zip.files)) {
if (name.toLowerCase() === "meta-inf/container.xml") {
this.containerFile = name;
break;
}
}
if (!this.containerFile) {
throw new Error("No container file in archive");
}
const data = await this._readFile(this.containerFile);
const xml = data.toString("utf-8").trim();
const result = parseXml(xml);
const rootfiles = result.rootfiles;
if (!rootfiles || !rootfiles.rootfile) {
throw new Error("No rootfiles found");
}
for (const rf of asArray(rootfiles.rootfile)) {
if (String(rf["@_media-type"]).toLowerCase() === "application/oebps-package+xml" &&
rf["@_full-path"]) {
this.rootFile = String(rf["@_full-path"]);
break;
}
}
if (!this.rootFile) {
throw new Error("Rootfile not found from archive");
}
}
async _handleRootFile() {
const data = await this._readFile(this.rootFile);
const xml = data.toString("utf-8");
return parseXml(xml);
}
_parseRootFile(rootfile) {
this.version = String(rootfile["@_version"] || "2.0");
for (const fullKey of Object.keys(rootfile)) {
if (fullKey.startsWith("@_")) {
continue;
}
const key = (fullKey.split(":").pop() || "").toLowerCase().trim();
switch (key) {
case "metadata":
this._parseMetadata(rootfile[fullKey]);
break;
case "manifest":
this._parseManifest(rootfile[fullKey]);
break;
case "spine":
this._parseSpine(rootfile[fullKey]);
break;
case "guide":
this._parseGuide(rootfile[fullKey]);
break;
}
}
}
_parseMetadata(metadata) {
for (const fullKey of Object.keys(metadata)) {
if (fullKey.startsWith("@_")) {
continue;
}
const metadataValue = metadata[fullKey];
const key = (fullKey.split(":").pop() || "").toLowerCase().trim();
switch (key) {
case "publisher":
case "title":
case "description":
case "date": {
if (Array.isArray(metadataValue)) {
this.metadata[key] = textOf(metadataValue[0]);
}
else {
this.metadata[key] = textOf(metadataValue);
}
break;
}
case "language": {
if (Array.isArray(metadataValue)) {
this.metadata.language = textOf(metadataValue[0]).toLowerCase();
}
else {
this.metadata.language = textOf(metadataValue).toLowerCase();
}
break;
}
case "subject": {
const subjects = asArray(metadataValue);
if (subjects.length === 0) {
this.metadata.subject = "";
}
else {
this.metadata.subjects = subjects.map((v) => textOf(v));
this.metadata.subject = this.metadata.subjects[0] ?? "";
}
break;
}
case "creator": {
if (Array.isArray(metadataValue)) {
const first = metadataValue[0];
this.metadata.creator = textOf(first);
this.metadata.creatorFileAs = String((typeof first === "object" && first?.["@_opf:file-as"]) || this.metadata.creator).trim();
}
else {
this.metadata.creator = textOf(metadataValue);
const fileAs = typeof metadataValue === "object" &&
metadataValue != null &&
metadataValue["@_opf:file-as"];
this.metadata.creatorFileAs = String(fileAs || this.metadata.creator).trim();
}
break;
}
case "identifier": {
for (const v of asArray(metadataValue)) {
extractIdentifiers(v, this.metadata);
}
break;
}
case "source": {
const sources = asArray(metadataValue);
this.metadata.source = sources.length > 0 ? textOf(sources[0]) : "";
break;
}
}
}
for (const meta of asArray(metadata.meta)) {
const name = meta["@_name"];
const content = meta["@_content"];
const property = meta["@_property"];
if (name) {
this.metadata[name] = content;
}
if (meta["#text"] && property) {
this.metadata[property] = meta["#text"];
}
}
}
_parseManifest(manifest) {
const path = this.rootFile.split("/");
path.pop();
const pathStr = path.join("/");
for (const item of asArray(manifest.item)) {
const element = attrsOf(item);
if (element.href && element.href.substring(0, pathStr.length) !== pathStr) {
element.href = path.concat([element.href]).join("/");
}
if (element.id) {
this.manifest[element.id] = element;
}
}
}
_parseGuide(guide) {
const path = this.rootFile.split("/");
path.pop();
const pathStr = path.join("/");
for (const ref of asArray(guide.reference)) {
const element = attrsOf(ref);
if (element.href && element.href.substring(0, pathStr.length) !== pathStr) {
element.href = path.concat([element.href]).join("/");
}
this.guide.push(element);
}
}
_parseSpine(spine) {
const toc = spine["@_toc"];
if (toc) {
this.spine.toc = this.manifest[toc] || false;
}
for (const itemref of asArray(spine.itemref)) {
const idref = itemref["@_idref"];
if (idref) {
const element = this.manifest[idref];
if (element) {
this.spine.contents.push(element);
}
}
}
this.flow = this.spine.contents;
}
async _parseTOC() {
const tocHref = this.spine.toc.href;
const path = tocHref.split("/");
path.pop();
const idList = {};
for (const key of Object.keys(this.manifest)) {
idList[this.manifest[key].href] = key;
}
const data = await this._readFile(tocHref);
const xml = data.toString("utf-8");
let result;
try {
result = parseXml(xml);
}
catch (err) {
throw new Error("Parsing container XML failed in TOC: " +
(err instanceof Error ? err.message : String(err)));
}
const navMap = result.navMap;
if (navMap?.navPoint) {
this.toc = this.walkNavMap(navMap.navPoint, path, idList);
}
}
walkNavMap(branch, path, idList, level = 0) {
if (level > 7) {
return [];
}
const output = [];
const items = Array.isArray(branch) ? branch : [branch];
for (const item of items) {
const navLabel = item.navLabel;
if (navLabel) {
let title = "";
if (typeof navLabel.text === "string") {
title = navLabel.text.trim();
}
let order = Number(item["@_playOrder"] || 0);
if (isNaN(order)) {
order = 0;
}
let href = "";
const content = item.content;
if (typeof content?.["@_src"] === "string") {
href = content["@_src"].trim();
}
let element = { level, order, title, id: "", href: "" };
if (href) {
href = path.concat([href]).join("/");
element.href = href;
if (idList[element.href]) {
element = this.manifest[idList[element.href]];
element.title = title;
element.order = order;
element.level = level;
}
else {
element.href = href;
element.id = String(item["@_id"] || "").trim();
}
output.push(element);
}
}
if (item.navPoint) {
output.push(...this.walkNavMap(item.navPoint, path, idList, level + 1));
}
}
return output;
}
async getChapter(id) {
const str = await this.getChapterRaw(id);
const path = this.rootFile.split("/");
path.pop();
const keys = Object.keys(this.manifest);
// remove linebreaks (no multi line matches in JS regex!)
let s = str.replace(/\r?\n/g, "\u0000");
// keep only <body> contents
s.replace(/<body[^>]*?>(.*)<\/body[^>]*?>/i, (_o, d) => {
s = d.trim();
return "";
});
// remove <script> blocks
s = s.replace(/<script[^>]*?>(.*?)<\/script[^>]*?>/gi, () => "");
// remove <style> blocks
s = s.replace(/<style[^>]*?>(.*?)<\/style[^>]*?>/gi, () => "");
// remove onEvent handlers
s = s.replace(/(\s)(on\w+)(\s*=\s*["']?[^"'\s>]*?["'\s>])/g, (_o, a, b, c) => {
return a + "skip-" + b + c;
});
// replace images
s = s.replace(/(\ssrc\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (_o, a, b, c) => {
const img = path.concat([b]).join("/").trim();
let element;
for (const k of keys) {
if (this.manifest[k].href === img) {
element = this.manifest[k];
break;
}
}
if (element) {
return a + this.imageroot + element.id + "/" + img + c;
}
return "";
});
// replace links
s = s.replace(/(\shref\s*=\s*["']?)([^"'\s>]*?)(["'\s>])/g, (_o, a, b, c) => {
const linkparts = b ? b.split("#") : [];
let link = linkparts.length
? path
.concat([linkparts.shift() || ""])
.join("/")
.trim()
: "";
let element;
for (const k of keys) {
if (this.manifest[k].href.split("#")[0] === link) {
element = this.manifest[k];
break;
}
}
if (linkparts.length) {
link += "#" + linkparts.join("#");
}
if (element) {
return a + this.linkroot + element.id + "/" + link + c;
}
return a + b + c;
});
// bring back linebreaks
// eslint-disable-next-line no-control-regex
s = s.replace(/\u0000/g, "\n").trim();
return s;
}
async getChapterRaw(id) {
if (!this.manifest[id]) {
throw new Error("File not found");
}
const mediaType = this.manifest[id]["media-type"];
if (mediaType !== "application/xhtml+xml" && mediaType !== "image/svg+xml") {
throw new Error("Invalid mime type for chapter");
}
const data = await this._readFile(this.manifest[id].href);
return data ? data.toString("utf-8") : "";
}
async getImage(id) {
if (!this.manifest[id]) {
throw new Error("File not found");
}
const mediaType = (this.manifest[id]["media-type"] || "").toLowerCase().trim();
if (!mediaType.startsWith("image/")) {
throw new Error("Invalid mime type for image");
}
return this.getFile(id);
}
async getFile(id) {
if (!this.manifest[id]) {
throw new Error("File not found");
}
const data = await this._readFile(this.manifest[id].href);
return { data, mimeType: this.manifest[id]["media-type"] };
}
async readFile(filename, encoding) {
const data = await this._readFile(filename);
if (encoding) {
return data.toString(encoding);
}
return data;
}
hasDRM() {
return this.zip.file("META-INF/encryption.xml") !== null;
}
}
export default EPub;