@lesjoursfr/html-to-epub
Version:
A library to generate EPUB from HTML. Inspired by epub-gen.
762 lines (761 loc) • 26 kB
JavaScript
import archiver from "archiver";
import axios from "axios";
import { remove as diacritics } from "diacritics";
import { renderFile } from "ejs";
import { encodeXML } from "entities";
import { createReadStream, createWriteStream, existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync, } from "fs";
import fsExtra from "fs-extra";
import { imageSizeFromFile } from "image-size/fromFile";
import mime from "mime";
import { basename, dirname, resolve } from "path";
import rehypeParse from "rehype-parse";
import rehypeStringify from "rehype-stringify";
import { unified } from "unified";
import { visit } from "unist-util-visit";
import { fileURLToPath } from "url";
import uslug from "uslug";
// Allowed HTML attributes & tags
export const defaultAllowedAttributes = [
"content",
"alt",
"id",
"title",
"src",
"href",
"about",
"accesskey",
"aria-activedescendant",
"aria-atomic",
"aria-autocomplete",
"aria-busy",
"aria-checked",
"aria-controls",
"aria-describedat",
"aria-describedby",
"aria-disabled",
"aria-dropeffect",
"aria-expanded",
"aria-flowto",
"aria-grabbed",
"aria-haspopup",
"aria-hidden",
"aria-invalid",
"aria-label",
"aria-labelledby",
"aria-level",
"aria-live",
"aria-multiline",
"aria-multiselectable",
"aria-orientation",
"aria-owns",
"aria-posinset",
"aria-pressed",
"aria-readonly",
"aria-relevant",
"aria-required",
"aria-selected",
"aria-setsize",
"aria-sort",
"aria-valuemax",
"aria-valuemin",
"aria-valuenow",
"aria-valuetext",
"className",
"content",
"contenteditable",
"contextmenu",
"controls",
"datatype",
"dir",
"draggable",
"dropzone",
"hidden",
"hreflang",
"id",
"inlist",
"itemid",
"itemref",
"itemscope",
"itemtype",
"lang",
"media",
"ns1:type",
"ns2:alphabet",
"ns2:ph",
"onabort",
"onblur",
"oncanplay",
"oncanplaythrough",
"onchange",
"onclick",
"oncontextmenu",
"ondblclick",
"ondrag",
"ondragend",
"ondragenter",
"ondragleave",
"ondragover",
"ondragstart",
"ondrop",
"ondurationchange",
"onemptied",
"onended",
"onerror",
"onfocus",
"oninput",
"oninvalid",
"onkeydown",
"onkeypress",
"onkeyup",
"onload",
"onloadeddata",
"onloadedmetadata",
"onloadstart",
"onmousedown",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onmousewheel",
"onpause",
"onplay",
"onplaying",
"onprogress",
"onratechange",
"onreadystatechange",
"onreset",
"onscroll",
"onseeked",
"onseeking",
"onselect",
"onshow",
"onstalled",
"onsubmit",
"onsuspend",
"ontimeupdate",
"onvolumechange",
"onwaiting",
"prefix",
"property",
"rel",
"resource",
"rev",
"role",
"spellcheck",
"style",
"tabindex",
"target",
"title",
"type",
"typeof",
"vocab",
"xml:base",
"xml:lang",
"xml:space",
"colspan",
"rowspan",
"epub:type",
"epub:prefix",
];
export const defaultAllowedXhtml11Tags = [
"div",
"p",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"ul",
"ol",
"li",
"dl",
"dt",
"dd",
"address",
"hr",
"pre",
"blockquote",
"center",
"ins",
"del",
"a",
"span",
"bdo",
"br",
"em",
"strong",
"dfn",
"code",
"samp",
"kbd",
"bar",
"cite",
"abbr",
"acronym",
"q",
"sub",
"sup",
"tt",
"i",
"b",
"big",
"small",
"u",
"s",
"strike",
"basefont",
"font",
"object",
"param",
"img",
"table",
"caption",
"colgroup",
"col",
"thead",
"tfoot",
"tbody",
"tr",
"th",
"td",
"embed",
"applet",
"iframe",
"img",
"map",
"noscript",
"ns:svg",
"object",
"script",
"table",
"tt",
"var",
];
// UUID generation
function uuid() {
return "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx".replace(/[xy]/g, (c) => {
const r = (Math.random() * 16) | 0;
return (c === "x" ? r : (r & 0x3) | 0x8).toString(16);
});
}
// Current directory
const __dirname = fileURLToPath(new URL(".", import.meta.url));
export class EPub {
uuid;
title;
description;
cover;
collections;
useFirstImageAsCover;
downloadAudioVideoFiles;
coverMediaType;
coverExtension;
coverDimensions = {
width: 0,
height: 0,
};
publisher;
author;
tocTitle;
appendChapterTitles;
showToC;
date;
lang;
css;
fonts;
content;
images;
audioVideo;
customOpfTemplatePath;
customNcxTocTemplatePath;
customHtmlCoverTemplatePath;
customHtmlTocTemplatePath;
version;
userAgent;
verbose;
tempDir;
tempEpubDir;
output;
allowedAttributes;
allowedXhtml11Tags;
coverMetaContent;
startOfContentHref;
constructor(options, output) {
// File ID
this.uuid = uuid();
// Required options
this.title = options.title;
this.description = options.description;
this.output = output;
// Options with defaults
this.cover = options.cover ?? null;
this.useFirstImageAsCover = options.useFirstImageAsCover ?? false;
this.publisher = options.publisher ?? "anonymous";
this.author = options.author
? typeof options.author === "string"
? [options.author]
: options.author
: ["anonymous"];
if (this.author.length === 0) {
this.author = ["anonymous"];
}
this.collections = options.collection
? Array.isArray(options.collection)
? options.collection
: [options.collection]
: [];
for (const collection of this.collections) {
if (collection.type && !["series", "set"].includes(collection.type)) {
throw new Error(`Invalid collections: ${this.collections.map((c) => `${c.name}: ${c.type}`).join(", ")}. Allowed types are "series" and "set".`);
}
}
this.tocTitle = options.tocTitle ?? "Table Of Contents";
this.appendChapterTitles = options.appendChapterTitles ?? true;
this.showToC = options.hideToC !== true;
this.date = options.date ?? new Date().toISOString();
this.lang = options.lang ?? "en";
this.css = options.css ?? null;
this.fonts = options.fonts ?? [];
this.customOpfTemplatePath = options.customOpfTemplatePath ?? null;
this.customNcxTocTemplatePath = options.customNcxTocTemplatePath ?? null;
this.customHtmlTocTemplatePath = options.customHtmlTocTemplatePath ?? null;
this.customHtmlCoverTemplatePath = options.customHtmlCoverTemplatePath ?? null;
this.version = options.version ?? 3;
this.downloadAudioVideoFiles = this.version !== 2 ? (options.downloadAudioVideoFiles ?? false) : false; // Disable audio/video downloads for EPUB 2
this.userAgent =
options.userAgent ??
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36";
this.verbose = options.verbose ?? false;
this.allowedAttributes = options.allowedAttributes ?? defaultAllowedAttributes;
this.allowedXhtml11Tags = options.allowedXhtml11Tags ?? defaultAllowedXhtml11Tags;
// Temporary folder for work
this.tempDir = options.tempDir ?? resolve(__dirname, "../tempDir/");
this.tempEpubDir = resolve(this.tempDir, this.uuid);
// Check the cover image
if (this.cover !== null) {
this.coverMediaType = mime.getType(this.cover);
if (this.coverMediaType === null) {
throw new Error(`The cover image can't be processed : ${this.cover}`);
}
this.coverExtension = mime.getExtension(this.coverMediaType);
if (this.coverExtension === null) {
throw new Error(`The cover image can't be processed : ${this.cover}`);
}
}
else {
this.coverMediaType = null;
this.coverExtension = null;
}
const loadHtml = (content, plugins) => unified()
.use(rehypeParse, { fragment: true })
.use(plugins)
// Voids: [] is required for epub generation, and causes little/no harm for non-epub usage
.use(rehypeStringify, { allowDangerousHtml: true, voids: [], collapseBooleanAttributes: false })
.processSync(content)
.toString();
this.images = [];
this.audioVideo = [];
this.content = [];
// Insert cover in content
if (this.cover) {
const templatePath = this.customHtmlCoverTemplatePath || resolve(__dirname, `../templates/epub${this.version}/cover.xhtml.ejs`);
if (!existsSync(templatePath)) {
throw new Error("Could not resolve path to cover template HTML.");
}
this.content.push({
id: `item_${this.content.length}`,
href: "cover.xhtml",
title: "cover",
data: "",
url: null,
author: [],
filePath: resolve(this.tempEpubDir, `./OEBPS/cover.xhtml`),
templatePath,
excludeFromToc: true,
beforeToc: true,
isCover: true,
});
}
// Parse contents & save media
const contentTemplatePath = resolve(__dirname, "../templates/content.xhtml.ejs");
const contentOffset = this.content.length;
this.content.push(...options.content.map((content, i) => {
const index = contentOffset + i;
// Get the content URL & path
let href, filePath;
if (content.filename === undefined) {
const titleSlug = uslug(diacritics(content.title || "no title"));
href = `${index}_${titleSlug}.xhtml`;
filePath = resolve(this.tempEpubDir, `./OEBPS/${index}_${titleSlug}.xhtml`);
}
else {
href = content.filename.match(/\.xhtml$/) ? content.filename : `${content.filename}.xhtml`;
if (content.filename.match(/\.xhtml$/)) {
filePath = resolve(this.tempEpubDir, `./OEBPS/${content.filename}`);
}
else {
filePath = resolve(this.tempEpubDir, `./OEBPS/${content.filename}.xhtml`);
}
}
// Content ID & directory
const id = `item_${index}`;
const dir = dirname(filePath);
// Parse the content
const html = loadHtml(content.data, [
() => (tree) => {
visit(tree, "element", (node) => {
this.validateElement(index, node);
});
},
() => (tree) => {
visit(tree, "element", (node) => {
if (["img", "input"].includes(node.tagName)) {
this.processMediaTag(index, dir, node, this.images, "images");
}
else if (this.downloadAudioVideoFiles && ["audio", "video"].includes(node.tagName)) {
this.processMediaTag(index, dir, node, this.audioVideo, "audiovideo");
}
});
},
]);
// Return the EpubContent
return {
id,
href,
title: content.title,
data: html,
url: content.url ?? null,
author: content.author ? (typeof content.author === "string" ? [content.author] : content.author) : [],
filePath,
templatePath: contentTemplatePath,
excludeFromToc: content.excludeFromToc === true, // Default to false
beforeToc: content.beforeToc === true, // Default to false
isCover: false,
};
}));
// Prepare the cover meta
if (this.cover) {
this.coverMetaContent = "image_cover";
}
else {
// Check if we can use the first image as a cover
if (this.useFirstImageAsCover && this.images.length > 0) {
this.coverMetaContent = "image_0";
this.images[0].isCoverImage = true; // Flag the image as a cover
}
else {
this.coverMetaContent = null;
}
}
// Get the link to the start of content
const firstContentInToc = this.content.find((el) => !el.excludeFromToc);
if (firstContentInToc === undefined) {
throw new Error("At least one element have to be included in the ToC!");
}
this.startOfContentHref = firstContentInToc.href;
}
validateElement(contentIndex, node) {
const attrs = node.properties;
if (["img", "br", "hr"].includes(node.tagName)) {
if (node.tagName === "img") {
node.properties.alt = node.properties?.alt || "image-placeholder";
}
}
for (const k of Object.keys(attrs)) {
if (this.allowedAttributes.includes(k)) {
if (k === "type") {
if (attrs[k] !== "script") {
delete node.properties[k];
}
}
else if (k === "controls") {
if (attrs[k] === true) {
node.properties[k] = "Controls";
}
}
}
else {
delete node.properties[k];
}
}
if (this.version === 2) {
if (!this.allowedXhtml11Tags.includes(node.tagName)) {
if (this.verbose) {
console.log(`[Warning] (content[${contentIndex}]) ${node.tagName} tag isn't allowed on EPUB 2/XHTML 1.1 DTD.`);
}
node.tagName = "div";
}
}
}
processMediaTag(contentIndex, dir, node, mediaArray, subfolder) {
const url = node.properties.src;
if (url === undefined || url === null) {
return;
}
let extension, id;
const media = mediaArray.find((element) => element.url === url);
if (media) {
id = media.id;
extension = media.extension;
}
else {
id = uuid();
const mediaType = mime.getType(url.replace(/\?.*/, ""));
if (mediaType === null) {
if (this.verbose) {
console.error(`[Media Error] (content[${contentIndex}]) (subfolder=${subfolder}) The media can't be processed : ${url}`);
}
return;
}
extension = mime.getExtension(mediaType);
if (extension === null) {
if (this.verbose) {
console.error(`[Media Error] (content[${contentIndex}]) (subfolder=${subfolder}) The media can't be processed : ${url}`);
}
return;
}
mediaArray.push({ id, url, dir, mediaType, extension });
}
node.properties.src = `${subfolder}/${id}.${extension}`;
}
async render() {
// Create directories
if (!existsSync(this.tempDir)) {
mkdirSync(this.tempDir);
}
mkdirSync(this.tempEpubDir);
mkdirSync(resolve(this.tempEpubDir, "./OEBPS"));
if (this.verbose) {
console.log("Downloading Media...");
}
await this.downloadAllMedia();
if (this.verbose) {
console.log("Making Cover...");
}
await this.makeCover();
if (this.verbose) {
console.log("Generating Template Files.....");
}
await this.generateTemplateFiles();
if (this.verbose) {
console.log("Generating Epub Files...");
}
await this.generate();
if (this.verbose) {
console.log("Done.");
}
return { result: "ok" };
}
async generateTemplateFiles() {
// Create the document's Header
const docHeader = this.version === 2
? `<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="${this.lang}">
`
: `<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="${this.lang}">
`;
// Copy the CSS style
if (!this.css) {
this.css = readFileSync(resolve(__dirname, "../templates/template.css"), { encoding: "utf8" });
}
writeFileSync(resolve(this.tempEpubDir, "./OEBPS/style.css"), this.css);
// Copy fonts
if (this.fonts.length) {
mkdirSync(resolve(this.tempEpubDir, "./OEBPS/fonts"));
this.fonts = this.fonts.map((font) => {
if (!existsSync(font)) {
throw new Error(`Custom font not found at ${font}.`);
}
const filename = basename(font);
fsExtra.copySync(font, resolve(this.tempEpubDir, `./OEBPS/fonts/${filename}`));
return filename;
});
}
// Write content files
for (const content of this.content) {
const result = await renderFile(content.templatePath, {
...this,
...content,
bookTitle: this.title,
encodeXML,
docHeader,
}, {
escape: (markup) => markup,
});
writeFileSync(content.filePath, result);
}
// write meta-inf/container.xml
mkdirSync(`${this.tempEpubDir}/META-INF`);
writeFileSync(`${this.tempEpubDir}/META-INF/container.xml`, '<?xml version="1.0" encoding="UTF-8" ?><container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"><rootfiles><rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/></rootfiles></container>');
if (this.version === 2) {
// write meta-inf/com.apple.ibooks.display-options.xml [from pedrosanta:xhtml#6]
writeFileSync(`${this.tempEpubDir}/META-INF/com.apple.ibooks.display-options.xml`, `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<display_options>
<platform name="*">
<option name="specified-fonts">true</option>
</platform>
</display_options>
`);
}
const opfPath = this.customOpfTemplatePath || resolve(__dirname, `../templates/epub${this.version}/content.opf.ejs`);
if (!existsSync(opfPath)) {
throw new Error("Custom file to OPF template not found.");
}
writeFileSync(resolve(this.tempEpubDir, "./OEBPS/content.opf"), await renderFile(opfPath, this));
const ncxTocPath = this.customNcxTocTemplatePath || resolve(__dirname, "../templates/toc.ncx.ejs");
if (!existsSync(ncxTocPath)) {
throw new Error("Custom file the NCX toc template not found.");
}
writeFileSync(resolve(this.tempEpubDir, "./OEBPS/toc.ncx"), await renderFile(ncxTocPath, this));
const htmlTocPath = this.customHtmlTocTemplatePath || resolve(__dirname, `../templates/epub${this.version}/toc.xhtml.ejs`);
if (!existsSync(htmlTocPath)) {
throw new Error("Custom file to HTML toc template not found.");
}
writeFileSync(resolve(this.tempEpubDir, "./OEBPS/toc.xhtml"), await renderFile(htmlTocPath, this));
}
async makeCover() {
if (this.cover === null) {
return;
}
const destPath = resolve(this.tempEpubDir, `./OEBPS/cover.${this.coverExtension}`);
let writeStream;
if (this.cover.slice(0, 4) === "http" || this.cover.slice(0, 2) === "//") {
try {
const httpRequest = await axios.get(this.cover, {
responseType: "stream",
headers: { "User-Agent": this.userAgent },
});
writeStream = httpRequest.data;
writeStream.pipe(createWriteStream(destPath));
}
catch (err) {
if (this.verbose) {
console.error(`The cover image can't be processed : ${this.cover}, ${err}`);
}
return;
}
}
else {
writeStream = createReadStream(this.cover);
writeStream.pipe(createWriteStream(destPath));
}
const promiseStream = new Promise((resolve, reject) => {
writeStream.on("end", () => resolve());
writeStream.on("error", (err) => {
console.error("Error", err);
unlinkSync(destPath);
reject(err);
});
});
await promiseStream;
if (this.verbose) {
console.log("[Success] cover image downloaded successfully!");
}
// Retrieve image dimensions
const result = await imageSizeFromFile(destPath);
if (!result || !result.width || !result.height) {
throw new Error(`Failed to retrieve cover image dimensions for "${destPath}"`);
}
this.coverDimensions.width = result.width;
this.coverDimensions.height = result.height;
if (this.verbose) {
console.log(`cover image dimensions: ${this.coverDimensions.width} x ${this.coverDimensions.height}`);
}
}
async downloadMedia(media, subfolder) {
const filename = resolve(this.tempEpubDir, `./OEBPS/${subfolder}/${media.id}.${media.extension}`);
if (media.url.indexOf("file://") === 0) {
const auxpath = media.url.substr(7);
fsExtra.copySync(auxpath, filename);
return;
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let requestAction;
if (media.url.indexOf("http") === 0 || media.url.indexOf("//") === 0) {
try {
const httpRequest = await axios.get(media.url, {
responseType: "stream",
headers: { "User-Agent": this.userAgent },
});
requestAction = httpRequest.data;
requestAction.pipe(createWriteStream(filename));
}
catch (err) {
if (this.verbose) {
console.error(`The media can't be processed : ${media.url}, ${err}`);
}
return;
}
}
else {
requestAction = createReadStream(resolve(media.dir, media.url));
requestAction.pipe(createWriteStream(filename));
}
return new Promise((resolve, reject) => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
requestAction.on("error", (err) => {
if (this.verbose) {
console.error("[Download Error]", "Error while downloading", media.url, err);
}
unlinkSync(filename);
reject(err);
});
requestAction.on("end", () => {
if (this.verbose) {
console.log("[Download Success]", media.url);
}
resolve();
});
});
}
async downloadAllMedia() {
if (this.images.length > 0) {
mkdirSync(resolve(this.tempEpubDir, "./OEBPS/images"));
for (let index = 0; index < this.images.length; index++) {
await this.downloadMedia(this.images[index], "images");
}
}
if (this.audioVideo.length > 0) {
mkdirSync(resolve(this.tempEpubDir, "./OEBPS/audiovideo"));
for (let index = 0; index < this.audioVideo.length; index++) {
await this.downloadMedia(this.audioVideo[index], "audiovideo");
}
}
}
generate() {
// Thanks to Paul Bradley
// http://www.bradleymedia.org/gzip-markdown-epub/ (404 as of 28.07.2016)
// Web Archive URL:
// http://web.archive.org/web/20150521053611/http://www.bradleymedia.org/gzip-markdown-epub
// or Gist:
// https://gist.github.com/cyrilis/8d48eef37fbc108869ac32eb3ef97bca
const cwd = this.tempEpubDir;
return new Promise((resolve, reject) => {
const archive = archiver("zip", { zlib: { level: 9 } });
const output = createWriteStream(this.output);
if (this.verbose) {
console.log("Zipping temp dir to", this.output);
}
archive.append("application/epub+zip", { store: true, name: "mimetype" });
archive.directory(`${cwd}/META-INF`, "META-INF");
archive.directory(`${cwd}/OEBPS`, "OEBPS");
archive.pipe(output);
archive.on("end", () => {
output.end(() => {
if (this.verbose) {
console.log("Done zipping, clearing temp dir...");
}
fsExtra.removeSync(cwd);
resolve();
});
});
// eslint-disable-next-line @typescript-eslint/no-explicit-any
archive.on("error", (err) => reject(err));
archive.finalize();
});
}
}