UNPKG

@lingo-reader/fb2-parser

Version:

An fb2 parser which can extract chapter contents from an fb2 file

442 lines (435 loc) 13.1 kB
'use strict'; var shared = require('@lingo-reader/shared'); const HREF_PREFIX = "fb2:"; const ID_PREFIX = "lingo_fb2_"; const STYLESHEET_ID = `${ID_PREFIX}style`; async function extractFileData(file) { if (file instanceof Uint8Array) { return { data: file, fileName: "" }; } { if (typeof file === "string") { throw new TypeError("The `fb2` param cannot be a `string` in browser env."); } return { data: await file.text(), fileName: file.name }; } } function getFirstXmlNodeText(xmlNode) { return xmlNode?.[0]._ ?? ""; } function extend(target, source, ignoreKeys = []) { for (const key in source) { if (Object.prototype.hasOwnProperty.call(source, key) && !(key in target)) { if (!ignoreKeys.includes(key)) { target[key] = source[key]; } } } return target; } function base64ToUint8Array(base64String) { const binaryString = atob(base64String.trim()); const len = binaryString.length; const bytes = new Uint8Array(len); for (let i = 0; i < len; i++) { bytes[i] = binaryString.charCodeAt(i); } return bytes; } function saveResource(resource, resourceSaveDir) { const { id, base64Data, contentType } = resource; { const resourceUint8 = base64ToUint8Array(base64Data); const blob = new Blob([resourceUint8], { type: contentType }); return URL.createObjectURL(blob); } } function saveStylesheet(style, resourceSaveDir) { { return URL.createObjectURL(new Blob([style], { type: "text/css" })); } } function buildFb2Href(chapterId, fb2GlobalId) { return HREF_PREFIX + chapterId + (fb2GlobalId ? `#${fb2GlobalId}` : ""); } function buildIdToSectionMap(sectionId, sectionNode, idToChapterMap) { for (const node of sectionNode.children) { if (node["#name"] === "__text__") { continue; } const $ = node.$; if ($ && $.id) { idToChapterMap.set($.id, sectionId); } if (node.children) { buildIdToSectionMap(sectionId, node, idToChapterMap); } } } const fb2TagToHtmlTagMap = { "section": "div", "title": "h2", "subtitle": "h3", "poem": "blockquote", "stanza": "p", "v": "p", "text-author": "cite", "epigraph": "blockquote", "empty-line": "br", "image": "img", "emphasis": "em" }; const selfClosingHtmlTag = /* @__PURE__ */ new Set([ "br", "img" ]); function transformTagName(tag) { const transtormedTag = fb2TagToHtmlTagMap[tag] ?? tag; return { tag: transtormedTag, isSelfClosing: selfClosingHtmlTag.has(transtormedTag) }; } function parseBinary(binaryAST) { const resourceMap = /* @__PURE__ */ new Map(); for (const binary of binaryAST) { const $ = binary.$; const id = $.id; const contentType = $["content-type"]; if (!id || !contentType) { throw new Error("The <binary> element must have `id` and `content-type` attributes."); } resourceMap.set(id, { id, contentType: $["content-type"], base64Data: binary._ }); } return resourceMap; } function parseAuthor(authorAST) { const firstName = getFirstXmlNodeText(authorAST["first-name"]); const middleName = getFirstXmlNodeText(authorAST["middle-name"]); const lastName = getFirstXmlNodeText(authorAST["last-name"]); const name = [firstName, middleName, lastName].filter(Boolean).join(" "); return { firstName, middleName, lastName, name, nickname: getFirstXmlNodeText(authorAST["first-name"]), homePage: getFirstXmlNodeText(authorAST["home-page"]), email: getFirstXmlNodeText(authorAST.email) }; } function parseCoverpage(coverpageAST) { const $ = coverpageAST.image[0].$; return $["l:href"] ?? $["xlink:href"]; } function parseTitleInfo(titleInfoAST) { const titleInfo = {}; const directMapFields = { "genre": "type", "lang": "language", "annotation": "description", "book-title": "title", "src-lang": "srcLang" }; for (const key in titleInfoAST) { const node = titleInfoAST[key]?.[0]; if (key in directMapFields && node && "_" in node) { titleInfo[directMapFields[key]] = node._; continue; } switch (key) { case "author": { titleInfo.author = parseAuthor(node); break; } case "coverpage": { titleInfo.coverImageId = parseCoverpage(node).slice(1); break; } } } return titleInfo; } function parseDocumentInfo(documentInfoAST) { const documentInfo = {}; for (const key in documentInfoAST) { if (key === "children") { continue; } else if (key === "author") { documentInfo.author = parseAuthor(documentInfoAST.author[0]); } else if (key === "history") { documentInfo.history = documentInfoAST.history[0]; } else { documentInfo[shared.camelCase(key)] = getFirstXmlNodeText(documentInfoAST[key]); } } return documentInfo; } function parsePublishInfo(publishInfoAST) { const publishInfo = {}; for (const key in publishInfoAST) { if (key === "children") { continue; } const node = publishInfoAST[key]; publishInfo[shared.camelCase(key)] = getFirstXmlNodeText(node); } return publishInfo; } function parseCustomInfo(customInfoAST) { const res = {}; for (const customInfo of customInfoAST) { const infoType = customInfo.$["info-type"]; res[infoType] = customInfo._; } return res; } function parseDescription(descriptionAST) { const metadata = {}; let coverImageId = ""; let history; if (descriptionAST["title-info"]) { const titleInfo = parseTitleInfo(descriptionAST["title-info"][0]); coverImageId = titleInfo.coverImageId ?? ""; extend(metadata, titleInfo, ["coverImageId"]); } if (descriptionAST["document-info"]) { const documentInfo = parseDocumentInfo(descriptionAST["document-info"][0]); extend(metadata, documentInfo, ["history"]); history = documentInfo.history; } if (descriptionAST["publish-info"]) { const publishInfo = parsePublishInfo(descriptionAST["publish-info"][0]); extend(metadata, publishInfo); } if (descriptionAST["custom-info"]) { const customInfo = parseCustomInfo(descriptionAST["custom-info"]); extend(metadata, customInfo); } return { metadata, coverImageId, history }; } var __defProp = Object.defineProperty; var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value; var __publicField = (obj, key, value) => __defNormalProp(obj, typeof key !== "symbol" ? key + "" : key, value); async function initFb2File(fb2, resourceSaveDir = "./images") { const fb2Instance = new Fb2File(fb2, resourceSaveDir); await fb2Instance.loadFb2(); return fb2Instance; } class Fb2File { constructor(fb2, resourceSaveDir = "./images") { this.fb2 = fb2; // resource __publicField(this, "resourceSaveDir"); // global id to Resource __publicField(this, "resourceStore"); // id to url __publicField(this, "resourceCache", /* @__PURE__ */ new Map()); // chapter id to processed chapter __publicField(this, "chapterCache", /* @__PURE__ */ new Map()); // stylesheet Url __publicField(this, "stylesheetUrl", ""); // chapters __publicField(this, "chapterStore", /* @__PURE__ */ new Map()); __publicField(this, "idToChapterIdMap", /* @__PURE__ */ new Map()); // Toc __publicField(this, "tableOfContent", []); // spine __publicField(this, "spine", []); __publicField(this, "metadata"); __publicField(this, "fileName"); __publicField(this, "coverImageId"); this.resourceSaveDir = resourceSaveDir; } getToc() { return this.tableOfContent; } getSpine() { return this.spine; } getMetadata() { return this.metadata; } getFileInfo() { return { fileName: this.fileName }; } getCoverImage() { if (this.resourceCache.has(this.coverImageId)) { return this.resourceCache.get(this.coverImageId); } if (this.coverImageId.length > 0 && this.resourceStore.has(this.coverImageId)) { const resourcePath = saveResource( this.resourceStore.get(this.coverImageId), this.resourceSaveDir ); this.resourceCache.set(this.coverImageId, resourcePath); return resourcePath; } return ""; } async loadFb2() { const { data: fb2Uint8Array, fileName } = await extractFileData(this.fb2); this.fileName = fileName; const res = await shared.parsexml(fb2Uint8Array, { charsAsChildren: true, preserveChildrenOrder: true, explicitChildren: true, childkey: "children", trim: true }); const fictionBook = res.FictionBook; this.resourceStore = parseBinary(fictionBook.binary); const { metadata, coverImageId, history } = parseDescription(fictionBook.description[0]); this.metadata = metadata; this.coverImageId = coverImageId; if (history) { this.metadata.history = this.serializeNode(history); } if (fictionBook.stylesheet) { this.stylesheetUrl = saveStylesheet(fictionBook.stylesheet[0]._, this.resourceSaveDir); this.resourceCache.set(STYLESHEET_ID, this.stylesheetUrl); } let sectionId = 0; for (const body of fictionBook.body) { const isUnnamedBody = !body.$?.name; for (const sectionNode of body.section) { const id = ID_PREFIX + sectionId; const name = isUnnamedBody ? getFirstXmlNodeText(sectionNode.title) : body.$.name; this.chapterStore.set(id, { id, sectionNode, ...isUnnamedBody ? {} : { name } }); this.spine.push({ id }); this.tableOfContent.push({ label: name, href: buildFb2Href(id) }); buildIdToSectionMap(id, sectionNode, this.idToChapterIdMap); sectionId++; } } } serializeAttr(attrs, tagName) { if (!attrs) { return ""; } const res = []; for (const key in attrs) { const value = attrs[key]; if (key === "l:href" || key === "xlink:href") { const id = value.slice(1); if (tagName === "a" && this.idToChapterIdMap.has(id)) { res.push(`href="${buildFb2Href( this.idToChapterIdMap.get(id), id )}"`); } else if (tagName === "a" && value.startsWith("http")) { res.push(`href="${value}"`); } else if (tagName === "img" && this.resourceStore.has(id)) { const resourceUrl = saveResource( this.resourceStore.get(id), this.resourceSaveDir ); this.resourceCache.set(id, resourceUrl); res.push(`src="${resourceUrl}"`); } else { res.push(""); } } else { res.push(`${key}="${value}"`); } } return res.filter(Boolean).join(" "); } serializeChildren(sectionNode) { const res = []; for (const node of sectionNode.children) { if (node["#name"] === "__text__") { res.push(node._.trim()); } else { const { tag, isSelfClosing } = transformTagName(node["#name"]); const attrStr = this.serializeAttr(node.$, tag); const targetAttrStr = attrStr.length > 0 ? ` ${attrStr}` : ""; let childrenStr = ""; if (node.children) { childrenStr = this.serializeChildren(node); } res.push( isSelfClosing ? `<${tag}${targetAttrStr}/>` : `<${tag}${targetAttrStr}>${childrenStr}</${tag}>` ); } } return res.join(""); } serializeNode(sectionNode) { const attrStr = this.serializeAttr(sectionNode.$, "div"); const childrenStr = this.serializeChildren(sectionNode); return `<div${attrStr}>${childrenStr}</div>`; } loadChapter(id) { if (!this.chapterStore.has(id)) { return void 0; } if (this.chapterCache.has(id)) { return this.chapterCache.get(id); } const chapter = this.chapterStore.get(id); const transformedSection = { html: this.serializeNode(chapter.sectionNode), css: this.stylesheetUrl.length > 0 ? [{ id: `${ID_PREFIX}css`, href: this.stylesheetUrl }] : [] }; this.chapterCache.set(id, transformedSection); return transformedSection; } resolveHref(fb2Href) { if (!fb2Href.startsWith(HREF_PREFIX)) { return void 0; } fb2Href = fb2Href.slice(HREF_PREFIX.length).trim(); const [chapterId, globalId] = fb2Href.split("#"); const id = this.chapterStore.get(chapterId)?.id; if (!id) { return void 0; } let selector = ""; if (globalId) { selector = `[id="${globalId}"]`; } return { id, selector }; } destroy() { this.resourceCache.forEach((path) => { { URL.revokeObjectURL(path); } }); this.resourceCache.clear(); this.chapterCache.clear(); this.resourceStore?.clear(); this.chapterStore.clear(); } } exports.HREF_PREFIX = HREF_PREFIX; exports.ID_PREFIX = ID_PREFIX; exports.STYLESHEET_ID = STYLESHEET_ID; exports.initFb2File = initFb2File;