UNPKG

foliate-js

Version:
1,217 lines (1,140 loc) 46.8 kB
const unescapeHTML = str => { if (!str) return '' const textarea = document.createElement('textarea') textarea.innerHTML = str return textarea.value } const MIME = { XML: 'application/xml', XHTML: 'application/xhtml+xml', HTML: 'text/html', CSS: 'text/css', SVG: 'image/svg+xml', } const PDB_HEADER = { name: [0, 32, 'string'], type: [60, 4, 'string'], creator: [64, 4, 'string'], numRecords: [76, 2, 'uint'], } const PALMDOC_HEADER = { compression: [0, 2, 'uint'], numTextRecords: [8, 2, 'uint'], recordSize: [10, 2, 'uint'], encryption: [12, 2, 'uint'], } const MOBI_HEADER = { magic: [16, 4, 'string'], length: [20, 4, 'uint'], type: [24, 4, 'uint'], encoding: [28, 4, 'uint'], uid: [32, 4, 'uint'], version: [36, 4, 'uint'], titleOffset: [84, 4, 'uint'], titleLength: [88, 4, 'uint'], localeRegion: [94, 1, 'uint'], localeLanguage: [95, 1, 'uint'], resourceStart: [108, 4, 'uint'], huffcdic: [112, 4, 'uint'], numHuffcdic: [116, 4, 'uint'], exthFlag: [128, 4, 'uint'], trailingFlags: [240, 4, 'uint'], indx: [244, 4, 'uint'], } const KF8_HEADER = { resourceStart: [108, 4, 'uint'], fdst: [192, 4, 'uint'], numFdst: [196, 4, 'uint'], frag: [248, 4, 'uint'], skel: [252, 4, 'uint'], guide: [260, 4, 'uint'], } const EXTH_HEADER = { magic: [0, 4, 'string'], length: [4, 4, 'uint'], count: [8, 4, 'uint'], } const INDX_HEADER = { magic: [0, 4, 'string'], length: [4, 4, 'uint'], type: [8, 4, 'uint'], idxt: [20, 4, 'uint'], numRecords: [24, 4, 'uint'], encoding: [28, 4, 'uint'], language: [32, 4, 'uint'], total: [36, 4, 'uint'], ordt: [40, 4, 'uint'], ligt: [44, 4, 'uint'], numLigt: [48, 4, 'uint'], numCncx: [52, 4, 'uint'], } const TAGX_HEADER = { magic: [0, 4, 'string'], length: [4, 4, 'uint'], numControlBytes: [8, 4, 'uint'], } const HUFF_HEADER = { magic: [0, 4, 'string'], offset1: [8, 4, 'uint'], offset2: [12, 4, 'uint'], } const CDIC_HEADER = { magic: [0, 4, 'string'], length: [4, 4, 'uint'], numEntries: [8, 4, 'uint'], codeLength: [12, 4, 'uint'], } const FDST_HEADER = { magic: [0, 4, 'string'], numEntries: [8, 4, 'uint'], } const FONT_HEADER = { flags: [8, 4, 'uint'], dataStart: [12, 4, 'uint'], keyLength: [16, 4, 'uint'], keyStart: [20, 4, 'uint'], } const MOBI_ENCODING = { 1252: 'windows-1252', 65001: 'utf-8', } const EXTH_RECORD_TYPE = { 100: ['creator', 'string', true], 101: ['publisher'], 103: ['description'], 104: ['isbn'], 105: ['subject', 'string', true], 106: ['date'], 108: ['contributor', 'string', true], 109: ['rights'], 110: ['subjectCode', 'string', true], 112: ['source', 'string', true], 113: ['asin'], 121: ['boundary', 'uint'], 122: ['fixedLayout'], 125: ['numResources', 'uint'], 126: ['originalResolution'], 127: ['zeroGutter'], 128: ['zeroMargin'], 129: ['coverURI'], 132: ['regionMagnification'], 201: ['coverOffset', 'uint'], 202: ['thumbnailOffset', 'uint'], 503: ['title'], 524: ['language', 'string', true], 527: ['pageProgressionDirection'], } const MOBI_LANG = { 1: ['ar', 'ar-SA', 'ar-IQ', 'ar-EG', 'ar-LY', 'ar-DZ', 'ar-MA', 'ar-TN', 'ar-OM', 'ar-YE', 'ar-SY', 'ar-JO', 'ar-LB', 'ar-KW', 'ar-AE', 'ar-BH', 'ar-QA'], 2: ['bg'], 3: ['ca'], 4: ['zh', 'zh-TW', 'zh-CN', 'zh-HK', 'zh-SG'], 5: ['cs'], 6: ['da'], 7: ['de', 'de-DE', 'de-CH', 'de-AT', 'de-LU', 'de-LI'], 8: ['el'], 9: ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-NZ', 'en-IE', 'en-ZA', 'en-JM', null, 'en-BZ', 'en-TT', 'en-ZW', 'en-PH'], 10: ['es', 'es-ES', 'es-MX', null, 'es-GT', 'es-CR', 'es-PA', 'es-DO', 'es-VE', 'es-CO', 'es-PE', 'es-AR', 'es-EC', 'es-CL', 'es-UY', 'es-PY', 'es-BO', 'es-SV', 'es-HN', 'es-NI', 'es-PR'], 11: ['fi'], 12: ['fr', 'fr-FR', 'fr-BE', 'fr-CA', 'fr-CH', 'fr-LU', 'fr-MC'], 13: ['he'], 14: ['hu'], 15: ['is'], 16: ['it', 'it-IT', 'it-CH'], 17: ['ja'], 18: ['ko'], 19: ['nl', 'nl-NL', 'nl-BE'], 20: ['no', 'nb', 'nn'], 21: ['pl'], 22: ['pt', 'pt-BR', 'pt-PT'], 23: ['rm'], 24: ['ro'], 25: ['ru'], 26: ['hr', null, 'sr'], 27: ['sk'], 28: ['sq'], 29: ['sv', 'sv-SE', 'sv-FI'], 30: ['th'], 31: ['tr'], 32: ['ur'], 33: ['id'], 34: ['uk'], 35: ['be'], 36: ['sl'], 37: ['et'], 38: ['lv'], 39: ['lt'], 41: ['fa'], 42: ['vi'], 43: ['hy'], 44: ['az'], 45: ['eu'], 46: ['hsb'], 47: ['mk'], 48: ['st'], 49: ['ts'], 50: ['tn'], 52: ['xh'], 53: ['zu'], 54: ['af'], 55: ['ka'], 56: ['fo'], 57: ['hi'], 58: ['mt'], 59: ['se'], 62: ['ms'], 63: ['kk'], 65: ['sw'], 67: ['uz', null, 'uz-UZ'], 68: ['tt'], 69: ['bn'], 70: ['pa'], 71: ['gu'], 72: ['or'], 73: ['ta'], 74: ['te'], 75: ['kn'], 76: ['ml'], 77: ['as'], 78: ['mr'], 79: ['sa'], 82: ['cy', 'cy-GB'], 83: ['gl', 'gl-ES'], 87: ['kok'], 97: ['ne'], 98: ['fy'], } const concatTypedArray = (a, b) => { const result = new a.constructor(a.length + b.length) result.set(a) result.set(b, a.length) return result } const concatTypedArray3 = (a, b, c) => { const result = new a.constructor(a.length + b.length + c.length) result.set(a) result.set(b, a.length) result.set(c, a.length + b.length) return result } const decoder = new TextDecoder() const getString = buffer => decoder.decode(buffer) const getUint = buffer => { if (!buffer) return const l = buffer.byteLength const func = l === 4 ? 'getUint32' : l === 2 ? 'getUint16' : 'getUint8' return new DataView(buffer)[func](0) } const getStruct = (def, buffer) => Object.fromEntries(Array.from(Object.entries(def)) .map(([key, [start, len, type]]) => [key, (type === 'string' ? getString : getUint)(buffer.slice(start, start + len))])) const getDecoder = x => new TextDecoder(MOBI_ENCODING[x]) const getVarLen = (byteArray, i = 0) => { let value = 0, length = 0 for (const byte of byteArray.subarray(i, i + 4)) { value = (value << 7) | (byte & 0b111_1111) >>> 0 length++ if (byte & 0b1000_0000) break } return { value, length } } // variable-length quantity, but read from the end of data const getVarLenFromEnd = byteArray => { let value = 0 for (const byte of byteArray.subarray(-4)) { // `byte & 0b1000_0000` indicates the start of value if (byte & 0b1000_0000) value = 0 value = (value << 7) | (byte & 0b111_1111) } return value } const countBitsSet = x => { let count = 0 for (; x > 0; x = x >> 1) if ((x & 1) === 1) count++ return count } const countUnsetEnd = x => { let count = 0 while ((x & 1) === 0) x = x >> 1, count++ return count } const decompressPalmDOC = array => { let output = [] for (let i = 0; i < array.length; i++) { const byte = array[i] if (byte === 0) output.push(0) // uncompressed literal, just copy it else if (byte <= 8) // copy next 1-8 bytes for (const x of array.subarray(i + 1, (i += byte) + 1)) output.push(x) else if (byte <= 0b0111_1111) output.push(byte) // uncompressed literal else if (byte <= 0b1011_1111) { // 1st and 2nd bits are 10, meaning this is a length-distance pair // read next byte and combine it with current byte const bytes = (byte << 8) | array[i++ + 1] // the 3rd to 13th bits encode distance const distance = (bytes & 0b0011_1111_1111_1111) >>> 3 // the last 3 bits, plus 3, is the length to copy const length = (bytes & 0b111) + 3 for (let j = 0; j < length; j++) output.push(output[output.length - distance]) } // compressed from space plus char else output.push(32, byte ^ 0b1000_0000) } return Uint8Array.from(output) } const read32Bits = (byteArray, from) => { const startByte = from >> 3 const end = from + 32 const endByte = end >> 3 let bits = 0n for (let i = startByte; i <= endByte; i++) bits = bits << 8n | BigInt(byteArray[i] ?? 0) return (bits >> (8n - BigInt(end & 7))) & 0xffffffffn } const huffcdic = async (mobi, loadRecord) => { const huffRecord = await loadRecord(mobi.huffcdic) const { magic, offset1, offset2 } = getStruct(HUFF_HEADER, huffRecord) if (magic !== 'HUFF') throw new Error('Invalid HUFF record') // table1 is indexed by byte value const table1 = Array.from({ length: 256 }, (_, i) => offset1 + i * 4) .map(offset => getUint(huffRecord.slice(offset, offset + 4))) .map(x => [x & 0b1000_0000, x & 0b1_1111, x >>> 8]) // table2 is indexed by code length const table2 = [null].concat(Array.from({ length: 32 }, (_, i) => offset2 + i * 8) .map(offset => [ getUint(huffRecord.slice(offset, offset + 4)), getUint(huffRecord.slice(offset + 4, offset + 8))])) const dictionary = [] for (let i = 1; i < mobi.numHuffcdic; i++) { const record = await loadRecord(mobi.huffcdic + i) const cdic = getStruct(CDIC_HEADER, record) if (cdic.magic !== 'CDIC') throw new Error('Invalid CDIC record') // `numEntries` is the total number of dictionary data across CDIC records // so `n` here is the number of entries in *this* record const n = Math.min(1 << cdic.codeLength, cdic.numEntries - dictionary.length) const buffer = record.slice(cdic.length) for (let i = 0; i < n; i++) { const offset = getUint(buffer.slice(i * 2, i * 2 + 2)) const x = getUint(buffer.slice(offset, offset + 2)) const length = x & 0x7fff const decompressed = x & 0x8000 const value = new Uint8Array( buffer.slice(offset + 2, offset + 2 + length)) dictionary.push([value, decompressed]) } } const decompress = byteArray => { let output = new Uint8Array() const bitLength = byteArray.byteLength * 8 for (let i = 0; i < bitLength;) { const bits = Number(read32Bits(byteArray, i)) let [found, codeLength, value] = table1[bits >>> 24] if (!found) { while (bits >>> (32 - codeLength) < table2[codeLength][0]) codeLength += 1 value = table2[codeLength][1] } if ((i += codeLength) > bitLength) break const code = value - (bits >>> (32 - codeLength)) let [result, decompressed] = dictionary[code] if (!decompressed) { // the result is itself compressed result = decompress(result) // cache the result for next time dictionary[code] = [result, true] } output = concatTypedArray(output, result) } return output } return decompress } const getIndexData = async (indxIndex, loadRecord) => { const indxRecord = await loadRecord(indxIndex) const indx = getStruct(INDX_HEADER, indxRecord) if (indx.magic !== 'INDX') throw new Error('Invalid INDX record') const decoder = getDecoder(indx.encoding) const tagxBuffer = indxRecord.slice(indx.length) const tagx = getStruct(TAGX_HEADER, tagxBuffer) if (tagx.magic !== 'TAGX') throw new Error('Invalid TAGX section') const numTags = (tagx.length - 12) / 4 const tagTable = Array.from({ length: numTags }, (_, i) => new Uint8Array(tagxBuffer.slice(12 + i * 4, 12 + i * 4 + 4))) const cncx = {} let cncxRecordOffset = 0 for (let i = 0; i < indx.numCncx; i++) { const record = await loadRecord(indxIndex + indx.numRecords + i + 1) const array = new Uint8Array(record) for (let pos = 0; pos < array.byteLength;) { const index = pos const { value, length } = getVarLen(array, pos) pos += length const result = record.slice(pos, pos + value) pos += value cncx[cncxRecordOffset + index] = decoder.decode(result) } cncxRecordOffset += 0x10000 } const table = [] for (let i = 0; i < indx.numRecords; i++) { const record = await loadRecord(indxIndex + 1 + i) const array = new Uint8Array(record) const indx = getStruct(INDX_HEADER, record) if (indx.magic !== 'INDX') throw new Error('Invalid INDX record') for (let j = 0; j < indx.numRecords; j++) { const offsetOffset = indx.idxt + 4 + 2 * j const offset = getUint(record.slice(offsetOffset, offsetOffset + 2)) const length = getUint(record.slice(offset, offset + 1)) const name = getString(record.slice(offset + 1, offset + 1 + length)) const tags = [] const startPos = offset + 1 + length let controlByteIndex = 0 let pos = startPos + tagx.numControlBytes for (const [tag, numValues, mask, end] of tagTable) { if (end & 1) { controlByteIndex++ continue } const offset = startPos + controlByteIndex const value = getUint(record.slice(offset, offset + 1)) & mask if (value === mask) { if (countBitsSet(mask) > 1) { const { value, length } = getVarLen(array, pos) tags.push([tag, null, value, numValues]) pos += length } else tags.push([tag, 1, null, numValues]) } else tags.push([tag, value >> countUnsetEnd(mask), null, numValues]) } const tagMap = {} for (const [tag, valueCount, valueBytes, numValues] of tags) { const values = [] if (valueCount != null) { for (let i = 0; i < valueCount * numValues; i++) { const { value, length } = getVarLen(array, pos) values.push(value) pos += length } } else { let count = 0 while (count < valueBytes) { const { value, length } = getVarLen(array, pos) values.push(value) pos += length count += length } } tagMap[tag] = values } table.push({ name, tagMap }) } } return { table, cncx } } const getNCX = async (indxIndex, loadRecord) => { const { table, cncx } = await getIndexData(indxIndex, loadRecord) const items = table.map(({ tagMap }, index) => ({ index, offset: tagMap[1]?.[0], size: tagMap[2]?.[0], label: cncx[tagMap[3]] ?? '', headingLevel: tagMap[4]?.[0], pos: tagMap[6], parent: tagMap[21]?.[0], firstChild: tagMap[22]?.[0], lastChild: tagMap[23]?.[0], })) const getChildren = item => { if (item.firstChild == null) return item item.children = items.filter(x => x.parent === item.index).map(getChildren) return item } return items.filter(item => item.headingLevel === 0).map(getChildren) } const getEXTH = (buf, encoding) => { const { magic, count } = getStruct(EXTH_HEADER, buf) if (magic !== 'EXTH') throw new Error('Invalid EXTH header') const decoder = getDecoder(encoding) const results = {} let offset = 12 for (let i = 0; i < count; i++) { const type = getUint(buf.slice(offset, offset + 4)) const length = getUint(buf.slice(offset + 4, offset + 8)) if (type in EXTH_RECORD_TYPE) { const [name, typ, many] = EXTH_RECORD_TYPE[type] const data = buf.slice(offset + 8, offset + length) const value = typ === 'uint' ? getUint(data) : decoder.decode(data) if (many) { results[name] ??= [] results[name].push(value) } else results[name] = value } offset += length } return results } const getFont = async (buf, unzlib) => { const { flags, dataStart, keyLength, keyStart } = getStruct(FONT_HEADER, buf) const array = new Uint8Array(buf.slice(dataStart)) // deobfuscate font if (flags & 0b10) { const bytes = keyLength === 16 ? 1024 : 1040 const key = new Uint8Array(buf.slice(keyStart, keyStart + keyLength)) const length = Math.min(bytes, array.length) for (var i = 0; i < length; i++) array[i] = array[i] ^ key[i % key.length] } // decompress font if (flags & 1) try { return await unzlib(array) } catch (e) { console.warn(e) console.warn('Failed to decompress font') } return array } export const isMOBI = async file => { const magic = getString(await file.slice(60, 68).arrayBuffer()) return magic === 'BOOKMOBI'// || magic === 'TEXtREAd' } class PDB { #file #offsets pdb async open(file) { this.#file = file const pdb = getStruct(PDB_HEADER, await file.slice(0, 78).arrayBuffer()) this.pdb = pdb const buffer = await file.slice(78, 78 + pdb.numRecords * 8).arrayBuffer() // get start and end offsets for each record this.#offsets = Array.from({ length: pdb.numRecords }, (_, i) => getUint(buffer.slice(i * 8, i * 8 + 4))) .map((x, i, a) => [x, a[i + 1]]) } loadRecord(index) { const offsets = this.#offsets[index] if (!offsets) throw new RangeError('Record index out of bounds') return this.#file.slice(...offsets).arrayBuffer() } async loadMagic(index) { const start = this.#offsets[index][0] return getString(await this.#file.slice(start, start + 4).arrayBuffer()) } } export class MOBI extends PDB { #start = 0 #resourceStart #decoder #encoder #decompress #removeTrailingEntries constructor({ unzlib }) { super() this.unzlib = unzlib } async open(file) { await super.open(file) // TODO: if (this.pdb.type === 'TEXt') this.headers = this.#getHeaders(await super.loadRecord(0)) this.#resourceStart = this.headers.mobi.resourceStart let isKF8 = this.headers.mobi.version >= 8 if (!isKF8) { const boundary = this.headers.exth?.boundary if (boundary < 0xffffffff) try { // it's a "combo" MOBI/KF8 file; try to open the KF8 part this.headers = this.#getHeaders(await super.loadRecord(boundary)) this.#start = boundary isKF8 = true } catch (e) { console.warn(e) console.warn('Failed to open KF8; falling back to MOBI') } } await this.#setup() return isKF8 ? new KF8(this).init() : new MOBI6(this).init() } #getHeaders(buf) { const palmdoc = getStruct(PALMDOC_HEADER, buf) const mobi = getStruct(MOBI_HEADER, buf) if (mobi.magic !== 'MOBI') throw new Error('Missing MOBI header') const { titleOffset, titleLength, localeLanguage, localeRegion } = mobi mobi.title = buf.slice(titleOffset, titleOffset + titleLength) const lang = MOBI_LANG[localeLanguage] mobi.language = lang?.[localeRegion >> 2] ?? lang?.[0] const exth = mobi.exthFlag & 0b100_0000 ? getEXTH(buf.slice(mobi.length + 16), mobi.encoding) : null const kf8 = mobi.version >= 8 ? getStruct(KF8_HEADER, buf) : null return { palmdoc, mobi, exth, kf8 } } async #setup() { const { palmdoc, mobi } = this.headers this.#decoder = getDecoder(mobi.encoding) // `TextEncoder` only supports UTF-8 // we are only encoding ASCII anyway, so I think it's fine this.#encoder = new TextEncoder() // set up decompressor const { compression } = palmdoc this.#decompress = compression === 1 ? f => f : compression === 2 ? decompressPalmDOC : compression === 17480 ? await huffcdic(mobi, this.loadRecord.bind(this)) : null if (!this.#decompress) throw new Error('Unknown compression type') // set up function for removing trailing bytes const { trailingFlags } = mobi const multibyte = trailingFlags & 1 const numTrailingEntries = countBitsSet(trailingFlags >>> 1) this.#removeTrailingEntries = array => { for (let i = 0; i < numTrailingEntries; i++) { const length = getVarLenFromEnd(array) array = array.subarray(0, -length) } if (multibyte) { const length = (array[array.length - 1] & 0b11) + 1 array = array.subarray(0, -length) } return array } } decode(...args) { return this.#decoder.decode(...args) } encode(...args) { return this.#encoder.encode(...args) } loadRecord(index) { return super.loadRecord(this.#start + index) } loadMagic(index) { return super.loadMagic(this.#start + index) } loadText(index) { return this.loadRecord(index + 1) .then(buf => new Uint8Array(buf)) .then(this.#removeTrailingEntries) .then(this.#decompress) } async loadResource(index) { const buf = await super.loadRecord(this.#resourceStart + index) const magic = getString(buf.slice(0, 4)) if (magic === 'FONT') return getFont(buf, this.unzlib) if (magic === 'VIDE' || magic === 'AUDI') return buf.slice(12) return buf } getNCX() { const index = this.headers.mobi.indx if (index < 0xffffffff) return getNCX(index, this.loadRecord.bind(this)) } getMetadata() { const { mobi, exth } = this.headers return { identifier: mobi.uid.toString(), title: unescapeHTML(exth?.title || this.decode(mobi.title)), author: exth?.creator?.map(unescapeHTML), publisher: unescapeHTML(exth?.publisher), language: exth?.language ?? mobi.language, published: exth?.date, description: unescapeHTML(exth?.description), subject: exth?.subject?.map(unescapeHTML), rights: unescapeHTML(exth?.rights), contributor: exth?.contributor, } } async getCover() { const { exth } = this.headers const offset = exth?.coverOffset < 0xffffffff ? exth?.coverOffset : exth?.thumbnailOffset < 0xffffffff ? exth?.thumbnailOffset : null if (offset != null) { const buf = await this.loadResource(offset) return new Blob([buf]) } } } const mbpPagebreakRegex = /<\s*(?:mbp:)?pagebreak[^>]*>/gi const fileposRegex = /<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>/gi const getIndent = el => { let x = 0 while (el) { const parent = el.parentElement if (parent) { const tag = parent.tagName.toLowerCase() if (tag === 'p') x += 1.5 else if (tag === 'blockquote') x += 2 } el = parent } return x } class MOBI6 { parser = new DOMParser() serializer = new XMLSerializer() #resourceCache = new Map() #textCache = new Map() #cache = new Map() #sections #fileposList = [] #type = MIME.HTML constructor(mobi) { this.mobi = mobi } async init() { // load all text records in an array let array = new Uint8Array() for (let i = 0; i < this.mobi.headers.palmdoc.numTextRecords; i++) array = concatTypedArray(array, await this.mobi.loadText(i)) // convert to string so we can use regex // note that `filepos` are byte offsets // so it needs to preserve each byte as a separate character // (see https://stackoverflow.com/q/50198017) const str = Array.from(new Uint8Array(array), c => String.fromCharCode(c)).join('') // split content into sections at each `<mbp:pagebreak>` this.#sections = [0] .concat(Array.from(str.matchAll(mbpPagebreakRegex), m => m.index)) .map((x, i, a) => str.slice(x, a[i + 1])) // recover the original raw bytes .map(str => Uint8Array.from(str, x => x.charCodeAt(0))) .map(raw => ({ book: this, raw })) // get start and end filepos for each section .reduce((arr, x) => { const last = arr[arr.length - 1] x.start = last?.end ?? 0 x.end = x.start + x.raw.byteLength return arr.concat(x) }, []) this.sections = this.#sections.map((section, index) => ({ id: index, load: () => this.loadSection(section), createDocument: () => this.createDocument(section), size: section.end - section.start, })) try { this.landmarks = await this.getGuide() const tocHref = this.landmarks .find(({ type }) => type?.includes('toc'))?.href if (tocHref) { const { index } = this.resolveHref(tocHref) const doc = await this.sections[index].createDocument() let lastItem let lastLevel = 0 let lastIndent = 0 const lastLevelOfIndent = new Map() const lastParentOfLevel = new Map() this.toc = Array.from(doc.querySelectorAll('a[filepos]')) .reduce((arr, a) => { const indent = getIndent(a) const item = { label: a.innerText?.trim() ?? '', href: `filepos:${a.getAttribute('filepos')}`, } const level = indent > lastIndent ? lastLevel + 1 : indent === lastIndent ? lastLevel : lastLevelOfIndent.get(indent) ?? Math.max(0, lastLevel - 1) if (level > lastLevel) { if (lastItem) { lastItem.subitems ??= [] lastItem.subitems.push(item) lastParentOfLevel.set(level, lastItem) } else arr.push(item) } else { const parent = lastParentOfLevel.get(level) if (parent) parent.subitems.push(item) else arr.push(item) } lastItem = item lastLevel = level lastIndent = indent lastLevelOfIndent.set(indent, level) return arr }, []) } } catch(e) { console.warn(e) } // get list of all `filepos` references in the book, // which will be used to insert anchor elements // because only then can they be referenced in the DOM this.#fileposList = [...new Set( Array.from(str.matchAll(fileposRegex), m => m[1]))] .map(filepos => ({ filepos, number: Number(filepos) })) .sort((a, b) => a.number - b.number) this.metadata = this.mobi.getMetadata() this.getCover = this.mobi.getCover.bind(this.mobi) return this } async getGuide() { const doc = await this.createDocument(this.#sections[0]) return Array.from(doc.getElementsByTagName('reference'), ref => ({ label: ref.getAttribute('title'), type: ref.getAttribute('type')?.split(/\s/), href: `filepos:${ref.getAttribute('filepos')}`, })) } async loadResource(index) { if (this.#resourceCache.has(index)) return this.#resourceCache.get(index) const raw = await this.mobi.loadResource(index) const url = URL.createObjectURL(new Blob([raw])) this.#resourceCache.set(index, url) return url } async loadRecindex(recindex) { return this.loadResource(Number(recindex) - 1) } async replaceResources(doc) { for (const img of doc.querySelectorAll('img[recindex]')) { const recindex = img.getAttribute('recindex') try { img.src = await this.loadRecindex(recindex) } catch { console.warn(`Failed to load image ${recindex}`) } } for (const media of doc.querySelectorAll('[mediarecindex]')) { const mediarecindex = media.getAttribute('mediarecindex') const recindex = media.getAttribute('recindex') try { media.src = await this.loadRecindex(mediarecindex) if (recindex) media.poster = await this.loadRecindex(recindex) } catch { console.warn(`Failed to load media ${mediarecindex}`) } } for (const a of doc.querySelectorAll('[filepos]')) { const filepos = a.getAttribute('filepos') a.href = `filepos:${filepos}` } } async loadText(section) { if (this.#textCache.has(section)) return this.#textCache.get(section) const { raw } = section // insert anchor elements for each `filepos` const fileposList = this.#fileposList .filter(({ number }) => number >= section.start && number < section.end) .map(obj => ({ ...obj, offset: obj.number - section.start })) let arr = raw if (fileposList.length) { arr = raw.subarray(0, fileposList[0].offset) fileposList.forEach(({ filepos, offset }, i) => { const next = fileposList[i + 1] const a = this.mobi.encode(`<a id="filepos${filepos}"></a>`) arr = concatTypedArray3(arr, a, raw.subarray(offset, next?.offset)) }) } const str = this.mobi.decode(arr).replaceAll(mbpPagebreakRegex, '') this.#textCache.set(section, str) return str } async createDocument(section) { const str = await this.loadText(section) return this.parser.parseFromString(str, this.#type) } async loadSection(section) { if (this.#cache.has(section)) return this.#cache.get(section) const doc = await this.createDocument(section) // inject default stylesheet const style = doc.createElement('style') doc.head.append(style) // blockquotes in MOBI seem to have only a small left margin by default // many books seem to rely on this, as it's the only way to set margin // (since there's no CSS) style.append(doc.createTextNode(`blockquote { margin-block-start: 0; margin-block-end: 0; margin-inline-start: 1em; margin-inline-end: 0; }`)) await this.replaceResources(doc) const result = this.serializer.serializeToString(doc) const url = URL.createObjectURL(new Blob([result], { type: this.#type })) this.#cache.set(section, url) return url } resolveHref(href) { const filepos = href.match(/filepos:(.*)/)[1] const number = Number(filepos) const index = this.#sections.findIndex(section => section.end > number) const anchor = doc => doc.getElementById(`filepos${filepos}`) return { index, anchor } } splitTOCHref(href) { const filepos = href.match(/filepos:(.*)/)[1] const number = Number(filepos) const index = this.#sections.findIndex(section => section.end > number) return [index, `filepos${filepos}`] } getTOCFragment(doc, id) { return doc.getElementById(id) } isExternal(uri) { return /^(?!blob|filepos)\w+:/i.test(uri) } destroy() { for (const url of this.#resourceCache.values()) URL.revokeObjectURL(url) for (const url of this.#cache.values()) URL.revokeObjectURL(url) } } // handlers for `kindle:` uris const kindleResourceRegex = /kindle:(flow|embed):(\w+)(?:\?mime=(\w+\/[-+.\w]+))?/ const kindlePosRegex = /kindle:pos:fid:(\w+):off:(\w+)/ const parseResourceURI = str => { const [resourceType, id, type] = str.match(kindleResourceRegex).slice(1) return { resourceType, id: parseInt(id, 32), type } } const parsePosURI = str => { const [fid, off] = str.match(kindlePosRegex).slice(1) return { fid: parseInt(fid, 32), off: parseInt(off, 32) } } const makePosURI = (fid = 0, off = 0) => `kindle:pos:fid:${fid.toString(32).toUpperCase().padStart(4, '0') }:off:${off.toString(32).toUpperCase().padStart(10, '0')}` // `kindle:pos:` links are originally links that contain fragments identifiers // so there should exist an element with `id` or `name` // otherwise try to find one with an `aid` attribute const getFragmentSelector = str => { const match = str.match(/\s(id|name|aid)\s*=\s*['"]([^'"]*)['"]/i) if (!match) return const [, attr, value] = match return `[${attr}="${CSS.escape(value)}"]` } // replace asynchronously and sequentially const replaceSeries = async (str, regex, f) => { const matches = [] str.replace(regex, (...args) => (matches.push(args), null)) const results = [] for (const args of matches) results.push(await f(...args)) return str.replace(regex, () => results.shift()) } const getPageSpread = properties => { for (const p of properties) { if (p === 'page-spread-left' || p === 'rendition:page-spread-left') return 'left' if (p === 'page-spread-right' || p === 'rendition:page-spread-right') return 'right' if (p === 'rendition:page-spread-center') return 'center' } } class KF8 { parser = new DOMParser() serializer = new XMLSerializer() #cache = new Map() #fragmentOffsets = new Map() #fragmentSelectors = new Map() #tables = {} #sections #fullRawLength #rawHead = new Uint8Array() #rawTail = new Uint8Array() #lastLoadedHead = -1 #lastLoadedTail = -1 #type = MIME.XHTML #inlineMap = new Map() constructor(mobi) { this.mobi = mobi } async init() { const loadRecord = this.mobi.loadRecord.bind(this.mobi) const { kf8 } = this.mobi.headers try { const fdstBuffer = await loadRecord(kf8.fdst) const fdst = getStruct(FDST_HEADER, fdstBuffer) if (fdst.magic !== 'FDST') throw new Error('Missing FDST record') const fdstTable = Array.from({ length: fdst.numEntries }, (_, i) => 12 + i * 8) .map(offset => [ getUint(fdstBuffer.slice(offset, offset + 4)), getUint(fdstBuffer.slice(offset + 4, offset + 8))]) this.#tables.fdstTable = fdstTable this.#fullRawLength = fdstTable[fdstTable.length - 1][1] } catch {} const skelTable = (await getIndexData(kf8.skel, loadRecord)).table .map(({ name, tagMap }, index) => ({ index, name, numFrag: tagMap[1][0], offset: tagMap[6][0], length: tagMap[6][1], })) const fragData = await getIndexData(kf8.frag, loadRecord) const fragTable = fragData.table.map(({ name, tagMap }) => ({ insertOffset: parseInt(name), selector: fragData.cncx[tagMap[2][0]], index: tagMap[4][0], offset: tagMap[6][0], length: tagMap[6][1], })) this.#tables.skelTable = skelTable this.#tables.fragTable = fragTable this.#sections = skelTable.reduce((arr, skel) => { const last = arr[arr.length - 1] const fragStart = last?.fragEnd ?? 0, fragEnd = fragStart + skel.numFrag const frags = fragTable.slice(fragStart, fragEnd) const length = skel.length + frags.map(f => f.length).reduce((a, b) => a + b) const totalLength = (last?.totalLength ?? 0) + length return arr.concat({ skel, frags, fragEnd, length, totalLength }) }, []) const resources = await this.getResourcesByMagic(['RESC', 'PAGE']) const pageSpreads = new Map() if (resources.RESC) { const buf = await this.mobi.loadRecord(resources.RESC) const str = this.mobi.decode(buf.slice(16)).replace(/\0/g, '') // the RESC record lacks the root `<package>` element // but seem to be otherwise valid XML const index = str.search(/\?>/) const xmlStr = `<package>${str.slice(index)}</package>` const opf = this.parser.parseFromString(xmlStr, MIME.XML) for (const $itemref of opf.querySelectorAll('spine > itemref')) { const i = parseInt($itemref.getAttribute('skelid')) pageSpreads.set(i, getPageSpread( $itemref.getAttribute('properties')?.split(' ') ?? [])) } } this.sections = this.#sections.map((section, index) => section.frags.length ? ({ id: index, load: () => this.loadSection(section), createDocument: () => this.createDocument(section), size: section.length, pageSpread: pageSpreads.get(index), }) : ({ linear: 'no' })) try { const ncx = await this.mobi.getNCX() const map = ({ label, pos, children }) => { const [fid, off] = pos const href = makePosURI(fid, off) const arr = this.#fragmentOffsets.get(fid) if (arr) arr.push(off) else this.#fragmentOffsets.set(fid, [off]) return { label: unescapeHTML(label), href, subitems: children?.map(map) } } this.toc = ncx?.map(map) this.landmarks = await this.getGuide() } catch(e) { console.warn(e) } const { exth } = this.mobi.headers this.dir = exth.pageProgressionDirection this.rendition = { layout: exth.fixedLayout === 'true' ? 'pre-paginated' : 'reflowable', viewport: Object.fromEntries(exth.originalResolution ?.split('x')?.slice(0, 2) ?.map((x, i) => [i ? 'height' : 'width', x]) ?? []), } this.metadata = this.mobi.getMetadata() this.getCover = this.mobi.getCover.bind(this.mobi) return this } // is this really the only way of getting to RESC, PAGE, etc.? async getResourcesByMagic(keys) { const results = {} const start = this.mobi.headers.kf8.resourceStart const end = this.mobi.pdb.numRecords for (let i = start; i < end; i++) { try { const magic = await this.mobi.loadMagic(i) const match = keys.find(key => key === magic) if (match) results[match] = i } catch {} } return results } async getGuide() { const index = this.mobi.headers.kf8.guide if (index < 0xffffffff) { const loadRecord = this.mobi.loadRecord.bind(this.mobi) const { table, cncx } = await getIndexData(index, loadRecord) return table.map(({ name, tagMap }) => ({ label: cncx[tagMap[1][0]] ?? '', type: name?.split(/\s/), href: makePosURI(tagMap[6]?.[0] ?? tagMap[3]?.[0]), })) } } async loadResourceBlob(str) { const { resourceType, id, type } = parseResourceURI(str) const raw = resourceType === 'flow' ? await this.loadFlow(id) : await this.mobi.loadResource(id - 1) const result = [MIME.XHTML, MIME.HTML, MIME.CSS, MIME.SVG].includes(type) ? await this.replaceResources(this.mobi.decode(raw)) : raw const doc = type === MIME.SVG ? this.parser.parseFromString(result, type) : null return [new Blob([result], { type }), // SVG wrappers need to be inlined // as browsers don't allow external resources when loading SVG as an image doc?.getElementsByTagNameNS('http://www.w3.org/2000/svg', 'image')?.length ? doc.documentElement : null] } async loadResource(str) { if (this.#cache.has(str)) return this.#cache.get(str) const [blob, inline] = await this.loadResourceBlob(str) const url = inline ? str : URL.createObjectURL(blob) if (inline) this.#inlineMap.set(url, inline) this.#cache.set(str, url) return url } replaceResources(str) { const regex = new RegExp(kindleResourceRegex, 'g') return replaceSeries(str, regex, this.loadResource.bind(this)) } // NOTE: there doesn't seem to be a way to access text randomly? // how to know the decompressed size of the records without decompressing? // 4096 is just the maximum size async loadRaw(start, end) { // here we load either from the front or back until we have reached the // required offsets; at worst you'd have to load half the book at once const distanceHead = end - this.#rawHead.length const distanceEnd = this.#fullRawLength == null ? Infinity : (this.#fullRawLength - this.#rawTail.length) - start // load from the start if (distanceHead < 0 || distanceHead < distanceEnd) { while (this.#rawHead.length < end) { const index = ++this.#lastLoadedHead const data = await this.mobi.loadText(index) this.#rawHead = concatTypedArray(this.#rawHead, data) } return this.#rawHead.slice(start, end) } // load from the end while (this.#fullRawLength - this.#rawTail.length > start) { const index = this.mobi.headers.palmdoc.numTextRecords - 1 - (++this.#lastLoadedTail) const data = await this.mobi.loadText(index) this.#rawTail = concatTypedArray(data, this.#rawTail) } const rawTailStart = this.#fullRawLength - this.#rawTail.length return this.#rawTail.slice(start - rawTailStart, end - rawTailStart) } loadFlow(index) { if (index < 0xffffffff) return this.loadRaw(...this.#tables.fdstTable[index]) } async loadText(section) { const { skel, frags, length } = section const raw = await this.loadRaw(skel.offset, skel.offset + length) let skeleton = raw.slice(0, skel.length) for (const frag of frags) { const insertOffset = frag.insertOffset - skel.offset const offset = skel.length + frag.offset const fragRaw = raw.slice(offset, offset + frag.length) skeleton = concatTypedArray3( skeleton.slice(0, insertOffset), fragRaw, skeleton.slice(insertOffset)) const offsets = this.#fragmentOffsets.get(frag.index) if (offsets) for (const offset of offsets) { const str = this.mobi.decode(fragRaw).slice(offset) const selector = getFragmentSelector(str) this.#setFragmentSelector(frag.index, offset, selector) } } return this.mobi.decode(skeleton) } async createDocument(section) { const str = await this.loadText(section) return this.parser.parseFromString(str, this.#type) } async loadSection(section) { if (this.#cache.has(section)) return this.#cache.get(section) const str = await this.loadText(section) const replaced = await this.replaceResources(str) // by default, type is XHTML; change to HTML if it's not valid XHTML let doc = this.parser.parseFromString(replaced, this.#type) if (doc.querySelector('parsererror') || !doc.documentElement?.namespaceURI) { this.#type = MIME.HTML doc = this.parser.parseFromString(replaced, this.#type) } for (const [url, node] of this.#inlineMap) { for (const el of doc.querySelectorAll(`img[src="${url}"]`)) el.replaceWith(node) } const url = URL.createObjectURL( new Blob([this.serializer.serializeToString(doc)], { type: this.#type })) this.#cache.set(section, url) return url } getIndexByFID(fid) { return this.#sections.findIndex(section => section.frags.some(frag => frag.index === fid)) } #setFragmentSelector(id, offset, selector) { const map = this.#fragmentSelectors.get(id) if (map) map.set(offset, selector) else { const map = new Map() this.#fragmentSelectors.set(id, map) map.set(offset, selector) } } async resolveHref(href) { const { fid, off } = parsePosURI(href) const index = this.getIndexByFID(fid) if (index < 0) return const saved = this.#fragmentSelectors.get(fid)?.get(off) if (saved) return { index, anchor: doc => doc.querySelector(saved) } const { skel, frags } = this.#sections[index] const frag = frags.find(frag => frag.index === fid) const offset = skel.offset + skel.length + frag.offset const fragRaw = await this.loadRaw(offset, offset + frag.length) const str = this.mobi.decode(fragRaw).slice(off) const selector = getFragmentSelector(str) this.#setFragmentSelector(fid, off, selector) const anchor = doc => doc.querySelector(selector) return { index, anchor } } splitTOCHref(href) { const pos = parsePosURI(href) const index = this.getIndexByFID(pos.fid) return [index, pos] } getTOCFragment(doc, { fid, off }) { const selector = this.#fragmentSelectors.get(fid)?.get(off) return doc.querySelector(selector) } isExternal(uri) { return /^(?!blob|kindle)\w+:/i.test(uri) } destroy() { for (const url of this.#cache.values()) URL.revokeObjectURL(url) } }