UNPKG

@cantoo/pdf-lib

Version:

Create and modify PDF files with JavaScript

cantoo-scribe/pdf-lib

307 lines • 13.3 kB

JavaScript

import { __awaiter } from "tslib"; import PDFCrossRefSection from '../document/PDFCrossRefSection.js'; import PDFHeader from '../document/PDFHeader.js'; import PDFTrailer from '../document/PDFTrailer.js'; import { MissingKeywordError, MissingPDFHeaderError, PDFInvalidObjectParsingError, ReparseError, StalledParserError, } from '../errors.js'; import PDFDict from '../objects/PDFDict.js'; import PDFInvalidObject from '../objects/PDFInvalidObject.js'; import PDFName from '../objects/PDFName.js'; import PDFRawStream from '../objects/PDFRawStream.js'; import PDFRef from '../objects/PDFRef.js'; import ByteStream from './ByteStream.js'; import PDFObjectParser from './PDFObjectParser.js'; import PDFObjectStreamParser from './PDFObjectStreamParser.js'; import PDFXRefStreamParser from './PDFXRefStreamParser.js'; import PDFContext from '../PDFContext.js'; import CharCodes from '../syntax/CharCodes.js'; import { Keywords } from '../syntax/Keywords.js'; import { IsDigit } from '../syntax/Numeric.js'; import { waitForTick } from '../../utils/index.js'; class PDFParser extends PDFObjectParser { constructor(pdfBytes, objectsPerTick = Infinity, throwOnInvalidObject = false, warnOnInvalidObjects = false, capNumbers = false, cryptoFactory) { super(ByteStream.of(pdfBytes), PDFContext.create(), capNumbers, cryptoFactory); this.alreadyParsed = false; this.parsedObjects = 0; this.shouldWaitForTick = () => { this.parsedObjects += 1; return this.parsedObjects % this.objectsPerTick === 0; }; this.objectsPerTick = objectsPerTick; this.throwOnInvalidObject = throwOnInvalidObject; this.warnOnInvalidObjects = warnOnInvalidObjects; this.context.isDecrypted = !!(cryptoFactory === null || cryptoFactory === void 0 ? void 0 : cryptoFactory.encryptionKey); } parseDocument() { return __awaiter(this, void 0, void 0, function* () { if (this.alreadyParsed) { throw new ReparseError('PDFParser', 'parseDocument'); } this.alreadyParsed = true; this.context.header = this.parseHeader(); let prevOffset; while (!this.bytes.done()) { yield this.parseDocumentSection(); const offset = this.bytes.offset(); if (offset === prevOffset) { throw new StalledParserError(this.bytes.position()); } prevOffset = offset; } this.maybeRecoverRoot(); if (this.context.lookup(PDFRef.of(0))) { console.warn('Removing parsed object: 0 0 R'); this.context.delete(PDFRef.of(0)); } return this.context; }); } maybeRecoverRoot() { const isValidCatalog = (obj) => obj instanceof PDFDict && obj.lookup(PDFName.of('Type')) === PDFName.of('Catalog'); const catalog = this.context.lookup(this.context.trailerInfo.Root); if (!isValidCatalog(catalog)) { const indirectObjects = this.context.enumerateIndirectObjects(); for (let idx = 0, len = indirectObjects.length; idx < len; idx++) { const [ref, object] = indirectObjects[idx]; if (isValidCatalog(object)) { this.context.trailerInfo.Root = ref; } } } } parseHeader() { while (!this.bytes.done()) { if (this.matchKeyword(Keywords.header)) { const major = this.parseRawInt(); this.bytes.assertNext(CharCodes.Period); const minor = this.parseRawInt(); const header = PDFHeader.forVersion(major, minor); this.skipBinaryHeaderComment(); return header; } this.bytes.next(); } throw new MissingPDFHeaderError(this.bytes.position()); } parseIndirectObjectHeader() { this.skipWhitespaceAndComments(); const objectNumber = this.parseRawInt(); this.skipWhitespaceAndComments(); const generationNumber = this.parseRawInt(); this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords.obj)) { throw new MissingKeywordError(this.bytes.position(), Keywords.obj); } return PDFRef.of(objectNumber, generationNumber); } matchIndirectObjectHeader() { const initialOffset = this.bytes.offset(); try { this.parseIndirectObjectHeader(); return true; } catch (e) { this.bytes.moveTo(initialOffset); return false; } } parseIndirectObject() { return __awaiter(this, void 0, void 0, function* () { const ref = this.parseIndirectObjectHeader(); this.skipWhitespaceAndComments(); const object = this.parseObject(ref); this.skipWhitespaceAndComments(); // if (!this.matchKeyword(Keywords.endobj)) { // throw new MissingKeywordError(this.bytes.position(), Keywords.endobj); // } // TODO: Log a warning if this fails... this.matchKeyword(Keywords.endobj); if (object instanceof PDFRawStream && object.dict.lookup(PDFName.of('Type')) === PDFName.of('ObjStm')) { yield PDFObjectStreamParser.forStream(object, this.shouldWaitForTick).parseIntoContext(); } else if (object instanceof PDFRawStream && object.dict.lookup(PDFName.of('Type')) === PDFName.of('XRef')) { PDFXRefStreamParser.forStream(object).parseIntoContext(); } else { this.context.assign(ref, object); } return ref; }); } // TODO: Improve and clean this up tryToParseInvalidIndirectObject() { const startPos = this.bytes.position(); const msg = `Trying to parse invalid object: ${JSON.stringify(startPos)})`; if (this.throwOnInvalidObject) throw new Error(msg); if (this.warnOnInvalidObjects) console.warn(msg); const ref = this.parseIndirectObjectHeader(); if (this.warnOnInvalidObjects) console.warn(`Invalid object ref: ${ref}`); this.skipWhitespaceAndComments(); const start = this.bytes.offset(); let failed = true; while (!this.bytes.done()) { if (this.matchKeyword(Keywords.endobj)) { failed = false; } if (!failed) break; this.bytes.next(); } if (failed) throw new PDFInvalidObjectParsingError(startPos); const end = this.bytes.offset() - Keywords.endobj.length; const object = PDFInvalidObject.of(this.bytes.slice(start, end)); this.context.assign(ref, object); return ref; } parseIndirectObjects() { return __awaiter(this, void 0, void 0, function* () { this.skipWhitespaceAndComments(); while (!this.bytes.done() && IsDigit[this.bytes.peek()]) { const initialOffset = this.bytes.offset(); try { yield this.parseIndirectObject(); } catch (e) { // TODO: Add tracing/logging mechanism to track when this happens! this.bytes.moveTo(initialOffset); this.tryToParseInvalidIndirectObject(); } this.skipWhitespaceAndComments(); // TODO: Can this be done only when needed, to avoid harming performance? this.skipJibberish(); if (this.shouldWaitForTick()) yield waitForTick(); } }); } maybeParseCrossRefSection() { this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords.xref)) return; this.skipWhitespaceAndComments(); let objectNumber = -1; const xref = PDFCrossRefSection.createEmpty(); while (!this.bytes.done() && IsDigit[this.bytes.peek()]) { const firstInt = this.parseRawInt(); this.skipWhitespaceAndComments(); // Check if second digit is valid integer if (!IsDigit[this.bytes.peek()]) { return PDFCrossRefSection.createEmpty(); } const secondInt = this.parseRawInt(); this.skipWhitespaceAndComments(); const byte = this.bytes.peek(); if (byte === CharCodes.n || byte === CharCodes.f) { const ref = PDFRef.of(objectNumber, secondInt); if (this.bytes.next() === CharCodes.n) { xref.addEntry(ref, firstInt); } else { // this.context.delete(ref); xref.addDeletedEntry(ref, firstInt); } objectNumber += 1; } else { objectNumber = firstInt; } this.skipWhitespaceAndComments(); } return xref; } maybeParseTrailerDict() { this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords.trailer)) return; this.skipWhitespaceAndComments(); const dict = this.parseDict(); const { context } = this; context.trailerInfo = { Root: dict.get(PDFName.of('Root')) || context.trailerInfo.Root, Encrypt: dict.get(PDFName.of('Encrypt')) || context.trailerInfo.Encrypt, Info: dict.get(PDFName.of('Info')) || context.trailerInfo.Info, ID: dict.get(PDFName.of('ID')) || context.trailerInfo.ID, }; } maybeParseTrailer() { this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords.startxref)) return; this.skipWhitespaceAndComments(); const offset = this.parseRawInt(); this.skipWhitespace(); this.matchKeyword(Keywords.eof); this.skipWhitespaceAndComments(); this.matchKeyword(Keywords.eof); this.skipWhitespaceAndComments(); return PDFTrailer.forLastCrossRefSectionOffset(offset); } parseDocumentSection() { return __awaiter(this, void 0, void 0, function* () { yield this.parseIndirectObjects(); this.maybeParseCrossRefSection(); this.maybeParseTrailerDict(); this.maybeParseTrailer(); // TODO: Can this be done only when needed, to avoid harming performance? this.skipJibberish(); }); } /** * This operation is not necessary for valid PDF files. But some invalid PDFs * contain jibberish in between indirect objects. This method is designed to * skip past that jibberish, should it exist, until it reaches the next * indirect object header, an xref table section, or the file trailer. */ skipJibberish() { this.skipWhitespaceAndComments(); while (!this.bytes.done()) { const initialOffset = this.bytes.offset(); const byte = this.bytes.peek(); const isAlphaNumeric = byte >= CharCodes.Space && byte <= CharCodes.Tilde; if (isAlphaNumeric) { if (this.matchKeyword(Keywords.xref) || this.matchKeyword(Keywords.trailer) || this.matchKeyword(Keywords.startxref) || this.matchIndirectObjectHeader()) { this.bytes.moveTo(initialOffset); break; } } this.bytes.next(); } } /** * Skips the binary comment following a PDF header. The specification * defines this binary comment (section 7.5.2 File Header) as a sequence of 4 * or more bytes that are 128 or greater, and which are preceded by a "%". * * This would imply that to strip out this binary comment, we could check for * a sequence of bytes starting with "%", and remove all subsequent bytes that * are 128 or greater. This works for many documents that properly comply with * the spec. But in the wild, there are PDFs that omit the leading "%", and * include bytes that are less than 128 (e.g. 0 or 1). So in order to parse * these headers correctly, we just throw out all bytes leading up to the * first indirect object header. */ skipBinaryHeaderComment() { this.skipWhitespaceAndComments(); try { const initialOffset = this.bytes.offset(); this.parseIndirectObjectHeader(); this.bytes.moveTo(initialOffset); } catch (e) { this.bytes.next(); this.skipWhitespaceAndComments(); } } } PDFParser.forBytesWithOptions = (pdfBytes, objectsPerTick, throwOnInvalidObject, warnOnInvalidObjects, capNumbers, cryptoFactory) => new PDFParser(pdfBytes, objectsPerTick, throwOnInvalidObject, warnOnInvalidObjects, capNumbers, cryptoFactory); export default PDFParser; //# sourceMappingURL=PDFParser.js.map