UNPKG

pdf-lib

Version:

Create and modify PDF files with JavaScript

354 lines 15.9 kB
import { __awaiter, __extends, __generator } from "tslib"; import PDFCrossRefSection from "../document/PDFCrossRefSection"; import PDFHeader from "../document/PDFHeader"; import PDFTrailer from "../document/PDFTrailer"; import { MissingKeywordError, MissingPDFHeaderError, PDFInvalidObjectParsingError, ReparseError, StalledParserError, } from "../errors"; import PDFDict from "../objects/PDFDict"; import PDFInvalidObject from "../objects/PDFInvalidObject"; import PDFName from "../objects/PDFName"; import PDFRawStream from "../objects/PDFRawStream"; import PDFRef from "../objects/PDFRef"; import ByteStream from "./ByteStream"; import PDFObjectParser from "./PDFObjectParser"; import PDFObjectStreamParser from "./PDFObjectStreamParser"; import PDFXRefStreamParser from "./PDFXRefStreamParser"; import PDFContext from "../PDFContext"; import CharCodes from "../syntax/CharCodes"; import { Keywords } from "../syntax/Keywords"; import { IsDigit } from "../syntax/Numeric"; import { waitForTick } from "../../utils"; var PDFParser = /** @class */ (function (_super) { __extends(PDFParser, _super); function PDFParser(pdfBytes, objectsPerTick, throwOnInvalidObject, capNumbers) { if (objectsPerTick === void 0) { objectsPerTick = Infinity; } if (throwOnInvalidObject === void 0) { throwOnInvalidObject = false; } if (capNumbers === void 0) { capNumbers = false; } var _this = _super.call(this, ByteStream.of(pdfBytes), PDFContext.create(), capNumbers) || this; _this.alreadyParsed = false; _this.parsedObjects = 0; _this.shouldWaitForTick = function () { _this.parsedObjects += 1; return _this.parsedObjects % _this.objectsPerTick === 0; }; _this.objectsPerTick = objectsPerTick; _this.throwOnInvalidObject = throwOnInvalidObject; return _this; } PDFParser.prototype.parseDocument = function () { return __awaiter(this, void 0, void 0, function () { var prevOffset, offset; return __generator(this, function (_a) { switch (_a.label) { case 0: if (this.alreadyParsed) { throw new ReparseError('PDFParser', 'parseDocument'); } this.alreadyParsed = true; this.context.header = this.parseHeader(); _a.label = 1; case 1: if (!!this.bytes.done()) return [3 /*break*/, 3]; return [4 /*yield*/, this.parseDocumentSection()]; case 2: _a.sent(); offset = this.bytes.offset(); if (offset === prevOffset) { throw new StalledParserError(this.bytes.position()); } prevOffset = offset; return [3 /*break*/, 1]; case 3: this.maybeRecoverRoot(); if (this.context.lookup(PDFRef.of(0))) { console.warn('Removing parsed object: 0 0 R'); this.context.delete(PDFRef.of(0)); } return [2 /*return*/, this.context]; } }); }); }; PDFParser.prototype.maybeRecoverRoot = function () { var isValidCatalog = function (obj) { return obj instanceof PDFDict && obj.lookup(PDFName.of('Type')) === PDFName.of('Catalog'); }; var catalog = this.context.lookup(this.context.trailerInfo.Root); if (!isValidCatalog(catalog)) { var indirectObjects = this.context.enumerateIndirectObjects(); for (var idx = 0, len = indirectObjects.length; idx < len; idx++) { var _a = indirectObjects[idx], ref = _a[0], object = _a[1]; if (isValidCatalog(object)) { this.context.trailerInfo.Root = ref; } } } }; PDFParser.prototype.parseHeader = function () { while (!this.bytes.done()) { if (this.matchKeyword(Keywords.header)) { var major = this.parseRawInt(); this.bytes.assertNext(CharCodes.Period); var minor = this.parseRawInt(); var header = PDFHeader.forVersion(major, minor); this.skipBinaryHeaderComment(); return header; } this.bytes.next(); } throw new MissingPDFHeaderError(this.bytes.position()); }; PDFParser.prototype.parseIndirectObjectHeader = function () { this.skipWhitespaceAndComments(); var objectNumber = this.parseRawInt(); this.skipWhitespaceAndComments(); var generationNumber = this.parseRawInt(); this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords.obj)) { throw new MissingKeywordError(this.bytes.position(), Keywords.obj); } return PDFRef.of(objectNumber, generationNumber); }; PDFParser.prototype.matchIndirectObjectHeader = function () { var initialOffset = this.bytes.offset(); try { this.parseIndirectObjectHeader(); return true; } catch (e) { this.bytes.moveTo(initialOffset); return false; } }; PDFParser.prototype.parseIndirectObject = function () { return __awaiter(this, void 0, void 0, function () { var ref, object; return __generator(this, function (_a) { switch (_a.label) { case 0: ref = this.parseIndirectObjectHeader(); this.skipWhitespaceAndComments(); object = this.parseObject(); this.skipWhitespaceAndComments(); // if (!this.matchKeyword(Keywords.endobj)) { // throw new MissingKeywordError(this.bytes.position(), Keywords.endobj); // } // TODO: Log a warning if this fails... this.matchKeyword(Keywords.endobj); if (!(object instanceof PDFRawStream && object.dict.lookup(PDFName.of('Type')) === PDFName.of('ObjStm'))) return [3 /*break*/, 2]; return [4 /*yield*/, PDFObjectStreamParser.forStream(object, this.shouldWaitForTick).parseIntoContext()]; case 1: _a.sent(); return [3 /*break*/, 3]; case 2: if (object instanceof PDFRawStream && object.dict.lookup(PDFName.of('Type')) === PDFName.of('XRef')) { PDFXRefStreamParser.forStream(object).parseIntoContext(); } else { this.context.assign(ref, object); } _a.label = 3; case 3: return [2 /*return*/, ref]; } }); }); }; // TODO: Improve and clean this up PDFParser.prototype.tryToParseInvalidIndirectObject = function () { var startPos = this.bytes.position(); var msg = "Trying to parse invalid object: " + JSON.stringify(startPos) + ")"; if (this.throwOnInvalidObject) throw new Error(msg); console.warn(msg); var ref = this.parseIndirectObjectHeader(); console.warn("Invalid object ref: " + ref); this.skipWhitespaceAndComments(); var start = this.bytes.offset(); var failed = true; while (!this.bytes.done()) { if (this.matchKeyword(Keywords.endobj)) { failed = false; } if (!failed) break; this.bytes.next(); } if (failed) throw new PDFInvalidObjectParsingError(startPos); var end = this.bytes.offset() - Keywords.endobj.length; var object = PDFInvalidObject.of(this.bytes.slice(start, end)); this.context.assign(ref, object); return ref; }; PDFParser.prototype.parseIndirectObjects = function () { return __awaiter(this, void 0, void 0, function () { var initialOffset, e_1; return __generator(this, function (_a) { switch (_a.label) { case 0: this.skipWhitespaceAndComments(); _a.label = 1; case 1: if (!(!this.bytes.done() && IsDigit[this.bytes.peek()])) return [3 /*break*/, 8]; initialOffset = this.bytes.offset(); _a.label = 2; case 2: _a.trys.push([2, 4, , 5]); return [4 /*yield*/, this.parseIndirectObject()]; case 3: _a.sent(); return [3 /*break*/, 5]; case 4: e_1 = _a.sent(); // TODO: Add tracing/logging mechanism to track when this happens! this.bytes.moveTo(initialOffset); this.tryToParseInvalidIndirectObject(); return [3 /*break*/, 5]; case 5: this.skipWhitespaceAndComments(); // TODO: Can this be done only when needed, to avoid harming performance? this.skipJibberish(); if (!this.shouldWaitForTick()) return [3 /*break*/, 7]; return [4 /*yield*/, waitForTick()]; case 6: _a.sent(); _a.label = 7; case 7: return [3 /*break*/, 1]; case 8: return [2 /*return*/]; } }); }); }; PDFParser.prototype.maybeParseCrossRefSection = function () { this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords.xref)) return; this.skipWhitespaceAndComments(); var objectNumber = -1; var xref = PDFCrossRefSection.createEmpty(); while (!this.bytes.done() && IsDigit[this.bytes.peek()]) { var firstInt = this.parseRawInt(); this.skipWhitespaceAndComments(); var secondInt = this.parseRawInt(); this.skipWhitespaceAndComments(); var byte = this.bytes.peek(); if (byte === CharCodes.n || byte === CharCodes.f) { var ref = PDFRef.of(objectNumber, secondInt); if (this.bytes.next() === CharCodes.n) { xref.addEntry(ref, firstInt); } else { // this.context.delete(ref); xref.addDeletedEntry(ref, firstInt); } objectNumber += 1; } else { objectNumber = firstInt; } this.skipWhitespaceAndComments(); } return xref; }; PDFParser.prototype.maybeParseTrailerDict = function () { this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords.trailer)) return; this.skipWhitespaceAndComments(); var dict = this.parseDict(); var context = this.context; context.trailerInfo = { Root: dict.get(PDFName.of('Root')) || context.trailerInfo.Root, Encrypt: dict.get(PDFName.of('Encrypt')) || context.trailerInfo.Encrypt, Info: dict.get(PDFName.of('Info')) || context.trailerInfo.Info, ID: dict.get(PDFName.of('ID')) || context.trailerInfo.ID, }; }; PDFParser.prototype.maybeParseTrailer = function () { this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords.startxref)) return; this.skipWhitespaceAndComments(); var offset = this.parseRawInt(); this.skipWhitespace(); this.matchKeyword(Keywords.eof); this.skipWhitespaceAndComments(); this.matchKeyword(Keywords.eof); this.skipWhitespaceAndComments(); return PDFTrailer.forLastCrossRefSectionOffset(offset); }; PDFParser.prototype.parseDocumentSection = function () { return __awaiter(this, void 0, void 0, function () { return __generator(this, function (_a) { switch (_a.label) { case 0: return [4 /*yield*/, this.parseIndirectObjects()]; case 1: _a.sent(); this.maybeParseCrossRefSection(); this.maybeParseTrailerDict(); this.maybeParseTrailer(); // TODO: Can this be done only when needed, to avoid harming performance? this.skipJibberish(); return [2 /*return*/]; } }); }); }; /** * This operation is not necessary for valid PDF files. But some invalid PDFs * contain jibberish in between indirect objects. This method is designed to * skip past that jibberish, should it exist, until it reaches the next * indirect object header, an xref table section, or the file trailer. */ PDFParser.prototype.skipJibberish = function () { this.skipWhitespaceAndComments(); while (!this.bytes.done()) { var initialOffset = this.bytes.offset(); var byte = this.bytes.peek(); var isAlphaNumeric = byte >= CharCodes.Space && byte <= CharCodes.Tilde; if (isAlphaNumeric) { if (this.matchKeyword(Keywords.xref) || this.matchKeyword(Keywords.trailer) || this.matchKeyword(Keywords.startxref) || this.matchIndirectObjectHeader()) { this.bytes.moveTo(initialOffset); break; } } this.bytes.next(); } }; /** * Skips the binary comment following a PDF header. The specification * defines this binary comment (section 7.5.2 File Header) as a sequence of 4 * or more bytes that are 128 or greater, and which are preceded by a "%". * * This would imply that to strip out this binary comment, we could check for * a sequence of bytes starting with "%", and remove all subsequent bytes that * are 128 or greater. This works for many documents that properly comply with * the spec. But in the wild, there are PDFs that omit the leading "%", and * include bytes that are less than 128 (e.g. 0 or 1). So in order to parse * these headers correctly, we just throw out all bytes leading up to the * first indirect object header. */ PDFParser.prototype.skipBinaryHeaderComment = function () { this.skipWhitespaceAndComments(); try { var initialOffset = this.bytes.offset(); this.parseIndirectObjectHeader(); this.bytes.moveTo(initialOffset); } catch (e) { this.bytes.next(); this.skipWhitespaceAndComments(); } }; PDFParser.forBytesWithOptions = function (pdfBytes, objectsPerTick, throwOnInvalidObject, capNumbers) { return new PDFParser(pdfBytes, objectsPerTick, throwOnInvalidObject, capNumbers); }; return PDFParser; }(PDFObjectParser)); export default PDFParser; //# sourceMappingURL=PDFParser.js.map