UNPKG

pdf-lib

Version:

Create and modify PDF files with JavaScript

344 lines 16.2 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); var tslib_1 = require("tslib"); var PDFCrossRefSection_1 = tslib_1.__importDefault(require("../document/PDFCrossRefSection")); var PDFHeader_1 = tslib_1.__importDefault(require("../document/PDFHeader")); var PDFTrailer_1 = tslib_1.__importDefault(require("../document/PDFTrailer")); var errors_1 = require("../errors"); var PDFDict_1 = tslib_1.__importDefault(require("../objects/PDFDict")); var PDFInvalidObject_1 = tslib_1.__importDefault(require("../objects/PDFInvalidObject")); var PDFName_1 = tslib_1.__importDefault(require("../objects/PDFName")); var PDFRawStream_1 = tslib_1.__importDefault(require("../objects/PDFRawStream")); var PDFRef_1 = tslib_1.__importDefault(require("../objects/PDFRef")); var ByteStream_1 = tslib_1.__importDefault(require("./ByteStream")); var PDFObjectParser_1 = tslib_1.__importDefault(require("./PDFObjectParser")); var PDFObjectStreamParser_1 = tslib_1.__importDefault(require("./PDFObjectStreamParser")); var PDFXRefStreamParser_1 = tslib_1.__importDefault(require("./PDFXRefStreamParser")); var PDFContext_1 = tslib_1.__importDefault(require("../PDFContext")); var CharCodes_1 = tslib_1.__importDefault(require("../syntax/CharCodes")); var Keywords_1 = require("../syntax/Keywords"); var Numeric_1 = require("../syntax/Numeric"); var utils_1 = require("../../utils"); var PDFParser = /** @class */ (function (_super) { tslib_1.__extends(PDFParser, _super); function PDFParser(pdfBytes, objectsPerTick) { if (objectsPerTick === void 0) { objectsPerTick = Infinity; } var _this = _super.call(this, ByteStream_1.default.of(pdfBytes), PDFContext_1.default.create()) || this; _this.alreadyParsed = false; _this.parsedObjects = 0; _this.shouldWaitForTick = function () { _this.parsedObjects += 1; return _this.parsedObjects % _this.objectsPerTick === 0; }; _this.objectsPerTick = objectsPerTick; return _this; } PDFParser.prototype.parseDocument = function () { return tslib_1.__awaiter(this, void 0, void 0, function () { var prevOffset, offset; return tslib_1.__generator(this, function (_a) { switch (_a.label) { case 0: if (this.alreadyParsed) { throw new errors_1.ReparseError('PDFParser', 'parseDocument'); } this.alreadyParsed = true; this.context.header = this.parseHeader(); _a.label = 1; case 1: if (!!this.bytes.done()) return [3 /*break*/, 3]; return [4 /*yield*/, this.parseDocumentSection()]; case 2: _a.sent(); offset = this.bytes.offset(); if (offset === prevOffset) { throw new errors_1.StalledParserError(this.bytes.position()); } prevOffset = offset; return [3 /*break*/, 1]; case 3: this.maybeRecoverRoot(); return [2 /*return*/, this.context]; } }); }); }; PDFParser.prototype.maybeRecoverRoot = function () { var isValidCatalog = function (obj) { return obj instanceof PDFDict_1.default && obj.lookup(PDFName_1.default.of('Type')) === PDFName_1.default.of('Catalog'); }; var catalog = this.context.lookup(this.context.trailerInfo.Root); if (!isValidCatalog(catalog)) { var indirectObjects = this.context.enumerateIndirectObjects(); for (var idx = 0, len = indirectObjects.length; idx < len; idx++) { var _a = indirectObjects[idx], ref = _a[0], object = _a[1]; if (isValidCatalog(object)) { this.context.trailerInfo.Root = ref; } } } }; PDFParser.prototype.parseHeader = function () { while (!this.bytes.done()) { if (this.matchKeyword(Keywords_1.Keywords.header)) { var major = this.parseRawInt(); this.bytes.assertNext(CharCodes_1.default.Period); var minor = this.parseRawInt(); var header = PDFHeader_1.default.forVersion(major, minor); this.skipBinaryHeaderComment(); return header; } this.bytes.next(); } throw new errors_1.MissingPDFHeaderError(this.bytes.position()); }; PDFParser.prototype.parseIndirectObjectHeader = function () { this.skipWhitespaceAndComments(); var objectNumber = this.parseRawInt(); this.skipWhitespaceAndComments(); var generationNumber = this.parseRawInt(); this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords_1.Keywords.obj)) { throw new errors_1.MissingKeywordError(this.bytes.position(), Keywords_1.Keywords.obj); } return PDFRef_1.default.of(objectNumber, generationNumber); }; PDFParser.prototype.matchIndirectObjectHeader = function () { var initialOffset = this.bytes.offset(); try { this.parseIndirectObjectHeader(); return true; } catch (e) { this.bytes.moveTo(initialOffset); return false; } }; PDFParser.prototype.parseIndirectObject = function () { return tslib_1.__awaiter(this, void 0, void 0, function () { var ref, object; return tslib_1.__generator(this, function (_a) { switch (_a.label) { case 0: ref = this.parseIndirectObjectHeader(); this.skipWhitespaceAndComments(); object = this.parseObject(); this.skipWhitespaceAndComments(); // if (!this.matchKeyword(Keywords.endobj)) { // throw new MissingKeywordError(this.bytes.position(), Keywords.endobj); // } // TODO: Log a warning if this fails... this.matchKeyword(Keywords_1.Keywords.endobj); if (!(object instanceof PDFRawStream_1.default && object.dict.lookup(PDFName_1.default.of('Type')) === PDFName_1.default.of('ObjStm'))) return [3 /*break*/, 2]; return [4 /*yield*/, PDFObjectStreamParser_1.default.forStream(object, this.shouldWaitForTick).parseIntoContext()]; case 1: _a.sent(); return [3 /*break*/, 3]; case 2: if (object instanceof PDFRawStream_1.default && object.dict.lookup(PDFName_1.default.of('Type')) === PDFName_1.default.of('XRef')) { PDFXRefStreamParser_1.default.forStream(object).parseIntoContext(); } else { this.context.assign(ref, object); } _a.label = 3; case 3: return [2 /*return*/, ref]; } }); }); }; // TODO: Improve and clean this up PDFParser.prototype.tryToParseInvalidIndirectObject = function () { var startPos = this.bytes.position(); console.warn("Trying to parse invalid object: " + JSON.stringify(startPos) + ")"); var ref = this.parseIndirectObjectHeader(); console.warn("Invalid object ref: " + ref); this.skipWhitespaceAndComments(); var start = this.bytes.offset(); var failed = true; while (!this.bytes.done()) { if (this.matchKeyword(Keywords_1.Keywords.endobj)) { failed = false; } if (!failed) break; this.bytes.next(); } if (failed) throw new errors_1.PDFInvalidObjectParsingError(startPos); var end = this.bytes.offset() - Keywords_1.Keywords.endobj.length; var object = PDFInvalidObject_1.default.of(this.bytes.slice(start, end)); this.context.assign(ref, object); return ref; }; PDFParser.prototype.parseIndirectObjects = function () { return tslib_1.__awaiter(this, void 0, void 0, function () { var initialOffset, e_1; return tslib_1.__generator(this, function (_a) { switch (_a.label) { case 0: this.skipWhitespaceAndComments(); _a.label = 1; case 1: if (!(!this.bytes.done() && Numeric_1.IsDigit[this.bytes.peek()])) return [3 /*break*/, 8]; initialOffset = this.bytes.offset(); _a.label = 2; case 2: _a.trys.push([2, 4, , 5]); return [4 /*yield*/, this.parseIndirectObject()]; case 3: _a.sent(); return [3 /*break*/, 5]; case 4: e_1 = _a.sent(); // TODO: Add tracing/logging mechanism to track when this happens! this.bytes.moveTo(initialOffset); this.tryToParseInvalidIndirectObject(); return [3 /*break*/, 5]; case 5: this.skipWhitespaceAndComments(); // TODO: Can this be done only when needed, to avoid harming performance? this.skipJibberish(); if (!this.shouldWaitForTick()) return [3 /*break*/, 7]; return [4 /*yield*/, utils_1.waitForTick()]; case 6: _a.sent(); _a.label = 7; case 7: return [3 /*break*/, 1]; case 8: return [2 /*return*/]; } }); }); }; PDFParser.prototype.maybeParseCrossRefSection = function () { this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords_1.Keywords.xref)) return; this.skipWhitespaceAndComments(); var objectNumber = -1; var xref = PDFCrossRefSection_1.default.createEmpty(); while (!this.bytes.done() && Numeric_1.IsDigit[this.bytes.peek()]) { var firstInt = this.parseRawInt(); this.skipWhitespaceAndComments(); var secondInt = this.parseRawInt(); this.skipWhitespaceAndComments(); var byte = this.bytes.peek(); if (byte === CharCodes_1.default.n || byte === CharCodes_1.default.f) { var ref = PDFRef_1.default.of(objectNumber, secondInt); if (this.bytes.next() === CharCodes_1.default.n) { xref.addEntry(ref, firstInt); } else { // this.context.delete(ref); xref.addDeletedEntry(ref, firstInt); } objectNumber += 1; } else { objectNumber = firstInt; } this.skipWhitespaceAndComments(); } return xref; }; PDFParser.prototype.maybeParseTrailerDict = function () { this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords_1.Keywords.trailer)) return; this.skipWhitespaceAndComments(); var dict = this.parseDict(); var context = this.context; context.trailerInfo = { Root: dict.get(PDFName_1.default.of('Root')) || context.trailerInfo.Root, Encrypt: dict.get(PDFName_1.default.of('Encrypt')) || context.trailerInfo.Encrypt, Info: dict.get(PDFName_1.default.of('Info')) || context.trailerInfo.Info, ID: dict.get(PDFName_1.default.of('ID')) || context.trailerInfo.ID, }; }; PDFParser.prototype.maybeParseTrailer = function () { this.skipWhitespaceAndComments(); if (!this.matchKeyword(Keywords_1.Keywords.startxref)) return; this.skipWhitespaceAndComments(); var offset = this.parseRawInt(); this.skipWhitespace(); this.matchKeyword(Keywords_1.Keywords.eof); this.skipWhitespaceAndComments(); this.matchKeyword(Keywords_1.Keywords.eof); this.skipWhitespaceAndComments(); return PDFTrailer_1.default.forLastCrossRefSectionOffset(offset); }; PDFParser.prototype.parseDocumentSection = function () { return tslib_1.__awaiter(this, void 0, void 0, function () { return tslib_1.__generator(this, function (_a) { switch (_a.label) { case 0: return [4 /*yield*/, this.parseIndirectObjects()]; case 1: _a.sent(); this.maybeParseCrossRefSection(); this.maybeParseTrailerDict(); this.maybeParseTrailer(); // TODO: Can this be done only when needed, to avoid harming performance? this.skipJibberish(); return [2 /*return*/]; } }); }); }; /** * This operation is not necessary for valid PDF files. But some invalid PDFs * contain jibberish in between indirect objects. This method is designed to * skip past that jibberish, should it exist, until it reaches the next * indirect object header, an xref table section, or the file trailer. */ PDFParser.prototype.skipJibberish = function () { this.skipWhitespaceAndComments(); while (!this.bytes.done()) { var initialOffset = this.bytes.offset(); var byte = this.bytes.peek(); var isAlphaNumeric = byte >= CharCodes_1.default.Space && byte <= CharCodes_1.default.Tilde; if (isAlphaNumeric) { if (this.matchKeyword(Keywords_1.Keywords.xref) || this.matchKeyword(Keywords_1.Keywords.trailer) || this.matchKeyword(Keywords_1.Keywords.startxref) || this.matchIndirectObjectHeader()) { this.bytes.moveTo(initialOffset); break; } } this.bytes.next(); } }; /** * Skips the binary comment following a PDF header. The specification * defines this binary comment (section 7.5.2 File Header) as a sequence of 4 * or more bytes that are 128 or greater, and which are preceded by a "%". * * This would imply that to strip out this binary comment, we could check for * a sequence of bytes starting with "%", and remove all subsequent bytes that * are 128 or greater. This works for many documents that properly comply with * the spec. But in the wild, there are PDFs that omit the leading "%", and * include bytes that are less than 128 (e.g. 0 or 1). So in order to parse * these headers correctly, we just throw out all bytes leading up to the * first indirect object header. */ PDFParser.prototype.skipBinaryHeaderComment = function () { this.skipWhitespaceAndComments(); try { var initialOffset = this.bytes.offset(); this.parseIndirectObjectHeader(); this.bytes.moveTo(initialOffset); } catch (e) { this.skipLine(); this.skipWhitespaceAndComments(); } }; PDFParser.forBytesWithOptions = function (pdfBytes, objectsPerTick) { return new PDFParser(pdfBytes, objectsPerTick); }; return PDFParser; }(PDFObjectParser_1.default)); exports.default = PDFParser; //# sourceMappingURL=PDFParser.js.map