@cantoo/pdf-lib
Version:
Create and modify PDF files with JavaScript
309 lines • 14.2 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const tslib_1 = require("tslib");
const PDFCrossRefSection_1 = tslib_1.__importDefault(require("../document/PDFCrossRefSection"));
const PDFHeader_1 = tslib_1.__importDefault(require("../document/PDFHeader"));
const PDFTrailer_1 = tslib_1.__importDefault(require("../document/PDFTrailer"));
const errors_1 = require("../errors");
const PDFDict_1 = tslib_1.__importDefault(require("../objects/PDFDict"));
const PDFInvalidObject_1 = tslib_1.__importDefault(require("../objects/PDFInvalidObject"));
const PDFName_1 = tslib_1.__importDefault(require("../objects/PDFName"));
const PDFRawStream_1 = tslib_1.__importDefault(require("../objects/PDFRawStream"));
const PDFRef_1 = tslib_1.__importDefault(require("../objects/PDFRef"));
const ByteStream_1 = tslib_1.__importDefault(require("./ByteStream"));
const PDFObjectParser_1 = tslib_1.__importDefault(require("./PDFObjectParser"));
const PDFObjectStreamParser_1 = tslib_1.__importDefault(require("./PDFObjectStreamParser"));
const PDFXRefStreamParser_1 = tslib_1.__importDefault(require("./PDFXRefStreamParser"));
const PDFContext_1 = tslib_1.__importDefault(require("../PDFContext"));
const CharCodes_1 = tslib_1.__importDefault(require("../syntax/CharCodes"));
const Keywords_1 = require("../syntax/Keywords");
const Numeric_1 = require("../syntax/Numeric");
const utils_1 = require("../../utils");
class PDFParser extends PDFObjectParser_1.default {
constructor(pdfBytes, objectsPerTick = Infinity, throwOnInvalidObject = false, warnOnInvalidObjects = false, capNumbers = false, cryptoFactory) {
super(ByteStream_1.default.of(pdfBytes), PDFContext_1.default.create(), capNumbers, cryptoFactory);
this.alreadyParsed = false;
this.parsedObjects = 0;
this.shouldWaitForTick = () => {
this.parsedObjects += 1;
return this.parsedObjects % this.objectsPerTick === 0;
};
this.objectsPerTick = objectsPerTick;
this.throwOnInvalidObject = throwOnInvalidObject;
this.warnOnInvalidObjects = warnOnInvalidObjects;
this.context.isDecrypted = !!(cryptoFactory === null || cryptoFactory === void 0 ? void 0 : cryptoFactory.encryptionKey);
}
parseDocument() {
return tslib_1.__awaiter(this, void 0, void 0, function* () {
if (this.alreadyParsed) {
throw new errors_1.ReparseError('PDFParser', 'parseDocument');
}
this.alreadyParsed = true;
this.context.header = this.parseHeader();
let prevOffset;
while (!this.bytes.done()) {
yield this.parseDocumentSection();
const offset = this.bytes.offset();
if (offset === prevOffset) {
throw new errors_1.StalledParserError(this.bytes.position());
}
prevOffset = offset;
}
this.maybeRecoverRoot();
if (this.context.lookup(PDFRef_1.default.of(0))) {
console.warn('Removing parsed object: 0 0 R');
this.context.delete(PDFRef_1.default.of(0));
}
return this.context;
});
}
maybeRecoverRoot() {
const isValidCatalog = (obj) => obj instanceof PDFDict_1.default &&
obj.lookup(PDFName_1.default.of('Type')) === PDFName_1.default.of('Catalog');
const catalog = this.context.lookup(this.context.trailerInfo.Root);
if (!isValidCatalog(catalog)) {
const indirectObjects = this.context.enumerateIndirectObjects();
for (let idx = 0, len = indirectObjects.length; idx < len; idx++) {
const [ref, object] = indirectObjects[idx];
if (isValidCatalog(object)) {
this.context.trailerInfo.Root = ref;
}
}
}
}
parseHeader() {
while (!this.bytes.done()) {
if (this.matchKeyword(Keywords_1.Keywords.header)) {
const major = this.parseRawInt();
this.bytes.assertNext(CharCodes_1.default.Period);
const minor = this.parseRawInt();
const header = PDFHeader_1.default.forVersion(major, minor);
this.skipBinaryHeaderComment();
return header;
}
this.bytes.next();
}
throw new errors_1.MissingPDFHeaderError(this.bytes.position());
}
parseIndirectObjectHeader() {
this.skipWhitespaceAndComments();
const objectNumber = this.parseRawInt();
this.skipWhitespaceAndComments();
const generationNumber = this.parseRawInt();
this.skipWhitespaceAndComments();
if (!this.matchKeyword(Keywords_1.Keywords.obj)) {
throw new errors_1.MissingKeywordError(this.bytes.position(), Keywords_1.Keywords.obj);
}
return PDFRef_1.default.of(objectNumber, generationNumber);
}
matchIndirectObjectHeader() {
const initialOffset = this.bytes.offset();
try {
this.parseIndirectObjectHeader();
return true;
}
catch (e) {
this.bytes.moveTo(initialOffset);
return false;
}
}
parseIndirectObject() {
return tslib_1.__awaiter(this, void 0, void 0, function* () {
const ref = this.parseIndirectObjectHeader();
this.skipWhitespaceAndComments();
const object = this.parseObject(ref);
this.skipWhitespaceAndComments();
// if (!this.matchKeyword(Keywords.endobj)) {
// throw new MissingKeywordError(this.bytes.position(), Keywords.endobj);
// }
// TODO: Log a warning if this fails...
this.matchKeyword(Keywords_1.Keywords.endobj);
if (object instanceof PDFRawStream_1.default &&
object.dict.lookup(PDFName_1.default.of('Type')) === PDFName_1.default.of('ObjStm')) {
yield PDFObjectStreamParser_1.default.forStream(object, this.shouldWaitForTick).parseIntoContext();
}
else if (object instanceof PDFRawStream_1.default &&
object.dict.lookup(PDFName_1.default.of('Type')) === PDFName_1.default.of('XRef')) {
PDFXRefStreamParser_1.default.forStream(object).parseIntoContext();
}
else {
this.context.assign(ref, object);
}
return ref;
});
}
// TODO: Improve and clean this up
tryToParseInvalidIndirectObject() {
const startPos = this.bytes.position();
const msg = `Trying to parse invalid object: ${JSON.stringify(startPos)})`;
if (this.throwOnInvalidObject)
throw new Error(msg);
if (this.warnOnInvalidObjects)
console.warn(msg);
const ref = this.parseIndirectObjectHeader();
if (this.warnOnInvalidObjects)
console.warn(`Invalid object ref: ${ref}`);
this.skipWhitespaceAndComments();
const start = this.bytes.offset();
let failed = true;
while (!this.bytes.done()) {
if (this.matchKeyword(Keywords_1.Keywords.endobj)) {
failed = false;
}
if (!failed)
break;
this.bytes.next();
}
if (failed)
throw new errors_1.PDFInvalidObjectParsingError(startPos);
const end = this.bytes.offset() - Keywords_1.Keywords.endobj.length;
const object = PDFInvalidObject_1.default.of(this.bytes.slice(start, end));
this.context.assign(ref, object);
return ref;
}
parseIndirectObjects() {
return tslib_1.__awaiter(this, void 0, void 0, function* () {
this.skipWhitespaceAndComments();
while (!this.bytes.done() && Numeric_1.IsDigit[this.bytes.peek()]) {
const initialOffset = this.bytes.offset();
try {
yield this.parseIndirectObject();
}
catch (e) {
// TODO: Add tracing/logging mechanism to track when this happens!
this.bytes.moveTo(initialOffset);
this.tryToParseInvalidIndirectObject();
}
this.skipWhitespaceAndComments();
// TODO: Can this be done only when needed, to avoid harming performance?
this.skipJibberish();
if (this.shouldWaitForTick())
yield (0, utils_1.waitForTick)();
}
});
}
maybeParseCrossRefSection() {
this.skipWhitespaceAndComments();
if (!this.matchKeyword(Keywords_1.Keywords.xref))
return;
this.skipWhitespaceAndComments();
let objectNumber = -1;
const xref = PDFCrossRefSection_1.default.createEmpty();
while (!this.bytes.done() && Numeric_1.IsDigit[this.bytes.peek()]) {
const firstInt = this.parseRawInt();
this.skipWhitespaceAndComments();
// Check if second digit is valid integer
if (!Numeric_1.IsDigit[this.bytes.peek()]) {
return PDFCrossRefSection_1.default.createEmpty();
}
const secondInt = this.parseRawInt();
this.skipWhitespaceAndComments();
const byte = this.bytes.peek();
if (byte === CharCodes_1.default.n || byte === CharCodes_1.default.f) {
const ref = PDFRef_1.default.of(objectNumber, secondInt);
if (this.bytes.next() === CharCodes_1.default.n) {
xref.addEntry(ref, firstInt);
}
else {
// this.context.delete(ref);
xref.addDeletedEntry(ref, firstInt);
}
objectNumber += 1;
}
else {
objectNumber = firstInt;
}
this.skipWhitespaceAndComments();
}
return xref;
}
maybeParseTrailerDict() {
this.skipWhitespaceAndComments();
if (!this.matchKeyword(Keywords_1.Keywords.trailer))
return;
this.skipWhitespaceAndComments();
const dict = this.parseDict();
const { context } = this;
context.trailerInfo = {
Root: dict.get(PDFName_1.default.of('Root')) || context.trailerInfo.Root,
Encrypt: dict.get(PDFName_1.default.of('Encrypt')) || context.trailerInfo.Encrypt,
Info: dict.get(PDFName_1.default.of('Info')) || context.trailerInfo.Info,
ID: dict.get(PDFName_1.default.of('ID')) || context.trailerInfo.ID,
};
}
maybeParseTrailer() {
this.skipWhitespaceAndComments();
if (!this.matchKeyword(Keywords_1.Keywords.startxref))
return;
this.skipWhitespaceAndComments();
const offset = this.parseRawInt();
this.skipWhitespace();
this.matchKeyword(Keywords_1.Keywords.eof);
this.skipWhitespaceAndComments();
this.matchKeyword(Keywords_1.Keywords.eof);
this.skipWhitespaceAndComments();
return PDFTrailer_1.default.forLastCrossRefSectionOffset(offset);
}
parseDocumentSection() {
return tslib_1.__awaiter(this, void 0, void 0, function* () {
yield this.parseIndirectObjects();
this.maybeParseCrossRefSection();
this.maybeParseTrailerDict();
this.maybeParseTrailer();
// TODO: Can this be done only when needed, to avoid harming performance?
this.skipJibberish();
});
}
/**
* This operation is not necessary for valid PDF files. But some invalid PDFs
* contain jibberish in between indirect objects. This method is designed to
* skip past that jibberish, should it exist, until it reaches the next
* indirect object header, an xref table section, or the file trailer.
*/
skipJibberish() {
this.skipWhitespaceAndComments();
while (!this.bytes.done()) {
const initialOffset = this.bytes.offset();
const byte = this.bytes.peek();
const isAlphaNumeric = byte >= CharCodes_1.default.Space && byte <= CharCodes_1.default.Tilde;
if (isAlphaNumeric) {
if (this.matchKeyword(Keywords_1.Keywords.xref) ||
this.matchKeyword(Keywords_1.Keywords.trailer) ||
this.matchKeyword(Keywords_1.Keywords.startxref) ||
this.matchIndirectObjectHeader()) {
this.bytes.moveTo(initialOffset);
break;
}
}
this.bytes.next();
}
}
/**
* Skips the binary comment following a PDF header. The specification
* defines this binary comment (section 7.5.2 File Header) as a sequence of 4
* or more bytes that are 128 or greater, and which are preceded by a "%".
*
* This would imply that to strip out this binary comment, we could check for
* a sequence of bytes starting with "%", and remove all subsequent bytes that
* are 128 or greater. This works for many documents that properly comply with
* the spec. But in the wild, there are PDFs that omit the leading "%", and
* include bytes that are less than 128 (e.g. 0 or 1). So in order to parse
* these headers correctly, we just throw out all bytes leading up to the
* first indirect object header.
*/
skipBinaryHeaderComment() {
this.skipWhitespaceAndComments();
try {
const initialOffset = this.bytes.offset();
this.parseIndirectObjectHeader();
this.bytes.moveTo(initialOffset);
}
catch (e) {
this.bytes.next();
this.skipWhitespaceAndComments();
}
}
}
PDFParser.forBytesWithOptions = (pdfBytes, objectsPerTick, throwOnInvalidObject, warnOnInvalidObjects, capNumbers, cryptoFactory) => new PDFParser(pdfBytes, objectsPerTick, throwOnInvalidObject, warnOnInvalidObjects, capNumbers, cryptoFactory);
exports.default = PDFParser;
//# sourceMappingURL=PDFParser.js.map