@cantoo/pdf-lib
Version:
Create and modify PDF files with JavaScript
238 lines • 10.1 kB
JavaScript
import { PDFObjectParsingError, PDFStreamParsingError, UnbalancedParenthesisError, } from '../errors.js';
import PDFArray from '../objects/PDFArray.js';
import PDFBool from '../objects/PDFBool.js';
import PDFDict from '../objects/PDFDict.js';
import PDFHexString from '../objects/PDFHexString.js';
import PDFName from '../objects/PDFName.js';
import PDFNull from '../objects/PDFNull.js';
import PDFNumber from '../objects/PDFNumber.js';
import PDFRawStream from '../objects/PDFRawStream.js';
import PDFRef from '../objects/PDFRef.js';
import PDFString from '../objects/PDFString.js';
import BaseParser from './BaseParser.js';
import ByteStream from './ByteStream.js';
import PDFCatalog from '../structures/PDFCatalog.js';
import PDFPageLeaf from '../structures/PDFPageLeaf.js';
import PDFPageTree from '../structures/PDFPageTree.js';
import CharCodes from '../syntax/CharCodes.js';
import { IsDelimiter } from '../syntax/Delimiters.js';
import { Keywords } from '../syntax/Keywords.js';
import { IsDigit, IsNumeric } from '../syntax/Numeric.js';
import { IsWhitespace } from '../syntax/Whitespace.js';
import { charFromCode } from '../../utils/index.js';
// TODO: Throw error if eof is reached before finishing object parse...
class PDFObjectParser extends BaseParser {
constructor(byteStream, context, capNumbers = false, cryptoFactory) {
super(byteStream, capNumbers);
this.context = context;
this.cryptoFactory = cryptoFactory;
}
// TODO: Is it possible to reduce duplicate parsing for ref lookaheads?
parseObject(ref) {
this.skipWhitespaceAndComments();
if (this.matchKeyword(Keywords.true))
return PDFBool.True;
if (this.matchKeyword(Keywords.false))
return PDFBool.False;
if (this.matchKeyword(Keywords.null))
return PDFNull;
const byte = this.bytes.peek();
if (byte === CharCodes.LessThan &&
this.bytes.peekAhead(1) === CharCodes.LessThan) {
return this.parseDictOrStream(ref);
}
if (byte === CharCodes.LessThan)
return this.parseHexString(ref);
if (byte === CharCodes.LeftParen)
return this.parseString(ref);
if (byte === CharCodes.ForwardSlash)
return this.parseName();
if (byte === CharCodes.LeftSquareBracket)
return this.parseArray(ref);
if (IsNumeric[byte])
return this.parseNumberOrRef();
throw new PDFObjectParsingError(this.bytes.position(), byte);
}
parseNumberOrRef() {
const firstNum = this.parseRawNumber();
this.skipWhitespaceAndComments();
const lookaheadStart = this.bytes.offset();
if (IsDigit[this.bytes.peek()]) {
const secondNum = this.parseRawNumber();
this.skipWhitespaceAndComments();
if (this.bytes.peek() === CharCodes.R) {
this.bytes.assertNext(CharCodes.R);
return PDFRef.of(firstNum, secondNum);
}
}
this.bytes.moveTo(lookaheadStart);
return PDFNumber.of(firstNum);
}
// TODO: Maybe update PDFHexString.of() logic to remove whitespace and validate input?
parseHexString(ref) {
let value = '';
this.bytes.assertNext(CharCodes.LessThan);
while (!this.bytes.done() && this.bytes.peek() !== CharCodes.GreaterThan) {
value += charFromCode(this.bytes.next());
}
this.bytes.assertNext(CharCodes.GreaterThan);
if (this.cryptoFactory && ref) {
const transformer = this.cryptoFactory.createCipherTransform(ref.objectNumber, ref.generationNumber);
const arr = transformer.decryptBytes(PDFHexString.of(value).asBytes());
value = arr.reduce((str, byte) => str + byte.toString(16).padStart(2, '0'), '');
}
return PDFHexString.of(value);
}
parseString(ref) {
let nestingLvl = 0;
let isEscaped = false;
let value = '';
while (!this.bytes.done()) {
const byte = this.bytes.next();
value += charFromCode(byte);
// Check for unescaped parenthesis
if (!isEscaped) {
if (byte === CharCodes.LeftParen)
nestingLvl += 1;
if (byte === CharCodes.RightParen)
nestingLvl -= 1;
}
// Track whether current character is being escaped or not
if (byte === CharCodes.BackSlash) {
isEscaped = !isEscaped;
}
else if (isEscaped) {
isEscaped = false;
}
// Once (if) the unescaped parenthesis balance out, return their contents
if (nestingLvl === 0) {
let actualValue = value.substring(1, value.length - 1);
if (this.cryptoFactory && ref) {
const transformer = this.cryptoFactory.createCipherTransform(ref.objectNumber, ref.generationNumber);
actualValue = transformer.decryptString(actualValue);
}
// Remove the outer parens so they aren't part of the contents
return PDFString.of(actualValue);
}
}
throw new UnbalancedParenthesisError(this.bytes.position());
}
// TODO: Compare performance of string concatenation to charFromCode(...bytes)
// TODO: Maybe preallocate small Uint8Array if can use charFromCode?
parseName() {
this.bytes.assertNext(CharCodes.ForwardSlash);
let name = '';
while (!this.bytes.done()) {
const byte = this.bytes.peek();
if (IsWhitespace[byte] || IsDelimiter[byte])
break;
name += charFromCode(byte);
this.bytes.next();
}
return PDFName.of(name);
}
parseArray(ref) {
this.bytes.assertNext(CharCodes.LeftSquareBracket);
this.skipWhitespaceAndComments();
const pdfArray = PDFArray.withContext(this.context);
while (this.bytes.peek() !== CharCodes.RightSquareBracket) {
const element = this.parseObject(ref);
pdfArray.push(element);
this.skipWhitespaceAndComments();
}
this.bytes.assertNext(CharCodes.RightSquareBracket);
return pdfArray;
}
parseDict(ref) {
this.bytes.assertNext(CharCodes.LessThan);
this.bytes.assertNext(CharCodes.LessThan);
this.skipWhitespaceAndComments();
const dict = new Map();
while (!this.bytes.done() &&
this.bytes.peek() !== CharCodes.GreaterThan &&
this.bytes.peekAhead(1) !== CharCodes.GreaterThan) {
const key = this.parseName();
const value = this.parseObject(ref);
dict.set(key, value);
this.skipWhitespaceAndComments();
}
this.skipWhitespaceAndComments();
this.bytes.assertNext(CharCodes.GreaterThan);
this.bytes.assertNext(CharCodes.GreaterThan);
const Type = dict.get(PDFName.of('Type'));
if (Type === PDFName.of('Catalog')) {
return PDFCatalog.fromMapWithContext(dict, this.context);
}
else if (Type === PDFName.of('Pages')) {
return PDFPageTree.fromMapWithContext(dict, this.context);
}
else if (Type === PDFName.of('Page')) {
return PDFPageLeaf.fromMapWithContext(dict, this.context);
}
else {
return PDFDict.fromMapWithContext(dict, this.context);
}
}
parseDictOrStream(ref) {
const startPos = this.bytes.position();
const dict = this.parseDict(ref);
this.skipWhitespaceAndComments();
if (!this.matchKeyword(Keywords.streamEOF1) &&
!this.matchKeyword(Keywords.streamEOF2) &&
!this.matchKeyword(Keywords.streamEOF3) &&
!this.matchKeyword(Keywords.streamEOF4) &&
!this.matchKeyword(Keywords.stream)) {
return dict;
}
const start = this.bytes.offset();
let end;
const Length = dict.get(PDFName.of('Length'));
if (Length instanceof PDFNumber) {
end = start + Length.asNumber();
this.bytes.moveTo(end);
this.skipWhitespaceAndComments();
if (!this.matchKeyword(Keywords.endstream)) {
this.bytes.moveTo(start);
end = this.findEndOfStreamFallback(startPos);
}
}
else {
end = this.findEndOfStreamFallback(startPos);
}
let contents = this.bytes.slice(start, end);
if (this.cryptoFactory && ref) {
const transform = this.cryptoFactory.createCipherTransform(ref.objectNumber, ref.generationNumber);
contents = transform.decryptBytes(contents);
}
return PDFRawStream.of(dict, contents);
}
findEndOfStreamFallback(startPos) {
// Move to end of stream, while handling nested streams
let nestingLvl = 1;
let end = this.bytes.offset();
while (!this.bytes.done()) {
end = this.bytes.offset();
if (this.matchKeyword(Keywords.stream)) {
nestingLvl += 1;
}
else if (this.matchKeyword(Keywords.EOF1endstream) ||
this.matchKeyword(Keywords.EOF2endstream) ||
this.matchKeyword(Keywords.EOF3endstream) ||
this.matchKeyword(Keywords.endstream)) {
nestingLvl -= 1;
}
else {
this.bytes.next();
}
if (nestingLvl === 0)
break;
}
if (nestingLvl !== 0)
throw new PDFStreamParsingError(startPos);
return end;
}
}
PDFObjectParser.forBytes = (bytes, context, capNumbers) => new PDFObjectParser(ByteStream.of(bytes), context, capNumbers);
PDFObjectParser.forByteStream = (byteStream, context, capNumbers = false) => new PDFObjectParser(byteStream, context, capNumbers);
export default PDFObjectParser;
//# sourceMappingURL=PDFObjectParser.js.map