pdf-lib
Version:
Library for creating and modifying PDF files in JavaScript
77 lines (76 loc) • 3.25 kB
JavaScript
/* eslint-disable no-constant-condition */
import { error } from '../../utils';
import parseHeader from './parseHeader';
import parseIndirectObj from './parseIndirectObj';
import parseLinearization from './parseLinearization';
import { parseTrailer, parseTrailerWithoutDict } from './parseTrailer';
import parseXRefTable from './parseXRefTable';
/**
* Accepts an array of bytes as input. Parses indirect objects from the input bytes
* until an xref table or trailer is found. The "onParseIndirectObj" parse
* handler is called with each indirect object that is parsed.
*
* Returns a subarray of the input bytes with the bytes making up the parsed
* indirect objects removed.
*/
var parseBodySection = function (input, index, parseHandlers) {
var remainder = input;
while (true) {
var result = parseIndirectObj(remainder, index, parseHandlers);
if (!result)
break;
remainder = result[1];
}
return remainder;
};
/**
* Accepts an array of bytes as input. Checks to see if the first characters in the
* input make up an xref table followed by a trailer, or just a trailer. The
* "onParseXRefTable" and "onParseTrailer" parseHandlers will be called with the
* parsed objects.
*
* Returns a subarray of the input bytes with the bytes making up the parsed
* objects removed.
*/
var parseFooterSection = function (input, index, parseHandlers) {
var remainder = input;
// Try to parse the XRef table (some PDFs omit the XRef table)
var parsedXRef = parseXRefTable(input, parseHandlers);
if (parsedXRef)
remainder = parsedXRef[1];
// Try to parse the trailer with and without dictionary, because some
// malformatted documents are missing the dictionary.
var parsedTrailer = parseTrailer(remainder, index, parseHandlers) ||
parseTrailerWithoutDict(remainder, index, parseHandlers);
if (!parsedTrailer)
return undefined;
remainder = parsedTrailer[1];
return remainder;
};
/**
* Accepts an array of bytes comprising a PDF document as input. Parses all the
* objects in the file in a sequential fashion, beginning with the header and
* ending with the last trailer.
*
* The XRef tables/streams in the input are not used to locate and parse objects
* as needed. Rather, the whole document is parsed and stored in memory at once.
*/
var parseDocument = function (input, index, parseHandlers) {
var cleaned = input;
// Parse the document header
var remainder;
_a = parseHeader(cleaned, parseHandlers) || error('PDF is missing a header'), remainder = _a[1];
// If document is linearized, we'll need to parse the linearization
// dictionary and First-Page XRef table/stream next...
var linearizationMatch = parseLinearization(remainder, index, parseHandlers);
if (linearizationMatch)
remainder = linearizationMatch[1];
// Parse each body of the document and its corresponding footer.
// (if document does not have update sections, loop will only occur once)
while (remainder) {
remainder = parseBodySection(remainder, index, parseHandlers);
remainder = parseFooterSection(remainder, index, parseHandlers);
}
var _a;
};
export default parseDocument;