UNPKG

pdf-lib

Version:

Library for creating and modifying PDF files in JavaScript

77 lines (76 loc) 3.25 kB
/* eslint-disable no-constant-condition */ import { error } from '../../utils'; import parseHeader from './parseHeader'; import parseIndirectObj from './parseIndirectObj'; import parseLinearization from './parseLinearization'; import { parseTrailer, parseTrailerWithoutDict } from './parseTrailer'; import parseXRefTable from './parseXRefTable'; /** * Accepts an array of bytes as input. Parses indirect objects from the input bytes * until an xref table or trailer is found. The "onParseIndirectObj" parse * handler is called with each indirect object that is parsed. * * Returns a subarray of the input bytes with the bytes making up the parsed * indirect objects removed. */ var parseBodySection = function (input, index, parseHandlers) { var remainder = input; while (true) { var result = parseIndirectObj(remainder, index, parseHandlers); if (!result) break; remainder = result[1]; } return remainder; }; /** * Accepts an array of bytes as input. Checks to see if the first characters in the * input make up an xref table followed by a trailer, or just a trailer. The * "onParseXRefTable" and "onParseTrailer" parseHandlers will be called with the * parsed objects. * * Returns a subarray of the input bytes with the bytes making up the parsed * objects removed. */ var parseFooterSection = function (input, index, parseHandlers) { var remainder = input; // Try to parse the XRef table (some PDFs omit the XRef table) var parsedXRef = parseXRefTable(input, parseHandlers); if (parsedXRef) remainder = parsedXRef[1]; // Try to parse the trailer with and without dictionary, because some // malformatted documents are missing the dictionary. var parsedTrailer = parseTrailer(remainder, index, parseHandlers) || parseTrailerWithoutDict(remainder, index, parseHandlers); if (!parsedTrailer) return undefined; remainder = parsedTrailer[1]; return remainder; }; /** * Accepts an array of bytes comprising a PDF document as input. Parses all the * objects in the file in a sequential fashion, beginning with the header and * ending with the last trailer. * * The XRef tables/streams in the input are not used to locate and parse objects * as needed. Rather, the whole document is parsed and stored in memory at once. */ var parseDocument = function (input, index, parseHandlers) { var cleaned = input; // Parse the document header var remainder; _a = parseHeader(cleaned, parseHandlers) || error('PDF is missing a header'), remainder = _a[1]; // If document is linearized, we'll need to parse the linearization // dictionary and First-Page XRef table/stream next... var linearizationMatch = parseLinearization(remainder, index, parseHandlers); if (linearizationMatch) remainder = linearizationMatch[1]; // Parse each body of the document and its corresponding footer. // (if document does not have update sections, loop will only occur once) while (remainder) { remainder = parseBodySection(remainder, index, parseHandlers); remainder = parseFooterSection(remainder, index, parseHandlers); } var _a; }; export default parseDocument;