UNPKG

pdf-lib

Version:

Library for creating and modifying PDF files in JavaScript

112 lines (111 loc) 5.51 kB
import { PDFName, PDFNumber, PDFRawStream, } from '../pdf-objects'; import { arrayIndexOf, arrayIndexOneOf, arrayToString, error, trimArrayAndRemoveComments, } from '../../utils'; import decodeStream from './encoding/decodeStream'; import parseObjectStream from './parseObjectStream'; /** * Accepts an array of bytes and a PDFDictionary as input. Checks to see if the * first characters in the trimmed input make up a PDF Stream. * * If so, the content of the stream is extracted into a subarray. A tuple * containing this content subarray and a subarray of the input with the bytes making * up the entire stream removed is returned. * * If not, null is returned. */ var parseStream = function (input, dict, parseHandlers) { if (parseHandlers === void 0) { parseHandlers = {}; } // Check that the next bytes comprise the beginning of a stream var trimmed = trimArrayAndRemoveComments(input); // The first two cases we check for are valid according to the PDF spec // ('stream\n' and 'stream\r\n') but the third ('stream\r') is not: // > The keyword stream that follows the stream dictionary shall be followed // > by an end-of-line marker consisting of either a CARRIAGE RETURN and a // > LINE FEED or just a LINE FEED, **and not by a CARRIAGE RETURN alone.** // However, some PDFs in the wild only use carriage returns, so we have to // check for them here in the third case. var startstreamIdx; if (arrayToString(trimmed, 0, 7) === 'stream\n') startstreamIdx = 7; else if (arrayToString(trimmed, 0, 8) === 'stream\r\n') startstreamIdx = 8; else if (arrayToString(trimmed, 0, 7) === 'stream\r') startstreamIdx = 7; if (!startstreamIdx) return undefined; /* ===================== Try to find the stream's end ===================== */ var Length = dict.getMaybe('Length'); var endstreamMatchTuple; var endstreamKeywords = ['\nendstream', '\rendstream', 'endstream']; // TODO: Enhance parser to support indirect Length references. Right now this // only works if the Length entry is a direct number. // Try to use the Length entry to find the end of the stream if (Length && Length instanceof PDFNumber) { var startAt = Length.number + startstreamIdx; var maybeTuple = arrayIndexOneOf(trimmed, endstreamKeywords, startAt); if (maybeTuple && maybeTuple[0] === startAt + 1) { endstreamMatchTuple = maybeTuple; } } // If the Length entry isn't present, is an indirect reference, or is // an invalid value, then we'll try to find the end of the stream by brute // force. We'll scan each byte from the start of the stream until we find // the `endstream` keyword. if (!endstreamMatchTuple) { var maybeTuple = arrayIndexOneOf(trimmed, endstreamKeywords, 0); if (maybeTuple) endstreamMatchTuple = maybeTuple; } /* ======================================================================== */ if (!endstreamMatchTuple) throw new Error('Invalid Stream!'); var endstreamIdx = endstreamMatchTuple[0], endstreamMatch = endstreamMatchTuple[1]; /* TODO: See if it makes sense to .slice() the stream contents, even though this would require more memory space. */ // Extract the stream content bytes var contents = trimmed.subarray(startstreamIdx, endstreamIdx); // Verify that the next characters denote the end of the stream var endobjIdx = arrayIndexOf(trimmed, 'endobj', endstreamIdx); if (arrayToString(trimmed, endstreamIdx, endobjIdx).trim() !== 'endstream') { error('Invalid Stream!'); } return [contents, trimmed.subarray(endstreamIdx + endstreamMatch.length)]; }; /** * Accepts an array of bytes and a PDFDictionary as input. Checks to see if the * first characters in the trimmed input make up a PDF Stream. * * If so, returns a tuple containing (1) a PDFObjectStream if it is an * Object Stream, otherwise a PDFStream and (2) a subarray of the input wih the * characters making up the parsed stream removed. The "onParseObjectStream" will * be called with the PDFObjectStream if it is an Object Stream. Otherwise * the "onParseStream" parse hander will be called. * * If not, null is returned. */ export default (function (input, dict, index, parseHandlers) { if (parseHandlers === void 0) { parseHandlers = {}; } // Parse the input bytes into the stream dictionary and content bytes var res = parseStream(input, dict, parseHandlers); if (!res) return undefined; var contents = res[0], remaining = res[1]; // If it's an Object Stream, parse it and return the indirect objects it contains if (dict.getMaybe('Type') === PDFName.from('ObjStm')) { if (dict.getMaybe('Filter') !== PDFName.from('FlateDecode')) { error("Cannot decode \"" + dict.get('Filter') + "\" Object Streams"); } var decoded = decodeStream(dict, contents); var objectStream = parseObjectStream(dict, decoded, index, parseHandlers); if (parseHandlers.onParseObjectStream) { parseHandlers.onParseObjectStream(objectStream); } return [objectStream, remaining]; } // Otherwise, return a PDFStream without parsing the content bytes var stream = PDFRawStream.from(dict, contents); if (parseHandlers.onParseStream) parseHandlers.onParseStream(stream); return [stream, remaining]; });