UNPKG

pdf-lib

Version:

Library for creating and modifying PDF files in JavaScript

63 lines (62 loc) 2.82 kB
import { PDFHeader } from '../pdf-structures'; import { arrayToString, trimArray } from '../../utils'; /** * Accepts an array of bytes as input. Removes all leading bytes that do not * make up digits ([0-9]). Returns a subarray of the input with these leading * non-digit bytes removed. * * This allows us to remove the binary comment following a PDF header, before * proceeding to parse the rest of the document. The specification defines this * binary comment (section 7.5.2 File Header) as a sequence of 4 or more bytes * that are 128 or greater, and which are preceded by a "%". * * This would imply that to strip out this binary comment, we could check for a * sequence of bytes starting with "%", and remove all subsequent bytes that are * 128 or greater. This works for many documents that properly comply with the * spec. But in the wild, there are PDFs that omit the leading "%", and include * bytes that are less than 128 (e.g. 0 or 1). So in order to parse these * headers correctly, we just throw out all bytes leading up to the first digit. * (we assume the first digit is the object number of the first indirect object) */ var stripBinaryComment = function (input) { var idx = 0; while (idx < input.length && String.fromCharCode(input[idx]).match(/^(?![\d])./)) { idx += 1; } return input.subarray(idx); }; /** * Accepts an array of bytes as input. Checks to see if the first characters in the * trimmed input make up a PDF Header. * * If so, returns a tuple containing (1) an object representing the parsed PDF * Header and (2) a subarray of the input with the characters making up the parsed * header removed. The "onParseHeader" parse handler will also be called with the * PDFHeader obect. * * If not, null is returned. */ var parseHeader = function (input, _a) { var onParseHeader = (_a === void 0 ? {} : _a).onParseHeader; var trimmed = trimArray(input); var fileHeaderRegex = /^[\0\t\n\f\r ]*%PDF-(\d+)\.(\d+)[\0\t\n\f\r ]*/; // Search for first character that isn't part of a header var idx = 0; while (idx < trimmed.length && String.fromCharCode(trimmed[idx]).match(/^[\0\t\n\f\r %PDF-\d.]/)) { idx += 1; } // Try to match the regex up to that character to see if we've got a header var result = arrayToString(trimmed, 0, idx).match(fileHeaderRegex); if (!result) return undefined; var fullMatch = result[0], major = result[1], minor = result[2]; var withoutVersion = trimArray(trimmed.subarray(fullMatch.length)); var returnArray = stripBinaryComment(withoutVersion); var pdfHeader = PDFHeader.forVersion(Number(major), Number(minor)); if (onParseHeader) onParseHeader(pdfHeader); return [pdfHeader, returnArray]; }; export default parseHeader;