UNPKG

@adobe/pdftools-extract-node-sdk

Version:

The Document Services PDF Tools Extract Node.js SDK provides APIs for extracting elements and renditions from PDF

257 lines (224 loc) 7.68 kB
/* * Copyright 2019 Adobe * All Rights Reserved. * * NOTICE: Adobe permits you to use, modify, and distribute this file in * accordance with the terms of the Adobe license agreement accompanying * it. If you have received this file from a source other than Adobe, * then your use, modification, or distribution of it requires the prior * written permission of Adobe. */ const ExtractPDFService = require('../internal/api/extract-pdf-service'), OperationMessage = require('../internal/cpf/operation-message'), ExtensionMediaTypeMapping = require('../internal/extension-mediatype-mapping'), { getRandomFileNameWithExtension } = require('../internal/util/path-util'), logger = require('./../internal/logger'), DefaultConfig = require('../internal/config/dc-services-default-config.js'); /** * * Supported source file formats for {@link ExtractPdfOperation} is .pdf. * @enum * @readonly * @memberOf ExtractPdfOperation * */ const SupportedSourceFormat = { /** * Represents "application/pdf" media type * @type {string} */ pdf: ExtensionMediaTypeMapping.pdf.mediaType }, allowedConfiguration = { targetFileExtension: ExtensionMediaTypeMapping.zip.extension, getSupportedMediaTypes() { return Object.values(SupportedSourceFormat); } }; Object.freeze(allowedConfiguration); Object.freeze(SupportedSourceFormat); /** * An Operation that extracts pdf elements such as text, images, tables in a structured format from a PDF. * * Sample Usage: * <pre class="prettyprint"> * <code> * try { * * const credentials = PDFToolsSdk.Credentials * .serviceAccountCredentialsBuilder() * .fromFile("pdftools-api-credentials.json") * .build(); * * const clientContext = PDFToolsSdk.ExecutionContext * .create(credentials), * extractPDFOperation = PDFToolsSdk.ExtractPDF.Operation * .createNew(), * * input = PDFToolsSdk.FileRef.createFromLocalFile( * 'test/resources/extractPdfInput.pdf', * PDFToolsSdk.ExtractPDF.SupportedSourceFormat.pdf * ); * * extractPDFOperation.setInput(input); * * extractPDFOperation.addElementToExtract(PDFToolsSdk.PDFElementType.TEXT); * extractPDFOperation.addElementToExtract(PDFToolsSdk.PDFElementType.TABLES); * * extractPDFOperation.addElementToExtractRenditions(PDFToolsSdk.PDFElementType.FIGURES); * extractPDFOperation.addElementToExtractRenditions(PDFToolsSdk.PDFElementType.TABLES); * * extractPDFOperation.execute(clientContext) * .then(result => result.saveAsFile('output/extractPdf.zip')) * .catch(err => console.log(err)); * * } * * catch (err) { * * throw err; * * } * </code> * </pre> * */ class ExtractPdfOperation { /** * @hideconstructor */ constructor() { this.sourceFileRef = null; this.isInvoked = false; /** * List of pdf element types to be extracted in a structured format from input file */ this.elementsToExtract = []; /** * List of pdf element types whose renditions needs to be extracted from input file */ this.elementsToExtractRenditions = []; /** * export table in specified format - currently csv supported */ this.tableOutFormat = null; /** * Boolean specifying whether to add character level bounding boxes to output json */ this.getCharInfo = false; Object.preventExtensions(this); } /** * Constructs a {@link ExtractPdfOperation} instance. * @returns {ExtractPdfOperation} A new ExtractPdfOperation instance. * */ static createNew() { return new ExtractPdfOperation(); } static get SupportedSourceFormat() { return SupportedSourceFormat; } /** * Sets an input file. * @param {!FileRef} sourceFileRef - An input file. */ setInput(sourceFileRef) { this.sourceFileRef = sourceFileRef; } /** * @summary Adds a pdf element type for extracting structured information. * @param {PDFElementType} element - PDFElementType to be extracted * @return ExtractPdfOperation - current ExtractPDFOperation instance */ addElementToExtract(element){ this.elementsToExtract.push(element); } /** * @summary Adds a pdf element type for extracting rendition. * @param {PDFElementType} element - PDFElementType whose renditions have to be extracted * @return ExtractPdfOperation - current ExtractPDFOperation instance */ addElementToExtractRenditions(element){ this.elementsToExtractRenditions.push(element); } /** * @summary Adds the table structure format (currently csv only) for extracting structured information. * @param {TableStructureType} element - TableStructureType to be extracted * @return ExtractPdfOperation - current ExtractPDFOperation instance */ addTableStructureFormat(element){ this.tableOutFormat = element; } /** * @summary Boolean specifying whether to add character level bounding boxes to output json * @param {Boolean} element - Set True to extract character level bounding boxes information * @return ExtractPdfOperation - current ExtractPDFOperation instance */ addCharInfo(element){ this.getCharInfo = element; } /** * @summary Adds a pdf element type for extracting structured information. * @param {PDFElementType} elements - List of PDFElementType to be extracted * @return ExtractPdfOperation - current ExtractPDFOperation instance */ addElementsToExtract(...elements){ this.elementsToExtract = elements; } /** * @summary Add pdf element types for extracting renditions. * @param {PDFElementType} elements - List of PDFElementType whose renditions have to be extracted * @return ExtractPdfOperation - current ExtractPDFOperation instance */ addElementsToExtractRenditions(...elements){ this.elementsToExtractRenditions = elements } /** * Executes this operation using the supplied context and returns a Promise which resolves to the operation result. * * The resulting file may be stored in the system temporary directory (per the os.tempdir(), symlinks are resolved * to the actual path). * See {@link FileRef} for how temporary resources are cleaned up. * * @param {!ExecutionContext} context - The context in which the operation will be executed. * @returns {Promise<T>} A promise which resolves to the operation result. * @throws {ServiceApiError} if an API call results in an error response. * @throws {ServiceUsageError} if service usage limits have been reached or credentials quota has been exhausted. */ execute(context) { try { this.validate(); } catch (err) { return Promise.reject(err); } logger.info('All validations successfully done. Beginning Extract Pdf operation execution'); const targetFileName = getRandomFileNameWithExtension(allowedConfiguration.targetFileExtension), operationMessage = new OperationMessage(this.sourceFileRef, targetFileName, DefaultConfig.operationName.extractPdf), extractPDFService = new ExtractPDFService(); this.isInvoked = true; if(this.elementsToExtractRenditions.length>0){ operationMessage.setOptions({ 'elementsToExtract':this.elementsToExtract, 'renditionsToExtract' : this.elementsToExtractRenditions, 'getCharBounds' : this.getCharInfo, 'tableOutputFormat': this.tableOutFormat }); } else { operationMessage.setOptions({ 'elementsToExtract':this.elementsToExtract, 'getCharBounds' : this.getCharInfo, 'tableOutputFormat': this.tableOutFormat }); } return extractPDFService.perform(context, operationMessage) .then(res => Promise.resolve(res)) .catch(err => Promise.reject(err)); } validate() { if(this.isInvoked) throw new Error('Operation instance must only be invoked once'); } } Object.freeze(ExtractPdfOperation); module.exports = ExtractPdfOperation;