@adobe/pdftools-extract-node-sdk
Version:
The Document Services PDF Tools Extract Node.js SDK provides APIs for extracting elements and renditions from PDF
257 lines (224 loc) • 7.68 kB
JavaScript
/*
* Copyright 2019 Adobe
* All Rights Reserved.
*
* NOTICE: Adobe permits you to use, modify, and distribute this file in
* accordance with the terms of the Adobe license agreement accompanying
* it. If you have received this file from a source other than Adobe,
* then your use, modification, or distribution of it requires the prior
* written permission of Adobe.
*/
const ExtractPDFService = require('../internal/api/extract-pdf-service'),
OperationMessage = require('../internal/cpf/operation-message'),
ExtensionMediaTypeMapping = require('../internal/extension-mediatype-mapping'),
{ getRandomFileNameWithExtension } = require('../internal/util/path-util'),
logger = require('./../internal/logger'),
DefaultConfig = require('../internal/config/dc-services-default-config.js');
/**
*
* Supported source file formats for {@link ExtractPdfOperation} is .pdf.
* @enum
* @readonly
* @memberOf ExtractPdfOperation
*
*/
const SupportedSourceFormat = {
/**
* Represents "application/pdf" media type
* @type {string}
*/
pdf: ExtensionMediaTypeMapping.pdf.mediaType
},
allowedConfiguration = {
targetFileExtension: ExtensionMediaTypeMapping.zip.extension,
getSupportedMediaTypes() {
return Object.values(SupportedSourceFormat);
}
};
Object.freeze(allowedConfiguration);
Object.freeze(SupportedSourceFormat);
/**
* An Operation that extracts pdf elements such as text, images, tables in a structured format from a PDF.
*
* Sample Usage:
* <pre class="prettyprint">
* <code>
* try {
*
* const credentials = PDFToolsSdk.Credentials
* .serviceAccountCredentialsBuilder()
* .fromFile("pdftools-api-credentials.json")
* .build();
*
* const clientContext = PDFToolsSdk.ExecutionContext
* .create(credentials),
* extractPDFOperation = PDFToolsSdk.ExtractPDF.Operation
* .createNew(),
*
* input = PDFToolsSdk.FileRef.createFromLocalFile(
* 'test/resources/extractPdfInput.pdf',
* PDFToolsSdk.ExtractPDF.SupportedSourceFormat.pdf
* );
*
* extractPDFOperation.setInput(input);
*
* extractPDFOperation.addElementToExtract(PDFToolsSdk.PDFElementType.TEXT);
* extractPDFOperation.addElementToExtract(PDFToolsSdk.PDFElementType.TABLES);
*
* extractPDFOperation.addElementToExtractRenditions(PDFToolsSdk.PDFElementType.FIGURES);
* extractPDFOperation.addElementToExtractRenditions(PDFToolsSdk.PDFElementType.TABLES);
*
* extractPDFOperation.execute(clientContext)
* .then(result => result.saveAsFile('output/extractPdf.zip'))
* .catch(err => console.log(err));
*
* }
*
* catch (err) {
*
* throw err;
*
* }
* </code>
* </pre>
*
*/
class ExtractPdfOperation {
/**
* @hideconstructor
*/
constructor() {
this.sourceFileRef = null;
this.isInvoked = false;
/**
* List of pdf element types to be extracted in a structured format from input file
*/
this.elementsToExtract = [];
/**
* List of pdf element types whose renditions needs to be extracted from input file
*/
this.elementsToExtractRenditions = [];
/**
* export table in specified format - currently csv supported
*/
this.tableOutFormat = null;
/**
* Boolean specifying whether to add character level bounding boxes to output json
*/
this.getCharInfo = false;
Object.preventExtensions(this);
}
/**
* Constructs a {@link ExtractPdfOperation} instance.
* @returns {ExtractPdfOperation} A new ExtractPdfOperation instance.
*
*/
static createNew() {
return new ExtractPdfOperation();
}
static get SupportedSourceFormat() {
return SupportedSourceFormat;
}
/**
* Sets an input file.
* @param {!FileRef} sourceFileRef - An input file.
*/
setInput(sourceFileRef) {
this.sourceFileRef = sourceFileRef;
}
/**
* @summary Adds a pdf element type for extracting structured information.
* @param {PDFElementType} element - PDFElementType to be extracted
* @return ExtractPdfOperation - current ExtractPDFOperation instance
*/
addElementToExtract(element){
this.elementsToExtract.push(element);
}
/**
* @summary Adds a pdf element type for extracting rendition.
* @param {PDFElementType} element - PDFElementType whose renditions have to be extracted
* @return ExtractPdfOperation - current ExtractPDFOperation instance
*/
addElementToExtractRenditions(element){
this.elementsToExtractRenditions.push(element);
}
/**
* @summary Adds the table structure format (currently csv only) for extracting structured information.
* @param {TableStructureType} element - TableStructureType to be extracted
* @return ExtractPdfOperation - current ExtractPDFOperation instance
*/
addTableStructureFormat(element){
this.tableOutFormat = element;
}
/**
* @summary Boolean specifying whether to add character level bounding boxes to output json
* @param {Boolean} element - Set True to extract character level bounding boxes information
* @return ExtractPdfOperation - current ExtractPDFOperation instance
*/
addCharInfo(element){
this.getCharInfo = element;
}
/**
* @summary Adds a pdf element type for extracting structured information.
* @param {PDFElementType} elements - List of PDFElementType to be extracted
* @return ExtractPdfOperation - current ExtractPDFOperation instance
*/
addElementsToExtract(...elements){
this.elementsToExtract = elements;
}
/**
* @summary Add pdf element types for extracting renditions.
* @param {PDFElementType} elements - List of PDFElementType whose renditions have to be extracted
* @return ExtractPdfOperation - current ExtractPDFOperation instance
*/
addElementsToExtractRenditions(...elements){
this.elementsToExtractRenditions = elements
}
/**
* Executes this operation using the supplied context and returns a Promise which resolves to the operation result.
*
* The resulting file may be stored in the system temporary directory (per the os.tempdir(), symlinks are resolved
* to the actual path).
* See {@link FileRef} for how temporary resources are cleaned up.
*
* @param {!ExecutionContext} context - The context in which the operation will be executed.
* @returns {Promise<T>} A promise which resolves to the operation result.
* @throws {ServiceApiError} if an API call results in an error response.
* @throws {ServiceUsageError} if service usage limits have been reached or credentials quota has been exhausted.
*/
execute(context) {
try {
this.validate();
} catch (err) {
return Promise.reject(err);
}
logger.info('All validations successfully done. Beginning Extract Pdf operation execution');
const targetFileName = getRandomFileNameWithExtension(allowedConfiguration.targetFileExtension),
operationMessage = new OperationMessage(this.sourceFileRef, targetFileName, DefaultConfig.operationName.extractPdf),
extractPDFService = new ExtractPDFService();
this.isInvoked = true;
if(this.elementsToExtractRenditions.length>0){
operationMessage.setOptions({
'elementsToExtract':this.elementsToExtract,
'renditionsToExtract' : this.elementsToExtractRenditions,
'getCharBounds' : this.getCharInfo,
'tableOutputFormat': this.tableOutFormat
});
} else {
operationMessage.setOptions({
'elementsToExtract':this.elementsToExtract,
'getCharBounds' : this.getCharInfo,
'tableOutputFormat': this.tableOutFormat
});
}
return extractPDFService.perform(context, operationMessage)
.then(res => Promise.resolve(res))
.catch(err => Promise.reject(err));
}
validate() {
if(this.isInvoked)
throw new Error('Operation instance must only be invoked once');
}
}
Object.freeze(ExtractPdfOperation);
module.exports = ExtractPdfOperation;