@adobe/pdftools-extract-node-sdk
Version:
The Document Services PDF Tools Extract Node.js SDK provides APIs for extracting elements and renditions from PDF
347 lines (305 loc) • 12.4 kB
JavaScript
/*
* Copyright 2019 Adobe
* All Rights Reserved.
*
* NOTICE: Adobe permits you to use, modify, and distribute this file in
* accordance with the terms of the Adobe license agreement accompanying
* it. If you have received this file from a source other than Adobe,
* then your use, modification, or distribution of it requires the prior
* written permission of Adobe.
*/
const CPFRequestKeys = require('../cpf/cpf-service-request-key'),
logger = require('../logger'),
{ensureDirectoryExistence} = require('../util/file-util'),
fs = require('fs'),
FileRef = require('../../io/file-ref'),
DefaultConfig = require('./../config/dc-services-default-config'),
DefaultHeaders = require('../http/default-dc-request-options'),
temp = require('temp-dir'),
path = require('path'),
ServiceApiError = require('../../error/service-api-error'),
formidable = require('formidable'),
AdmZip = require('adm-zip'),
StringUtil = require('../util/string-util');
const timeLimit = 1000 * 60 * 10; // 10 minutes in ms
const pollingInterval = 1000; // 1 second in ms
const specialHttpErrorCodes = DefaultConfig.specialHttpErrorCodes,
tempFolderName = DefaultConfig.tempFolderName,
operationHeaderName = "x-dcsdk-ops-info";
/**
* This methods is used for making a CPF API call and return the response
*/
const callApiUtil = (context, multipartData, requestKey, operation) => context.getBaseRequestFromRequestContext(requestKey)
.then(httpRequest => {
httpRequest = httpRequest.withMultipartData(multipartData).withHeader(operationHeaderName, operation);
logger.debug(`Uploading file with options ${JSON.stringify(httpRequest.requestOptions)}`);
return httpRequest.call();
}).catch(err => Promise.reject(err));
/**
* This method keeps on polling for the Status API response until we get the status code other than 202.
* It will keep on polling for a response for 10 minutes post that it'll throw a ServiceAPIError
*/
const cpfStatusPolling = (context, location, startTime) => context.getBaseRequestFromRequestContext(CPFRequestKeys.CPF_STATUS)
.then(statusRequest => {
statusRequest.options.requestConfig.uriTemplate = location;
logger.debug(`Download file with options ${JSON.stringify(statusRequest.requestOptions)}`);
return statusRequest.call();
})
.then(result => {
if (result.statusCode === 202) {
logger.debug('polling for status');
return new Promise((resolve, reject) => {
if (new Date().getTime() - startTime > timeLimit) {
logger.debug('Aborting conversion that is taking too long.');
reject(new ServiceApiError(
'Operation execution has timed out! Please find the last successful polling response',
result.headers[DefaultHeaders.DC_REQUEST_ID_HEADER_KEY]
));
}
setTimeout(() => {
return cpfStatusPolling(context, location, startTime)
.then(response => resolve(response))
.catch(err => reject(err));
}, pollingInterval);
});
}
return Promise.resolve(result)
})
.catch(err => Promise.reject(err));
/**
* This method is used to get the file stream of the input file.
*/
const getFileData = (file) => {
return new Promise((resolve, reject) => {
let fileStream = file.input.asStream;
let bufferChunks = [];
fileStream.on('data', chunk => {
bufferChunks.push(chunk);
});
fileStream.on('end', () => {
const fileData = {
data: Buffer.concat(bufferChunks)
};
resolve(fileData);
});
fileStream.on('error', error => reject(error));
});
};
/**
* This method process the input files and create the file stream for each input file asynchronously.
*/
const processInputFiles = (files) => {
let promises = [];
let fileDataList = [];
// Generate individual file body parts
let index = 0;
files.forEach(file => {
// To maintain the file ordering, assign unique index to every file input
let localIndex = index;
index++;
promises.push(new Promise((resolve, reject) => {
let fileDataPromise = getFileData(file);
fileDataPromise.then(fileData => {
fileDataList[localIndex] = fileData;
resolve();
}).catch(error => reject(error));
}));
});
return Promise.all(promises)
.then(() => Promise.resolve(fileDataList))
.catch(err => Promise.reject(err));
};
/**
* This method returns the Formidable object required for handling multipart response.
*/
const getMultipartResponseParser = (fileName, filePath) => {
const multipartResponseParser = formidable();
multipartResponseParser.onPart = (part) => {
// For the file part, explicitly add the fileName to let the formidable lib treat it as file instead of field
if (part.name.includes('file')) {
fileName = part.name.slice(4, part.name.length);
part.filename = fileName;
}
if (part.name.includes('json')) {
part.name = 'structuredData';
}
multipartResponseParser.handlePart(part);
};
multipartResponseParser.on('fileBegin', (filename, file) => {
// Override the file part's file path with filePath
file.path = filePath + fileName;
});
return multipartResponseParser;
};
const frameExtractRenditionsOutput = (elements, figures, tables) => {
for (let elementsIndex = 0; elementsIndex < elements.length; elementsIndex++) {
let renditionsPath = elements[elementsIndex].Path.toLowerCase();
let identifier = renditionsPath.split('/').pop();
if (identifier.startsWith('figure')
&& elements[elementsIndex].filePaths !== undefined) {
let fileNames = elements[elementsIndex].filePaths;
for (let i = 0; i < fileNames.length; i++) {
figures.push(elements[elementsIndex].filePaths[i]);
elements[elementsIndex].filePaths[i] =
"figures" + path.sep + elements[elementsIndex].filePaths[i];
}
}
if (identifier.startsWith('table')
&& elements[elementsIndex].filePaths !== undefined) {
let fileNames = elements[elementsIndex].filePaths;
for (let i = 0; i < fileNames.length; i++) {
tables.push(elements[elementsIndex].filePaths[i]);
elements[elementsIndex].filePaths[i] =
"tables" + path.sep + elements[elementsIndex].filePaths[i];
}
}
}
}
const frameExtractSpecificOutput = (files, fields, tempDir, fileName) => {
let figures = [];
let tables = [];
const file = new AdmZip();
let contentAnalyserResponse = JSON.parse(fields['contentAnalyzerResponse']);
let elementRenditionsList = contentAnalyserResponse['cpf:outputs']['elementsRenditions'];
let structuredDataJson = JSON.parse(fields['structuredData']);
if (elementRenditionsList.length > 0) {
frameExtractRenditionsOutput(structuredDataJson.elements, figures, tables)
}
const figureMap = StringUtil.buildMap(figures.map(x => x.substr(0, x.indexOf('.'))), figures);
const tableMap = StringUtil.buildMap(tables.map(x => x.substr(0, x.indexOf('.'))), tables);
fs.writeFileSync(tempDir + "structuredData.json", JSON.stringify(structuredDataJson));
for (let elementRenditionIndex = 0; elementRenditionIndex < elementRenditionsList.length; elementRenditionIndex ++) {
let pathOfFile = files[`cid:fileoutpart`+elementRenditionIndex].path;
let name = files[`cid:fileoutpart`+elementRenditionIndex].name;
if (Array.from(figureMap.keys()).includes(name)) {
file.addLocalFile(pathOfFile, "figures", figureMap.get(name));
} else if (Array.from(tableMap.keys()).includes(name)) {
file.addLocalFile(pathOfFile, "tables", tableMap.get(name));
}
}
let elementInfoPath = tempDir + "structuredData.json";
file.addLocalFile(elementInfoPath);
file.writeZip(tempDir + fileName);
};
/**
* This method parses the success result of cpfStatusApi and puts the content into specified file(fileName)
*/
const parseSuccessResult = (result, fileName) => {
return new Promise((resolve, reject) => {
const tempDir = `${temp + path.sep}${tempFolderName}${path.sep}`,
multipartResponseParser = getMultipartResponseParser(fileName, tempDir);
ensureDirectoryExistence(tempDir);
const requestId = result.headers[DefaultHeaders.DC_REQUEST_ID_HEADER_KEY] || result.headers[DefaultHeaders.SESSION_TOKEN_REQUEST_ID_HEADER_KEY];
// Parse multipart response data
multipartResponseParser.parse(result, (err, fields, files) => {
// Check for error while parsing
if (err) {
return reject(new ServiceApiError(
"Response Parser Failure: " + err.message,
fields.request_id
));
}
// Check for contentAnalyzerResponse errors
if (fields.statuses) {
let contentAnalyzerResponse = fields.statuses[0].invocations[0];
if (contentAnalyzerResponse.status >= 400) {
return reject(new ServiceApiError(
contentAnalyzerResponse.message,
fields.request_id,
parseInt(contentAnalyzerResponse.status)
));
}
}
/* For PAPI specific errors, as the response json is different.for e.g.
"cpf:status": {
"completed": true,
"type": "",
"title": "Deleting all pages in an input document is not allowed. The page range specified include all the pages.",
"status": 400,
"report": ""
}
*/
if (fields['cpf:status']) {
let cpfContentAnalyzerResponse = fields['cpf:status'];
//As from engine we are only sending 400 or greater error codes.
if (cpfContentAnalyzerResponse.status >= 400) {
return reject(new ServiceApiError(
cpfContentAnalyzerResponse.title,
requestId,
parseInt(cpfContentAnalyzerResponse.status)
));
}
}
frameExtractSpecificOutput(files, fields, tempDir, fileName);
resolve(FileRef.createFromLocalFile(tempDir + fileName));
});
});
};
const getMultipartFormData = (fileDataList, contentAnalyzerRequest) => {
let multipartData = {};
multipartData['contentAnalyzerRequests'] = contentAnalyzerRequest;
let index = 0;
fileDataList.forEach(buffer => {
multipartData['file' + index] = buffer.data;
index++;
});
return multipartData;
};
const CPFApi = {
cpfCreateApi(context, files, contentAnalyzerRequest, operation) {
return new Promise((resolve, reject) => {
let fileDataPromise = processInputFiles(files);
fileDataPromise.then((fileDataList) => {
const multipartData = getMultipartFormData(fileDataList, contentAnalyzerRequest);
let createOpsApiPromise = callApiUtil(context, multipartData, CPFRequestKeys.CPF_CREATE, operation);
createOpsApiPromise.then(result => {
if (result.status === 202) {
resolve(result.headers.location);
}
reject(new ServiceApiError(//handling of the response after retry for eg cases like 401 from sensei-core
result.content.message || result.content.error.message || result.content['cpf:status'].title,//error handling for authorization of cpf create call
result.headers[DefaultHeaders.DC_REQUEST_ID_HEADER_KEY],
result.status
));
}).catch(err => reject(err))
}).catch(err => reject(err));
});
},
cpfPredictApi(context, files, contentAnalyzerRequest, operation) {
return new Promise((resolve, reject) => {
let fileDataPromise = processInputFiles(files);
fileDataPromise.then((fileDataList) => {
const multipartData = getMultipartFormData(fileDataList, contentAnalyzerRequest);
let predictApiPromise = callApiUtil(context, multipartData, CPFRequestKeys.CPF_PREDICT, operation);
predictApiPromise.then(result => {
if (result.status === 202) {
resolve(result.headers.location);
}
reject(new ServiceApiError(//handling of the response after retry for eg cases like 401 from sensei-core
result.content.message || result.content.title,
result.headers[DefaultHeaders.DC_REQUEST_ID_HEADER_KEY],
result.status
));
}).catch(err => reject(err))
}).catch(err => reject(err));
});
},
cpfStatusApi(context, location, fileName) {
const startTime = new Date().getTime();
let statusPollingPromise = cpfStatusPolling(context, location, startTime);
logger.info("Transaction ID : " + StringUtil.getRequestIdFromLocation(location))
return statusPollingPromise.then(result => {
if (result.statusCode === 200) {
return parseSuccessResult(result, fileName);
}
let errorMessage = (specialHttpErrorCodes[result.statusCode]) ? specialHttpErrorCodes[result.statusCode] : result.content.message;
return Promise.reject(new ServiceApiError(
errorMessage,
result.headers[DefaultHeaders.DC_REQUEST_ID_HEADER_KEY],
result.statusCode
));
})
.catch(err => Promise.reject(err));
}
};
module.exports = CPFApi;