UNPKG

@adobe/pdftools-extract-node-sdk

Version:

The Document Services PDF Tools Extract Node.js SDK provides APIs for extracting elements and renditions from PDF

www.adobe.com/go/pdfextractapi_doc

347 lines (305 loc) • 12.4 kB

JavaScript

/* * Copyright 2019 Adobe * All Rights Reserved. * * NOTICE: Adobe permits you to use, modify, and distribute this file in * accordance with the terms of the Adobe license agreement accompanying * it. If you have received this file from a source other than Adobe, * then your use, modification, or distribution of it requires the prior * written permission of Adobe. */ const CPFRequestKeys = require('../cpf/cpf-service-request-key'), logger = require('../logger'), {ensureDirectoryExistence} = require('../util/file-util'), fs = require('fs'), FileRef = require('../../io/file-ref'), DefaultConfig = require('./../config/dc-services-default-config'), DefaultHeaders = require('../http/default-dc-request-options'), temp = require('temp-dir'), path = require('path'), ServiceApiError = require('../../error/service-api-error'), formidable = require('formidable'), AdmZip = require('adm-zip'), StringUtil = require('../util/string-util'); const timeLimit = 1000 * 60 * 10; // 10 minutes in ms const pollingInterval = 1000; // 1 second in ms const specialHttpErrorCodes = DefaultConfig.specialHttpErrorCodes, tempFolderName = DefaultConfig.tempFolderName, operationHeaderName = "x-dcsdk-ops-info"; /** * This methods is used for making a CPF API call and return the response */ const callApiUtil = (context, multipartData, requestKey, operation) => context.getBaseRequestFromRequestContext(requestKey) .then(httpRequest => { httpRequest = httpRequest.withMultipartData(multipartData).withHeader(operationHeaderName, operation); logger.debug(`Uploading file with options ${JSON.stringify(httpRequest.requestOptions)}`); return httpRequest.call(); }).catch(err => Promise.reject(err)); /** * This method keeps on polling for the Status API response until we get the status code other than 202. * It will keep on polling for a response for 10 minutes post that it'll throw a ServiceAPIError */ const cpfStatusPolling = (context, location, startTime) => context.getBaseRequestFromRequestContext(CPFRequestKeys.CPF_STATUS) .then(statusRequest => { statusRequest.options.requestConfig.uriTemplate = location; logger.debug(`Download file with options ${JSON.stringify(statusRequest.requestOptions)}`); return statusRequest.call(); }) .then(result => { if (result.statusCode === 202) { logger.debug('polling for status'); return new Promise((resolve, reject) => { if (new Date().getTime() - startTime > timeLimit) { logger.debug('Aborting conversion that is taking too long.'); reject(new ServiceApiError( 'Operation execution has timed out! Please find the last successful polling response', result.headers[DefaultHeaders.DC_REQUEST_ID_HEADER_KEY] )); } setTimeout(() => { return cpfStatusPolling(context, location, startTime) .then(response => resolve(response)) .catch(err => reject(err)); }, pollingInterval); }); } return Promise.resolve(result) }) .catch(err => Promise.reject(err)); /** * This method is used to get the file stream of the input file. */ const getFileData = (file) => { return new Promise((resolve, reject) => { let fileStream = file.input.asStream; let bufferChunks = []; fileStream.on('data', chunk => { bufferChunks.push(chunk); }); fileStream.on('end', () => { const fileData = { data: Buffer.concat(bufferChunks) }; resolve(fileData); }); fileStream.on('error', error => reject(error)); }); }; /** * This method process the input files and create the file stream for each input file asynchronously. */ const processInputFiles = (files) => { let promises = []; let fileDataList = []; // Generate individual file body parts let index = 0; files.forEach(file => { // To maintain the file ordering, assign unique index to every file input let localIndex = index; index++; promises.push(new Promise((resolve, reject) => { let fileDataPromise = getFileData(file); fileDataPromise.then(fileData => { fileDataList[localIndex] = fileData; resolve(); }).catch(error => reject(error)); })); }); return Promise.all(promises) .then(() => Promise.resolve(fileDataList)) .catch(err => Promise.reject(err)); }; /** * This method returns the Formidable object required for handling multipart response. */ const getMultipartResponseParser = (fileName, filePath) => { const multipartResponseParser = formidable(); multipartResponseParser.onPart = (part) => { // For the file part, explicitly add the fileName to let the formidable lib treat it as file instead of field if (part.name.includes('file')) { fileName = part.name.slice(4, part.name.length); part.filename = fileName; } if (part.name.includes('json')) { part.name = 'structuredData'; } multipartResponseParser.handlePart(part); }; multipartResponseParser.on('fileBegin', (filename, file) => { // Override the file part's file path with filePath file.path = filePath + fileName; }); return multipartResponseParser; }; const frameExtractRenditionsOutput = (elements, figures, tables) => { for (let elementsIndex = 0; elementsIndex < elements.length; elementsIndex++) { let renditionsPath = elements[elementsIndex].Path.toLowerCase(); let identifier = renditionsPath.split('/').pop(); if (identifier.startsWith('figure') && elements[elementsIndex].filePaths !== undefined) { let fileNames = elements[elementsIndex].filePaths; for (let i = 0; i < fileNames.length; i++) { figures.push(elements[elementsIndex].filePaths[i]); elements[elementsIndex].filePaths[i] = "figures" + path.sep + elements[elementsIndex].filePaths[i]; } } if (identifier.startsWith('table') && elements[elementsIndex].filePaths !== undefined) { let fileNames = elements[elementsIndex].filePaths; for (let i = 0; i < fileNames.length; i++) { tables.push(elements[elementsIndex].filePaths[i]); elements[elementsIndex].filePaths[i] = "tables" + path.sep + elements[elementsIndex].filePaths[i]; } } } } const frameExtractSpecificOutput = (files, fields, tempDir, fileName) => { let figures = []; let tables = []; const file = new AdmZip(); let contentAnalyserResponse = JSON.parse(fields['contentAnalyzerResponse']); let elementRenditionsList = contentAnalyserResponse['cpf:outputs']['elementsRenditions']; let structuredDataJson = JSON.parse(fields['structuredData']); if (elementRenditionsList.length > 0) { frameExtractRenditionsOutput(structuredDataJson.elements, figures, tables) } const figureMap = StringUtil.buildMap(figures.map(x => x.substr(0, x.indexOf('.'))), figures); const tableMap = StringUtil.buildMap(tables.map(x => x.substr(0, x.indexOf('.'))), tables); fs.writeFileSync(tempDir + "structuredData.json", JSON.stringify(structuredDataJson)); for (let elementRenditionIndex = 0; elementRenditionIndex < elementRenditionsList.length; elementRenditionIndex ++) { let pathOfFile = files[`cid:fileoutpart`+elementRenditionIndex].path; let name = files[`cid:fileoutpart`+elementRenditionIndex].name; if (Array.from(figureMap.keys()).includes(name)) { file.addLocalFile(pathOfFile, "figures", figureMap.get(name)); } else if (Array.from(tableMap.keys()).includes(name)) { file.addLocalFile(pathOfFile, "tables", tableMap.get(name)); } } let elementInfoPath = tempDir + "structuredData.json"; file.addLocalFile(elementInfoPath); file.writeZip(tempDir + fileName); }; /** * This method parses the success result of cpfStatusApi and puts the content into specified file(fileName) */ const parseSuccessResult = (result, fileName) => { return new Promise((resolve, reject) => { const tempDir = `${temp + path.sep}${tempFolderName}${path.sep}`, multipartResponseParser = getMultipartResponseParser(fileName, tempDir); ensureDirectoryExistence(tempDir); const requestId = result.headers[DefaultHeaders.DC_REQUEST_ID_HEADER_KEY] || result.headers[DefaultHeaders.SESSION_TOKEN_REQUEST_ID_HEADER_KEY]; // Parse multipart response data multipartResponseParser.parse(result, (err, fields, files) => { // Check for error while parsing if (err) { return reject(new ServiceApiError( "Response Parser Failure: " + err.message, fields.request_id )); } // Check for contentAnalyzerResponse errors if (fields.statuses) { let contentAnalyzerResponse = fields.statuses[0].invocations[0]; if (contentAnalyzerResponse.status >= 400) { return reject(new ServiceApiError( contentAnalyzerResponse.message, fields.request_id, parseInt(contentAnalyzerResponse.status) )); } } /* For PAPI specific errors, as the response json is different.for e.g. "cpf:status": { "completed": true, "type": "", "title": "Deleting all pages in an input document is not allowed. The page range specified include all the pages.", "status": 400, "report": "" } */ if (fields['cpf:status']) { let cpfContentAnalyzerResponse = fields['cpf:status']; //As from engine we are only sending 400 or greater error codes. if (cpfContentAnalyzerResponse.status >= 400) { return reject(new ServiceApiError( cpfContentAnalyzerResponse.title, requestId, parseInt(cpfContentAnalyzerResponse.status) )); } } frameExtractSpecificOutput(files, fields, tempDir, fileName); resolve(FileRef.createFromLocalFile(tempDir + fileName)); }); }); }; const getMultipartFormData = (fileDataList, contentAnalyzerRequest) => { let multipartData = {}; multipartData['contentAnalyzerRequests'] = contentAnalyzerRequest; let index = 0; fileDataList.forEach(buffer => { multipartData['file' + index] = buffer.data; index++; }); return multipartData; }; const CPFApi = { cpfCreateApi(context, files, contentAnalyzerRequest, operation) { return new Promise((resolve, reject) => { let fileDataPromise = processInputFiles(files); fileDataPromise.then((fileDataList) => { const multipartData = getMultipartFormData(fileDataList, contentAnalyzerRequest); let createOpsApiPromise = callApiUtil(context, multipartData, CPFRequestKeys.CPF_CREATE, operation); createOpsApiPromise.then(result => { if (result.status === 202) { resolve(result.headers.location); } reject(new ServiceApiError(//handling of the response after retry for eg cases like 401 from sensei-core result.content.message || result.content.error.message || result.content['cpf:status'].title,//error handling for authorization of cpf create call result.headers[DefaultHeaders.DC_REQUEST_ID_HEADER_KEY], result.status )); }).catch(err => reject(err)) }).catch(err => reject(err)); }); }, cpfPredictApi(context, files, contentAnalyzerRequest, operation) { return new Promise((resolve, reject) => { let fileDataPromise = processInputFiles(files); fileDataPromise.then((fileDataList) => { const multipartData = getMultipartFormData(fileDataList, contentAnalyzerRequest); let predictApiPromise = callApiUtil(context, multipartData, CPFRequestKeys.CPF_PREDICT, operation); predictApiPromise.then(result => { if (result.status === 202) { resolve(result.headers.location); } reject(new ServiceApiError(//handling of the response after retry for eg cases like 401 from sensei-core result.content.message || result.content.title, result.headers[DefaultHeaders.DC_REQUEST_ID_HEADER_KEY], result.status )); }).catch(err => reject(err)) }).catch(err => reject(err)); }); }, cpfStatusApi(context, location, fileName) { const startTime = new Date().getTime(); let statusPollingPromise = cpfStatusPolling(context, location, startTime); logger.info("Transaction ID : " + StringUtil.getRequestIdFromLocation(location)) return statusPollingPromise.then(result => { if (result.statusCode === 200) { return parseSuccessResult(result, fileName); } let errorMessage = (specialHttpErrorCodes[result.statusCode]) ? specialHttpErrorCodes[result.statusCode] : result.content.message; return Promise.reject(new ServiceApiError( errorMessage, result.headers[DefaultHeaders.DC_REQUEST_ID_HEADER_KEY], result.statusCode )); }) .catch(err => Promise.reject(err)); } }; module.exports = CPFApi;