@briancullen/aws-textract-parser

Version:

Library for converting AWS Textract responses into a more usable structure.

github.com/briancullen/aws-textract-parser

98 lines (97 loc) • 4.54 kB

TypeScript

import { AWSError, Textract } from 'aws-sdk'; import { DocumentFactory } from './factory'; import { Document } from './types'; /** * Callback signature to be implemented by code wishing to use callbacks to process * the data returned by Textract. */ export declare type ParsedDetectTextCallback = (err: AWSError | null, data: Document | null) => void; /** * Callback signature used by AWS to return data from Textract. */ export declare type TextractDetectTextCallback = (err: AWSError | null, data: Textract.Types.DetectDocumentTextResponse | null) => void; /** * Unified type to accept responses from either the synchronous or asynchronous detect text operations. */ export declare type TextractDetectTextResponse = Textract.DetectDocumentTextResponse & Textract.GetDocumentTextDetectionResponse; /** * This class provides methods to process the information returned by Textract into * a tree structure that is easier to work with. * * A single instance of this class is automatically created for you and provided as * the default export from this library. */ export declare class TextractParser { private readonly factory; /** @hidden */ constructor(factory: DocumentFactory); /** * Method that parses the textract response synchronously. * * For example it can also be used as part of processing the result of a promise * as shown below. * * ```typescript * textract.detectDocumentText(request).promise() * .then(data => textractParser.parseDetectTextResponse(data)) * .then(parsedData => console.log(parsedData)) * .catch(err => console.log(err)) * ``` * * **NOTE**: If used to process GetDocumentTextDetectionResponse response then all data should be * contained within a single response. If a NextToken is detected on the response then null will * be returned. See [[parseGetTextDetection]] for a helper method which will aggregate the * responses from the GetDocumentTextDetection operation. * * @param response the response object returned from Textract or null if the response is incomplete * @returns Document that acts as the root node for the processed tree */ parseDetectTextResponse(response: TextractDetectTextResponse): Document | null; /** * Method that acts as a proxy for the standard Textract callback. * * This proxy will process the data returned by Textract and call the provided callback * with the processed information. It can be invoked as shown, where myCallback is written * by the user of the library. * * ```typescript * textract.detectDocumentText(request, textractParser.parseDetectTextCallback(myCallback)) * ``` * * @param callback the callback to be invoked with the processed data or error * @returns callback function that can be used with the AWS Textract invocation */ parseDetectTextCallback(callback: ParsedDetectTextCallback): TextractDetectTextCallback; /** * Method that retrieves the result of a asynchronous document text detection operation * (which may require multiple requests to AWS) and produces a tree of the results. * * An example of how to use this method is shown below. * * ```typescript * const jobId = 'your-job-id' * const client = new AWS.Textract() * * textract.detectDocumentText(client, jobId) * .then(parsedData => console.log(parsedData)) * .catch(err => console.log(err)) * ``` * * If the specified Textract job is not marked as SUCCEEDED or the AWS operations fail * to return the results then the Promise will be rejected. * * **NOTE**: This method will try and retrieve all the results for the Textract job and * process them in memory. For extremely large documents then memory may become an issue. * * @param client the AWS client to use for retrieving the Textract results * @param jobId the id of the Textract job for which we want to parse the results * @returns Promise for a document that acts as the root node for the processed tree */ parseGetTextDetection(textract: Textract, jobId: string): Promise<Document>; /** @hidden */ private getGetTextDetectionResponse; } export { Document, PageBlock, LineBlock, WordBlock, Block, Blocks, BlockType, DocumentMetadata } from './types'; export { Geometry, BoundaryBox, Polygon, Point } from './model/Geometry'; declare const _default: TextractParser; export default _default;