@briancullen/aws-textract-parser
Version:
Library for converting AWS Textract responses into a more usable structure.
141 lines (140 loc) • 4.94 kB
TypeScript
import { Geometry } from './model/Geometry';
/**
* Interface that descibes the properties implemented
* by all blocks with the exception of the document block type.
*/
export interface BaseBlockProperties {
/** Unique id used to identify a block */
readonly id: string;
/** Location information identifying where the block can be found */
readonly geometry: Geometry;
}
/**
* Interface that is implemented by blocks that include
* recognised text.
*/
export interface TextProvider {
/** The text recognised for this block */
readonly text: string;
/** The confidence (as a precentage) that the text has
* been identified correctly
*/
readonly confidence: number;
}
/**
* Interface implemented by all nodes that make up the parsed tree.
* @typeparam P type of the parent of this node
* @typeparam C type of the children of this node
*/
export interface TreeNode<P extends Block, C extends Block> {
/**
* Get the parent node of this node
* @returns the parent of this node or undefined if this is the root node
*/
parent(): P | undefined;
/**
* Gets the children of this node
* @returns an array of children or an empty array if this is a leaf node
*/
children(): C[];
/**
* Determines whether or not the specified node is a child of this node.
* @param item the item to look for
* @returns true if the item is a child of this node, false otherwise
*/
hasChild(item: C): boolean;
}
/**
* Interface describing information made available about the document
* that has been processed. Currently only one piece of information
* is provided.
*/
export interface DocumentMetadata {
/** Number of pages in the processed document */
readonly pages: number;
}
/**
* Interface that provides access to information about Word blocks.
* If present word blocks are always the leaves of the tree, that is
* they never have any child node.
*
* In the current implementation the parent of a Word block is
* always a line block.
*/
export interface WordBlock extends BaseBlockProperties, TextProvider, TreeNode<LineBlock, Block> {
/**
* Constant indicating the type of Block. Can be used for type descrimination,
* see [[BlockType]] for more details.
*/
readonly blockType: BlockType.Word;
}
/**
* Interface that provides access to information about Line blocks.
* Line blocks are always children of the Page block and represent a
* single line of text in the processed document.
*
* The line is further split into words what are made into Word
* blocks and are added as children to a Line block.
*/
export interface LineBlock extends BaseBlockProperties, TextProvider, TreeNode<PageBlock, WordBlock> {
/**
* Constant indicating the type of Block. Can be used for type descrimination,
* see [[BlockType]] for more details.
*/
readonly blockType: BlockType.Line;
}
/**
* Interface that provides access to information about Page blocks.
* Page blocks act as the root of all information processed from
* a single page in the document. The parent of the Page block is
* always a document block.
*
* Pages are broken down into a number of lines. The line blocks
* that are created to represent these lines form the children
* of a page block.
*/
export interface PageBlock extends BaseBlockProperties, TreeNode<Document, LineBlock> {
/**
* Constant indicating the type of Block. Can be used for type descrimination,
* see [[BlockType]] for more details.
*/
readonly blockType: BlockType.Page;
}
/**
* Interface that provides access to information about Document blocks.
* The Document block always forms the root of the tree and there is
* only one Document block per Textract response.
*
* The Document block will have one Page block child for each page
* in the processed document.
*/
export interface Document extends TreeNode<Block, PageBlock> {
/**
* Constant indicating the type of Block. Can be used for type descrimination,
* see [[BlockType]] for more details.
*/
readonly blockType: BlockType.Document;
/** Provides metadata regarding the processed document */
readonly metadata: DocumentMetadata;
}
/** Union type for all the different possible block types */
export declare type Block = Document | PageBlock | LineBlock | WordBlock;
/**
* Type alias used as shorthand for an array of block of a particular type.
* @typeparam T the type of block to be stored in the array
*/
export declare type Blocks<T extends Block> = T[];
/**
* Enum used to identify the different types of block that can
* be in the returned tree
*/
export declare enum BlockType {
/** Indicates a Word Block */
Word = "WORD",
/** Indicates a Line Block */
Line = "LINE",
/** Indicates a Page Block */
Page = "PAGE",
/** Indicates a Document Block */
Document = "DOCUMENT"
}