UNPKG

@cityssm/bill-data-extract

Version:

Extracts data from scanned bill documents into usable details.

165 lines (136 loc) 4.67 kB
import Debug from 'debug' import type Tesseract from 'tesseract.js' import { createWorker } from 'tesseract.js' import type { DataExtractOptions } from './extracts/types.js' import { deleteTempFiles } from './utilities/fileUtilities.js' import { imageFilePathsToImageFiles, pdfOrImageFilePathsToImageFilePaths } from './utilities/imageUtilities.js' import { percentageToCoordinate } from './utilities/mathUtilities.js' const debug = Debug('bill-data-extract:index') function getOCRCacheKey(options: { imagePath: string pageNumber: number xTop: number yTop: number xBottom: number yBottom: number }): string { return `${options.imagePath}::${options.pageNumber}::${options.xTop}::${options.yTop}::${options.xBottom}::${options.yBottom}` } // eslint-disable-next-line no-secrets/no-secrets /** * Extracts data from a series of files. * @param {string[]} pdfOrImageFilePaths - A list of paths to PDFs or images that represent a single bill. * @param {DataExtractOptions} extractOptions - Options describing where the extract should occur. * @returns {Promise<object>} - Extracted data. */ export async function extractData<T extends Record<string, unknown>>( pdfOrImageFilePaths: string[], extractOptions: DataExtractOptions<T> ): Promise<T> { const { imageFilePaths, tempFilePaths } = await pdfOrImageFilePathsToImageFilePaths(pdfOrImageFilePaths) /* * Populate image dimensions */ const imageFiles = imageFilePathsToImageFiles(imageFilePaths) /* * Extract data */ const ocrCache: Record<string, Tesseract.RecognizeResult> = {} const result = {} const worker = await createWorker() try { for (const [dataFieldName, dataFieldOptions] of Object.entries( extractOptions )) { const image = imageFiles[(dataFieldOptions.pageNumber ?? 1) - 1] const xTop = percentageToCoordinate( dataFieldOptions.topLeftCoordinate?.xPercentage ?? 0, image.width as number ) const yTop = percentageToCoordinate( dataFieldOptions.topLeftCoordinate?.yPercentage ?? 0, image.height as number ) const xBottom = percentageToCoordinate( dataFieldOptions.bottomRightCoordinate?.xPercentage ?? 100, image.width as number ) const yBottom = percentageToCoordinate( dataFieldOptions.bottomRightCoordinate?.yPercentage ?? 100, image.height as number ) const rectangle: Tesseract.Rectangle = { top: yTop, left: xTop, width: xBottom - xTop - 1, height: yBottom - yTop } debug(`Finding "${dataFieldName}"...`) const ocrCacheKey = getOCRCacheKey({ imagePath: image.path, pageNumber: dataFieldOptions.pageNumber ?? 1, xTop, yTop, xBottom, yBottom }) // eslint-disable-next-line security/detect-object-injection let rawText = ocrCache[ocrCacheKey] if (rawText === undefined) { rawText = await worker.recognize(image.path, { rectangle }) // eslint-disable-next-line security/detect-object-injection ocrCache[ocrCacheKey] = rawText } else { debug(`OCR Cache Hit: ${ocrCacheKey}`) } debug(`Raw Text: ${rawText.data.text}`) // eslint-disable-next-line security/detect-object-injection result[dataFieldName] = dataFieldOptions.processingFunction === undefined ? rawText.data.text.trim() : dataFieldOptions.processingFunction(rawText) // eslint-disable-next-line security/detect-object-injection debug(`${dataFieldName}: ${result[dataFieldName]}`) } } finally { await worker.terminate() } /* * Clean up temp files */ await deleteTempFiles(tempFilePaths) return result as T } /** * Extracts the full text for a given page. * @param {string} billPath - The bill path. * @param {number} pageNumber - The page number, defaults to 1. * @returns {string} - The full text of the given page. */ export async function extractFullPageText( billPath: string, pageNumber: number = 1 ): Promise<string> { const rawData = await extractData([billPath], { text: { pageNumber } }) return rawData.text as string } export { getSuggestedExtractType } from './utilities/extractUtilities.js' export type { DataExtractOptions, DataField, DataFieldCoordinate, BillData, GasBillData, ElectricityBillData, WaterBillData } from './extracts/types.js'