wdio-ocr-service

import { execSync } from 'child_process' import { createWorker, OEM, PSM } from 'tesseract.js' // @ts-ignore import { recognize } from 'node-tesseract-ocr' import { parseString } from 'xml2js' import { GetOcrData, Line, Words } from '../typings/types' import { parseAttributeString } from './index' export function isTesseractAvailable(tesseractName: string = ''): boolean { const binary = tesseractName || 'tesseract' const command = [binary, '--version'].join(' ') try { execSync(command) } catch (ign) { return false } return true } interface GetOcrDataOptions { filePath: string; language: string; } export async function getNodeOcrData(options: GetOcrDataOptions): Promise<GetOcrData|Error> { try { const { filePath, language } = options const jsonSingleWords: Words[] = [] const jsonWordStrings: Line[] = [] let composedBlocks: any = [] const worker = createWorker() await worker.load() await worker.loadLanguage(language) await worker.initialize(language) await worker.setParameters({ tessedit_ocr_engine_mode: OEM.TESSERACT_LSTM_COMBINED, tessedit_pageseg_mode: PSM.AUTO, tessjs_create_tsv: '0', tessjs_create_box: '0', tessjs_create_unlv: '0', tessjs_create_osd: '0', }) const { data: { text, hocr } } = await worker.recognize(filePath) // @ts-ignore parseString(hocr, (error: Error, data: any) => { if (error) { throw Error(`An error happened when parsing the getNodeOcrData, see: ${error}`) } composedBlocks = data.div.div }) if (!composedBlocks || composedBlocks.length === 0){ throw Error('No text was found for the OCR, please verify the stored image.') } // This is for single words // @ts-ignore composedBlocks.forEach(({ p: TextBlock }) => { // @ts-ignore TextBlock.forEach(({ span: TextLine }) => { // @ts-ignore TextLine.forEach(({ span: String }) => { // @ts-ignore String.forEach(({ _: text, $: { title } }) => { if (!text) { return } const attributes = `; ${title}`.split('; ') const { bbox, wc } = parseAttributeString(attributes) jsonSingleWords.push({ text, bbox, wc, }) }) }) }) }) // This is for single lines // @ts-ignore composedBlocks.forEach(({ p: TextBlock }) => { // @ts-ignore TextBlock.forEach(({ span: TextLine }) => { // @ts-ignore TextLine.forEach(({ $: { title }, span: String }) => { const attributes = `; ${title}`.split('; ') const { bbox } = parseAttributeString(attributes) const line = { text: '', bbox, } // @ts-ignore String.map(({ _: text }) => { line.text = `${line.text} ${text || ''}`.trim() }) if (line.text === '') { return } jsonWordStrings.push(line) }) }) }) await worker.terminate() return { lines: jsonWordStrings, words: jsonSingleWords, text, } } catch (error) { throw Error(`An error happened when parsing the getNodeOcrData, see: ${error}`) } } export async function getSystemOcrData(options: GetOcrDataOptions): Promise<GetOcrData|Error> { try { const { filePath, language } = options const jsonSingleWords: Words[] = [] const jsonWordStrings: Line[] = [] let composedBlocks: any = [] let text: string = '' const result = await recognize(filePath, { lang: language, oem: 1, // https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc psm: 3, presets: ['txt', 'alto'], }) parseString(result, (error: Error, data) => { if (error) { throw Error(`An error happened when parsing the getSystemOcrData, see: ${error}`) } text = data.alto.Layout[0]._ || text composedBlocks = data.alto.Layout[0].Page[0].PrintSpace[0].ComposedBlock }) if (!composedBlocks || composedBlocks.length === 0){ throw Error('No text was found for the OCR, please verify the stored image.') } // This is for single words // @ts-ignore composedBlocks.forEach(({ TextBlock }) => { // @ts-ignore TextBlock.forEach(({ TextLine }) => { // @ts-ignore TextLine.forEach(({ String }) => { // @ts-ignore String.forEach(({ $: { CONTENT, HPOS, VPOS, WIDTH, HEIGHT, WC } }) => { jsonSingleWords.push({ text: CONTENT || '', bbox: { left: Number(HPOS), top: Number(VPOS), right: Number(HPOS) + Number(WIDTH), bottom: Number(VPOS) + Number(HEIGHT), }, wc: Number(WC), }) } ) }) }) }) // This is for single lines // @ts-ignore composedBlocks.forEach(({ TextBlock }) => { // @ts-ignore TextBlock.forEach(({ TextLine }) => { // @ts-ignore TextLine.forEach(({ $: { HPOS, VPOS, WIDTH, HEIGHT }, String }) => { const line = { text: '', bbox: { left: Number(HPOS), top: Number(VPOS), right: Number(HPOS) + Number(WIDTH), bottom: Number(VPOS) + Number(HEIGHT), }, } // @ts-ignore String.forEach(({ $: { CONTENT } }) => { line.text = `${line.text} ${CONTENT || ''}`.trim() }) if (line.text === '') { return } jsonWordStrings.push(line) }) }) }) return { lines: jsonWordStrings, words: jsonSingleWords, text: text, } } catch (error) { throw Error(`An error happened when parsing the getSystemOcrData, see: ${error}`) } }