UNPKG

tesseract.js

Version:

Pure Javascript Multilingual OCR

github.com/naptha/tesseract.js

naptha/tesseract.js

238 lines (224 loc) • 8.52 kB

JavaScript

/** * * Dump data to a big JSON tree * * @fileoverview dump data to JSON tree * @author Kevin Kwok <antimatter15@gmail.com> * @author Guillermo Webster <gui@mit.edu> * @author Jerome Wu <jeromewus@gmail.com> */ const arrayBufferToBase64 = require('./arrayBufferToBase64'); const imageType = require('../../constants/imageType'); /** * deindent * * The generated HOCR is excessively indented, so * we get rid of that indentation * * @name deindent * @function deindent string * @access public */ const deindent = (html) => { const lines = html.split('\n'); if (lines[0].substring(0, 2) === ' ') { for (let i = 0; i < lines.length; i += 1) { if (lines[i].substring(0, 2) === ' ') { lines[i] = lines[i].slice(2); } } } return lines.join('\n'); }; /** * dump * * @name dump * @function dump recognition result to a JSON object * @access public */ module.exports = (TessModule, api, output, options) => { const ri = api.GetIterator(); const { RIL_BLOCK, RIL_PARA, RIL_TEXTLINE, RIL_WORD, RIL_SYMBOL, } = TessModule; const blocks = []; let block; let para; let textline; let word; let symbol; const enumToString = (value, prefix) => ( Object.keys(TessModule) .filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value)) .map((e) => e.slice(prefix.length + 1))[0] ); const getImage = (type) => { api.WriteImage(type, '/image.png'); const pngBuffer = TessModule.FS.readFile('/image.png'); const pngStr = `data:image/png;base64,${arrayBufferToBase64(pngBuffer.buffer)}`; TessModule.FS.unlink('/image.png'); return pngStr; }; const getPDFInternal = (title, textonly) => { const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly); pdfRenderer.BeginDocument(title); pdfRenderer.AddImage(api); pdfRenderer.EndDocument(); TessModule._free(pdfRenderer); return TessModule.FS.readFile('/tesseract-ocr.pdf'); }; // If output.layoutBlocks is true and options.skipRecognition is true, // the user wants layout data but text recognition has not been run. // In this case, fields that require text recognition are skipped. if (output.blocks || output.layoutBlocks) { ri.Begin(); do { if (ri.IsAtBeginningOf(RIL_BLOCK)) { const poly = ri.BlockPolygon(); let polygon = null; // BlockPolygon() returns null when automatic page segmentation is off if (TessModule.getPointer(poly) > 0) { const n = poly.get_n(); const px = poly.get_x(); const py = poly.get_y(); polygon = []; for (let i = 0; i < n; i += 1) { polygon.push([px.getValue(i), py.getValue(i)]); } /* * TODO: find out why _ptaDestroy doesn't work */ // TessModule._ptaDestroy(TessModule.getPointer(poly)); } block = { paragraphs: [], text: !options.skipRecognition ? ri.GetUTF8Text(RIL_BLOCK) : null, confidence: !options.skipRecognition ? ri.Confidence(RIL_BLOCK) : null, baseline: ri.getBaseline(RIL_BLOCK), bbox: ri.getBoundingBox(RIL_BLOCK), blocktype: enumToString(ri.BlockType(), 'PT'), polygon, }; blocks.push(block); } if (ri.IsAtBeginningOf(RIL_PARA)) { para = { lines: [], text: !options.skipRecognition ? ri.GetUTF8Text(RIL_PARA) : null, confidence: !options.skipRecognition ? ri.Confidence(RIL_PARA) : null, baseline: ri.getBaseline(RIL_PARA), bbox: ri.getBoundingBox(RIL_PARA), is_ltr: !!ri.ParagraphIsLtr(), }; block.paragraphs.push(para); } if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { // getRowAttributes was added in a recent minor version of Tesseract.js-core, // so we need to check if it exists before calling it. // This can be removed in the next major version (v6). let rowAttributes; if (ri.getRowAttributes) { rowAttributes = ri.getRowAttributes(); // Descenders is reported as a negative within Tesseract internally so we need to flip it. // The positive version is intuitive, and matches what is reported in the hOCR output. rowAttributes.descenders *= -1; } textline = { words: [], text: !options.skipRecognition ? ri.GetUTF8Text(RIL_TEXTLINE) : null, confidence: !options.skipRecognition ? ri.Confidence(RIL_TEXTLINE) : null, baseline: ri.getBaseline(RIL_TEXTLINE), rowAttributes, bbox: ri.getBoundingBox(RIL_TEXTLINE), }; para.lines.push(textline); } if (ri.IsAtBeginningOf(RIL_WORD)) { const fontInfo = ri.getWordFontAttributes(); const wordDir = ri.WordDirection(); word = { symbols: [], choices: [], text: !options.skipRecognition ? ri.GetUTF8Text(RIL_WORD) : null, confidence: !options.skipRecognition ? ri.Confidence(RIL_WORD) : null, baseline: ri.getBaseline(RIL_WORD), bbox: ri.getBoundingBox(RIL_WORD), is_numeric: !!ri.WordIsNumeric(), in_dictionary: !!ri.WordIsFromDictionary(), direction: enumToString(wordDir, 'DIR'), language: ri.WordRecognitionLanguage(), is_bold: fontInfo.is_bold, is_italic: fontInfo.is_italic, is_underlined: fontInfo.is_underlined, is_monospace: fontInfo.is_monospace, is_serif: fontInfo.is_serif, is_smallcaps: fontInfo.is_smallcaps, font_size: fontInfo.pointsize, font_id: fontInfo.font_id, font_name: fontInfo.font_name, }; const wc = new TessModule.WordChoiceIterator(ri); do { word.choices.push({ text: !options.skipRecognition ? wc.GetUTF8Text() : null, confidence: !options.skipRecognition ? wc.Confidence() : null, }); } while (wc.Next()); TessModule.destroy(wc); textline.words.push(word); } // let image = null; // var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL) // var image = pix2array(pix); // // for some reason it seems that things stop working if you destroy pics // TessModule._pixDestroy(TessModule.getPointer(pix)); if (ri.IsAtBeginningOf(RIL_SYMBOL)) { symbol = { choices: [], image: null, text: !options.skipRecognition ? ri.GetUTF8Text(RIL_SYMBOL) : null, confidence: !options.skipRecognition ? ri.Confidence(RIL_SYMBOL) : null, baseline: ri.getBaseline(RIL_SYMBOL), bbox: ri.getBoundingBox(RIL_SYMBOL), is_superscript: !!ri.SymbolIsSuperscript(), is_subscript: !!ri.SymbolIsSubscript(), is_dropcap: !!ri.SymbolIsDropcap(), }; word.symbols.push(symbol); const ci = new TessModule.ChoiceIterator(ri); do { symbol.choices.push({ text: !options.skipRecognition ? ci.GetUTF8Text() : null, confidence: !options.skipRecognition ? ci.Confidence() : null, }); } while (ci.Next()); // TessModule.destroy(i); } } while (ri.Next(RIL_SYMBOL)); TessModule.destroy(ri); } return { text: output.text ? api.GetUTF8Text() : null, hocr: output.hocr ? deindent(api.GetHOCRText()) : null, tsv: output.tsv ? api.GetTSVText() : null, box: output.box ? api.GetBoxText() : null, unlv: output.unlv ? api.GetUNLVText() : null, osd: output.osd ? api.GetOsdText() : null, pdf: output.pdf ? getPDFInternal(options.pdfTitle ?? 'Tesseract OCR Result', options.pdfTextOnly ?? false) : null, imageColor: output.imageColor ? getImage(imageType.COLOR) : null, imageGrey: output.imageGrey ? getImage(imageType.GREY) : null, imageBinary: output.imageBinary ? getImage(imageType.BINARY) : null, confidence: !options.skipRecognition ? api.MeanTextConf() : null, blocks: output.blocks && !options.skipRecognition ? blocks : null, layoutBlocks: output.layoutBlocks && options.skipRecognition ? blocks : null, psm: enumToString(api.GetPageSegMode(), 'PSM'), oem: enumToString(api.oem(), 'OEM'), version: api.Version(), debug: output.debug ? TessModule.FS.readFile('/debugInternal.txt', { encoding: 'utf8', flags: 'a+' }) : null, }; };