UNPKG

nfv

Version:

[Updated to use ndv package instead dv] FormVision is a node.js library for extracting data from scanned forms

161 lines (151 loc) 5.71 kB
// Generated by CoffeeScript 1.12.7 var boundingBox, boxDistanceVector, cloneUsingRegion, detectCandidates, detectLineMask, distanceVector, dv, findWords, intersectBox, isSameBlock, length, mergeRegions, ref; dv = require('ndv'); ref = require('./math'), length = ref.length, distanceVector = ref.distanceVector, boxDistanceVector = ref.boxDistanceVector, intersectBox = ref.intersectBox, boundingBox = ref.boundingBox; detectLineMask = function(image, minLineLength) { var k, len, line, lineMask, longLines; lineMask = new dv.Image(image.width, image.height, 8); longLines = image.toGray().lineSegments(0, 0, false).filter(function(line) { return length(distanceVector(line.p1, line.p2)) >= minLineLength; }); for (k = 0, len = longLines.length; k < len; k++) { line = longLines[k]; lineMask.drawLine(line.p1, line.p2, 7, 'set'); } return lineMask; }; mergeRegions = function(items, predicate) { var done, i, item, j, jj, k, l, len, len1, m, n, otherItem, ref1, ref2, ref3, region, regions, results; regions = (function() { results = []; for (var k = 0, ref1 = items.length; 0 <= ref1 ? k < ref1 : k > ref1; 0 <= ref1 ? k++ : k--){ results.push(k); } return results; }).apply(this); done = false; while (!done) { done = true; for (i = l = 0, len = items.length; l < len; i = ++l) { item = items[i]; ref2 = items.slice(i + 1); for (j = m = 0, len1 = ref2.length; m < len1; j = ++m) { otherItem = ref2[j]; jj = j + i + 1; if (regions[jj] !== regions[i] && predicate(item, otherItem)) { region = Math.min(regions[jj], regions[i]); regions[i] = regions[jj] = region; done = false; } } } for (i = n = 0, ref3 = regions.length; 0 <= ref3 ? n <= ref3 : n >= ref3; i = 0 <= ref3 ? ++n : --n) { while (regions[regions[i]] !== regions[i]) { regions[i] = regions[regions[i]]; } } } return regions; }; isSameBlock = function(fontWidth, fontHeight) { return function(boxA, boxB) { var bottomA, bottomB, delta, sameLine; bottomA = boxA.y + boxA.height; bottomB = boxB.y + boxB.height; delta = boxDistanceVector(boxA, boxB); sameLine = Math.abs(bottomA - bottomB) < fontHeight / 2 && delta.x < fontWidth * 3; return sameLine || intersectBox(boxA, boxB); }; }; detectCandidates = function(binarizedImage, fontWidth, fontHeight) { var _, boxIndex, boxes, boxesByRegion, candidates, hasLetterSize, k, len, region, regions, smearHeight, smearWidth; if (fontWidth == null) { fontWidth = 20; } if (fontHeight == null) { fontHeight = 30; } hasLetterSize = function(box) { var ref1; return fontWidth / 2 < box.width && (fontHeight / 2 < (ref1 = box.height) && ref1 < fontHeight * 6); }; smearWidth = (1 * fontWidth) + fontWidth % 2; smearHeight = (0.25 * fontHeight) + fontHeight % 2; boxes = binarizedImage.dilate(smearWidth, smearHeight).connectedComponents(8).filter(hasLetterSize); regions = mergeRegions(boxes, isSameBlock(fontWidth, fontHeight)); boxesByRegion = {}; for (boxIndex = k = 0, len = regions.length; k < len; boxIndex = ++k) { region = regions[boxIndex]; if (boxesByRegion[region] == null) { boxesByRegion[region] = []; } boxesByRegion[region].push(boxes[boxIndex]); } candidates = (function() { var results; results = []; for (_ in boxesByRegion) { boxes = boxesByRegion[_]; results.push(boxes); } return results; })(); return candidates; }; cloneUsingRegion = function(image, boxes) { var box, cloneBox, cloneImage, k, len; cloneBox = boundingBox(boxes); cloneImage = new dv.Image(cloneBox.width, cloneBox.height, image.depth); cloneImage.clearBox({ x: 0, y: 0, width: cloneBox.width, height: cloneBox.height }); for (k = 0, len = boxes.length; k < len; k++) { box = boxes[k]; cloneImage.drawImage(image.crop(box.x, box.y, box.width + 25, box.height), { x: box.x - cloneBox.x, y: box.y - cloneBox.y, width: box.width + 25, height: box.height }); } return [cloneImage, cloneBox]; }; findWords = function(candidates, image, tesseract) { var candidateBoxes, cloneBox, cloneImage, k, l, len, len1, localWords, ref1, word, words; words = []; for (k = 0, len = candidates.length; k < len; k++) { candidateBoxes = candidates[k]; ref1 = cloneUsingRegion(image, candidateBoxes), cloneImage = ref1[0], cloneBox = ref1[1]; tesseract.image = cloneImage; tesseract.pageSegMode = cloneBox.height < 60 ? 'single_line' : 'single_block'; localWords = tesseract.findWords(); for (l = 0, len1 = localWords.length; l < len1; l++) { word = localWords[l]; word.box.x += cloneBox.x; word.box.y += cloneBox.y; word.candidate = candidateBoxes.slice(0); } words = words.concat(localWords); } words = words.filter(function(word) { return word.box.width > 5 && word.box.height > 5; }); return words; }; module.exports.findText = function(image, tesseract) { var candidates, clearedImage, k, len, lineMask, textImage, word, words; clearedImage = new dv.Image(image); lineMask = detectLineMask(image, 45); textImage = image.toGray().add(lineMask.toGray()); tesseract.image = textImage; candidates = detectCandidates(textImage.otsuAdaptiveThreshold(128, 128, 0, 0, 0).image); words = findWords(candidates, image, tesseract); for (k = 0, len = words.length; k < len; k++) { word = words[k]; if (word.text.length >= 6 || (word.text.length >= 3 && word.confidence >= 30)) { clearedImage.clearBox(word.box); } } return [words, clearedImage]; };