nfv

Version:

[Updated to use ndv package instead dv] FormVision is a node.js library for extracting data from scanned forms

github.com/brmk/node-fv

brmk/node-fv

109 lines (101 loc) • 4.1 kB

text/coffeescript

dv = require 'ndv' {length, distanceVector, boxDistanceVector, intersectBox, boundingBox} = require './math' # Compiles a mask with lines that have a certain length. detectLineMask = (image, minLineLength) -> lineMask = new dv.Image(image.width, image.height, 8) longLines = image.toGray().lineSegments(0, 0, false).filter (line) -> return length(distanceVector(line.p1, line.p2)) >= minLineLength for line in longLines lineMask.drawLine line.p1, line.p2, 7, 'set' return lineMask mergeRegions = (items, predicate) -> # Initialize regions with unique indices. regions = [0...items.length] # Merge regions until predicate can no longer be applied. done = false while not done done = true # Merge regions (non-transitive). for item, i in items for otherItem, j in items[i + 1..] jj = j + i + 1 if regions[jj] isnt regions[i] and predicate(item, otherItem) region = Math.min(regions[jj], regions[i]) regions[i] = regions[jj] = region done = false # Propagate merges (transitive). for i in [0..regions.length] while regions[regions[i]] isnt regions[i] regions[i] = regions[regions[i]] return regions isSameBlock = (fontWidth, fontHeight) -> return (boxA, boxB) -> bottomA = boxA.y + boxA.height bottomB = boxB.y + boxB.height delta = boxDistanceVector boxA, boxB sameLine = Math.abs(bottomA - bottomB) < fontHeight / 2 and delta.x < fontWidth * 3 return sameLine or intersectBox(boxA, boxB) detectCandidates = (binarizedImage, fontWidth = 20, fontHeight = 30) -> hasLetterSize = (box) -> return fontWidth / 2 < box.width and fontHeight / 2 < box.height < fontHeight * 6 # Smear text a bit to extract letter boxes. smearWidth = (1 * fontWidth) + fontWidth % 2 smearHeight = (0.25 * fontHeight) + fontHeight % 2 boxes = binarizedImage.dilate(smearWidth, smearHeight).connectedComponents(8).filter(hasLetterSize) # Merge letter boxes to text regions. regions = mergeRegions boxes, isSameBlock(fontWidth, fontHeight) boxesByRegion = {} for region, boxIndex in regions boxesByRegion[region] ?= [] boxesByRegion[region].push(boxes[boxIndex]) candidates = (boxes for _, boxes of boxesByRegion) return candidates # Clone area of an image from boxes cloneUsingRegion = (image, boxes) -> cloneBox = boundingBox boxes cloneImage = new dv.Image cloneBox.width, cloneBox.height, image.depth cloneImage.clearBox x: 0 y: 0 width: cloneBox.width height: cloneBox.height for box in boxes cloneImage.drawImage image.crop(box.x, box.y, box.width + 25, box.height), x: box.x - cloneBox.x y: box.y - cloneBox.y width: box.width + 25 height: box.height return [cloneImage, cloneBox] findWords = (candidates, image, tesseract) -> words = [] for candidateBoxes in candidates # Crop and recognize. [cloneImage, cloneBox] = cloneUsingRegion image, candidateBoxes tesseract.image = cloneImage tesseract.pageSegMode = if cloneBox.height < 60 then 'single_line' else 'single_block' localWords = tesseract.findWords() for word in localWords # Transform back. word.box.x += cloneBox.x word.box.y += cloneBox.y # Store candidate. word.candidate = candidateBoxes[..] words = words.concat(localWords) # Filter words with tiny boxes. words = words.filter (word) -> word.box.width > 5 and word.box.height > 5 return words # Use given *Tesseract* instance to find all text grouped as words along with # confidence and boxes. module.exports.findText = (image, tesseract) -> clearedImage = new dv.Image image # Remove long lines. lineMask = detectLineMask image, 45 textImage = image.toGray().add lineMask.toGray() # Find words using a simple Otsu thresholding. tesseract.image = textImage candidates = detectCandidates textImage.otsuAdaptiveThreshold(128, 128, 0, 0, 0).image words = findWords candidates, image, tesseract # Remove words from image, but safeguard against removing 'noise' that may be a checkmark. for word in words when word.text.length >= 6 or (word.text.length >= 3 and word.confidence >= 30) clearedImage.clearBox word.box return [words, clearedImage]