nfv
Version:
[Updated to use ndv package instead dv] FormVision is a node.js library for extracting data from scanned forms
109 lines (101 loc) • 4.1 kB
text/coffeescript
dv = require 'ndv'
{length, distanceVector, boxDistanceVector, intersectBox, boundingBox} = require './math'
# Compiles a mask with lines that have a certain length.
detectLineMask = (image, minLineLength) ->
lineMask = new dv.Image(image.width, image.height, 8)
longLines = image.toGray().lineSegments(0, 0, false).filter (line) ->
return length(distanceVector(line.p1, line.p2)) >= minLineLength
for line in longLines
lineMask.drawLine line.p1, line.p2, 7, 'set'
return lineMask
mergeRegions = (items, predicate) ->
# Initialize regions with unique indices.
regions = [0...items.length]
# Merge regions until predicate can no longer be applied.
done = false
while not done
done = true
# Merge regions (non-transitive).
for item, i in items
for otherItem, j in items[i + 1..]
jj = j + i + 1
if regions[jj] isnt regions[i] and predicate(item, otherItem)
region = Math.min(regions[jj], regions[i])
regions[i] = regions[jj] = region
done = false
# Propagate merges (transitive).
for i in [0..regions.length]
while regions[regions[i]] isnt regions[i]
regions[i] = regions[regions[i]]
return regions
isSameBlock = (fontWidth, fontHeight) ->
return (boxA, boxB) ->
bottomA = boxA.y + boxA.height
bottomB = boxB.y + boxB.height
delta = boxDistanceVector boxA, boxB
sameLine = Math.abs(bottomA - bottomB) < fontHeight / 2 and delta.x < fontWidth * 3
return sameLine or intersectBox(boxA, boxB)
detectCandidates = (binarizedImage, fontWidth = 20, fontHeight = 30) ->
hasLetterSize = (box) ->
return fontWidth / 2 < box.width and fontHeight / 2 < box.height < fontHeight * 6
# Smear text a bit to extract letter boxes.
smearWidth = (1 * fontWidth) + fontWidth % 2
smearHeight = (0.25 * fontHeight) + fontHeight % 2
boxes = binarizedImage.dilate(smearWidth, smearHeight).connectedComponents(8).filter(hasLetterSize)
# Merge letter boxes to text regions.
regions = mergeRegions boxes, isSameBlock(fontWidth, fontHeight)
boxesByRegion = {}
for region, boxIndex in regions
boxesByRegion[region] ?= []
boxesByRegion[region].push(boxes[boxIndex])
candidates = (boxes for _, boxes of boxesByRegion)
return candidates
# Clone area of an image from boxes
cloneUsingRegion = (image, boxes) ->
cloneBox = boundingBox boxes
cloneImage = new dv.Image cloneBox.width, cloneBox.height, image.depth
cloneImage.clearBox
x: 0
y: 0
width: cloneBox.width
height: cloneBox.height
for box in boxes
cloneImage.drawImage image.crop(box.x, box.y, box.width + 25, box.height),
x: box.x - cloneBox.x
y: box.y - cloneBox.y
width: box.width + 25
height: box.height
return [cloneImage, cloneBox]
findWords = (candidates, image, tesseract) ->
words = []
for candidateBoxes in candidates
# Crop and recognize.
[cloneImage, cloneBox] = cloneUsingRegion image, candidateBoxes
tesseract.image = cloneImage
tesseract.pageSegMode = if cloneBox.height < 60 then 'single_line' else 'single_block'
localWords = tesseract.findWords()
for word in localWords
# Transform back.
word.box.x += cloneBox.x
word.box.y += cloneBox.y
# Store candidate.
word.candidate = candidateBoxes[..]
words = words.concat(localWords)
# Filter words with tiny boxes.
words = words.filter (word) -> word.box.width > 5 and word.box.height > 5
return words
# Use given *Tesseract* instance to find all text grouped as words along with
# confidence and boxes.
module.exports.findText = (image, tesseract) ->
clearedImage = new dv.Image image
# Remove long lines.
lineMask = detectLineMask image, 45
textImage = image.toGray().add lineMask.toGray()
# Find words using a simple Otsu thresholding.
tesseract.image = textImage
candidates = detectCandidates textImage.otsuAdaptiveThreshold(128, 128, 0, 0, 0).image
words = findWords candidates, image, tesseract
# Remove words from image, but safeguard against removing 'noise' that may be a checkmark.
for word in words when word.text.length >= 6 or (word.text.length >= 3 and word.confidence >= 30)
clearedImage.clearBox word.box
return [words, clearedImage]