nfv
Version:
[Updated to use ndv package instead dv] FormVision is a node.js library for extracting data from scanned forms
333 lines (301 loc) • 11.2 kB
text/coffeescript
{unpack, validate} = require './schema'
{distance, length, boundingBox} = require './math'
# Find anchor words (unique matches).
findAnchors = (textFields, words, schemaToPage) ->
anchors = []
anchorFields = []
anchorWords = []
for textField, fieldIndex in textFields
matches = []
for word, wordIndex in words when word.text.length > 3
if validate textField, word.text, false
matches.push wordIndex
if matches.length is 1
word = words[matches[0]]
if word in anchorWords
# Uniqueness check failed: This is at least the second field subscribing to `word`.
# Pull the existing one from anchors and skip this one completely.
#console.log 'Duplicate subscription on', word.text
for anchor, index in anchors
if anchor.word is word
anchors.splice index, 1
anchorFields.splice index, 1
break
continue
# Safeguard: Disregard matches that are too far off
pageBox = schemaToPage textField.box
continue if Math.abs(word.box.x - pageBox.x) + Math.abs(word.box.y - pageBox.y) > 400
#console.log 'Unique match:', textField, word
anchor =
acceptedBy: textField.path
offset:
x: pageBox.x - word.box.x
y: pageBox.y - word.box.y
word: word
anchors.push anchor
anchorWords.push word
return anchors
# Find close words in a box.
findTwoClosestWords = (pageBox, words) ->
wordDistances = words.map (word) -> {distance: Math.abs(word.box.x - pageBox.x) + Math.abs(word.box.y - pageBox.y), word}
wordDistances = wordDistances.filter (i) -> i.distance < 200
wordDistances.sort (a, b) -> a.distance - b.distance
return wordDistances[...2].map (item) -> item.word
# Select words in a box.
selectWords = (words, box) ->
selectedWords = []
right = box.x + box.width
bottom = box.y + box.height
for word in words
# Select all words that are at least 50% within box in x direction,
# touch box in y direction, and are none of the typical 'character garbage'.
if (word.box.x + word.box.width / 2) < right and (word.box.x + word.box.width / 2) > box.x and
word.box.y < bottom and word.box.y + word.box.height > box.y and
word.text not in ['I', '|', '_', '—']
selectedWords.push word
# Now decide which words in y direction to take: First line is the one which is nearest to specified y.
firstLine = undefined
firstLineDiff = Infinity
for word in selectedWords
diff = Math.abs box.y - word.box.y
if diff < firstLineDiff
firstLine = word.box.y
firstLineDiff = diff
#console.log 'Chosen as first line:', firstLine
return (word for word in selectedWords when firstLine - 10 <= word.box.y < firstLine + box.height - 5)
# Estimate width of symbols from a word.
estimateSymbolWidth = (word) ->
charWidth = word.box.width / word.text.length
# Compensate for padding of outermost characters
charWidth += 0.2 * charWidth / word.text.length
return charWidth
# Convert words in random order to a single block of text.
wordsToText = (words, extendedGapDetection = false) ->
# Extract lines from Y difference peaks.
lines = [[]]
lastWord = words[0]
words.sort((a, b) -> a.box.y + a.box.height - b.box.y - b.box.height)
for word in words
if Math.abs(lastWord.box.y + lastWord.box.height - word.box.y - word.box.height) > 15
lines.push []
lines[lines.length - 1].push word
lastWord = word
# Put lines in reading order and join them.
text = ''
# Make an attempt to repair words splitted into characters, e.g. 'J 1 2/ 34 5'.
fragment = /^(|\w|\d\d)\/?$/
for line, i in lines
text += '\n' unless i is 0
line.sort((a, b) -> a.box.x - b.box.x)
gap = 0
minimumGap = 50
lastWord = null
for word, i in line
isFragment = fragment.test word.text
if lastWord?
charWidth = (estimateSymbolWidth(lastWord) + estimateSymbolWidth(word)) / 2
gap = word.box.x - (lastWord.box.x + lastWord.box.width)
minimumGap = charWidth * 1.5 if extendedGapDetection
if (i is 0) or (isFragment and fragment.test(lastWord.text) and gap < minimumGap)
text += word.text
else if extendedGapDetection and gap > charWidth * 2
# Insert up to three spaces depending on gap
spaces = Math.max 1, Math.floor(gap / charWidth)
text += ' '[...spaces] + word.text
else
text += ' ' + word.text
lastWord = word
return text
# Find closest anchor.
findClosestAnchor = (anchors, pageBox) ->
minDistance = Infinity
closest = null
for anchor in anchors
dist = Math.abs(anchor.word.box.x - pageBox.x) + Math.abs(anchor.word.box.y - pageBox.y)
if dist < minDistance
minDistance = dist
closest = anchor
return closest
# Compute confidence from words.
wordsToConfidence = (words) ->
return Math.round(words.reduce(((sum, word) -> sum + word.confidence), 0) / words.length)
# Compute confidence from pixels inside box.
pixelsToConfidence = (box, image) ->
# Sanitize box to image.
x = Math.max(0, Math.min(image.width - 1, box.x))
y = Math.max(0, Math.min(image.height - 1, box.y))
cropBox =
x: x
y: y
width: Math.max(0, Math.min(image.width - x, box.width))
height: Math.max(0, Math.min(image.height - y, box.height))
return 50 if cropBox.width is 0 or cropBox.height is 0
# Search for pixel blobs and compute confidence.
blobImage = image.crop(cropBox).threshold(248)
blobs = blobImage.connectedComponents(8).filter (box) -> box.width > 4 and box.height > 6
if blobs.length > 0
box = boundingBox blobs
blobRatio = (box.width * box.height) / (blobImage.width * blobImage.height)
confidence = Math.max(0, Math.round((0.33 - blobRatio) * 100))
else
confidence = 100
return confidence
# Find text variants and assign a priority:
# valid content > valid epsilon > positional content > positional epsilon
findVariants = (field, anchors, words, schemaToPage, image) ->
pageBox = schemaToPage field.box
closestAnchor = findClosestAnchor anchors, pageBox
if closestAnchor?
pageBox.x -= closestAnchor.offset.x
pageBox.y -= closestAnchor.offset.y
closeWords = findTwoClosestWords pageBox, words
# Generate search boxes.
searchBoxes = [
x: pageBox.x
y: pageBox.y
width: pageBox.width
height: pageBox.height
priority: 0
]
for word in closeWords
searchBoxes.push
x: word.box.x
y: word.box.y
width: pageBox.width
height: pageBox.height
priority: 2
searchBoxes.push
x: word.box.x
y: word.box.y
width: pageBox.width * 0.9
height: pageBox.height * 0.9
priority: 3
searchBoxes.push
x: word.box.x
y: word.box.y
width: pageBox.width * 1.1
height: pageBox.height * 1.1
priority: 3
# Map words to variants using search boxes.
variants = []
for searchBox in searchBoxes
candidateWords = selectWords words, searchBox
if candidateWords.length > 0
candidateText = wordsToText candidateWords, field.extendedGapDetection
isDuplicate = variants.some (variant) -> variant.text is candidateText
continue if isDuplicate
isValid = validate field, candidateText, true
continue if not isValid
variants.push
path: field.path
confidence: wordsToConfidence candidateWords
box: boundingBox (word.box for word in candidateWords)
text: candidateText
words: candidateWords
used: false
priority: [isValid, searchBox.priority]
# Insert epsilon variant when confident or nothing else was found.
epsilonConfidence = pixelsToConfidence pageBox, image
if epsilonConfidence > 50 or variants.length is 0
isValid = validate field, '', false
variants.push
path: field.path
confidence: epsilonConfidence
box: pageBox
text: ''
words: []
used: false
priority: [isValid, 1]
variants.sort (a, b) ->
deltaValid = b.priority[0] - a.priority[0]
deltaPriority = a.priority[1] - b.priority[1]
if deltaValid is 0
return deltaPriority
else
return deltaValid
return variants
# Match text to form schema.
#
# This process is content- and location-sensitive.
module.exports.matchText = (formData, formSchema, words, schemaToPage, rawImage) ->
textFields = formSchema.fields.filter((field) -> field.type is 'text')
return {} if textFields.length is 0
textFields.sort (a, b) ->
deltaY = a.box.y - b.box.y
deltaX = a.box.x - b.box.x
if Math.abs(deltaY) < 20
return deltaX
else
return deltaY
image = rawImage.toGray()
# Find anchors to compensate for *very* inaccurate printing.
anchors = findAnchors textFields, words, schemaToPage
#console.log 'anchors', anchors
# Map words to variants.
variantsByPath = {}
variantsByWord = {}
for field in textFields
variants = findVariants field, anchors, words, schemaToPage, image
variantsByPath[field.path] = variants
for variant in variants
for word in variant.words
wordIndex = words.indexOf word
variantsByWord[wordIndex] ?= []
variantsByWord[wordIndex].push variant
# Solve simple conflicts automatically (before passing to fieldSelector).
# For the case that a field has only a single one-word variant available, remove that option from
# everything else (except if this would in turn bring those fields' variants to zero).
for field in textFields
variants = variantsByPath[field.path]
if variants.length is 1 and variants[0].words.length is 1
word = variants[0].words[0]
wordIndex = words.indexOf word
for variant in variantsByWord[wordIndex]
continue if variant.path is field.path
conflictingVariants = variantsByPath[variant.path]
if conflictingVariants.length > 1
#console.log 'Removing ' + JSON.stringify(variant.text) + ' from ' + variant.path +
# ' because ' + JSON.stringify(word.text) + ' is the only valid option for ' + field.path
conflictingVariants.splice conflictingVariants.indexOf(variant), 1
# Reduce to fields.
selectedVariants = []
wordUsage = []
for field in textFields
variants = variantsByPath[field.path].filter (variant) -> not variant.used
# All variants are marked as used... TODO: Assign epsilon instead?
if variants.length is 0
variants = variantsByPath[field.path]
# Choose variant.
#console.log field.path, variants.map (variant) -> variant.text
if variants.length > 1 and field.fieldSelector?
values = variants.map (variant) -> variant.text
choice = field.fieldSelector values
if choice not of values
throw new Error('Returned choice index out of bounds')
else
choice = 0
selectedVariants.push selectedVariant = variants[choice]
for word in selectedVariant.words
wordIndex = words.indexOf word
# Mark conflicting variants as used.
for wordVariant in variantsByWord[wordIndex]
wordVariant.used = true
# Mark word as used.
wordUsage[wordIndex] ?= []
wordUsage[wordIndex].push field.path
for field, fieldIndex in textFields
selectedVariant = selectedVariants[fieldIndex]
# Compute conflicts.
conflicts = [field.path]
for word in selectedVariant.words
wordIndex = words.indexOf word
for path in wordUsage[wordIndex] when path not in conflicts
conflicts.push path
conflicts.splice(0, 1)
# Assign variant to field.
fieldData = unpack formData, field.path
fieldData.value = selectedVariant.text
fieldData.confidence = selectedVariant.confidence
fieldData.box = selectedVariant.box
fieldData.conflicts = conflicts
return {anchors}