nfv
Version:
[Updated to use ndv package instead dv] FormVision is a node.js library for extracting data from scanned forms
106 lines (100 loc) • 3.81 kB
text/coffeescript
{unpack, validate} = require './schema'
{estimateTransform} = require './estimate_transform'
{distance} = require './math'
findClosestShortWord = (words, box, maxDistance) ->
minDistance = maxDistance
closestIndex = -1
for word, index in words when word.text.length < 3
dist = distance(word.box, box)
if dist < minDistance
minDistance = dist
closestIndex = index
return closestIndex
matchByWords = (formData, fields, words, schemaToPage, schemaToData) ->
matchedFields = []
matches = []
wordUsage = []
for field in fields
# Find short words close to estimated locations (data and page transform).
dataBox = schemaToData field.box
closeIndex = findClosestShortWord words, dataBox, dataBox.width
closeIndex = findClosestShortWord words, schemaToPage(field.box), dataBox.width if closeIndex is -1
continue if closeIndex is -1
# Validate short words.
if validate field, words[closeIndex].text, false
wordUsage[closeIndex] ?= []
wordUsage[closeIndex].push field.path
matchedFields.push field
matches.push closeIndex
# Assign matching fields.
for field, fieldIndex in matchedFields
index = matches[fieldIndex]
fieldData = unpack formData, field.path
fieldData.value = words[index].text
fieldData.confidence = words[index].confidence
fieldData.box = words[index].box
fieldData.conflicts = wordUsage[index].filter (path) -> path isnt field.path
return matchedFields
findClosestMark = (marks, box, maxDistance) ->
minDistance = maxDistance
closestIndex = -1
for mark, index in marks
dist = distance(mark.box, box)
if dist < minDistance
minDistance = dist
closestIndex = index
return closestIndex
matchByMark = (formData, fields, marks, schemaToPage) ->
matches = {}
markUsage = []
for field in fields
# Find close marks using estimated locations (only page transform).
pageBox = schemaToPage field.box
nearDistance = pageBox.width * 0.95
farDistance = pageBox.width * 3.00
closeIndex = findClosestMark marks, pageBox, farDistance
if closeIndex is -1
# No marks with less than far distance found, thus false.
matches[field.path] =
index: -1
value: false
confidence: 100
box: pageBox
else if distance(marks[closeIndex].box, pageBox) > nearDistance
# Mark between near and far distance found, thus false with reduced confidence.
matches[field.path] =
index: -1
value: false
confidence: Math.round((distance(marks[closeIndex].box, pageBox) / farDistance) * 100)
box: pageBox
else
# Near mark found, thus use it.
matches[field.path] =
index: closeIndex
value: marks[closeIndex].checked
confidence: marks[closeIndex].confidence
box: marks[closeIndex].box
markUsage[closeIndex] ?= []
markUsage[closeIndex].push field.path
# Assign matching fields.
for field in fields
match = matches[field.path]
fieldData = unpack formData, field.path
fieldData.value = match.value
fieldData.confidence = match.confidence
fieldData.box = match.box
if match.index is -1
fieldData.conflicts = []
else
fieldData.conflicts = markUsage[match.index].filter (path) -> path isnt field.path
return
# Match checkboxes to form schema.
#
# This process is content- and location-sensitive. Short words are preferred over marks.
# XXX: false negatives are to be expected when words are invalidated, this hurts valid vs. positional matching.
module.exports.matchCheckboxes = (formData, formSchema, marks, words, schemaToPage, schemaToData) ->
checkboxFields = formSchema.fields.filter((field) -> field.type is 'checkbox')
assignedFields = matchByWords formData, checkboxFields, words, schemaToPage, schemaToData
remainingFields = (field for field in checkboxFields when field not in assignedFields)
matchByMark formData, remainingFields, marks, schemaToPage
return