UNPKG

fv

Version:

FormVision is a node.js library for extracting data from scanned forms

167 lines (157 loc) 5.67 kB
// Generated by CoffeeScript 2.3.1 var distance, estimateTransform, findClosestMark, findClosestShortWord, matchByMark, matchByWords, unpack, validate, indexOf = [].indexOf; ({unpack, validate} = require('./schema')); ({estimateTransform} = require('./estimate_transform')); ({distance} = require('./math')); findClosestShortWord = function(words, box, maxDistance) { var closestIndex, dist, i, index, len, minDistance, word; minDistance = maxDistance; closestIndex = -1; for (index = i = 0, len = words.length; i < len; index = ++i) { word = words[index]; if (!(word.text.length < 3)) { continue; } dist = distance(word.box, box); if (dist < minDistance) { minDistance = dist; closestIndex = index; } } return closestIndex; }; matchByWords = function(formData, fields, words, schemaToPage, schemaToData) { var closeIndex, dataBox, field, fieldData, fieldIndex, i, index, j, len, len1, matchedFields, matches, wordUsage; matchedFields = []; matches = []; wordUsage = []; for (i = 0, len = fields.length; i < len; i++) { field = fields[i]; // Find short words close to estimated locations (data and page transform). dataBox = schemaToData(field.box); closeIndex = findClosestShortWord(words, dataBox, dataBox.width); if (closeIndex === -1) { closeIndex = findClosestShortWord(words, schemaToPage(field.box), dataBox.width); } if (closeIndex === -1) { continue; } // Validate short words. if (validate(field, words[closeIndex].text, false)) { if (wordUsage[closeIndex] == null) { wordUsage[closeIndex] = []; } wordUsage[closeIndex].push(field.path); matchedFields.push(field); matches.push(closeIndex); } } // Assign matching fields. for (fieldIndex = j = 0, len1 = matchedFields.length; j < len1; fieldIndex = ++j) { field = matchedFields[fieldIndex]; index = matches[fieldIndex]; fieldData = unpack(formData, field.path); fieldData.value = words[index].text; fieldData.confidence = words[index].confidence; fieldData.box = words[index].box; fieldData.conflicts = wordUsage[index].filter(function(path) { return path !== field.path; }); } return matchedFields; }; findClosestMark = function(marks, box, maxDistance) { var closestIndex, dist, i, index, len, mark, minDistance; minDistance = maxDistance; closestIndex = -1; for (index = i = 0, len = marks.length; i < len; index = ++i) { mark = marks[index]; dist = distance(mark.box, box); if (dist < minDistance) { minDistance = dist; closestIndex = index; } } return closestIndex; }; matchByMark = function(formData, fields, marks, schemaToPage) { var closeIndex, farDistance, field, fieldData, i, j, len, len1, markUsage, match, matches, nearDistance, pageBox; matches = {}; markUsage = []; for (i = 0, len = fields.length; i < len; i++) { field = fields[i]; // Find close marks using estimated locations (only page transform). pageBox = schemaToPage(field.box); nearDistance = pageBox.width * 0.95; farDistance = pageBox.width * 3.00; closeIndex = findClosestMark(marks, pageBox, farDistance); if (closeIndex === -1) { // No marks with less than far distance found, thus false. matches[field.path] = { index: -1, value: false, confidence: 100, box: pageBox }; } else if (distance(marks[closeIndex].box, pageBox) > nearDistance) { // Mark between near and far distance found, thus false with reduced confidence. matches[field.path] = { index: -1, value: false, confidence: Math.round((distance(marks[closeIndex].box, pageBox) / farDistance) * 100), box: pageBox }; } else { // Near mark found, thus use it. matches[field.path] = { index: closeIndex, value: marks[closeIndex].checked, confidence: marks[closeIndex].confidence, box: marks[closeIndex].box }; if (markUsage[closeIndex] == null) { markUsage[closeIndex] = []; } markUsage[closeIndex].push(field.path); } } // Assign matching fields. for (j = 0, len1 = fields.length; j < len1; j++) { field = fields[j]; match = matches[field.path]; fieldData = unpack(formData, field.path); fieldData.value = match.value; fieldData.confidence = match.confidence; fieldData.box = match.box; if (match.index === -1) { fieldData.conflicts = []; } else { fieldData.conflicts = markUsage[match.index].filter(function(path) { return path !== field.path; }); } } }; // Match checkboxes to form schema. // This process is content- and location-sensitive. Short words are preferred over marks. // XXX: false negatives are to be expected when words are invalidated, this hurts valid vs. positional matching. module.exports.matchCheckboxes = function(formData, formSchema, marks, words, schemaToPage, schemaToData) { var assignedFields, checkboxFields, field, remainingFields; checkboxFields = formSchema.fields.filter(function(field) { return field.type === 'checkbox'; }); assignedFields = matchByWords(formData, checkboxFields, words, schemaToPage, schemaToData); remainingFields = (function() { var i, len, results; results = []; for (i = 0, len = checkboxFields.length; i < len; i++) { field = checkboxFields[i]; if (indexOf.call(assignedFields, field) < 0) { results.push(field); } } return results; })(); matchByMark(formData, remainingFields, marks, schemaToPage); };