fv
Version:
FormVision is a node.js library for extracting data from scanned forms
167 lines (157 loc) • 5.67 kB
JavaScript
// Generated by CoffeeScript 2.3.1
var distance, estimateTransform, findClosestMark, findClosestShortWord, matchByMark, matchByWords, unpack, validate,
indexOf = [].indexOf;
({unpack, validate} = require('./schema'));
({estimateTransform} = require('./estimate_transform'));
({distance} = require('./math'));
findClosestShortWord = function(words, box, maxDistance) {
var closestIndex, dist, i, index, len, minDistance, word;
minDistance = maxDistance;
closestIndex = -1;
for (index = i = 0, len = words.length; i < len; index = ++i) {
word = words[index];
if (!(word.text.length < 3)) {
continue;
}
dist = distance(word.box, box);
if (dist < minDistance) {
minDistance = dist;
closestIndex = index;
}
}
return closestIndex;
};
matchByWords = function(formData, fields, words, schemaToPage, schemaToData) {
var closeIndex, dataBox, field, fieldData, fieldIndex, i, index, j, len, len1, matchedFields, matches, wordUsage;
matchedFields = [];
matches = [];
wordUsage = [];
for (i = 0, len = fields.length; i < len; i++) {
field = fields[i];
// Find short words close to estimated locations (data and page transform).
dataBox = schemaToData(field.box);
closeIndex = findClosestShortWord(words, dataBox, dataBox.width);
if (closeIndex === -1) {
closeIndex = findClosestShortWord(words, schemaToPage(field.box), dataBox.width);
}
if (closeIndex === -1) {
continue;
}
// Validate short words.
if (validate(field, words[closeIndex].text, false)) {
if (wordUsage[closeIndex] == null) {
wordUsage[closeIndex] = [];
}
wordUsage[closeIndex].push(field.path);
matchedFields.push(field);
matches.push(closeIndex);
}
}
// Assign matching fields.
for (fieldIndex = j = 0, len1 = matchedFields.length; j < len1; fieldIndex = ++j) {
field = matchedFields[fieldIndex];
index = matches[fieldIndex];
fieldData = unpack(formData, field.path);
fieldData.value = words[index].text;
fieldData.confidence = words[index].confidence;
fieldData.box = words[index].box;
fieldData.conflicts = wordUsage[index].filter(function(path) {
return path !== field.path;
});
}
return matchedFields;
};
findClosestMark = function(marks, box, maxDistance) {
var closestIndex, dist, i, index, len, mark, minDistance;
minDistance = maxDistance;
closestIndex = -1;
for (index = i = 0, len = marks.length; i < len; index = ++i) {
mark = marks[index];
dist = distance(mark.box, box);
if (dist < minDistance) {
minDistance = dist;
closestIndex = index;
}
}
return closestIndex;
};
matchByMark = function(formData, fields, marks, schemaToPage) {
var closeIndex, farDistance, field, fieldData, i, j, len, len1, markUsage, match, matches, nearDistance, pageBox;
matches = {};
markUsage = [];
for (i = 0, len = fields.length; i < len; i++) {
field = fields[i];
// Find close marks using estimated locations (only page transform).
pageBox = schemaToPage(field.box);
nearDistance = pageBox.width * 0.95;
farDistance = pageBox.width * 3.00;
closeIndex = findClosestMark(marks, pageBox, farDistance);
if (closeIndex === -1) {
// No marks with less than far distance found, thus false.
matches[field.path] = {
index: -1,
value: false,
confidence: 100,
box: pageBox
};
} else if (distance(marks[closeIndex].box, pageBox) > nearDistance) {
// Mark between near and far distance found, thus false with reduced confidence.
matches[field.path] = {
index: -1,
value: false,
confidence: Math.round((distance(marks[closeIndex].box, pageBox) / farDistance) * 100),
box: pageBox
};
} else {
// Near mark found, thus use it.
matches[field.path] = {
index: closeIndex,
value: marks[closeIndex].checked,
confidence: marks[closeIndex].confidence,
box: marks[closeIndex].box
};
if (markUsage[closeIndex] == null) {
markUsage[closeIndex] = [];
}
markUsage[closeIndex].push(field.path);
}
}
// Assign matching fields.
for (j = 0, len1 = fields.length; j < len1; j++) {
field = fields[j];
match = matches[field.path];
fieldData = unpack(formData, field.path);
fieldData.value = match.value;
fieldData.confidence = match.confidence;
fieldData.box = match.box;
if (match.index === -1) {
fieldData.conflicts = [];
} else {
fieldData.conflicts = markUsage[match.index].filter(function(path) {
return path !== field.path;
});
}
}
};
// Match checkboxes to form schema.
// This process is content- and location-sensitive. Short words are preferred over marks.
// XXX: false negatives are to be expected when words are invalidated, this hurts valid vs. positional matching.
module.exports.matchCheckboxes = function(formData, formSchema, marks, words, schemaToPage, schemaToData) {
var assignedFields, checkboxFields, field, remainingFields;
checkboxFields = formSchema.fields.filter(function(field) {
return field.type === 'checkbox';
});
assignedFields = matchByWords(formData, checkboxFields, words, schemaToPage, schemaToData);
remainingFields = (function() {
var i, len, results;
results = [];
for (i = 0, len = checkboxFields.length; i < len; i++) {
field = checkboxFields[i];
if (indexOf.call(assignedFields, field) < 0) {
results.push(field);
}
}
return results;
})();
matchByMark(formData, remainingFields, marks, schemaToPage);
};