nfv
Version:
[Updated to use ndv package instead dv] FormVision is a node.js library for extracting data from scanned forms
445 lines (432 loc) • 14.4 kB
JavaScript
// Generated by CoffeeScript 1.12.7
var boundingBox, distance, estimateSymbolWidth, findAnchors, findClosestAnchor, findTwoClosestWords, findVariants, length, pixelsToConfidence, ref, ref1, selectWords, unpack, validate, wordsToConfidence, wordsToText,
indexOf = [].indexOf || function(item) { for (var i = 0, l = this.length; i < l; i++) { if (i in this && this[i] === item) return i; } return -1; };
ref = require('./schema'), unpack = ref.unpack, validate = ref.validate;
ref1 = require('./math'), distance = ref1.distance, length = ref1.length, boundingBox = ref1.boundingBox;
findAnchors = function(textFields, words, schemaToPage) {
var anchor, anchorFields, anchorWords, anchors, fieldIndex, index, j, k, l, len, len1, len2, matches, pageBox, textField, word, wordIndex;
anchors = [];
anchorFields = [];
anchorWords = [];
for (fieldIndex = j = 0, len = textFields.length; j < len; fieldIndex = ++j) {
textField = textFields[fieldIndex];
matches = [];
for (wordIndex = k = 0, len1 = words.length; k < len1; wordIndex = ++k) {
word = words[wordIndex];
if (word.text.length > 3) {
if (validate(textField, word.text, false)) {
matches.push(wordIndex);
}
}
}
if (matches.length === 1) {
word = words[matches[0]];
if (indexOf.call(anchorWords, word) >= 0) {
for (index = l = 0, len2 = anchors.length; l < len2; index = ++l) {
anchor = anchors[index];
if (anchor.word === word) {
anchors.splice(index, 1);
anchorFields.splice(index, 1);
break;
}
}
continue;
}
pageBox = schemaToPage(textField.box);
if (Math.abs(word.box.x - pageBox.x) + Math.abs(word.box.y - pageBox.y) > 400) {
continue;
}
anchor = {
acceptedBy: textField.path,
offset: {
x: pageBox.x - word.box.x,
y: pageBox.y - word.box.y
},
word: word
};
anchors.push(anchor);
anchorWords.push(word);
}
}
return anchors;
};
findTwoClosestWords = function(pageBox, words) {
var wordDistances;
wordDistances = words.map(function(word) {
return {
distance: Math.abs(word.box.x - pageBox.x) + Math.abs(word.box.y - pageBox.y),
word: word
};
});
wordDistances = wordDistances.filter(function(i) {
return i.distance < 200;
});
wordDistances.sort(function(a, b) {
return a.distance - b.distance;
});
return wordDistances.slice(0, 2).map(function(item) {
return item.word;
});
};
selectWords = function(words, box) {
var bottom, diff, firstLine, firstLineDiff, j, k, len, len1, ref2, right, selectedWords, word;
selectedWords = [];
right = box.x + box.width;
bottom = box.y + box.height;
for (j = 0, len = words.length; j < len; j++) {
word = words[j];
if ((word.box.x + word.box.width / 2) < right && (word.box.x + word.box.width / 2) > box.x && word.box.y < bottom && word.box.y + word.box.height > box.y && ((ref2 = word.text) !== 'I' && ref2 !== '|' && ref2 !== '_' && ref2 !== '—')) {
selectedWords.push(word);
}
}
firstLine = void 0;
firstLineDiff = 2e308;
for (k = 0, len1 = selectedWords.length; k < len1; k++) {
word = selectedWords[k];
diff = Math.abs(box.y - word.box.y);
if (diff < firstLineDiff) {
firstLine = word.box.y;
firstLineDiff = diff;
}
}
return (function() {
var l, len2, ref3, results;
results = [];
for (l = 0, len2 = selectedWords.length; l < len2; l++) {
word = selectedWords[l];
if ((firstLine - 10 <= (ref3 = word.box.y) && ref3 < firstLine + box.height - 5)) {
results.push(word);
}
}
return results;
})();
};
estimateSymbolWidth = function(word) {
var charWidth;
charWidth = word.box.width / word.text.length;
charWidth += 0.2 * charWidth / word.text.length;
return charWidth;
};
wordsToText = function(words, extendedGapDetection) {
var charWidth, fragment, gap, i, isFragment, j, k, l, lastWord, len, len1, len2, line, lines, minimumGap, spaces, text, word;
if (extendedGapDetection == null) {
extendedGapDetection = false;
}
lines = [[]];
lastWord = words[0];
words.sort(function(a, b) {
return a.box.y + a.box.height - b.box.y - b.box.height;
});
for (j = 0, len = words.length; j < len; j++) {
word = words[j];
if (Math.abs(lastWord.box.y + lastWord.box.height - word.box.y - word.box.height) > 15) {
lines.push([]);
}
lines[lines.length - 1].push(word);
lastWord = word;
}
text = '';
fragment = /^(|\w|\d\d)\/?$/;
for (i = k = 0, len1 = lines.length; k < len1; i = ++k) {
line = lines[i];
if (i !== 0) {
text += '\n';
}
line.sort(function(a, b) {
return a.box.x - b.box.x;
});
gap = 0;
minimumGap = 50;
lastWord = null;
for (i = l = 0, len2 = line.length; l < len2; i = ++l) {
word = line[i];
isFragment = fragment.test(word.text);
if (lastWord != null) {
charWidth = (estimateSymbolWidth(lastWord) + estimateSymbolWidth(word)) / 2;
gap = word.box.x - (lastWord.box.x + lastWord.box.width);
if (extendedGapDetection) {
minimumGap = charWidth * 1.5;
}
}
if ((i === 0) || (isFragment && fragment.test(lastWord.text) && gap < minimumGap)) {
text += word.text;
} else if (extendedGapDetection && gap > charWidth * 2) {
spaces = Math.max(1, Math.floor(gap / charWidth));
text += ' '.slice(0, spaces) + word.text;
} else {
text += ' ' + word.text;
}
lastWord = word;
}
}
return text;
};
findClosestAnchor = function(anchors, pageBox) {
var anchor, closest, dist, j, len, minDistance;
minDistance = 2e308;
closest = null;
for (j = 0, len = anchors.length; j < len; j++) {
anchor = anchors[j];
dist = Math.abs(anchor.word.box.x - pageBox.x) + Math.abs(anchor.word.box.y - pageBox.y);
if (dist < minDistance) {
minDistance = dist;
closest = anchor;
}
}
return closest;
};
wordsToConfidence = function(words) {
return Math.round(words.reduce((function(sum, word) {
return sum + word.confidence;
}), 0) / words.length);
};
pixelsToConfidence = function(box, image) {
var blobImage, blobRatio, blobs, confidence, cropBox, x, y;
x = Math.max(0, Math.min(image.width - 1, box.x));
y = Math.max(0, Math.min(image.height - 1, box.y));
cropBox = {
x: x,
y: y,
width: Math.max(0, Math.min(image.width - x, box.width)),
height: Math.max(0, Math.min(image.height - y, box.height))
};
if (cropBox.width === 0 || cropBox.height === 0) {
return 50;
}
blobImage = image.crop(cropBox).threshold(248);
blobs = blobImage.connectedComponents(8).filter(function(box) {
return box.width > 4 && box.height > 6;
});
if (blobs.length > 0) {
box = boundingBox(blobs);
blobRatio = (box.width * box.height) / (blobImage.width * blobImage.height);
confidence = Math.max(0, Math.round((0.33 - blobRatio) * 100));
} else {
confidence = 100;
}
return confidence;
};
findVariants = function(field, anchors, words, schemaToPage, image) {
var candidateText, candidateWords, closeWords, closestAnchor, epsilonConfidence, isDuplicate, isValid, j, k, len, len1, pageBox, searchBox, searchBoxes, variants, word;
pageBox = schemaToPage(field.box);
closestAnchor = findClosestAnchor(anchors, pageBox);
if (closestAnchor != null) {
pageBox.x -= closestAnchor.offset.x;
pageBox.y -= closestAnchor.offset.y;
}
closeWords = findTwoClosestWords(pageBox, words);
searchBoxes = [
{
x: pageBox.x,
y: pageBox.y,
width: pageBox.width,
height: pageBox.height,
priority: 0
}
];
for (j = 0, len = closeWords.length; j < len; j++) {
word = closeWords[j];
searchBoxes.push({
x: word.box.x,
y: word.box.y,
width: pageBox.width,
height: pageBox.height,
priority: 2
});
searchBoxes.push({
x: word.box.x,
y: word.box.y,
width: pageBox.width * 0.9,
height: pageBox.height * 0.9,
priority: 3
});
searchBoxes.push({
x: word.box.x,
y: word.box.y,
width: pageBox.width * 1.1,
height: pageBox.height * 1.1,
priority: 3
});
}
variants = [];
for (k = 0, len1 = searchBoxes.length; k < len1; k++) {
searchBox = searchBoxes[k];
candidateWords = selectWords(words, searchBox);
if (candidateWords.length > 0) {
candidateText = wordsToText(candidateWords, field.extendedGapDetection);
isDuplicate = variants.some(function(variant) {
return variant.text === candidateText;
});
if (isDuplicate) {
continue;
}
isValid = validate(field, candidateText, true);
if (!isValid) {
continue;
}
variants.push({
path: field.path,
confidence: wordsToConfidence(candidateWords),
box: boundingBox((function() {
var l, len2, results;
results = [];
for (l = 0, len2 = candidateWords.length; l < len2; l++) {
word = candidateWords[l];
results.push(word.box);
}
return results;
})()),
text: candidateText,
words: candidateWords,
used: false,
priority: [isValid, searchBox.priority]
});
}
}
epsilonConfidence = pixelsToConfidence(pageBox, image);
if (epsilonConfidence > 50 || variants.length === 0) {
isValid = validate(field, '', false);
variants.push({
path: field.path,
confidence: epsilonConfidence,
box: pageBox,
text: '',
words: [],
used: false,
priority: [isValid, 1]
});
}
variants.sort(function(a, b) {
var deltaPriority, deltaValid;
deltaValid = b.priority[0] - a.priority[0];
deltaPriority = a.priority[1] - b.priority[1];
if (deltaValid === 0) {
return deltaPriority;
} else {
return deltaValid;
}
});
return variants;
};
module.exports.matchText = function(formData, formSchema, words, schemaToPage, rawImage) {
var anchors, choice, conflictingVariants, conflicts, field, fieldData, fieldIndex, image, j, k, l, len, len1, len10, len2, len3, len4, len5, len6, len7, len8, len9, m, n, o, p, path, q, r, ref2, ref3, ref4, ref5, ref6, ref7, s, selectedVariant, selectedVariants, t, textFields, values, variant, variants, variantsByPath, variantsByWord, word, wordIndex, wordUsage, wordVariant;
textFields = formSchema.fields.filter(function(field) {
return field.type === 'text';
});
if (textFields.length === 0) {
return {};
}
textFields.sort(function(a, b) {
var deltaX, deltaY;
deltaY = a.box.y - b.box.y;
deltaX = a.box.x - b.box.x;
if (Math.abs(deltaY) < 20) {
return deltaX;
} else {
return deltaY;
}
});
image = rawImage.toGray();
anchors = findAnchors(textFields, words, schemaToPage);
variantsByPath = {};
variantsByWord = {};
for (j = 0, len = textFields.length; j < len; j++) {
field = textFields[j];
variants = findVariants(field, anchors, words, schemaToPage, image);
variantsByPath[field.path] = variants;
for (k = 0, len1 = variants.length; k < len1; k++) {
variant = variants[k];
ref2 = variant.words;
for (l = 0, len2 = ref2.length; l < len2; l++) {
word = ref2[l];
wordIndex = words.indexOf(word);
if (variantsByWord[wordIndex] == null) {
variantsByWord[wordIndex] = [];
}
variantsByWord[wordIndex].push(variant);
}
}
}
for (m = 0, len3 = textFields.length; m < len3; m++) {
field = textFields[m];
variants = variantsByPath[field.path];
if (variants.length === 1 && variants[0].words.length === 1) {
word = variants[0].words[0];
wordIndex = words.indexOf(word);
ref3 = variantsByWord[wordIndex];
for (n = 0, len4 = ref3.length; n < len4; n++) {
variant = ref3[n];
if (variant.path === field.path) {
continue;
}
conflictingVariants = variantsByPath[variant.path];
if (conflictingVariants.length > 1) {
conflictingVariants.splice(conflictingVariants.indexOf(variant), 1);
}
}
}
}
selectedVariants = [];
wordUsage = [];
for (o = 0, len5 = textFields.length; o < len5; o++) {
field = textFields[o];
variants = variantsByPath[field.path].filter(function(variant) {
return !variant.used;
});
if (variants.length === 0) {
variants = variantsByPath[field.path];
}
if (variants.length > 1 && (field.fieldSelector != null)) {
values = variants.map(function(variant) {
return variant.text;
});
choice = field.fieldSelector(values);
if (!(choice in values)) {
throw new Error('Returned choice index out of bounds');
}
} else {
choice = 0;
}
selectedVariants.push(selectedVariant = variants[choice]);
ref4 = selectedVariant.words;
for (p = 0, len6 = ref4.length; p < len6; p++) {
word = ref4[p];
wordIndex = words.indexOf(word);
ref5 = variantsByWord[wordIndex];
for (q = 0, len7 = ref5.length; q < len7; q++) {
wordVariant = ref5[q];
wordVariant.used = true;
}
if (wordUsage[wordIndex] == null) {
wordUsage[wordIndex] = [];
}
wordUsage[wordIndex].push(field.path);
}
}
for (fieldIndex = r = 0, len8 = textFields.length; r < len8; fieldIndex = ++r) {
field = textFields[fieldIndex];
selectedVariant = selectedVariants[fieldIndex];
conflicts = [field.path];
ref6 = selectedVariant.words;
for (s = 0, len9 = ref6.length; s < len9; s++) {
word = ref6[s];
wordIndex = words.indexOf(word);
ref7 = wordUsage[wordIndex];
for (t = 0, len10 = ref7.length; t < len10; t++) {
path = ref7[t];
if (indexOf.call(conflicts, path) < 0) {
conflicts.push(path);
}
}
}
conflicts.splice(0, 1);
fieldData = unpack(formData, field.path);
fieldData.value = selectedVariant.text;
fieldData.confidence = selectedVariant.confidence;
fieldData.box = selectedVariant.box;
fieldData.conflicts = conflicts;
}
return {
anchors: anchors
};
};