UNPKG

fv

Version:

FormVision is a node.js library for extracting data from scanned forms

493 lines (474 loc) 16.6 kB
// Generated by CoffeeScript 2.3.1 var boundingBox, distance, estimateSymbolWidth, findAnchors, findClosestAnchor, findTwoClosestWords, findVariants, length, pixelsToConfidence, selectWords, unpack, validate, wordsToConfidence, wordsToText, indexOf = [].indexOf; ({unpack, validate} = require('./schema')); ({distance, length, boundingBox} = require('./math')); // Find anchor words (unique matches). findAnchors = function(textFields, words, schemaToPage) { var anchor, anchorFields, anchorWords, anchors, fieldIndex, index, j, k, l, len, len1, len2, matches, pageBox, textField, word, wordIndex; anchors = []; anchorFields = []; anchorWords = []; for (fieldIndex = j = 0, len = textFields.length; j < len; fieldIndex = ++j) { textField = textFields[fieldIndex]; matches = []; for (wordIndex = k = 0, len1 = words.length; k < len1; wordIndex = ++k) { word = words[wordIndex]; if (word.text.length > 3) { if (validate(textField, word.text, false)) { matches.push(wordIndex); } } } if (matches.length === 1) { word = words[matches[0]]; if (indexOf.call(anchorWords, word) >= 0) { // Uniqueness check failed: This is at least the second field subscribing to `word`. // Pull the existing one from anchors and skip this one completely. //console.log 'Duplicate subscription on', word.text for (index = l = 0, len2 = anchors.length; l < len2; index = ++l) { anchor = anchors[index]; if (anchor.word === word) { anchors.splice(index, 1); anchorFields.splice(index, 1); break; } } continue; } // Safeguard: Disregard matches that are too far off pageBox = schemaToPage(textField.box); if (Math.abs(word.box.x - pageBox.x) + Math.abs(word.box.y - pageBox.y) > 400) { continue; } //console.log 'Unique match:', textField, word anchor = { acceptedBy: textField.path, offset: { x: pageBox.x - word.box.x, y: pageBox.y - word.box.y }, word: word }; anchors.push(anchor); anchorWords.push(word); } } return anchors; }; // Find close words in a box. findTwoClosestWords = function(pageBox, words) { var wordDistances; wordDistances = words.map(function(word) { return { distance: Math.abs(word.box.x - pageBox.x) + Math.abs(word.box.y - pageBox.y), word }; }); wordDistances = wordDistances.filter(function(i) { return i.distance < 200; }); wordDistances.sort(function(a, b) { return a.distance - b.distance; }); return wordDistances.slice(0, 2).map(function(item) { return item.word; }); }; // Select words in a box. selectWords = function(words, box) { var bottom, diff, firstLine, firstLineDiff, j, k, len, len1, ref, right, selectedWords, word; selectedWords = []; right = box.x + box.width; bottom = box.y + box.height; for (j = 0, len = words.length; j < len; j++) { word = words[j]; // Select all words that are at least 50% within box in x direction, // touch box in y direction, and are none of the typical 'character garbage'. if ((word.box.x + word.box.width / 2) < right && (word.box.x + word.box.width / 2) > box.x && word.box.y < bottom && word.box.y + word.box.height > box.y && ((ref = word.text) !== 'I' && ref !== '|' && ref !== '_' && ref !== '—')) { selectedWords.push(word); } } // Now decide which words in y direction to take: First line is the one which is nearest to specified y. firstLine = void 0; firstLineDiff = 2e308; for (k = 0, len1 = selectedWords.length; k < len1; k++) { word = selectedWords[k]; diff = Math.abs(box.y - word.box.y); if (diff < firstLineDiff) { firstLine = word.box.y; firstLineDiff = diff; } } return (function() { var l, len2, ref1, results; results = []; for (l = 0, len2 = selectedWords.length; l < len2; l++) { word = selectedWords[l]; if ((firstLine - 10 <= (ref1 = word.box.y) && ref1 < firstLine + box.height - 5)) { //console.log 'Chosen as first line:', firstLine results.push(word); } } return results; })(); }; // Estimate width of symbols from a word. estimateSymbolWidth = function(word) { var charWidth; charWidth = word.box.width / word.text.length; // Compensate for padding of outermost characters charWidth += 0.2 * charWidth / word.text.length; return charWidth; }; // Convert words in random order to a single block of text. wordsToText = function(words, extendedGapDetection = false) { var charWidth, fragment, gap, i, isFragment, j, k, l, lastWord, len, len1, len2, line, lines, minimumGap, spaces, text, word; // Extract lines from Y difference peaks. lines = [[]]; lastWord = words[0]; words.sort(function(a, b) { return a.box.y + a.box.height - b.box.y - b.box.height; }); for (j = 0, len = words.length; j < len; j++) { word = words[j]; if (Math.abs(lastWord.box.y + lastWord.box.height - word.box.y - word.box.height) > 15) { lines.push([]); } lines[lines.length - 1].push(word); lastWord = word; } // Put lines in reading order and join them. text = ''; // Make an attempt to repair words splitted into characters, e.g. 'J 1 2/ 34 5'. fragment = /^(|\w|\d\d)\/?$/; for (i = k = 0, len1 = lines.length; k < len1; i = ++k) { line = lines[i]; if (i !== 0) { text += '\n'; } line.sort(function(a, b) { return a.box.x - b.box.x; }); gap = 0; minimumGap = 50; lastWord = null; for (i = l = 0, len2 = line.length; l < len2; i = ++l) { word = line[i]; isFragment = fragment.test(word.text); if (lastWord != null) { charWidth = (estimateSymbolWidth(lastWord) + estimateSymbolWidth(word)) / 2; gap = word.box.x - (lastWord.box.x + lastWord.box.width); if (extendedGapDetection) { minimumGap = charWidth * 1.5; } } if ((i === 0) || (isFragment && fragment.test(lastWord.text) && gap < minimumGap)) { text += word.text; } else if (extendedGapDetection && gap > charWidth * 2) { // Insert up to three spaces depending on gap spaces = Math.max(1, Math.floor(gap / charWidth)); text += ' '.slice(0, spaces) + word.text; } else { text += ' ' + word.text; } lastWord = word; } } return text; }; // Find closest anchor. findClosestAnchor = function(anchors, pageBox) { var anchor, closest, dist, j, len, minDistance; minDistance = 2e308; closest = null; for (j = 0, len = anchors.length; j < len; j++) { anchor = anchors[j]; dist = Math.abs(anchor.word.box.x - pageBox.x) + Math.abs(anchor.word.box.y - pageBox.y); if (dist < minDistance) { minDistance = dist; closest = anchor; } } return closest; }; // Compute confidence from words. wordsToConfidence = function(words) { return Math.round(words.reduce((function(sum, word) { return sum + word.confidence; }), 0) / words.length); }; // Compute confidence from pixels inside box. pixelsToConfidence = function(box, image) { var blobImage, blobRatio, blobs, confidence, cropBox, x, y; // Sanitize box to image. x = Math.max(0, Math.min(image.width - 1, box.x)); y = Math.max(0, Math.min(image.height - 1, box.y)); cropBox = { x: x, y: y, width: Math.max(0, Math.min(image.width - x, box.width)), height: Math.max(0, Math.min(image.height - y, box.height)) }; if (cropBox.width === 0 || cropBox.height === 0) { return 50; } // Search for pixel blobs and compute confidence. blobImage = image.crop(cropBox).threshold(248); blobs = blobImage.connectedComponents(8).filter(function(box) { return box.width > 4 && box.height > 6; }); if (blobs.length > 0) { box = boundingBox(blobs); blobRatio = (box.width * box.height) / (blobImage.width * blobImage.height); confidence = Math.max(0, Math.round((0.33 - blobRatio) * 100)); } else { confidence = 100; } return confidence; }; // Find text variants and assign a priority: // valid content > valid epsilon > positional content > positional epsilon findVariants = function(field, anchors, words, schemaToPage, image) { var candidateText, candidateWords, closeWords, closestAnchor, epsilonConfidence, isDuplicate, isValid, j, k, len, len1, pageBox, searchBox, searchBoxes, variants, word; pageBox = schemaToPage(field.box); closestAnchor = findClosestAnchor(anchors, pageBox); if (closestAnchor != null) { pageBox.x -= closestAnchor.offset.x; pageBox.y -= closestAnchor.offset.y; } closeWords = findTwoClosestWords(pageBox, words); // Generate search boxes. searchBoxes = [ { x: pageBox.x, y: pageBox.y, width: pageBox.width, height: pageBox.height, priority: 0 } ]; for (j = 0, len = closeWords.length; j < len; j++) { word = closeWords[j]; searchBoxes.push({ x: word.box.x, y: word.box.y, width: pageBox.width, height: pageBox.height, priority: 2 }); searchBoxes.push({ x: word.box.x, y: word.box.y, width: pageBox.width * 0.9, height: pageBox.height * 0.9, priority: 3 }); searchBoxes.push({ x: word.box.x, y: word.box.y, width: pageBox.width * 1.1, height: pageBox.height * 1.1, priority: 3 }); } // Map words to variants using search boxes. variants = []; for (k = 0, len1 = searchBoxes.length; k < len1; k++) { searchBox = searchBoxes[k]; candidateWords = selectWords(words, searchBox); if (candidateWords.length > 0) { candidateText = wordsToText(candidateWords, field.extendedGapDetection); isDuplicate = variants.some(function(variant) { return variant.text === candidateText; }); if (isDuplicate) { continue; } isValid = validate(field, candidateText, true); if (!isValid) { continue; } variants.push({ path: field.path, confidence: wordsToConfidence(candidateWords), box: boundingBox((function() { var l, len2, results; results = []; for (l = 0, len2 = candidateWords.length; l < len2; l++) { word = candidateWords[l]; results.push(word.box); } return results; })()), text: candidateText, words: candidateWords, used: false, priority: [isValid, searchBox.priority] }); } } // Insert epsilon variant when confident or nothing else was found. epsilonConfidence = pixelsToConfidence(pageBox, image); if (epsilonConfidence > 50 || variants.length === 0) { isValid = validate(field, '', false); variants.push({ path: field.path, confidence: epsilonConfidence, box: pageBox, text: '', words: [], used: false, priority: [isValid, 1] }); } variants.sort(function(a, b) { var deltaPriority, deltaValid; deltaValid = b.priority[0] - a.priority[0]; deltaPriority = a.priority[1] - b.priority[1]; if (deltaValid === 0) { return deltaPriority; } else { return deltaValid; } }); return variants; }; // Match text to form schema. // This process is content- and location-sensitive. module.exports.matchText = function(formData, formSchema, words, schemaToPage, rawImage) { var anchors, choice, conflictingVariants, conflicts, field, fieldData, fieldIndex, image, j, k, l, len, len1, len10, len2, len3, len4, len5, len6, len7, len8, len9, m, n, o, p, path, q, r, ref, ref1, ref2, ref3, ref4, ref5, s, selectedVariant, selectedVariants, t, textFields, values, variant, variants, variantsByPath, variantsByWord, word, wordIndex, wordUsage, wordVariant; textFields = formSchema.fields.filter(function(field) { return field.type === 'text'; }); if (textFields.length === 0) { return {}; } textFields.sort(function(a, b) { var deltaX, deltaY; deltaY = a.box.y - b.box.y; deltaX = a.box.x - b.box.x; if (Math.abs(deltaY) < 20) { return deltaX; } else { return deltaY; } }); image = rawImage.toGray(); // Find anchors to compensate for *very* inaccurate printing. anchors = findAnchors(textFields, words, schemaToPage); //console.log 'anchors', anchors // Map words to variants. variantsByPath = {}; variantsByWord = {}; for (j = 0, len = textFields.length; j < len; j++) { field = textFields[j]; variants = findVariants(field, anchors, words, schemaToPage, image); variantsByPath[field.path] = variants; for (k = 0, len1 = variants.length; k < len1; k++) { variant = variants[k]; ref = variant.words; for (l = 0, len2 = ref.length; l < len2; l++) { word = ref[l]; wordIndex = words.indexOf(word); if (variantsByWord[wordIndex] == null) { variantsByWord[wordIndex] = []; } variantsByWord[wordIndex].push(variant); } } } // Solve simple conflicts automatically (before passing to fieldSelector). // For the case that a field has only a single one-word variant available, remove that option from // everything else (except if this would in turn bring those fields' variants to zero). for (m = 0, len3 = textFields.length; m < len3; m++) { field = textFields[m]; variants = variantsByPath[field.path]; if (variants.length === 1 && variants[0].words.length === 1) { word = variants[0].words[0]; wordIndex = words.indexOf(word); ref1 = variantsByWord[wordIndex]; for (n = 0, len4 = ref1.length; n < len4; n++) { variant = ref1[n]; if (variant.path === field.path) { continue; } conflictingVariants = variantsByPath[variant.path]; if (conflictingVariants.length > 1) { //console.log 'Removing ' + JSON.stringify(variant.text) + ' from ' + variant.path + // ' because ' + JSON.stringify(word.text) + ' is the only valid option for ' + field.path conflictingVariants.splice(conflictingVariants.indexOf(variant), 1); } } } } // Reduce to fields. selectedVariants = []; wordUsage = []; for (o = 0, len5 = textFields.length; o < len5; o++) { field = textFields[o]; variants = variantsByPath[field.path].filter(function(variant) { return !variant.used; }); // All variants are marked as used... TODO: Assign epsilon instead? if (variants.length === 0) { variants = variantsByPath[field.path]; } // Choose variant. //console.log field.path, variants.map (variant) -> variant.text if (variants.length > 1 && (field.fieldSelector != null)) { values = variants.map(function(variant) { return variant.text; }); choice = field.fieldSelector(values); if (!(choice in values)) { throw new Error('Returned choice index out of bounds'); } } else { choice = 0; } selectedVariants.push(selectedVariant = variants[choice]); ref2 = selectedVariant.words; for (p = 0, len6 = ref2.length; p < len6; p++) { word = ref2[p]; wordIndex = words.indexOf(word); ref3 = variantsByWord[wordIndex]; // Mark conflicting variants as used. for (q = 0, len7 = ref3.length; q < len7; q++) { wordVariant = ref3[q]; wordVariant.used = true; } // Mark word as used. if (wordUsage[wordIndex] == null) { wordUsage[wordIndex] = []; } wordUsage[wordIndex].push(field.path); } } for (fieldIndex = r = 0, len8 = textFields.length; r < len8; fieldIndex = ++r) { field = textFields[fieldIndex]; selectedVariant = selectedVariants[fieldIndex]; // Compute conflicts. conflicts = [field.path]; ref4 = selectedVariant.words; for (s = 0, len9 = ref4.length; s < len9; s++) { word = ref4[s]; wordIndex = words.indexOf(word); ref5 = wordUsage[wordIndex]; for (t = 0, len10 = ref5.length; t < len10; t++) { path = ref5[t]; if (indexOf.call(conflicts, path) < 0) { conflicts.push(path); } } } conflicts.splice(0, 1); // Assign variant to field. fieldData = unpack(formData, field.path); fieldData.value = selectedVariant.text; fieldData.confidence = selectedVariant.confidence; fieldData.box = selectedVariant.box; fieldData.conflicts = conflicts; } return {anchors}; };