nfv
Version:
[Updated to use ndv package instead dv] FormVision is a node.js library for extracting data from scanned forms
161 lines (151 loc) • 5.71 kB
JavaScript
// Generated by CoffeeScript 1.12.7
var boundingBox, boxDistanceVector, cloneUsingRegion, detectCandidates, detectLineMask, distanceVector, dv, findWords, intersectBox, isSameBlock, length, mergeRegions, ref;
dv = require('ndv');
ref = require('./math'), length = ref.length, distanceVector = ref.distanceVector, boxDistanceVector = ref.boxDistanceVector, intersectBox = ref.intersectBox, boundingBox = ref.boundingBox;
detectLineMask = function(image, minLineLength) {
var k, len, line, lineMask, longLines;
lineMask = new dv.Image(image.width, image.height, 8);
longLines = image.toGray().lineSegments(0, 0, false).filter(function(line) {
return length(distanceVector(line.p1, line.p2)) >= minLineLength;
});
for (k = 0, len = longLines.length; k < len; k++) {
line = longLines[k];
lineMask.drawLine(line.p1, line.p2, 7, 'set');
}
return lineMask;
};
mergeRegions = function(items, predicate) {
var done, i, item, j, jj, k, l, len, len1, m, n, otherItem, ref1, ref2, ref3, region, regions, results;
regions = (function() {
results = [];
for (var k = 0, ref1 = items.length; 0 <= ref1 ? k < ref1 : k > ref1; 0 <= ref1 ? k++ : k--){ results.push(k); }
return results;
}).apply(this);
done = false;
while (!done) {
done = true;
for (i = l = 0, len = items.length; l < len; i = ++l) {
item = items[i];
ref2 = items.slice(i + 1);
for (j = m = 0, len1 = ref2.length; m < len1; j = ++m) {
otherItem = ref2[j];
jj = j + i + 1;
if (regions[jj] !== regions[i] && predicate(item, otherItem)) {
region = Math.min(regions[jj], regions[i]);
regions[i] = regions[jj] = region;
done = false;
}
}
}
for (i = n = 0, ref3 = regions.length; 0 <= ref3 ? n <= ref3 : n >= ref3; i = 0 <= ref3 ? ++n : --n) {
while (regions[regions[i]] !== regions[i]) {
regions[i] = regions[regions[i]];
}
}
}
return regions;
};
isSameBlock = function(fontWidth, fontHeight) {
return function(boxA, boxB) {
var bottomA, bottomB, delta, sameLine;
bottomA = boxA.y + boxA.height;
bottomB = boxB.y + boxB.height;
delta = boxDistanceVector(boxA, boxB);
sameLine = Math.abs(bottomA - bottomB) < fontHeight / 2 && delta.x < fontWidth * 3;
return sameLine || intersectBox(boxA, boxB);
};
};
detectCandidates = function(binarizedImage, fontWidth, fontHeight) {
var _, boxIndex, boxes, boxesByRegion, candidates, hasLetterSize, k, len, region, regions, smearHeight, smearWidth;
if (fontWidth == null) {
fontWidth = 20;
}
if (fontHeight == null) {
fontHeight = 30;
}
hasLetterSize = function(box) {
var ref1;
return fontWidth / 2 < box.width && (fontHeight / 2 < (ref1 = box.height) && ref1 < fontHeight * 6);
};
smearWidth = (1 * fontWidth) + fontWidth % 2;
smearHeight = (0.25 * fontHeight) + fontHeight % 2;
boxes = binarizedImage.dilate(smearWidth, smearHeight).connectedComponents(8).filter(hasLetterSize);
regions = mergeRegions(boxes, isSameBlock(fontWidth, fontHeight));
boxesByRegion = {};
for (boxIndex = k = 0, len = regions.length; k < len; boxIndex = ++k) {
region = regions[boxIndex];
if (boxesByRegion[region] == null) {
boxesByRegion[region] = [];
}
boxesByRegion[region].push(boxes[boxIndex]);
}
candidates = (function() {
var results;
results = [];
for (_ in boxesByRegion) {
boxes = boxesByRegion[_];
results.push(boxes);
}
return results;
})();
return candidates;
};
cloneUsingRegion = function(image, boxes) {
var box, cloneBox, cloneImage, k, len;
cloneBox = boundingBox(boxes);
cloneImage = new dv.Image(cloneBox.width, cloneBox.height, image.depth);
cloneImage.clearBox({
x: 0,
y: 0,
width: cloneBox.width,
height: cloneBox.height
});
for (k = 0, len = boxes.length; k < len; k++) {
box = boxes[k];
cloneImage.drawImage(image.crop(box.x, box.y, box.width + 25, box.height), {
x: box.x - cloneBox.x,
y: box.y - cloneBox.y,
width: box.width + 25,
height: box.height
});
}
return [cloneImage, cloneBox];
};
findWords = function(candidates, image, tesseract) {
var candidateBoxes, cloneBox, cloneImage, k, l, len, len1, localWords, ref1, word, words;
words = [];
for (k = 0, len = candidates.length; k < len; k++) {
candidateBoxes = candidates[k];
ref1 = cloneUsingRegion(image, candidateBoxes), cloneImage = ref1[0], cloneBox = ref1[1];
tesseract.image = cloneImage;
tesseract.pageSegMode = cloneBox.height < 60 ? 'single_line' : 'single_block';
localWords = tesseract.findWords();
for (l = 0, len1 = localWords.length; l < len1; l++) {
word = localWords[l];
word.box.x += cloneBox.x;
word.box.y += cloneBox.y;
word.candidate = candidateBoxes.slice(0);
}
words = words.concat(localWords);
}
words = words.filter(function(word) {
return word.box.width > 5 && word.box.height > 5;
});
return words;
};
module.exports.findText = function(image, tesseract) {
var candidates, clearedImage, k, len, lineMask, textImage, word, words;
clearedImage = new dv.Image(image);
lineMask = detectLineMask(image, 45);
textImage = image.toGray().add(lineMask.toGray());
tesseract.image = textImage;
candidates = detectCandidates(textImage.otsuAdaptiveThreshold(128, 128, 0, 0, 0).image);
words = findWords(candidates, image, tesseract);
for (k = 0, len = words.length; k < len; k++) {
word = words[k];
if (word.text.length >= 6 || (word.text.length >= 3 && word.confidence >= 30)) {
clearedImage.clearBox(word.box);
}
}
return [words, clearedImage];
};