fv
Version:
FormVision is a node.js library for extracting data from scanned forms
172 lines (162 loc) • 6.21 kB
JavaScript
// Generated by CoffeeScript 2.3.1
var boundingBox, boxDistanceVector, cloneUsingRegion, detectCandidates, detectLineMask, distanceVector, dv, findWords, intersectBox, isSameBlock, length, mergeRegions;
dv = require('dv');
({length, distanceVector, boxDistanceVector, intersectBox, boundingBox} = require('./math'));
// Compiles a mask with lines that have a certain length.
detectLineMask = function(image, minLineLength) {
var k, len, line, lineMask, longLines;
lineMask = new dv.Image(image.width, image.height, 8);
longLines = image.toGray().lineSegments(0, 0, false).filter(function(line) {
return length(distanceVector(line.p1, line.p2)) >= minLineLength;
});
for (k = 0, len = longLines.length; k < len; k++) {
line = longLines[k];
lineMask.drawLine(line.p1, line.p2, 7, 'set');
}
return lineMask;
};
mergeRegions = function(items, predicate) {
var done, i, item, j, jj, k, l, len, len1, m, otherItem, ref, ref1, ref2, region, regions;
// Initialize regions with unique indices.
regions = (function() {
var results = [];
for (var k = 0, ref = items.length; 0 <= ref ? k < ref : k > ref; 0 <= ref ? k++ : k--){ results.push(k); }
return results;
}).apply(this);
// Merge regions until predicate can no longer be applied.
done = false;
while (!done) {
done = true;
// Merge regions (non-transitive).
for (i = k = 0, len = items.length; k < len; i = ++k) {
item = items[i];
ref1 = items.slice(i + 1);
for (j = l = 0, len1 = ref1.length; l < len1; j = ++l) {
otherItem = ref1[j];
jj = j + i + 1;
if (regions[jj] !== regions[i] && predicate(item, otherItem)) {
region = Math.min(regions[jj], regions[i]);
regions[i] = regions[jj] = region;
done = false;
}
}
}
// Propagate merges (transitive).
for (i = m = 0, ref2 = regions.length; (0 <= ref2 ? m <= ref2 : m >= ref2); i = 0 <= ref2 ? ++m : --m) {
while (regions[regions[i]] !== regions[i]) {
regions[i] = regions[regions[i]];
}
}
}
return regions;
};
isSameBlock = function(fontWidth, fontHeight) {
return function(boxA, boxB) {
var bottomA, bottomB, delta, sameLine;
bottomA = boxA.y + boxA.height;
bottomB = boxB.y + boxB.height;
delta = boxDistanceVector(boxA, boxB);
sameLine = Math.abs(bottomA - bottomB) < fontHeight / 2 && delta.x < fontWidth * 3;
return sameLine || intersectBox(boxA, boxB);
};
};
detectCandidates = function(binarizedImage, fontWidth = 20, fontHeight = 30) {
var _, boxIndex, boxes, boxesByRegion, candidates, hasLetterSize, k, len, region, regions, smearHeight, smearWidth;
hasLetterSize = function(box) {
var ref;
return fontWidth / 2 < box.width && (fontHeight / 2 < (ref = box.height) && ref < fontHeight * 6);
};
// Smear text a bit to extract letter boxes.
smearWidth = (1 * fontWidth) + fontWidth % 2;
smearHeight = (0.25 * fontHeight) + fontHeight % 2;
boxes = binarizedImage.dilate(smearWidth, smearHeight).connectedComponents(8).filter(hasLetterSize);
// Merge letter boxes to text regions.
regions = mergeRegions(boxes, isSameBlock(fontWidth, fontHeight));
boxesByRegion = {};
for (boxIndex = k = 0, len = regions.length; k < len; boxIndex = ++k) {
region = regions[boxIndex];
if (boxesByRegion[region] == null) {
boxesByRegion[region] = [];
}
boxesByRegion[region].push(boxes[boxIndex]);
}
candidates = (function() {
var results;
results = [];
for (_ in boxesByRegion) {
boxes = boxesByRegion[_];
results.push(boxes);
}
return results;
})();
return candidates;
};
// Clone area of an image from boxes
cloneUsingRegion = function(image, boxes) {
var box, cloneBox, cloneImage, k, len;
cloneBox = boundingBox(boxes);
cloneImage = new dv.Image(cloneBox.width, cloneBox.height, image.depth);
cloneImage.clearBox({
x: 0,
y: 0,
width: cloneBox.width,
height: cloneBox.height
});
for (k = 0, len = boxes.length; k < len; k++) {
box = boxes[k];
cloneImage.drawImage(image.crop(box.x, box.y, box.width + 25, box.height), {
x: box.x - cloneBox.x,
y: box.y - cloneBox.y,
width: box.width + 25,
height: box.height
});
}
return [cloneImage, cloneBox];
};
findWords = function(candidates, image, tesseract) {
var candidateBoxes, cloneBox, cloneImage, k, l, len, len1, localWords, word, words;
words = [];
for (k = 0, len = candidates.length; k < len; k++) {
candidateBoxes = candidates[k];
// Crop and recognize.
[cloneImage, cloneBox] = cloneUsingRegion(image, candidateBoxes);
tesseract.image = cloneImage;
tesseract.pageSegMode = cloneBox.height < 60 ? 'single_line' : 'single_block';
localWords = tesseract.findWords();
for (l = 0, len1 = localWords.length; l < len1; l++) {
word = localWords[l];
// Transform back.
word.box.x += cloneBox.x;
word.box.y += cloneBox.y;
// Store candidate.
word.candidate = candidateBoxes.slice(0);
}
words = words.concat(localWords);
}
// Filter words with tiny boxes.
words = words.filter(function(word) {
return word.box.width > 5 && word.box.height > 5;
});
return words;
};
// Use given *Tesseract* instance to find all text grouped as words along with
// confidence and boxes.
module.exports.findText = function(image, tesseract) {
var candidates, clearedImage, k, len, lineMask, textImage, word, words;
clearedImage = new dv.Image(image);
// Remove long lines.
lineMask = detectLineMask(image, 45);
textImage = image.toGray().add(lineMask.toGray());
// Find words using a simple Otsu thresholding.
tesseract.image = textImage;
candidates = detectCandidates(textImage.otsuAdaptiveThreshold(128, 128, 0, 0, 0).image);
words = findWords(candidates, image, tesseract);
// Remove words from image, but safeguard against removing 'noise' that may be a checkmark.
for (k = 0, len = words.length; k < len; k++) {
word = words[k];
if (word.text.length >= 6 || (word.text.length >= 3 && word.confidence >= 30)) {
clearedImage.clearBox(word.box);
}
}
return [words, clearedImage];
};