smart-docs-parser
Version:
Document Details Parsing using OCR
331 lines • 16.2 kB
JavaScript
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
var lodash_1 = __importDefault(require("lodash"));
var moment = __importStar(require("moment"));
var constants_1 = __importDefault(require("../constants"));
// TODO update regex rules
var AADHAAR_REGEX = {
title: /Unique|UNIQUE|Identification|IDENTIFICATION|Enrollment|ENROLLMENT/,
govt: /Government|GOVERNMENT|India|INDIA/,
dob_heading: /Dob|DOB|Year|YEAR|Birth|BIRTH|(\d\d\/\d\d\/\d+)/,
relative_name_heading: /Father|FATHER|Mother|MOTHER|Husband|HUSBAND|Wife|WIFE/,
date_format: /(\d{2}\/\d{2}\/\d{4})/,
gender: /Male|MALE|Female|FEMALE/,
document: /Aadhaar|AADHAAR/,
number_format: /[\d\s]{12,}/,
name_format: /^[a-zA-Z\s\.]+$/,
address_start: /([Ss]\/[Oo])|([Ww]\/[Oo])|([Dd]\/[Oo])|([Cc]\/[Oo])|(Address[:\s]*)|(ADDRESS[:\s]*)/,
address_head: /Address[:\s]*|ADDRESS[:\s]*/,
address_start_split: /,/,
noise: /(^[\s]+$)|(^[A-Z]{0,2}[.,]+$)|(^[A-Z0-9]{2,}[a-z]+)|(^[A-Z0-9]+[.,]+[A-Z0-9]+)|(^[0-9]+[a-zA-Z]{3,})|(^www\.\w+)|(\.gov\.in*$)/,
address_end: /([A-Z\s]+[a-z]*[,;:._\s-]+[0-9]{6}\.?$)|(^[0-9]{6}\.?$)/,
fathers_name_split: /([Ss]\/[Oo])[\s:]+|([Dd]\/[Oo])[\s:]+|([Cc]\/[Oo])[\s:]+|([Ww]\/[Oo])[\s:]+/,
english: /(^[\w,.:;&*\/|)('"#+^`-]*$)/,
local_language_reverse_prefix: /[^\w\s,.:;&*\/|)('"#+^`-]+.*/,
unwanted_prefix_suffix: /(^[\s.,-]+)|([\s.,-]+$)/g
};
var LINE_MIN_SIZE = 4;
var AadhaarParser = {};
// ******************************************************* //
// Logic for internal functions starts here //
// ******************************************************* //
var filterNoiseFromLine = function (lineText) {
var spaceSplit = lineText.split(/\s/);
var filteredSpacedList = lodash_1.default.filter(spaceSplit, function (word) {
if (AADHAAR_REGEX["gender"].exec(word) ||
AADHAAR_REGEX["date_format"].exec(word)) {
return true;
}
if (AADHAAR_REGEX["noise"].exec(word)) {
return false;
}
if (!AADHAAR_REGEX["english"].exec(word)) {
return false;
}
return true;
});
return lodash_1.default.join(filteredSpacedList, " ");
};
var removeNoiseFromText = function (lines) {
var filteredLines = [];
lodash_1.default.forEach(lines, function (line) {
var reverseLine = line
.split("")
.reverse()
.join("");
var reverseLineWithoutLocalLanguage = lodash_1.default.replace(reverseLine, AADHAAR_REGEX["local_language_reverse_prefix"], "");
var lineWithoutLocalLanguage = reverseLineWithoutLocalLanguage
.split("")
.reverse()
.join("")
.trim();
if (lodash_1.default.size(lineWithoutLocalLanguage) >= LINE_MIN_SIZE) {
var filteredText = filterNoiseFromLine(lineWithoutLocalLanguage);
filteredLines.push(filteredText);
}
});
return lodash_1.default.filter(filteredLines, function (line) {
return !lodash_1.default.isEmpty(line);
});
};
var parseAadhaarHeadingLineNumbers = function (lines) {
var aadhaarHeadingLineNumbers = {
aadhar_title_text_line: undefined,
aadhar_document_text_line: undefined,
aadhar_govt_text_line: undefined,
aadhar_dob_text_line: undefined,
aadhar_gender_text_line: undefined,
aadhar_number_text_line: undefined,
aadhar_relative_name_text_line: undefined,
aadhar_address_start_line: undefined,
aadhar_address_end_line: undefined
};
lodash_1.default.forEach(lines, function (line, index) {
if (AADHAAR_REGEX["title"].exec(line)) {
aadhaarHeadingLineNumbers["aadhar_title_text_line"] = index;
}
else if (AADHAAR_REGEX["document"].exec(line)) {
aadhaarHeadingLineNumbers["aadhar_document_text_line"] = index;
}
else if (AADHAAR_REGEX["govt"].exec(line)) {
aadhaarHeadingLineNumbers["aadhar_govt_text_line"] = index;
}
else if (aadhaarHeadingLineNumbers["aadhar_dob_text_line"] === undefined &&
AADHAAR_REGEX["dob_heading"].exec(line)) {
aadhaarHeadingLineNumbers["aadhar_dob_text_line"] = index;
}
else if (aadhaarHeadingLineNumbers["aadhar_gender_text_line"] === undefined &&
AADHAAR_REGEX["gender"].exec(line)) {
aadhaarHeadingLineNumbers["aadhar_gender_text_line"] = index;
}
else if (aadhaarHeadingLineNumbers["aadhar_number_text_line"] === undefined &&
AADHAAR_REGEX["number_format"].exec(line)) {
aadhaarHeadingLineNumbers["aadhar_number_text_line"] = index;
}
else if (aadhaarHeadingLineNumbers["aadhar_relative_name_text_line"] === undefined &&
AADHAAR_REGEX["relative_name_heading"].exec(line)) {
aadhaarHeadingLineNumbers["aadhar_relative_name_text_line"] = index;
}
else if (aadhaarHeadingLineNumbers["aadhar_address_start_line"] === undefined &&
AADHAAR_REGEX["address_start"].exec(line)) {
aadhaarHeadingLineNumbers["aadhar_address_start_line"] = index;
}
else if (aadhaarHeadingLineNumbers["aadhar_address_start_line"] !== undefined &&
aadhaarHeadingLineNumbers["aadhar_address_end_line"] === undefined &&
AADHAAR_REGEX["address_end"].exec(line)) {
aadhaarHeadingLineNumbers["aadhar_address_end_line"] = index;
}
});
return aadhaarHeadingLineNumbers;
};
var removeDispositionedText = function (noiseFreeText, aadhaarHeadingLineNumbers) {
var aadhaarGenderTextLine = aadhaarHeadingLineNumbers["aadhar_gender_text_line"];
if (aadhaarGenderTextLine) {
var aadhaarNumberLine = aadhaarGenderTextLine + 1;
var aadhaarNumber = noiseFreeText[aadhaarNumberLine];
if (!AADHAAR_REGEX["number_format"].exec(aadhaarNumber)) {
noiseFreeText[aadhaarNumberLine] = undefined;
}
}
return lodash_1.default.filter(noiseFreeText, function (line) {
return !lodash_1.default.isEmpty(line);
});
};
var processAadhaarGender = function (text) {
return lodash_1.default.get(constants_1.default, "GENDER." + text);
};
var processAadhaarName = function (textLines, aadhaarHeadingLineNumbers) {
var aadhaarRelativeNamePrevLine = lodash_1.default.isNumber(aadhaarHeadingLineNumbers["aadhar_relative_name_text_line"]) &&
aadhaarHeadingLineNumbers["aadhar_relative_name_text_line"] - 1;
if (aadhaarRelativeNamePrevLine &&
AADHAAR_REGEX["name_format"].exec(textLines[aadhaarRelativeNamePrevLine])) {
return textLines[aadhaarRelativeNamePrevLine];
}
var aadhaarGovtTextNextLine = lodash_1.default.isNumber(aadhaarHeadingLineNumbers["aadhar_govt_text_line"]) &&
aadhaarHeadingLineNumbers["aadhar_govt_text_line"] + 1;
var aadhaarDOBTextPrevLine = lodash_1.default.isNumber(aadhaarHeadingLineNumbers["aadhar_dob_text_line"]) &&
aadhaarHeadingLineNumbers["aadhar_dob_text_line"] - 1;
if (aadhaarGovtTextNextLine <= aadhaarDOBTextPrevLine &&
AADHAAR_REGEX["name_format"].exec(textLines[aadhaarGovtTextNextLine])) {
return textLines[aadhaarGovtTextNextLine];
}
if (!aadhaarHeadingLineNumbers["aadhar_address_start_line"] &&
aadhaarDOBTextPrevLine &&
AADHAAR_REGEX["name_format"].exec(textLines[aadhaarDOBTextPrevLine])) {
return textLines[aadhaarDOBTextPrevLine];
}
return undefined;
};
var processAadhaarNumber = function (text) {
if (lodash_1.default.isEmpty(text)) {
return undefined;
}
var numbers = text.replace(/ +/g, "");
var parsedNumbers = numbers
.replace("O", "0")
.replace("D", "0")
.replace("B", "8")
.replace("S", "5")
.replace("I", "1")
.replace("!", "1")
.replace("l", "1");
if (AADHAAR_REGEX["number_format"].exec(parsedNumbers)) {
return parsedNumbers;
}
return undefined;
};
var parseAadhaarNumber = function (textLines, aadhaarHeadingLineNumbers) {
var aadhaarGenderLine = aadhaarHeadingLineNumbers["aadhar_gender_text_line"];
if (lodash_1.default.isNumber(aadhaarGenderLine)) {
return processAadhaarNumber(textLines[aadhaarGenderLine + 1]);
}
var aadhaarDOBLine = aadhaarHeadingLineNumbers["aadhar_dob_text_line"];
if (lodash_1.default.isNumber(aadhaarDOBLine)) {
return processAadhaarNumber(textLines[aadhaarDOBLine + 2]);
}
return undefined;
};
var getAddressStartLineTokens = function (rawAddressStartText) {
var addressRelevantTokens = lodash_1.default.split(rawAddressStartText, AADHAAR_REGEX["address_head"]);
var addressRelevantString = lodash_1.default.join(addressRelevantTokens, "");
var addressSplit = lodash_1.default.split(addressRelevantString, AADHAAR_REGEX["address_start_split"]);
return addressSplit;
};
var getAddressEndLineText = function (rawAddressEndText) {
var addressRelevantTokens = lodash_1.default.split(rawAddressEndText, AADHAAR_REGEX["address_end"]);
var filteredText = lodash_1.default.filter(addressRelevantTokens, function (token) {
return !lodash_1.default.isEmpty(token);
});
return lodash_1.default.last(filteredText);
};
var processAadhaaarFathersName = function (textLines, aadhaarHeadingLineNumbers) {
var addressStartLine = aadhaarHeadingLineNumbers["aadhar_address_start_line"];
if (!lodash_1.default.isNumber(addressStartLine)) {
return undefined;
}
var addressSplit = getAddressStartLineTokens(textLines[addressStartLine]);
if (lodash_1.default.size(addressSplit) < 2) {
return undefined;
}
var fathersNameTag = lodash_1.default.get(addressSplit, "0", "");
var fathersNameSplit = lodash_1.default.split(fathersNameTag, AADHAAR_REGEX["fathers_name_split"]);
var fathersName = lodash_1.default.last(fathersNameSplit);
if (AADHAAR_REGEX["name_format"].exec(fathersName)) {
return fathersName;
}
return undefined;
};
var processAadhaarDOB = function (textLines, aadhaarHeadingLineNumbers) {
var addressStartLine = aadhaarHeadingLineNumbers["aadhar_address_start_line"];
var addressEndLine = aadhaarHeadingLineNumbers["aadhar_address_end_line"];
if (lodash_1.default.isNumber(addressStartLine) || lodash_1.default.isNumber(addressEndLine)) {
return undefined;
}
var aadhaarDOBLine = aadhaarHeadingLineNumbers["aadhar_dob_text_line"];
var dobMatch = AADHAAR_REGEX["date_format"].exec(textLines[aadhaarDOBLine]);
var dateText = lodash_1.default.get(dobMatch, "0");
if (lodash_1.default.isEmpty(dateText)) {
return undefined;
}
return moment.utc(dateText, "DD/MM/YYYY").toISOString();
};
var processAadhaarAddress = function (textLines, aadhaarHeadingLineNumbers) {
var addressStartLine = aadhaarHeadingLineNumbers["aadhar_address_start_line"];
var addressEndLine = aadhaarHeadingLineNumbers["aadhar_address_end_line"];
if (!lodash_1.default.isNumber(addressStartLine) || !lodash_1.default.isNumber(addressEndLine)) {
return undefined;
}
var addressLines = [];
var addressStartSplit = getAddressStartLineTokens(textLines[addressStartLine]);
lodash_1.default.forEach(addressStartSplit, function (token) {
if (!lodash_1.default.isEmpty(token)) {
addressLines.push(token);
}
});
lodash_1.default.forEach(lodash_1.default.range(addressStartLine + 1, addressEndLine), function (lineNumber) {
addressLines.push(textLines[lineNumber]);
});
addressLines.push(textLines[addressEndLine]);
var address = lodash_1.default.join(addressLines, " ");
return lodash_1.default.replace(address, AADHAAR_REGEX["unwanted_prefix_suffix"], "");
};
var parseAadhaarText = function (textLines, aadhaarHeadingLineNumbers) {
var parsedResult = {
document_type: constants_1.default.DOCUMENT_TYPES.AADHAAR_CARD
};
var aadhaarGenderLine = aadhaarHeadingLineNumbers["aadhar_gender_text_line"];
var genderMatch = AADHAAR_REGEX["gender"].exec(textLines[aadhaarGenderLine]);
parsedResult.gender = processAadhaarGender(lodash_1.default.get(genderMatch, "0"));
var name = processAadhaarName(textLines, aadhaarHeadingLineNumbers);
var aadhaarNumber = parseAadhaarNumber(textLines, aadhaarHeadingLineNumbers);
var address = processAadhaarAddress(textLines, aadhaarHeadingLineNumbers);
var fathersName = processAadhaaarFathersName(textLines, aadhaarHeadingLineNumbers);
var dateOfBirth = processAadhaarDOB(textLines, aadhaarHeadingLineNumbers);
parsedResult.identification_number = aadhaarNumber;
parsedResult.name = name;
parsedResult.date_of_birth = dateOfBirth;
parsedResult.address = address;
parsedResult.fathers_name = fathersName;
return parsedResult;
};
var filterRelevantAadhaarText = function (rawTextLines) {
var noiseFreeText = removeNoiseFromText(rawTextLines);
var aadhaarHeadingLineNumbers = parseAadhaarHeadingLineNumbers(noiseFreeText);
var filteredText = removeDispositionedText(noiseFreeText, aadhaarHeadingLineNumbers);
return filteredText;
};
var validateAadhaarText = function (aadhaarHeadingLineNumbers) {
var aadharNumberTextLine = aadhaarHeadingLineNumbers.aadhar_number_text_line, aadharTitleTextLine = aadhaarHeadingLineNumbers.aadhar_title_text_line, aadharDocumentTextLine = aadhaarHeadingLineNumbers.aadhar_document_text_line, aadharAddressStartLine = aadhaarHeadingLineNumbers.aadhar_address_start_line, aadharAddressEndLine = aadhaarHeadingLineNumbers.aadhar_address_end_line;
return (lodash_1.default.isNumber(aadharNumberTextLine) ||
lodash_1.default.isNumber(aadharTitleTextLine) ||
lodash_1.default.isNumber(aadharDocumentTextLine) ||
lodash_1.default.isNumber(aadharAddressStartLine) ||
lodash_1.default.isNumber(aadharAddressEndLine));
};
// ******************************************************* //
// Logic for internal functions ends here //
// ******************************************************* //
// ******************************************************* //
// Logic for API handlers starts here //
// ******************************************************* //
AadhaarParser.parseDocumentDetails = function (params) {
var rawTextLines = params.raw_text;
var textLines = filterRelevantAadhaarText(rawTextLines);
var aadhaarHeadingLineNumbers = parseAadhaarHeadingLineNumbers(textLines);
var isDocumentValid = validateAadhaarText(aadhaarHeadingLineNumbers);
if (!isDocumentValid) {
return constants_1.default.INVALID_DOCUMENT_RESPONSE;
}
var parsedDetails = parseAadhaarText(textLines, aadhaarHeadingLineNumbers);
return {
is_document_valid: true,
document_details: parsedDetails
};
};
// ******************************************************* //
// Logic for API handlers ends here //
// ******************************************************* //
exports.default = AadhaarParser;
//# sourceMappingURL=aadhaar-parser.js.map
;