symspell-ex
Version:
Spelling correction & Fuzzy search based on symmetric delete spelling correction algorithm
84 lines (83 loc) • 3.41 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
exports.CoreTokenizer = void 0;
const types_1 = require("../../types");
const constants_1 = require("../../constants");
class CoreTokenizer {
constructor() {
this._spaceExpression = /\s+/g;
this._expressions = [
{ value: /[(http(s)?)://(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/gi, tag: 'url' },
{ value: /\d+\/\d+|\d(?:[.,-\/]?\d)*(?:\.\d+)?/g, tag: 'number', alphabet: constants_1.Alphabets.LATIN },
{ value: /[\u0660-\u0669]+/g, tag: 'number', alphabet: constants_1.Alphabets.ARABIC },
{ value: /[A-zÀ-ú]+/gi, tag: 'word', alphabet: constants_1.Alphabets.LATIN },
{ value: /[\u0620-\u06EF]+/g, tag: 'word', alphabet: constants_1.Alphabets.ARABIC },
{ value: /[.!?;\-()\[\]{}"]/g, tag: 'punctuation', alphabet: constants_1.Alphabets.LATIN },
{ value: /[،؟]/g, tag: 'punctuation', alphabet: constants_1.Alphabets.ARABIC },
{ value: /\s+/g, tag: 'space' }
];
}
_tokenizeSegment(input) {
let tokens;
for (let i = 0; i < this._expressions.length; i += 1) {
const expression = this._expressions[i];
const matches = input.match(expression.value) || [];
const parts = input.split(expression.value);
let mIndex = 0;
tokens = [];
for (let j = 0; j < parts.length; j += 1) {
const part = parts[j];
if (part != null) {
if (part.trim().length > 0) {
tokens.push(new types_1.Token(part));
}
}
if (mIndex < matches.length && matches[mIndex] != null) {
const mToken = matches[mIndex].trim();
if (mToken.length > 0) {
tokens.push(new types_1.Token(mToken, expression.tag, expression.alphabet || null));
}
if (mToken.length >= input.length) {
break;
}
}
mIndex++;
}
if (matches.length > 0) {
break;
}
}
if (tokens.length === 1 && tokens[0].tag == null) {
tokens[0].tag = 'none';
}
return tokens;
}
_tokenizeInput(input, tokens) {
const tokenValue = input.value.trim();
if (tokenValue.length === 0) {
return;
}
const bTokens = this._tokenizeSegment(tokenValue);
for (let i = 0; i < bTokens.length; i += 1) {
const pSpaceIndex = input.value.indexOf(`${bTokens[i].value.trim()} `);
const tDistance = pSpaceIndex >= 0 ? 1 : 0;
bTokens[i].distance = i >= bTokens.length - 1 ?
input.distance : tDistance;
if (bTokens[i].tag == null) {
this._tokenizeInput(bTokens[i], tokens);
}
else {
tokens.push(bTokens[i]);
}
}
}
tokenize(input) {
if (input == null || input.length === 0) {
return [];
}
const tokens = [];
this._tokenizeInput(new types_1.Token(input), tokens);
return tokens;
}
}
exports.CoreTokenizer = CoreTokenizer;
;