UNPKG

symspell-ex

Version:

Spelling correction & Fuzzy search based on symmetric delete spelling correction algorithm

github.com/m-elbably/symspell-ex

m-elbably/symspell-ex

84 lines (83 loc) • 3.41 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.CoreTokenizer = void 0; const types_1 = require("../../types"); const constants_1 = require("../../constants"); class CoreTokenizer { constructor() { this._spaceExpression = /\s+/g; this._expressions = [ { value: /[(http(s)?)://(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/gi, tag: 'url' }, { value: /\d+\/\d+|\d(?:[.,-\/]?\d)*(?:\.\d+)?/g, tag: 'number', alphabet: constants_1.Alphabets.LATIN }, { value: /[\u0660-\u0669]+/g, tag: 'number', alphabet: constants_1.Alphabets.ARABIC }, { value: /[A-zÀ-ú]+/gi, tag: 'word', alphabet: constants_1.Alphabets.LATIN }, { value: /[\u0620-\u06EF]+/g, tag: 'word', alphabet: constants_1.Alphabets.ARABIC }, { value: /[.!?;\-()\[\]{}"]/g, tag: 'punctuation', alphabet: constants_1.Alphabets.LATIN }, { value: /[،؟]/g, tag: 'punctuation', alphabet: constants_1.Alphabets.ARABIC }, { value: /\s+/g, tag: 'space' } ]; } _tokenizeSegment(input) { let tokens; for (let i = 0; i < this._expressions.length; i += 1) { const expression = this._expressions[i]; const matches = input.match(expression.value) || []; const parts = input.split(expression.value); let mIndex = 0; tokens = []; for (let j = 0; j < parts.length; j += 1) { const part = parts[j]; if (part != null) { if (part.trim().length > 0) { tokens.push(new types_1.Token(part)); } } if (mIndex < matches.length && matches[mIndex] != null) { const mToken = matches[mIndex].trim(); if (mToken.length > 0) { tokens.push(new types_1.Token(mToken, expression.tag, expression.alphabet || null)); } if (mToken.length >= input.length) { break; } } mIndex++; } if (matches.length > 0) { break; } } if (tokens.length === 1 && tokens[0].tag == null) { tokens[0].tag = 'none'; } return tokens; } _tokenizeInput(input, tokens) { const tokenValue = input.value.trim(); if (tokenValue.length === 0) { return; } const bTokens = this._tokenizeSegment(tokenValue); for (let i = 0; i < bTokens.length; i += 1) { const pSpaceIndex = input.value.indexOf(`${bTokens[i].value.trim()} `); const tDistance = pSpaceIndex >= 0 ? 1 : 0; bTokens[i].distance = i >= bTokens.length - 1 ? input.distance : tDistance; if (bTokens[i].tag == null) { this._tokenizeInput(bTokens[i], tokens); } else { tokens.push(bTokens[i]); } } } tokenize(input) { if (input == null || input.length === 0) { return []; } const tokens = []; this._tokenizeInput(new types_1.Token(input), tokens); return tokens; } } exports.CoreTokenizer = CoreTokenizer;