@zanreal/search
Version:
A powerful TypeScript fuzzy search library with intelligent scoring, exact match prioritization, and automatic field detection for any object structure
360 lines (359 loc) • 14 kB
JavaScript
/**
* Universal Search Engine
* Provides fuzzy search with intelligent scoring, exact match prioritization,
* and automatic field detection for any object structure.
*/
/** Exported default search options */
export const DEFAULT_SEARCH_OPTIONS = {
fieldWeights: {},
fuzzyThreshold: 0.7,
minFuzzyLength: 3,
limit: 100,
caseSensitive: false,
};
/**
* Calculate Levenshtein distance between two strings
* Optimized to use O(min(m,n)) space instead of O(m*n)
*/
function levenshteinDistance(str1, str2) {
// Ensure str1 is the shorter string to minimize memory usage
if (str1.length > str2.length) {
[str1, str2] = [str2, str1];
}
const len1 = str1.length;
const len2 = str2.length;
// Use only two arrays instead of a 2D matrix
let prevRow = new Array(len1 + 1);
let currRow = new Array(len1 + 1);
// Initialize first row
for (let i = 0; i <= len1; i++) {
prevRow[i] = i;
}
for (let i = 1; i <= len2; i++) {
currRow[0] = i;
for (let j = 1; j <= len1; j++) {
const cost = str1[j - 1] === str2[i - 1] ? 0 : 1;
currRow[j] = Math.min(prevRow[j] + 1, // deletion
currRow[j - 1] + 1, // insertion
prevRow[j - 1] + cost // substitution
);
}
// Swap arrays for next iteration
[prevRow, currRow] = [currRow, prevRow];
}
return prevRow[len1];
}
/**
* Get nested value from object using dot notation
*/
function getNestedValue(obj, path) {
return (path.split(".").reduce((current, key) => {
if (current && typeof current === "object" && key in current) {
return current[key];
}
return "";
}, obj) ?? "");
}
// Global cache for field detection to avoid repeated computation
const fieldDetectionCache = new WeakMap();
// Global cache for processed strings to reduce memory allocation
const stringProcessingCache = new Map();
const MAX_STRING_CACHE_SIZE = 500;
// Global cache for field statistics to avoid recomputation
const fieldStatsCache = new WeakMap();
/**
* Automatically detect searchable string fields in an object with caching
*/
function detectStringFields(obj, prefix = "", maxDepth = 3) {
if (maxDepth <= 0 || !obj || typeof obj !== "object") {
return [];
}
// Check cache first for the root object
if (prefix === "" && fieldDetectionCache.has(obj)) {
return fieldDetectionCache.get(obj);
}
const fields = [];
for (const [key, value] of Object.entries(obj)) {
const fieldPath = prefix ? `${prefix}.${key}` : key;
if (typeof value === "string" && value.length > 0) {
fields.push(fieldPath);
}
else if (typeof value === "object" &&
value !== null &&
!Array.isArray(value)) {
fields.push(...detectStringFields(value, fieldPath, maxDepth - 1));
}
}
// Cache the result for the root object
if (prefix === "") {
fieldDetectionCache.set(obj, fields);
}
return fields;
}
/**
* Get processed (lowercased) string with caching and aggressive cleanup
*/
function getProcessedString(str, caseSensitive) {
if (caseSensitive)
return str;
const cacheKey = `lc:${str}`;
if (stringProcessingCache.has(cacheKey)) {
return stringProcessingCache.get(cacheKey);
}
const processed = str.toLowerCase();
// More aggressive cache size management
if (stringProcessingCache.size >= MAX_STRING_CACHE_SIZE) {
// Clear oldest entries (simple LRU-like behavior)
const keysToDelete = Array.from(stringProcessingCache.keys()).slice(0, MAX_STRING_CACHE_SIZE / 2);
keysToDelete.forEach(key => stringProcessingCache.delete(key));
}
stringProcessingCache.set(cacheKey, processed);
return processed;
}
/**
* Calculate match score for a text field against a query
* Optimized to reduce memory allocations
*/
function calculateFieldScore(text, query, fieldWeight, options) {
if (!text || !query)
return null;
const searchText = getProcessedString(text, options.caseSensitive);
const searchQuery = getProcessedString(query, options.caseSensitive);
// Exact match from start (highest priority)
if (searchText.startsWith(searchQuery)) {
return {
field: "",
value: text,
score: fieldWeight * 20, // Highest multiplier for exact start matches
type: "exact-start",
position: 0,
};
}
// Exact match anywhere
const position = searchText.indexOf(searchQuery);
if (position !== -1) {
// Earlier positions get higher scores, with length bonus for shorter strings
const lengthBonus = Math.max(1, 100 / text.length); // Shorter strings get bonus
const positionPenalty = position * 0.1;
const score = fieldWeight * (10 + lengthBonus - positionPenalty);
return {
field: "",
value: text,
score: Math.max(score, fieldWeight), // Minimum score of base weight
type: "exact-contain",
position,
};
}
// Fuzzy matching for misspellings - optimized to reduce string operations
if (searchQuery.length >= options.minFuzzyLength) {
// Use a more efficient word splitting approach
let bestMatch = null;
let wordStart = 0;
for (let i = 0; i <= searchText.length; i++) {
const char = searchText[i];
if (i === searchText.length || (char && /\s/.test(char))) {
if (i - wordStart >= 3) {
const word = searchText.slice(wordStart, i);
const distance = levenshteinDistance(word, searchQuery);
const maxLength = Math.max(word.length, searchQuery.length);
const similarity = (maxLength - distance) / maxLength;
if (similarity >= options.fuzzyThreshold) {
const lengthBonus = Math.max(1, 50 / text.length);
const score = fieldWeight * similarity * (2 + lengthBonus);
if (!bestMatch || score > bestMatch.score) {
bestMatch = {
field: "",
value: text,
score,
type: "fuzzy",
};
}
}
}
wordStart = i + 1;
}
}
return bestMatch;
}
return null;
}
let searchCallCount = 0;
/**
* Universal search function that works with any data structure
* Optimized for memory efficiency and performance
*/
export function search(data, query, options = {}) {
// Periodic cache cleanup every 100 search calls
searchCallCount++;
if (searchCallCount % 100 === 0) {
// Force a more aggressive cache cleanup
if (stringProcessingCache.size > MAX_STRING_CACHE_SIZE / 2) {
const keysToDelete = Array.from(stringProcessingCache.keys()).slice(0, MAX_STRING_CACHE_SIZE / 4);
keysToDelete.forEach(key => stringProcessingCache.delete(key));
}
}
if (!query.trim())
return data.map((item) => ({ item, score: 0, matches: [] }));
const { fields, fieldWeights = DEFAULT_SEARCH_OPTIONS.fieldWeights, fuzzyThreshold = DEFAULT_SEARCH_OPTIONS.fuzzyThreshold, minFuzzyLength = DEFAULT_SEARCH_OPTIONS.minFuzzyLength, limit = DEFAULT_SEARCH_OPTIONS.limit, caseSensitive = DEFAULT_SEARCH_OPTIONS.caseSensitive, } = options;
const searchQuery = query.trim();
const searchOptions = { fuzzyThreshold, minFuzzyLength, caseSensitive };
// Auto-detect fields if not provided (with caching)
const searchFields = fields || (data.length > 0 ? detectStringFields(data[0]) : []);
// Get or calculate field statistics with caching
let fieldStats;
if (fieldStatsCache.has(data)) {
fieldStats = fieldStatsCache.get(data);
}
else {
fieldStats = new Map();
// Calculate average length for each field across all data items
const calculateFieldStats = (fieldPath) => {
let totalLength = 0;
let count = 0;
// Sample only first 100 items for performance on large datasets
const sampleSize = Math.min(data.length, 100);
for (let i = 0; i < sampleSize; i++) {
const item = data[i];
const text = getNestedValue(item, fieldPath);
if (text && typeof text === "string" && text.length > 0) {
totalLength += text.length;
count++;
}
}
const avgLength = count > 0 ? totalLength / count : 0;
// Calculate weight based on field name and average length
const fieldName = fieldPath.split(".").pop()?.toLowerCase() ?? "";
let baseWeight = 1;
// Higher weight for common important fields
if (["title", "name", "heading"].includes(fieldName))
baseWeight = 5;
else if (["description", "summary", "subtitle"].includes(fieldName))
baseWeight = 3;
else if (["content", "body", "text"].includes(fieldName))
baseWeight = 1;
// Prioritize fields with shorter average length (likely more important)
let lengthWeight = 1;
if (avgLength < 50)
lengthWeight = 2.0; // Very short fields (titles)
else if (avgLength < 100)
lengthWeight = 1.5; // Short fields (subtitles)
else if (avgLength < 300)
lengthWeight = 1.2; // Medium fields (descriptions)
else
lengthWeight = 1.0; // Long fields (content)
return {
avgLength,
weight: baseWeight * lengthWeight,
};
};
// Pre-calculate field statistics for weight determination
for (const field of searchFields) {
fieldStats.set(field, calculateFieldStats(field));
}
// Cache the field stats
fieldStatsCache.set(data, fieldStats);
}
const results = [];
const maxResults = limit ? limit * 3 : data.length; // Get more than needed for better sorting
for (let itemIndex = 0; itemIndex < data.length; itemIndex++) {
const item = data[itemIndex];
if (!item)
continue; // Skip undefined items
const matches = [];
let totalScore = 0;
for (const field of searchFields) {
const text = getNestedValue(item, field);
if (!text)
continue;
// Determine field weight
const explicitWeight = fieldWeights[field];
const fieldStat = fieldStats.get(field);
const defaultWeight = explicitWeight ?? fieldStat?.weight ?? 1;
const match = calculateFieldScore(text, searchQuery, defaultWeight, searchOptions);
if (match) {
match.field = field;
matches.push(match);
totalScore += match.score;
}
}
if (matches.length > 0) {
results.push({
item,
score: totalScore,
matches,
});
}
// Early termination for large datasets
if (results.length >= maxResults) {
break;
}
}
// Sort by score (descending), then by total text length (ascending for ties)
results.sort((a, b) => {
if (a.score !== b.score) {
return b.score - a.score;
}
// For equal scores, prefer items with shorter total text (likely more relevant)
const aTotalLength = a.matches.reduce((sum, match) => sum + match.value.length, 0);
const bTotalLength = b.matches.reduce((sum, match) => sum + match.value.length, 0);
return aTotalLength - bTotalLength;
});
return limit ? results.slice(0, limit) : results;
}
/**
* Simplified search function that returns just the items
*/
export function searchItems(data, query, options = {}) {
return search(data, query, options).map((result) => result.item);
}
/**
* Search with field-specific configuration
*/
export function createSearcher(config) {
return (data, query, overrides = {}) => {
return search(data, query, { ...config, ...overrides });
};
}
/**
* Create a search function with common default options for documents
*/
export function createDocumentSearcher() {
return createSearcher({
fieldWeights: DEFAULT_SEARCH_OPTIONS.fieldWeights,
fuzzyThreshold: DEFAULT_SEARCH_OPTIONS.fuzzyThreshold,
minFuzzyLength: DEFAULT_SEARCH_OPTIONS.minFuzzyLength,
limit: DEFAULT_SEARCH_OPTIONS.limit,
caseSensitive: DEFAULT_SEARCH_OPTIONS.caseSensitive,
});
}
/**
* Quick search function with sensible defaults for most use cases
*/
export function quickSearch(data, query, fields) {
return searchItems(data, query, {
fields,
fieldWeights: DEFAULT_SEARCH_OPTIONS.fieldWeights,
fuzzyThreshold: DEFAULT_SEARCH_OPTIONS.fuzzyThreshold,
minFuzzyLength: DEFAULT_SEARCH_OPTIONS.minFuzzyLength,
limit: DEFAULT_SEARCH_OPTIONS.limit,
caseSensitive: DEFAULT_SEARCH_OPTIONS.caseSensitive,
});
}
/**
* Clear all internal caches to free memory
* Useful for long-running applications or when switching between different datasets
*/
export function clearSearchCaches() {
stringProcessingCache.clear();
searchCallCount = 0; // Reset counter
// Note: WeakMaps (fieldDetectionCache, fieldStatsCache) will be cleared automatically by GC
}
/**
* Get cache statistics for monitoring memory usage
*/
export function getCacheStats() {
return {
stringProcessingCacheSize: stringProcessingCache.size,
searchCallCount,
};
}