UNPKG

devcontext

Version:

DevContext is a cutting-edge Model Context Protocol (MCP) server designed to provide developers with continuous, project-centric context awareness.

754 lines (650 loc) 24.6 kB
/** * SmartSearchServiceLogic.js * * Provides advanced search capabilities for code entities using a combination * of Full-Text Search (FTS) and keyword-based matching. */ import { executeQuery } from "../db.js"; import { tokenize, extractKeywords, stem } from "./TextTokenizerLogic.js"; /** * @typedef {Object} SearchOptions * @property {string[]} [entityTypes] - Types of entities to search (e.g., 'file', 'function', 'class') * @property {string[]} [filePaths] - File paths to limit the search to * @property {Object} [dateRange] - Date range to filter by last modified date * @property {Date} [dateRange.start] - Start date of range * @property {Date} [dateRange.end] - End date of range * @property {string} [sortBy] - Field to sort results by * @property {number} [limit] - Maximum number of results to return * @property {number} [minRelevance] - Minimum relevance score for results * @property {string} [strategy] - Search strategy to use ('fts', 'keywords', 'combined') * @property {string} [booleanOperator] - Boolean operator for keyword combination * @property {boolean} [useExactMatch] - Whether to use exact phrase matching * @property {boolean} [useProximity] - Whether to use proximity search * @property {number} [proximityDistance] - Distance for proximity search */ /** * @typedef {Object} CodeEntity * @property {string} entity_id - Unique identifier for the code entity * @property {string} file_path - Path to the file containing the entity * @property {string} entity_type - Type of code entity (e.g., 'file', 'function', 'class') * @property {string} name - Name of the code entity * @property {string} [parent_entity_id] - ID of the parent entity (if any) * @property {string} [content_hash] - Hash of the entity content * @property {string} [raw_content] - Raw content of the entity * @property {number} [start_line] - Start line of the entity within the file * @property {number} [end_line] - End line of the entity within the file * @property {string} [language] - Programming language of the entity * @property {string} [created_at] - Creation timestamp * @property {string} [last_modified_at] - Last modification timestamp */ /** * @typedef {Object} SearchResult * @property {CodeEntity} entity - The found code entity * @property {number} relevanceScore - Relevance score for the search result */ /** * Searches code entities by keywords using Full-Text Search and/or entity_keywords table * * @param {string[]} keywords - Keywords to search for * @param {SearchOptions} [options={}] - Search options including: * - entityTypes: Types of entities to search * - filePaths: File paths with glob pattern support * - dateRange: Date range to filter by * - sortBy: Field to sort by * - limit: Max results * - minRelevance: Minimum relevance score * - strategy: Search strategy ('fts', 'keywords', 'combined') * - booleanOperator: 'AND' or 'OR' for keyword combination * - useExactMatch: Whether to use exact phrase matching * - useProximity: Whether to use proximity search * - proximityDistance: Distance for proximity search * @returns {Promise<SearchResult[]>} Array of search results */ export async function searchByKeywords(keywords, options = {}) { try { // Validate and normalize input if (!keywords || !Array.isArray(keywords) || keywords.length === 0) { throw new Error("Keywords array is required and cannot be empty"); } // Handle single string input with boolean operators if ( keywords.length === 1 && /\s+(AND|OR|NOT|NEAR\/\d+)\s+/i.test(keywords[0]) ) { // Keep as is - will be processed by searchUsingFTS } else { // Process and clean keywords keywords = keywords.map((kw) => kw.trim()).filter((kw) => kw.length > 0); } // Set default options options = { strategy: "combined", // Default to combined search booleanOperator: "OR", // Default to OR for broader matches limit: 100, // Default result limit ...options, }; // Prepare results array let searchResults = []; // If strategy is 'fts' or 'combined', perform FTS search if (options.strategy === "fts" || options.strategy === "combined") { const ftsResults = await searchUsingFTS(keywords, options); searchResults = [...ftsResults]; } // If strategy is 'keywords' or 'combined', or if FTS returned no results, perform keyword-based search if ( options.strategy === "keywords" || options.strategy === "combined" || (options.strategy === "fts" && searchResults.length === 0) ) { const keywordResults = await searchUsingKeywords(keywords, options); if (options.strategy === "combined" && searchResults.length > 0) { // Merge and deduplicate results searchResults = mergeSearchResults(searchResults, keywordResults); } else { searchResults = keywordResults; } } // Apply minimum relevance filter if specified if (options.minRelevance) { searchResults = searchResults.filter( (result) => result.relevanceScore >= options.minRelevance ); } // Apply result limit if not already applied in search functions if (options.limit && searchResults.length > options.limit) { searchResults = searchResults.slice(0, options.limit); } // Return the search results return searchResults; } catch (error) { console.error("Error in searchByKeywords:", error); throw error; } } /** * Searches code entities using Full-Text Search * * @param {string[]} keywords - Keywords to search for * @param {SearchOptions} options - Search options * @returns {Promise<SearchResult[]>} Search results */ async function searchUsingFTS(keywords, options) { try { // Process keywords for FTS5 query const processedKeywords = keywords.map((keyword) => { // Apply stemming to match the behavior used when indexing content const stemmed = stem(keyword.toLowerCase()); // Sanitize special characters and escape quotes for FTS // Note: SQLite FTS5 has special handling for " and other special characters const sanitized = stemmed.replace( /[\\"\(\)\[\]\{\}\^\$\+\*\?\.]/g, (char) => `\\${char}` ); return sanitized; }); // Determine boolean operator based on options or use default // Default to OR for broader results, use AND for more specific matching const booleanOperator = options.booleanOperator?.toUpperCase() === "AND" ? "AND" : "OR"; // Construct the FTS query let ftsQuery; if (options.useExactMatch) { // For exact phrase matching, wrap the entire phrase in quotes ftsQuery = `"${processedKeywords.join(" ")}"`; } else if (options.useProximity && processedKeywords.length > 1) { // For proximity search, use NEAR operator with optional distance const distance = options.proximityDistance || 10; ftsQuery = `${processedKeywords.join(` NEAR/${distance} `)}`; } else { // Standard boolean search ftsQuery = processedKeywords.join(` ${booleanOperator} `); } // Check if the user provided explicit boolean syntax like "library AND file OR module" // If so, respect their input instead of our processing if ( keywords.length === 1 && /\s+(AND|OR|NOT|NEAR\/\d+)\s+/i.test(keywords[0]) ) { ftsQuery = keywords[0]; } // Start building the SQL query let sql = ` SELECT e.*, fts.rank as relevance_score FROM code_entities_fts fts JOIN code_entities e ON fts.rowid = e.rowid WHERE fts.code_entities_fts MATCH ? `; // Array to hold query parameters const queryParams = [ftsQuery]; // Apply filters from options if (options.entityTypes && options.entityTypes.length > 0) { const placeholders = options.entityTypes.map(() => "?").join(", "); sql += ` AND e.entity_type IN (${placeholders})`; queryParams.push(...options.entityTypes); } // Apply file path filters with proper wildcard handling if (options.filePaths && options.filePaths.length > 0) { sql += " AND ("; const filePathConditions = []; for (const pathPattern of options.filePaths) { // Handle glob patterns by converting to SQL LIKE patterns let sqlPattern = pathPattern .replace(/\*/g, "%") // Convert * to % .replace(/\?/g, "_"); // Convert ? to _ // Handle **/ pattern (recursive directory matching) sqlPattern = sqlPattern.replace(/%\/%/g, "%"); filePathConditions.push("e.file_path LIKE ?"); queryParams.push(sqlPattern); } sql += filePathConditions.join(" OR "); sql += ")"; } // Apply date range filter if (options.dateRange) { if (options.dateRange.start) { sql += " AND e.last_modified_at >= ?"; queryParams.push(options.dateRange.start.toISOString()); } if (options.dateRange.end) { sql += " AND e.last_modified_at <= ?"; queryParams.push(options.dateRange.end.toISOString()); } } // Apply custom ranking if available, otherwise use default FTS rank if (options.customRanking) { sql += ` ORDER BY ${options.customRanking}`; } else { // Enhance default ranking with optional boosts sql += ` ORDER BY relevance_score * CASE WHEN e.entity_type = 'file' THEN 1.2 WHEN e.entity_type = 'class' THEN 1.1 WHEN e.entity_type = 'function' THEN 1.0 ELSE 0.9 END DESC `; } // Apply limit with reasonable default const limit = options.limit && options.limit > 0 ? options.limit : 100; sql += " LIMIT ?"; queryParams.push(limit); // Execute the query const results = await executeQuery(sql, queryParams); // Map results to SearchResult objects return mapToSearchResults(results); } catch (error) { console.error("Error in searchUsingFTS:", error); throw error; } } /** * Searches code entities using the entity_keywords table * * @param {string[]} keywords - Keywords to search for * @param {SearchOptions} options - Search options * @returns {Promise<SearchResult[]>} Search results */ async function searchUsingKeywords(keywords, options) { try { // Handle single string input with boolean operators by splitting into individual terms let processedKeywords; if (keywords.length === 1 && /\s+(AND|OR|NOT)\s+/i.test(keywords[0])) { // Split the complex query string into individual terms, ignoring operators processedKeywords = keywords[0] .split(/\s+(?:AND|OR|NOT)\s+/i) .map((term) => term.trim()) .filter((term) => term.length > 0); } else { processedKeywords = keywords; } // Stem the keywords for more effective matching with the entity_keywords table const stemmedKeywords = processedKeywords.map((keyword) => stem(keyword.toLowerCase()) ); // Use prepared statement with placeholders for security let sql = ` SELECT e.*, SUM(ek.weight * (1.0 + (0.1 * count_matches))) as relevance_score FROM ( SELECT entity_id, COUNT(DISTINCT keyword) as count_matches, MAX(weight) as weight FROM entity_keywords WHERE keyword IN (${stemmedKeywords.map(() => "?").join(",")}) GROUP BY entity_id ) as ek JOIN code_entities e ON ek.entity_id = e.entity_id `; // Array to hold query parameters const queryParams = [...stemmedKeywords]; // Apply filters using our updated filter function sql = applyFilters(sql, options, queryParams); // Apply ranking with type-based boosts similar to searchUsingFTS if (options.sortBy) { sql += ` ORDER BY e.${options.sortBy}`; } else { // Provide entity-type-based boosting along with the keyword match score sql += ` ORDER BY relevance_score * CASE WHEN e.entity_type = 'file' THEN 1.2 WHEN e.entity_type = 'class' THEN 1.1 WHEN e.entity_type = 'function' THEN 1.0 ELSE 0.9 END DESC `; } // Apply limit with reasonable default const limit = options.limit && options.limit > 0 ? options.limit : 100; sql += " LIMIT ?"; queryParams.push(limit); // Execute the query const results = await executeQuery(sql, queryParams); // Map results to SearchResult objects return mapToSearchResults(results); } catch (error) { console.error("Error in searchUsingKeywords:", error); throw error; } } /** * Apply filters from search options to SQL query * Note: This function is mainly used by searchUsingKeywords. * The searchUsingFTS function now applies filters directly for better query construction. * * @param {string} sql - SQL query to enhance * @param {SearchOptions} options - Search options * @param {Array} queryParams - Query parameters array to append to * @returns {string} Enhanced SQL query with filters */ function applyFilters(sql, options, queryParams) { // The provided SQL should already have a WHERE clause, so we'll use AND // Apply entity type filters if (options.entityTypes && options.entityTypes.length > 0) { const placeholders = options.entityTypes.map(() => "?").join(", "); sql += ` AND e.entity_type IN (${placeholders})`; queryParams.push(...options.entityTypes); } // Apply file path filters with proper glob pattern support if (options.filePaths && options.filePaths.length > 0) { sql += " AND ("; const filePathConditions = []; for (const pathPattern of options.filePaths) { // Handle glob patterns by converting to SQL LIKE patterns let sqlPattern = pathPattern .replace(/\*/g, "%") // Convert * to % .replace(/\?/g, "_"); // Convert ? to _ // Handle **/ pattern (recursive directory matching) sqlPattern = sqlPattern.replace(/%\/%/g, "%"); filePathConditions.push("e.file_path LIKE ?"); queryParams.push(sqlPattern); } sql += filePathConditions.join(" OR "); sql += ")"; } // Apply date range filter if (options.dateRange) { if (options.dateRange.start) { sql += " AND e.last_modified_at >= ?"; queryParams.push(options.dateRange.start.toISOString()); } if (options.dateRange.end) { sql += " AND e.last_modified_at <= ?"; queryParams.push(options.dateRange.end.toISOString()); } } return sql; } /** * Map database results to SearchResult objects * * @param {Array} results - Database query results * @returns {Array<SearchResult>} Mapped search results */ function mapToSearchResults(results) { // Check if results has a rows property and it's an array const rows = results && results.rows && Array.isArray(results.rows) ? results.rows : Array.isArray(results) ? results : []; // If no valid results, return empty array if (rows.length === 0) { console.warn("No valid search results found to map"); return []; } return rows.map((row) => ({ entity: { entity_id: row.entity_id, file_path: row.file_path, entity_type: row.entity_type, name: row.name, parent_entity_id: row.parent_entity_id, content_hash: row.content_hash, raw_content: row.raw_content, start_line: row.start_line, end_line: row.end_line, language: row.language, created_at: row.created_at, last_modified_at: row.last_modified_at, }, relevanceScore: row.relevance_score, })); } /** * Merge and deduplicate search results from multiple sources * * @param {Array<SearchResult>} resultsA - First set of search results * @param {Array<SearchResult>} resultsB - Second set of search results * @returns {Array<SearchResult>} Merged and deduplicated results */ function mergeSearchResults(resultsA, resultsB) { // Create a map to deduplicate by entity_id const entityMap = new Map(); // Process the first result set (higher priority) for (const result of resultsA) { entityMap.set(result.entity.entity_id, result); } // Process the second result set, only adding entities not already present // or combining scores if the entity already exists for (const result of resultsB) { const entityId = result.entity.entity_id; if (entityMap.has(entityId)) { // Entity already exists, update relevance score // Using a weighted average here, favoring FTS results const existingResult = entityMap.get(entityId); const combinedScore = existingResult.relevanceScore * 0.7 + result.relevanceScore * 0.3; entityMap.set(entityId, { ...existingResult, relevanceScore: combinedScore, }); } else { // New entity, add to results entityMap.set(entityId, result); } } // Convert map back to array and sort by relevance score return Array.from(entityMap.values()).sort( (a, b) => b.relevanceScore - a.relevanceScore ); } /** * Calculate a custom relevance score for an entity based on non-vector factors * * @param {CodeEntity} entity - The code entity to score * @param {string[]} queryKeywords - Keywords from the search query * @param {string[]} [focusKeywords=[]] - Keywords representing the current focus area * @returns {number} A relevance score between 0 and 1 */ export function nonVectorRelevanceScore( entity, queryKeywords, focusKeywords = [] ) { // Ensure we have valid inputs if (!entity || !queryKeywords || queryKeywords.length === 0) { return 0; } // Initialize base score let score = 0.5; // Prepare keywords by stemming const stemmedQueryKeywords = queryKeywords.map((kw) => stem(kw.toLowerCase()) ); const stemmedFocusKeywords = focusKeywords.map((kw) => stem(kw.toLowerCase()) ); // 1. Keyword Matching Score const keywordMatchScore = calculateKeywordMatchScore( entity, stemmedQueryKeywords ); // 2. Focus Area Boost const focusBoost = calculateFocusAreaBoost(entity, stemmedFocusKeywords); // 3. Recency Factor const recencyFactor = calculateRecencyFactor(entity); // 4. Importance Score Factor const importanceFactor = entity.importance_score !== undefined ? entity.importance_score : 0.5; // 5. Type-Based Weighting const typeWeight = calculateTypeWeight(entity); // 6. Hierarchical Proximity (simplified first pass) const hierarchyBoost = 1.0; // Default value for now, can be enhanced later // Combine all factors with appropriate weights score = (keywordMatchScore * 0.35 + // 35% weight for keyword matching focusBoost * 0.2 + // 20% weight for focus area boost recencyFactor * 0.15 + // 15% weight for recency importanceFactor * 0.2 + // 20% weight for importance typeWeight * 0.1) * // 10% weight for entity type hierarchyBoost; // Apply hierarchy boost as a multiplier // Ensure score is between 0 and 1 return Math.max(0, Math.min(1, score)); } /** * Calculate keyword matching score based on entity content and query keywords * * @param {CodeEntity} entity - The code entity * @param {string[]} stemmedQueryKeywords - Stemmed query keywords * @returns {number} Keyword match score between 0 and 1 */ function calculateKeywordMatchScore(entity, stemmedQueryKeywords) { // Extract meaningful tokens from entity name and content const nameTokens = tokenize(entity.name || "").map((token) => stem(token.toLowerCase()) ); // Use summary if available, otherwise use raw_content const contentText = entity.summary || entity.raw_content || ""; const contentTokens = tokenize(contentText).map((token) => stem(token.toLowerCase()) ); // Combine unique tokens const entityTokens = Array.from(new Set([...nameTokens, ...contentTokens])); if (entityTokens.length === 0) return 0; // Calculate matches let nameMatches = 0; let contentMatches = 0; for (const queryKw of stemmedQueryKeywords) { // Check for matches in name (higher importance) if (nameTokens.includes(queryKw)) { nameMatches++; } // Check for matches in content else if (contentTokens.includes(queryKw)) { contentMatches++; } } // Calculate Jaccard index for overall similarity const matchingTokens = stemmedQueryKeywords.filter((kw) => entityTokens.includes(kw) ).length; const jaccardIndex = matchingTokens / (entityTokens.length + stemmedQueryKeywords.length - matchingTokens); // Calculate final keyword score with boosted name matches const nameMatchScore = (nameMatches / stemmedQueryKeywords.length) * 1.5; // 50% boost for name matches const contentMatchScore = contentMatches / stemmedQueryKeywords.length; const overallMatchScore = jaccardIndex * 0.5; // Base similarity return Math.min(1.0, nameMatchScore + contentMatchScore + overallMatchScore); } /** * Calculate focus area boost based on overlap with focus keywords * * @param {CodeEntity} entity - The code entity * @param {string[]} stemmedFocusKeywords - Stemmed focus area keywords * @returns {number} Focus area boost between 0 and 1 */ function calculateFocusAreaBoost(entity, stemmedFocusKeywords) { if (!stemmedFocusKeywords || stemmedFocusKeywords.length === 0) { return 0; } // Extract tokens from entity const entityText = [ entity.name || "", entity.summary || "", entity.raw_content || "", ].join(" "); const entityTokens = tokenize(entityText).map((token) => stem(token.toLowerCase()) ); // Count matching focus keywords const matchingFocusKeywords = stemmedFocusKeywords.filter((kw) => entityTokens.includes(kw) ).length; // Calculate focus boost based on proportion of matching focus keywords return matchingFocusKeywords / stemmedFocusKeywords.length; } /** * Calculate recency factor based on entity's last modified or accessed date * * @param {CodeEntity} entity - The code entity * @returns {number} Recency factor between 0 and 1 */ function calculateRecencyFactor(entity) { // Use last_modified_at or last_accessed_at, whichever is more recent const lastModified = entity.last_modified_at ? new Date(entity.last_modified_at) : null; const lastAccessed = entity.last_accessed_at ? new Date(entity.last_accessed_at) : null; if (!lastModified && !lastAccessed) { return 0.5; // Default value if no dates available } // Use the most recent date const mostRecentDate = !lastAccessed ? lastModified : !lastModified ? lastAccessed : lastAccessed > lastModified ? lastAccessed : lastModified; const now = new Date(); const ageInDays = (now - mostRecentDate) / (1000 * 60 * 60 * 24); // Exponential decay function: score = e^(-ageInDays/30) // This gives a score of ~1.0 for today, ~0.37 for 30 days ago, ~0.14 for 60 days ago return Math.exp(-ageInDays / 30); } /** * Calculate type-based weight for different entity types * * @param {CodeEntity} entity - The code entity * @returns {number} Type weight between 0 and 1 */ function calculateTypeWeight(entity) { // Define weights for different entity types const typeWeights = { function: 0.9, class: 0.9, method: 0.85, file: 0.8, variable: 0.75, comment: 0.5, default: 0.7, // Default weight for unknown types }; const entityType = (entity.entity_type || "").toLowerCase(); return typeWeights[entityType] || typeWeights.default; } /** * Retrieves code entities by their entity IDs * * @param {string[]} entityIds - Array of entity IDs to retrieve * @returns {Promise<CodeEntity[]>} Array of code entities */ export async function searchByEntityIds(entityIds) { try { // Validate input if (!entityIds || !Array.isArray(entityIds) || entityIds.length === 0) { throw new Error("Entity IDs array is required and cannot be empty"); } // Create placeholders for the IN clause const placeholders = entityIds.map(() => "?").join(", "); // Build and execute the query const sql = ` SELECT * FROM code_entities WHERE entity_id IN (${placeholders}) `; const results = await executeQuery(sql, entityIds); // Return the raw entity objects return results; } catch (error) { console.error("Error in searchByEntityIds:", error); throw error; } }