UNPKG

scai

Version:

> **A local-first AI CLI for understanding, querying, and iterating on large codebases.** > **100% local • No token costs • No cloud • No prompt injection • Private by design**

918 lines (903 loc) 32.6 kB
// indexCmd.ts import fg from 'fast-glob'; import path from 'path'; import lockfile from 'proper-lockfile'; import { initSchema } from '../db/schema.js'; import { getDbForRepo, getDbPathForRepo } from '../db/client.js'; import { upsertFileTemplate } from '../db/sqlTemplates.js'; import { detectFileType } from '../fileRules/detectFileType.js'; import { classifyFile } from '../fileRules/classifyFile.js'; import { IGNORED_FOLDER_GLOBS } from '../fileRules/ignoredPaths.js'; import { Config } from '../config.js'; import { log } from '../utils/log.js'; import { startDaemon } from '../commands/DaemonCmd.js'; import * as sqlTemplates from '../db/sqlTemplates.js'; import { RELATED_FILES_LIMIT } from '../constants.js'; import { generate } from '../lib/generate.js'; import { logInputOutput } from '../utils/promptLogHelper.js'; import { sanitizeQueryForFts } from '../utils/sanitizeQuery.js'; import { extractTaggedContent } from '../utils/parseTaggedContent.js'; import { extractFileReferences } from '../utils/extractFileReferences.js'; const QUERY_OPERATOR_TOKENS = new Set(["or", "and", "not", "near"]); const GENERIC_FTS_TERMS = new Set([ "file", "files", "code", "source", "repository", "result", "results", "output", "entry", "database", "configuration", ]); const MAX_PRIMARY_TERMS = 12; const MAX_FALLBACK_TERMS = 10; /* -------------------------------------------------- */ /* DB LOCK */ /* -------------------------------------------------- */ async function lockDb() { try { return await lockfile.lock(getDbPathForRepo()); } catch (err) { log('❌ Failed to acquire DB lock: ' + err); throw err; } } /* -------------------------------------------------- */ /* INDEX COMMAND */ /* -------------------------------------------------- */ export async function runIndexCommand() { try { initSchema(); } catch (err) { console.error('❌ Failed to initialize schema:', err); process.exit(1); } const indexDir = Config.getIndexDir() || process.cwd(); Config.setIndexDir(indexDir); log(`📂 Scanning files in: ${indexDir}`); const files = await fg('**/*.*', { cwd: indexDir, ignore: IGNORED_FOLDER_GLOBS, absolute: true, }); const db = getDbForRepo(); const release = await lockDb(); const countByExt = {}; let count = 0; try { for (const file of files) { const classification = classifyFile(file); if (classification !== 'valid') { log(`⏭️ Skipping (${classification}): ${file}`); continue; } try { const type = detectFileType(file); const normalizedPath = path.normalize(file).replace(/\\/g, '/'); const filename = path.basename(normalizedPath); db.prepare(upsertFileTemplate).run({ path: normalizedPath, filename, summary: null, type, lastModified: null, indexedAt: null, }); const ext = path.extname(file); countByExt[ext] = (countByExt[ext] || 0) + 1; count++; } catch (err) { log(`⚠️ Skipped in indexCmd ${file}: ${String(err)}`); } } } finally { await release(); } log('📊 Discovered files by extension:', JSON.stringify(countByExt, null, 2)); log(`✅ Done. Enqueued ${count} files for indexing.`); startDaemon(); } /* -------------------------------------------------- */ /* QUERY API */ /* -------------------------------------------------- */ export function queryFiles(safeQuery, limit = 10) { const db = getDbForRepo(); return db .prepare(sqlTemplates.queryFilesTemplate) .all(safeQuery, limit); } /* -------------------------------------------------- */ /* SEMANTIC SEARCH */ /* -------------------------------------------------- */ export async function semanticSearchFiles(originalQuery, topK = 5, intent = {}) { const db = getDbForRepo(); const summarizeRows = (rows) => rows.map((r) => `${r.filename} | bm25Score=${r.bm25Score}`); const targetFiles = Array.isArray(intent.targetFiles) ? dedupeNormalizedStrings(intent.targetFiles) : []; const targetSymbols = Array.isArray(intent.targetSymbols) ? dedupeNormalizedStrings(intent.targetSymbols) : []; // 1) Target-first path. // Example: targetFiles=["semanticAnalysisModule.ts","typescript.ts"] -> return only those // resolved rows and skip broad FTS expansion. if (targetFiles.length > 0 || targetSymbols.length > 0) { // Deterministic resolution order: // 1) files by explicit path/name 2) files by exact symbol ownership. // Example: targetFiles=["fileIndex.ts"], targetSymbols=["semanticSearchFiles"]. const { resolvedRows: fileRows, unresolvedTargets } = resolveTargetRows(db, targetFiles); const exactSymbolRows = resolveSymbolRows(db, targetSymbols); const patternSymbolRows = targetSymbols.length > 0 && exactSymbolRows.length === 0 ? resolveSymbolRowsByPattern(db, targetSymbols) : []; const symbolRows = mergeRows(exactSymbolRows, patternSymbolRows); const resolvedRows = mergeRows(fileRows, symbolRows); const queryExpansionTerms = buildQueryExpansionTerms(originalQuery, [ ...targetFiles.map((target) => `"${path.basename(target)}"`), ...targetSymbols.flatMap((symbol) => buildSymbolSearchForms(symbol) .map((form) => normalizeFtsTerm(form)) .filter(Boolean) .map((form) => `${form}*`)) ]); logInputOutput("semanticSearchFiles target resolution", "output", { originalQuery, intent, targetFiles, targetSymbols, resolvedCount: resolvedRows.length, fileResolvedCount: fileRows.length, symbolResolvedCount: symbolRows.length, symbolResolvedExactCount: exactSymbolRows.length, symbolResolvedPatternCount: patternSymbolRows.length, unresolvedTargets, resolvedFiles: resolvedRows.map((row) => row.path), }); // Target-first short-circuit: if we resolved any explicit targets, keep scope tight. if (resolvedRows.length > 0) { return resolvedRows.slice(0, topK).map((r) => ({ id: r.id, path: r.path, filename: r.filename, summary: r.summary, type: r.type, lastModified: r.lastModified, bm25Score: r.bm25Score, queryExpansionTerms, })); } } // 2) Broad search path (fallback when target resolution failed). // Example query: "add comments in parser module" -> model emits OR-joined FTS terms. const primaryFtsQuery = await generatePrimaryFtsQuery(originalQuery, intent); const primaryExpansionTerms = buildQueryExpansionTerms(originalQuery, [primaryFtsQuery]); logInputOutput("semanticSearchFiles LLM primary query", "output", { originalQuery, intent, ftsQuery: primaryFtsQuery, }); const primaryResults = db .prepare(sqlTemplates.searchFilesTemplate) .all(primaryFtsQuery, RELATED_FILES_LIMIT); logInputOutput("semanticSearchFiles initial results", "output", { stage: "primary", originalQuery, ftsQuery: primaryFtsQuery, resultCount: primaryResults.length, results: summarizeRows(primaryResults), }); if (primaryResults.length > 0) { return rankAndMap(new Map(primaryResults.map(r => [r.id, r])), topK, primaryExpansionTerms); } // 3) If primary returns nothing, try diversified fallback queries. const fallbackQuery = await generateFallbackFtsQueries(originalQuery, primaryFtsQuery, intent); logInputOutput("semanticSearchFiles LLM fallback query", "output", { originalQuery, primaryFtsQuery, fallbackQuery, }); if (fallbackQuery && fallbackQuery.length > 0) { const fallbackExpansionTerms = buildQueryExpansionTerms(originalQuery, [primaryFtsQuery, ...fallbackQuery]); const stmt = db.prepare(sqlTemplates.searchFilesTemplate); for (const query of fallbackQuery) { const rows = stmt.all(query, RELATED_FILES_LIMIT); logInputOutput("semanticSearchFiles initial results", "output", { stage: "fallback", originalQuery, ftsQuery: query, resultCount: rows.length, results: summarizeRows(rows), }); if (rows.length > 0) { return rankAndMap(new Map(rows.map(r => [r.id, r])), topK, fallbackExpansionTerms); } } } return []; } function resolveTargetRows(db, targetFiles) { const resolvedRows = []; const unresolvedTargets = []; const seenPaths = new Set(); const byExactPath = db.prepare(` SELECT id, path, filename, summary, type, 0 AS bm25Score, last_modified AS lastModified FROM files WHERE REPLACE(path, '\\\\', '/') = ? LIMIT 1 `); const byFilename = db.prepare(` SELECT id, path, filename, summary, type, 0 AS bm25Score, last_modified AS lastModified FROM files WHERE LOWER(filename) = LOWER(?) `); const byFilenameLike = db.prepare(` SELECT id, path, filename, summary, type, 0 AS bm25Score, last_modified AS lastModified FROM files WHERE LOWER(filename) LIKE '%' || LOWER(?) || '%' `); // Resolution strategy per target: // exact path -> path suffix -> exact filename. // If multiple matches exist, mark unresolved to avoid ambiguous selection. for (const rawTarget of targetFiles) { const target = String(rawTarget ?? "").trim(); if (!target) continue; const normalizedTarget = target.replace(/\\/g, "/"); let matches = byExactPath.all(normalizedTarget); if (matches.length === 0 && normalizedTarget.includes("/")) { matches = db.prepare(` SELECT id, path, filename, summary, type, 0 AS bm25Score, last_modified AS lastModified FROM files WHERE LOWER(REPLACE(path, '\\\\', '/')) LIKE '%' || LOWER(?) `).all(`/${normalizedTarget}`); } if (matches.length === 0) { matches = byFilename.all(path.basename(normalizedTarget)); } if (matches.length === 0) { const targetName = path.basename(normalizedTarget); const targetKey = normalizeFileLookupToken(targetName); if (targetKey) { const candidates = byFilenameLike.all(targetName); matches = candidates.filter((row) => { const filenameKey = normalizeFileLookupToken(row.filename); const stemKey = normalizeFileLookupToken(path.parse(row.filename).name); return filenameKey === targetKey || stemKey === targetKey; }); } } if (matches.length !== 1) { unresolvedTargets.push(target); continue; } const row = matches[0]; if (seenPaths.has(row.path)) continue; seenPaths.add(row.path); resolvedRows.push(row); } return { resolvedRows, unresolvedTargets }; } function normalizeFileLookupToken(value) { return String(value ?? "").toLowerCase().replace(/[^a-z0-9]/g, "").trim(); } function normalizeSymbolLookupToken(value) { return String(value ?? "").toLowerCase().replace(/[^a-z0-9]/g, "").trim(); } function buildSymbolSearchForms(rawSymbol) { const symbol = String(rawSymbol ?? "").trim(); if (!symbol) return []; const forms = [symbol]; const normalized = normalizeSymbolLookupToken(symbol); if (normalized && normalized !== symbol.toLowerCase()) { forms.push(normalized); } return dedupeNormalizedStrings(forms); } function resolveSymbolRows(db, targetSymbols) { if (targetSymbols.length === 0) return []; const rows = []; const seenPaths = new Set(); const exactSymbolStmt = db.prepare(` SELECT DISTINCT f.id, f.path, f.filename, f.summary, f.type, 0 AS bm25Score, f.last_modified AS lastModified FROM files f LEFT JOIN functions fn ON fn.file_id = f.id LEFT JOIN graph_classes gc ON gc.file_id = f.id WHERE LOWER(fn.name) = LOWER(?) OR LOWER(gc.name) = LOWER(?) `); const normalizedSymbolStmt = db.prepare(` SELECT DISTINCT f.id, f.path, f.filename, f.summary, f.type, 0 AS bm25Score, f.last_modified AS lastModified FROM files f LEFT JOIN functions fn ON fn.file_id = f.id LEFT JOIN graph_classes gc ON gc.file_id = f.id WHERE LOWER(REPLACE(REPLACE(REPLACE(fn.name, '_', ''), '-', ''), ' ', '')) = LOWER(?) OR LOWER(REPLACE(REPLACE(REPLACE(gc.name, '_', ''), '-', ''), ' ', '')) = LOWER(?) `); const normalizedFileStemStmt = db.prepare(` SELECT DISTINCT f.id, f.path, f.filename, f.summary, f.type, 0 AS bm25Score, f.last_modified AS lastModified FROM files f WHERE LOWER( REPLACE( REPLACE( REPLACE( CASE WHEN INSTR(f.filename, '.') > 0 THEN SUBSTR(f.filename, 1, INSTR(f.filename, '.') - 1) ELSE f.filename END, '_', '' ), '-', '' ), ' ', '' ) ) = LOWER(?) `); for (const rawSymbol of targetSymbols) { // Resolve exact symbol names first, then normalized identifier forms // (e.g. "semantic analysis module" -> "semanticanalysismodule"). const symbol = String(rawSymbol ?? "").trim(); if (!symbol) continue; const symbolForms = buildSymbolSearchForms(symbol); const candidateRows = new Map(); for (const form of symbolForms) { const exactMatches = exactSymbolStmt.all(form, form); for (const row of exactMatches) { candidateRows.set(row.path, row); } } const normalizedSymbol = normalizeSymbolLookupToken(symbol); if (normalizedSymbol) { const normalizedMatches = normalizedSymbolStmt.all(normalizedSymbol, normalizedSymbol); for (const row of normalizedMatches) { candidateRows.set(row.path, row); } const filenameMatches = normalizedFileStemStmt.all(normalizedSymbol); for (const row of filenameMatches) { candidateRows.set(row.path, row); } } for (const row of candidateRows.values()) { if (seenPaths.has(row.path)) continue; seenPaths.add(row.path); rows.push(row); } } return rows; } function resolveSymbolRowsByPattern(db, targetSymbols) { if (targetSymbols.length === 0) return []; const rows = []; const seenPaths = new Set(); const candidatePatterns = new Set(); const candidateParts = new Set(); for (const rawSymbol of targetSymbols) { for (const pattern of buildSymbolLikePatterns(rawSymbol)) { candidatePatterns.add(pattern); } for (const part of splitSymbolParts(rawSymbol)) { if (part.length >= 3) candidateParts.add(part); } } if (candidatePatterns.size === 0 && candidateParts.size === 0) { return []; } const patternStmt = db.prepare(` SELECT DISTINCT f.id, f.path, f.filename, f.summary, f.type, 0 AS bm25Score, f.last_modified AS lastModified FROM files f LEFT JOIN files_fts fts ON f.id = fts.rowid LEFT JOIN functions fn ON fn.file_id = f.id LEFT JOIN graph_classes gc ON gc.file_id = f.id WHERE LOWER(COALESCE(fn.name, '')) LIKE LOWER(?) OR LOWER(COALESCE(gc.name, '')) LIKE LOWER(?) OR LOWER(COALESCE(f.filename, '')) LIKE LOWER(?) OR LOWER(COALESCE(f.path, '')) LIKE LOWER(?) OR LOWER(COALESCE(f.summary, '')) LIKE LOWER(?) OR LOWER(COALESCE(fts.content_text, '')) LIKE LOWER(?) LIMIT ? `); const partStmt = db.prepare(` SELECT DISTINCT f.id, f.path, f.filename, f.summary, f.type, 0 AS bm25Score, f.last_modified AS lastModified FROM files f LEFT JOIN files_fts fts ON f.id = fts.rowid LEFT JOIN functions fn ON fn.file_id = f.id LEFT JOIN graph_classes gc ON gc.file_id = f.id WHERE LOWER(COALESCE(fn.name, '')) LIKE LOWER(?) OR LOWER(COALESCE(gc.name, '')) LIKE LOWER(?) OR LOWER(COALESCE(fts.content_text, '')) LIKE LOWER(?) LIMIT ? `); for (const pattern of candidatePatterns) { const likePattern = `%${escapeSqlLikeTerm(pattern)}%`; const matches = patternStmt.all(likePattern, likePattern, likePattern, likePattern, likePattern, likePattern, RELATED_FILES_LIMIT); for (const row of matches) { if (seenPaths.has(row.path)) continue; seenPaths.add(row.path); rows.push(row); } } for (const part of candidateParts) { const likePattern = `%${escapeSqlLikeTerm(part)}%`; const matches = partStmt.all(likePattern, likePattern, likePattern, RELATED_FILES_LIMIT); for (const row of matches) { if (seenPaths.has(row.path)) continue; seenPaths.add(row.path); rows.push(row); } } return rows.slice(0, RELATED_FILES_LIMIT); } function splitSymbolParts(rawSymbol) { const symbol = String(rawSymbol ?? "").trim(); if (!symbol) return []; const withCamelBreaks = symbol.replace(/([a-z0-9])([A-Z])/g, "$1 $2"); const parts = withCamelBreaks .split(/[^A-Za-z0-9]+/) .map((part) => part.toLowerCase().trim()) .filter(Boolean); return Array.from(new Set(parts)); } function buildSymbolLikePatterns(rawSymbol) { const symbol = String(rawSymbol ?? "").trim(); if (!symbol) return []; const patterns = new Set(); const compact = normalizeSymbolLookupToken(symbol); const parts = splitSymbolParts(symbol); if (compact) patterns.add(compact); if (parts.length > 0) { patterns.add(parts.join("")); patterns.add(parts.join("%")); } return Array.from(patterns); } function escapeSqlLikeTerm(value) { return String(value ?? "").replace(/[%_]/g, ""); } function mergeRows(...groups) { // Keep deterministic priority from call order and remove duplicates by path. // Example: if file target and symbol target resolve same file, return it once. const out = []; const seenPaths = new Set(); for (const group of groups) { for (const row of group) { if (seenPaths.has(row.path)) continue; seenPaths.add(row.path); out.push(row); } } return out; } /* -------------------------------------------------- */ /* LLM → FTS QUERY GENERATION (TAG-BASED) */ /* -------------------------------------------------- */ async function generatePrimaryFtsQuery(userQuery, intent = {}) { const prompt = ` You are generating a SQLite FTS query for searching a source code repository. The user query may refer to: - High-level intent - Domain terminology - Specific filenames, file types, or configuration files Input: "${userQuery}" Task: 1. Extract high-level intent terms 2. Expand to related domain-specific terminology 3. Expand to likely filenames, config files, or structural artifacts if relevant 4. Combine ALL useful terms into ONE OR-joined FTS query Rules: - Output ONLY the OR-joined terms - Max 12 total terms - Use OR between terms - Include filenames when relevant - No explanations - No natural language sentences Wrap the result in <FILE_CONTENT> tags. <FILE_CONTENT> term1 OR term2 OR term3 </FILE_CONTENT> `.trim(); // Model-generated query is always normalized by enforceFtsQueryPolicy. // Example: "code* OR file*" + target file -> keeps anchored file term first. try { const response = await generate({ content: prompt, query: "" }); const rawText = String(response.data ?? ""); const { content } = extractTaggedContent(rawText, "FILE_CONTENT"); return enforceFtsQueryPolicy(userQuery, content, MAX_PRIMARY_TERMS, intent); } catch (err) { return enforceFtsQueryPolicy(userQuery, userQuery, MAX_PRIMARY_TERMS, intent); } } async function generateFallbackFtsQueries(userQuery, failedQuery, intent = {}) { const prompt = ` You are generating fallback SQLite FTS queries for a source code repository search. Original user query: "${userQuery}" Primary FTS query returned ZERO results: "${failedQuery}" Task: Generate 35 independent FTS queries (MAX 5). For each query: 1. Think at a different abstraction level (intent-level, domain-level, structural-level). 2. Include filenames, file types, modules, config files, or symbols when relevant. 3. Use a single OR-joined expression. 4. Max 10 terms per query. Rules: - Avoid natural language sentences - No explanations - No commentary - Each line must be one complete OR expression Output format (STRICT): <FILE_CONTENT> query1 query2 query3 query4 query5 </FILE_CONTENT> `.trim(); try { const response = await generate({ content: prompt, query: "" }); const rawText = String(response.data ?? ""); const { content } = extractTaggedContent(rawText, "FILE_CONTENT"); // Each candidate fallback line is sanitized/anchored the same way as primary. const subQueries = content .split(/\r?\n/) .map(q => enforceFtsQueryPolicy(userQuery, q.trim(), MAX_FALLBACK_TERMS, intent)) .filter(Boolean) .slice(0, 5); if (!subQueries.length) { throw new Error("No fallback subqueries generated"); } return subQueries; } catch (err) { log(`⚠️ [semanticSearchFiles] Fallback FTS generation failed: ${String(err)}`); return null; } } /* -------------------------------------------------- */ /* PLANNER SEARCH */ /* -------------------------------------------------- */ export async function plannerSearchFiles(originalQuery, query, topK = 5) { const db = getDbForRepo(); const seen = new Map(); const usedQueries = []; const safeQuery = sanitizeQueryForFts(query); if (safeQuery) usedQueries.push(safeQuery); const primaryResults = db .prepare(sqlTemplates.searchFilesTemplate) .all(safeQuery, RELATED_FILES_LIMIT); primaryResults.forEach(r => seen.set(r.id, r)); logInputOutput("plannerSearchFiles primary FTS", "input", { safeQuery, count: primaryResults.length, }); if (primaryResults.length === 0) { const stmt = db.prepare(sqlTemplates.searchFilesTemplate); const llmPrimaryQuery = await generatePrimaryFtsQuery(originalQuery); const llmFallbackQueries = await generateFallbackFtsQueries(originalQuery, llmPrimaryQuery || safeQuery); const candidateQueries = []; const pushCandidate = (q) => { if (!q) return; const sanitized = sanitizeQueryForFts(q); if (!sanitized) return; if (sanitized === safeQuery) return; if (candidateQueries.includes(sanitized)) return; candidateQueries.push(sanitized); usedQueries.push(sanitized); }; pushCandidate(llmPrimaryQuery); (llmFallbackQueries ?? []).forEach(pushCandidate); // Keep legacy planner expansion as final fallback if LLM query generation did not add anything useful. if (candidateQueries.length === 0) { pushCandidate(await expandQueryWithModel(originalQuery)); } for (const candidateQuery of candidateQueries) { const rows = stmt.all(candidateQuery, RELATED_FILES_LIMIT); logInputOutput("plannerSearchFiles expanded FTS", "output", { stage: "llm-expanded", originalQuery, ftsQuery: candidateQuery, resultCount: rows.length, }); rows.forEach(r => { if (!seen.has(r.id)) seen.set(r.id, r); }); if (seen.size > 0) break; } } if (seen.size === 0) return []; return rankAndMap(seen, topK, buildQueryExpansionTerms(originalQuery, usedQueries)); } /* -------------------------------------------------- */ /* HELPERS */ /* -------------------------------------------------- */ function rankAndMap(seen, topK, queryExpansionTerms = []) { // Lower BM25 is better in SQLite FTS, so ascending sort gives best matches first. return Array.from(seen.values()) .sort((a, b) => (a.bm25Score ?? 0) - (b.bm25Score ?? 0)) .slice(0, topK) .map(r => ({ id: r.id, path: r.path, filename: r.filename, summary: r.summary, type: r.type, lastModified: r.lastModified, bm25Score: r.bm25Score, queryExpansionTerms, })); } /** * Builds a deduped token list used as retrieval metadata for downstream steps. * Example: * - original: "update fileIndex.ts logging" * - queries: ["\"fileIndex.ts\" OR logging*"] * - output contains: ["update", "fileindex", "ts", "logging"] */ function buildQueryExpansionTerms(originalQuery, queries) { const terms = new Set(); const addTokens = (text) => { for (const token of tokenizeQueryTerms(text)) { if (!QUERY_OPERATOR_TOKENS.has(token)) { terms.add(token); } } }; addTokens(originalQuery); for (const query of queries) addTokens(query); return Array.from(terms); } /** * Applies strict normalization to an LLM-generated FTS query: * 1) sanitize, 2) split into OR terms, 3) add anchors, 4) drop generic noise, * 5) dedupe, 6) cap term count. * Example: * - user: "change semanticSearchFiles in fileIndex.ts" * - candidate: "code* OR file* OR output*" * - output: "\"fileIndex.ts\" OR semanticsearchfiles* OR output*" */ function enforceFtsQueryPolicy(userQuery, candidateQuery, maxTerms, intent = {}) { const safe = sanitizeQueryForFts(candidateQuery); const candidateTerms = splitOrTerms(safe); const prioritizedAnchors = buildAnchorTerms(userQuery, intent); const hasAnchors = prioritizedAnchors.length > 0; const filtered = candidateTerms.filter(term => { const normalized = normalizeFtsTerm(term); if (!normalized || QUERY_OPERATOR_TOKENS.has(normalized)) return false; if (hasAnchors && GENERIC_FTS_TERMS.has(normalized)) return false; return true; }); const merged = dedupeTerms([...prioritizedAnchors, ...filtered]); return merged.slice(0, maxTerms).join(" OR "); } /** * Splits an OR query into raw terms. * Example: "\"fileIndex.ts\" OR semanticsearchfiles* OR output*" -> * ["\"fileIndex.ts\"", "semanticsearchfiles*", "output*"] */ function splitOrTerms(query) { return query .split(/\s+OR\s+/i) .map(part => part.trim()) .filter(Boolean); } /** * Normalizes an FTS term for stable comparisons by removing quotes/wildcards. * Example: "\"fileIndex.ts\"" -> "fileindex.ts", "Module*" -> "module" */ function normalizeFtsTerm(term) { return term.replace(/["*]/g, "").toLowerCase().trim(); } /** * Dedupes terms using normalized keys while preserving first-seen order. * Example: ["code*", "CODE*", "\"code\""] -> ["code*"] */ function dedupeTerms(terms) { const seen = new Set(); const out = []; for (const term of terms) { const key = normalizeFtsTerm(term); if (!key || seen.has(key)) continue; seen.add(key); out.push(term); } return out; } /** * Extracts high-signal anchors from user text (explicit filenames + likely symbols). * Example: * - user: "update semanticSearchFiles in fileIndex.ts" * - output includes: ["\"fileIndex.ts\"", "semanticsearchfiles*"] */ function buildAnchorTerms(userQuery, intent = {}) { const anchorTerms = []; const explicitFiles = extractFileReferences(userQuery); const targetFiles = Array.isArray(intent.targetFiles) ? dedupeNormalizedStrings(intent.targetFiles) : []; const targetSymbols = Array.isArray(intent.targetSymbols) ? dedupeNormalizedStrings(intent.targetSymbols) : []; for (const fileRef of targetFiles) { for (const matchedFile of extractFileReferences(fileRef)) { const filename = path.basename(matchedFile); if (filename) anchorTerms.push(`"${filename}"`); } } for (const fileRef of explicitFiles) { const filename = path.basename(fileRef); if (filename) anchorTerms.push(`"${filename}"`); } for (const symbol of targetSymbols) { for (const form of buildSymbolSearchForms(symbol)) { const normalized = normalizeFtsTerm(form); if (!normalized) continue; if (QUERY_OPERATOR_TOKENS.has(normalized)) continue; if (GENERIC_FTS_TERMS.has(normalized)) continue; anchorTerms.push(`${normalized}*`); } } const symbolMatches = userQuery.match(/[A-Za-z_][A-Za-z0-9_]*/g) ?? []; for (const token of symbolMatches) { const isLikelySymbol = /[A-Z]/.test(token) || token.includes("_") || token.endsWith("Step") || token.endsWith("Module"); if (!isLikelySymbol) continue; const normalized = token.toLowerCase(); if (QUERY_OPERATOR_TOKENS.has(normalized)) continue; if (GENERIC_FTS_TERMS.has(normalized)) continue; anchorTerms.push(`${normalized}*`); } return dedupeTerms(anchorTerms); } function dedupeNormalizedStrings(tokens) { // Lightweight normalization only (trim + slash normalization + case-insensitive dedupe). // Example: [" FileIndex.ts ", "fileindex.ts"] -> ["FileIndex.ts"]. const out = []; const seen = new Set(); for (const token of tokens) { if (typeof token !== "string") continue; const normalized = token.trim().replace(/\\/g, "/"); if (!normalized) continue; const key = normalized.toLowerCase(); if (seen.has(key)) continue; seen.add(key); out.push(normalized); } return out; } /** * Tokenizes text into lowercase alphanumeric terms for query expansion metadata. * Example: "\"fileIndex.ts\" OR semanticsearchfiles*" -> ["fileindex", "ts", "semanticsearchfiles"] */ function tokenizeQueryTerms(text) { const matches = text.toLowerCase().match(/[a-z_][a-z0-9_]{1,}/g) ?? []; return Array.from(new Set(matches)); } async function expandQueryWithModel(query) { const prompt = ` Return concrete search terms likely to appear in source code. Wrap the result in <FILE_CONTENT> tags. Question: "${query}" `.trim(); try { const response = await generate({ content: prompt, query: "" }); const rawText = String(response.data ?? ""); const { content } = extractTaggedContent(rawText, "FILE_CONTENT"); return sanitizeQueryForFts(content); } catch { return null; } }