scai
Version:
> **A local-first AI CLI for understanding, querying, and iterating on large codebases.** > **100% local • No token costs • No cloud • No prompt injection • Private by design**
918 lines (903 loc) • 32.6 kB
JavaScript
// indexCmd.ts
import fg from 'fast-glob';
import path from 'path';
import lockfile from 'proper-lockfile';
import { initSchema } from '../db/schema.js';
import { getDbForRepo, getDbPathForRepo } from '../db/client.js';
import { upsertFileTemplate } from '../db/sqlTemplates.js';
import { detectFileType } from '../fileRules/detectFileType.js';
import { classifyFile } from '../fileRules/classifyFile.js';
import { IGNORED_FOLDER_GLOBS } from '../fileRules/ignoredPaths.js';
import { Config } from '../config.js';
import { log } from '../utils/log.js';
import { startDaemon } from '../commands/DaemonCmd.js';
import * as sqlTemplates from '../db/sqlTemplates.js';
import { RELATED_FILES_LIMIT } from '../constants.js';
import { generate } from '../lib/generate.js';
import { logInputOutput } from '../utils/promptLogHelper.js';
import { sanitizeQueryForFts } from '../utils/sanitizeQuery.js';
import { extractTaggedContent } from '../utils/parseTaggedContent.js';
import { extractFileReferences } from '../utils/extractFileReferences.js';
const QUERY_OPERATOR_TOKENS = new Set(["or", "and", "not", "near"]);
const GENERIC_FTS_TERMS = new Set([
"file",
"files",
"code",
"source",
"repository",
"result",
"results",
"output",
"entry",
"database",
"configuration",
]);
const MAX_PRIMARY_TERMS = 12;
const MAX_FALLBACK_TERMS = 10;
/* -------------------------------------------------- */
/* DB LOCK */
/* -------------------------------------------------- */
async function lockDb() {
try {
return await lockfile.lock(getDbPathForRepo());
}
catch (err) {
log('❌ Failed to acquire DB lock: ' + err);
throw err;
}
}
/* -------------------------------------------------- */
/* INDEX COMMAND */
/* -------------------------------------------------- */
export async function runIndexCommand() {
try {
initSchema();
}
catch (err) {
console.error('❌ Failed to initialize schema:', err);
process.exit(1);
}
const indexDir = Config.getIndexDir() || process.cwd();
Config.setIndexDir(indexDir);
log(`📂 Scanning files in: ${indexDir}`);
const files = await fg('**/*.*', {
cwd: indexDir,
ignore: IGNORED_FOLDER_GLOBS,
absolute: true,
});
const db = getDbForRepo();
const release = await lockDb();
const countByExt = {};
let count = 0;
try {
for (const file of files) {
const classification = classifyFile(file);
if (classification !== 'valid') {
log(`⏭️ Skipping (${classification}): ${file}`);
continue;
}
try {
const type = detectFileType(file);
const normalizedPath = path.normalize(file).replace(/\\/g, '/');
const filename = path.basename(normalizedPath);
db.prepare(upsertFileTemplate).run({
path: normalizedPath,
filename,
summary: null,
type,
lastModified: null,
indexedAt: null,
});
const ext = path.extname(file);
countByExt[ext] = (countByExt[ext] || 0) + 1;
count++;
}
catch (err) {
log(`⚠️ Skipped in indexCmd ${file}: ${String(err)}`);
}
}
}
finally {
await release();
}
log('📊 Discovered files by extension:', JSON.stringify(countByExt, null, 2));
log(`✅ Done. Enqueued ${count} files for indexing.`);
startDaemon();
}
/* -------------------------------------------------- */
/* QUERY API */
/* -------------------------------------------------- */
export function queryFiles(safeQuery, limit = 10) {
const db = getDbForRepo();
return db
.prepare(sqlTemplates.queryFilesTemplate)
.all(safeQuery, limit);
}
/* -------------------------------------------------- */
/* SEMANTIC SEARCH */
/* -------------------------------------------------- */
export async function semanticSearchFiles(originalQuery, topK = 5, intent = {}) {
const db = getDbForRepo();
const summarizeRows = (rows) => rows.map((r) => `${r.filename} | bm25Score=${r.bm25Score}`);
const targetFiles = Array.isArray(intent.targetFiles)
? dedupeNormalizedStrings(intent.targetFiles)
: [];
const targetSymbols = Array.isArray(intent.targetSymbols)
? dedupeNormalizedStrings(intent.targetSymbols)
: [];
// 1) Target-first path.
// Example: targetFiles=["semanticAnalysisModule.ts","typescript.ts"] -> return only those
// resolved rows and skip broad FTS expansion.
if (targetFiles.length > 0 || targetSymbols.length > 0) {
// Deterministic resolution order:
// 1) files by explicit path/name 2) files by exact symbol ownership.
// Example: targetFiles=["fileIndex.ts"], targetSymbols=["semanticSearchFiles"].
const { resolvedRows: fileRows, unresolvedTargets } = resolveTargetRows(db, targetFiles);
const exactSymbolRows = resolveSymbolRows(db, targetSymbols);
const patternSymbolRows = targetSymbols.length > 0 && exactSymbolRows.length === 0
? resolveSymbolRowsByPattern(db, targetSymbols)
: [];
const symbolRows = mergeRows(exactSymbolRows, patternSymbolRows);
const resolvedRows = mergeRows(fileRows, symbolRows);
const queryExpansionTerms = buildQueryExpansionTerms(originalQuery, [
...targetFiles.map((target) => `"${path.basename(target)}"`),
...targetSymbols.flatMap((symbol) => buildSymbolSearchForms(symbol)
.map((form) => normalizeFtsTerm(form))
.filter(Boolean)
.map((form) => `${form}*`))
]);
logInputOutput("semanticSearchFiles target resolution", "output", {
originalQuery,
intent,
targetFiles,
targetSymbols,
resolvedCount: resolvedRows.length,
fileResolvedCount: fileRows.length,
symbolResolvedCount: symbolRows.length,
symbolResolvedExactCount: exactSymbolRows.length,
symbolResolvedPatternCount: patternSymbolRows.length,
unresolvedTargets,
resolvedFiles: resolvedRows.map((row) => row.path),
});
// Target-first short-circuit: if we resolved any explicit targets, keep scope tight.
if (resolvedRows.length > 0) {
return resolvedRows.slice(0, topK).map((r) => ({
id: r.id,
path: r.path,
filename: r.filename,
summary: r.summary,
type: r.type,
lastModified: r.lastModified,
bm25Score: r.bm25Score,
queryExpansionTerms,
}));
}
}
// 2) Broad search path (fallback when target resolution failed).
// Example query: "add comments in parser module" -> model emits OR-joined FTS terms.
const primaryFtsQuery = await generatePrimaryFtsQuery(originalQuery, intent);
const primaryExpansionTerms = buildQueryExpansionTerms(originalQuery, [primaryFtsQuery]);
logInputOutput("semanticSearchFiles LLM primary query", "output", {
originalQuery,
intent,
ftsQuery: primaryFtsQuery,
});
const primaryResults = db
.prepare(sqlTemplates.searchFilesTemplate)
.all(primaryFtsQuery, RELATED_FILES_LIMIT);
logInputOutput("semanticSearchFiles initial results", "output", {
stage: "primary",
originalQuery,
ftsQuery: primaryFtsQuery,
resultCount: primaryResults.length,
results: summarizeRows(primaryResults),
});
if (primaryResults.length > 0) {
return rankAndMap(new Map(primaryResults.map(r => [r.id, r])), topK, primaryExpansionTerms);
}
// 3) If primary returns nothing, try diversified fallback queries.
const fallbackQuery = await generateFallbackFtsQueries(originalQuery, primaryFtsQuery, intent);
logInputOutput("semanticSearchFiles LLM fallback query", "output", {
originalQuery,
primaryFtsQuery,
fallbackQuery,
});
if (fallbackQuery && fallbackQuery.length > 0) {
const fallbackExpansionTerms = buildQueryExpansionTerms(originalQuery, [primaryFtsQuery, ...fallbackQuery]);
const stmt = db.prepare(sqlTemplates.searchFilesTemplate);
for (const query of fallbackQuery) {
const rows = stmt.all(query, RELATED_FILES_LIMIT);
logInputOutput("semanticSearchFiles initial results", "output", {
stage: "fallback",
originalQuery,
ftsQuery: query,
resultCount: rows.length,
results: summarizeRows(rows),
});
if (rows.length > 0) {
return rankAndMap(new Map(rows.map(r => [r.id, r])), topK, fallbackExpansionTerms);
}
}
}
return [];
}
function resolveTargetRows(db, targetFiles) {
const resolvedRows = [];
const unresolvedTargets = [];
const seenPaths = new Set();
const byExactPath = db.prepare(`
SELECT
id,
path,
filename,
summary,
type,
0 AS bm25Score,
last_modified AS lastModified
FROM files
WHERE REPLACE(path, '\\\\', '/') = ?
LIMIT 1
`);
const byFilename = db.prepare(`
SELECT
id,
path,
filename,
summary,
type,
0 AS bm25Score,
last_modified AS lastModified
FROM files
WHERE LOWER(filename) = LOWER(?)
`);
const byFilenameLike = db.prepare(`
SELECT
id,
path,
filename,
summary,
type,
0 AS bm25Score,
last_modified AS lastModified
FROM files
WHERE LOWER(filename) LIKE '%' || LOWER(?) || '%'
`);
// Resolution strategy per target:
// exact path -> path suffix -> exact filename.
// If multiple matches exist, mark unresolved to avoid ambiguous selection.
for (const rawTarget of targetFiles) {
const target = String(rawTarget ?? "").trim();
if (!target)
continue;
const normalizedTarget = target.replace(/\\/g, "/");
let matches = byExactPath.all(normalizedTarget);
if (matches.length === 0 && normalizedTarget.includes("/")) {
matches = db.prepare(`
SELECT
id,
path,
filename,
summary,
type,
0 AS bm25Score,
last_modified AS lastModified
FROM files
WHERE LOWER(REPLACE(path, '\\\\', '/')) LIKE '%' || LOWER(?)
`).all(`/${normalizedTarget}`);
}
if (matches.length === 0) {
matches = byFilename.all(path.basename(normalizedTarget));
}
if (matches.length === 0) {
const targetName = path.basename(normalizedTarget);
const targetKey = normalizeFileLookupToken(targetName);
if (targetKey) {
const candidates = byFilenameLike.all(targetName);
matches = candidates.filter((row) => {
const filenameKey = normalizeFileLookupToken(row.filename);
const stemKey = normalizeFileLookupToken(path.parse(row.filename).name);
return filenameKey === targetKey || stemKey === targetKey;
});
}
}
if (matches.length !== 1) {
unresolvedTargets.push(target);
continue;
}
const row = matches[0];
if (seenPaths.has(row.path))
continue;
seenPaths.add(row.path);
resolvedRows.push(row);
}
return { resolvedRows, unresolvedTargets };
}
function normalizeFileLookupToken(value) {
return String(value ?? "").toLowerCase().replace(/[^a-z0-9]/g, "").trim();
}
function normalizeSymbolLookupToken(value) {
return String(value ?? "").toLowerCase().replace(/[^a-z0-9]/g, "").trim();
}
function buildSymbolSearchForms(rawSymbol) {
const symbol = String(rawSymbol ?? "").trim();
if (!symbol)
return [];
const forms = [symbol];
const normalized = normalizeSymbolLookupToken(symbol);
if (normalized && normalized !== symbol.toLowerCase()) {
forms.push(normalized);
}
return dedupeNormalizedStrings(forms);
}
function resolveSymbolRows(db, targetSymbols) {
if (targetSymbols.length === 0)
return [];
const rows = [];
const seenPaths = new Set();
const exactSymbolStmt = db.prepare(`
SELECT DISTINCT
f.id,
f.path,
f.filename,
f.summary,
f.type,
0 AS bm25Score,
f.last_modified AS lastModified
FROM files f
LEFT JOIN functions fn ON fn.file_id = f.id
LEFT JOIN graph_classes gc ON gc.file_id = f.id
WHERE LOWER(fn.name) = LOWER(?)
OR LOWER(gc.name) = LOWER(?)
`);
const normalizedSymbolStmt = db.prepare(`
SELECT DISTINCT
f.id,
f.path,
f.filename,
f.summary,
f.type,
0 AS bm25Score,
f.last_modified AS lastModified
FROM files f
LEFT JOIN functions fn ON fn.file_id = f.id
LEFT JOIN graph_classes gc ON gc.file_id = f.id
WHERE LOWER(REPLACE(REPLACE(REPLACE(fn.name, '_', ''), '-', ''), ' ', '')) = LOWER(?)
OR LOWER(REPLACE(REPLACE(REPLACE(gc.name, '_', ''), '-', ''), ' ', '')) = LOWER(?)
`);
const normalizedFileStemStmt = db.prepare(`
SELECT DISTINCT
f.id,
f.path,
f.filename,
f.summary,
f.type,
0 AS bm25Score,
f.last_modified AS lastModified
FROM files f
WHERE LOWER(
REPLACE(
REPLACE(
REPLACE(
CASE
WHEN INSTR(f.filename, '.') > 0 THEN SUBSTR(f.filename, 1, INSTR(f.filename, '.') - 1)
ELSE f.filename
END,
'_',
''
),
'-',
''
),
' ',
''
)
) = LOWER(?)
`);
for (const rawSymbol of targetSymbols) {
// Resolve exact symbol names first, then normalized identifier forms
// (e.g. "semantic analysis module" -> "semanticanalysismodule").
const symbol = String(rawSymbol ?? "").trim();
if (!symbol)
continue;
const symbolForms = buildSymbolSearchForms(symbol);
const candidateRows = new Map();
for (const form of symbolForms) {
const exactMatches = exactSymbolStmt.all(form, form);
for (const row of exactMatches) {
candidateRows.set(row.path, row);
}
}
const normalizedSymbol = normalizeSymbolLookupToken(symbol);
if (normalizedSymbol) {
const normalizedMatches = normalizedSymbolStmt.all(normalizedSymbol, normalizedSymbol);
for (const row of normalizedMatches) {
candidateRows.set(row.path, row);
}
const filenameMatches = normalizedFileStemStmt.all(normalizedSymbol);
for (const row of filenameMatches) {
candidateRows.set(row.path, row);
}
}
for (const row of candidateRows.values()) {
if (seenPaths.has(row.path))
continue;
seenPaths.add(row.path);
rows.push(row);
}
}
return rows;
}
function resolveSymbolRowsByPattern(db, targetSymbols) {
if (targetSymbols.length === 0)
return [];
const rows = [];
const seenPaths = new Set();
const candidatePatterns = new Set();
const candidateParts = new Set();
for (const rawSymbol of targetSymbols) {
for (const pattern of buildSymbolLikePatterns(rawSymbol)) {
candidatePatterns.add(pattern);
}
for (const part of splitSymbolParts(rawSymbol)) {
if (part.length >= 3)
candidateParts.add(part);
}
}
if (candidatePatterns.size === 0 && candidateParts.size === 0) {
return [];
}
const patternStmt = db.prepare(`
SELECT DISTINCT
f.id,
f.path,
f.filename,
f.summary,
f.type,
0 AS bm25Score,
f.last_modified AS lastModified
FROM files f
LEFT JOIN files_fts fts ON f.id = fts.rowid
LEFT JOIN functions fn ON fn.file_id = f.id
LEFT JOIN graph_classes gc ON gc.file_id = f.id
WHERE LOWER(COALESCE(fn.name, '')) LIKE LOWER(?)
OR LOWER(COALESCE(gc.name, '')) LIKE LOWER(?)
OR LOWER(COALESCE(f.filename, '')) LIKE LOWER(?)
OR LOWER(COALESCE(f.path, '')) LIKE LOWER(?)
OR LOWER(COALESCE(f.summary, '')) LIKE LOWER(?)
OR LOWER(COALESCE(fts.content_text, '')) LIKE LOWER(?)
LIMIT ?
`);
const partStmt = db.prepare(`
SELECT DISTINCT
f.id,
f.path,
f.filename,
f.summary,
f.type,
0 AS bm25Score,
f.last_modified AS lastModified
FROM files f
LEFT JOIN files_fts fts ON f.id = fts.rowid
LEFT JOIN functions fn ON fn.file_id = f.id
LEFT JOIN graph_classes gc ON gc.file_id = f.id
WHERE LOWER(COALESCE(fn.name, '')) LIKE LOWER(?)
OR LOWER(COALESCE(gc.name, '')) LIKE LOWER(?)
OR LOWER(COALESCE(fts.content_text, '')) LIKE LOWER(?)
LIMIT ?
`);
for (const pattern of candidatePatterns) {
const likePattern = `%${escapeSqlLikeTerm(pattern)}%`;
const matches = patternStmt.all(likePattern, likePattern, likePattern, likePattern, likePattern, likePattern, RELATED_FILES_LIMIT);
for (const row of matches) {
if (seenPaths.has(row.path))
continue;
seenPaths.add(row.path);
rows.push(row);
}
}
for (const part of candidateParts) {
const likePattern = `%${escapeSqlLikeTerm(part)}%`;
const matches = partStmt.all(likePattern, likePattern, likePattern, RELATED_FILES_LIMIT);
for (const row of matches) {
if (seenPaths.has(row.path))
continue;
seenPaths.add(row.path);
rows.push(row);
}
}
return rows.slice(0, RELATED_FILES_LIMIT);
}
function splitSymbolParts(rawSymbol) {
const symbol = String(rawSymbol ?? "").trim();
if (!symbol)
return [];
const withCamelBreaks = symbol.replace(/([a-z0-9])([A-Z])/g, "$1 $2");
const parts = withCamelBreaks
.split(/[^A-Za-z0-9]+/)
.map((part) => part.toLowerCase().trim())
.filter(Boolean);
return Array.from(new Set(parts));
}
function buildSymbolLikePatterns(rawSymbol) {
const symbol = String(rawSymbol ?? "").trim();
if (!symbol)
return [];
const patterns = new Set();
const compact = normalizeSymbolLookupToken(symbol);
const parts = splitSymbolParts(symbol);
if (compact)
patterns.add(compact);
if (parts.length > 0) {
patterns.add(parts.join(""));
patterns.add(parts.join("%"));
}
return Array.from(patterns);
}
function escapeSqlLikeTerm(value) {
return String(value ?? "").replace(/[%_]/g, "");
}
function mergeRows(...groups) {
// Keep deterministic priority from call order and remove duplicates by path.
// Example: if file target and symbol target resolve same file, return it once.
const out = [];
const seenPaths = new Set();
for (const group of groups) {
for (const row of group) {
if (seenPaths.has(row.path))
continue;
seenPaths.add(row.path);
out.push(row);
}
}
return out;
}
/* -------------------------------------------------- */
/* LLM → FTS QUERY GENERATION (TAG-BASED) */
/* -------------------------------------------------- */
async function generatePrimaryFtsQuery(userQuery, intent = {}) {
const prompt = `
You are generating a SQLite FTS query for searching a source code repository.
The user query may refer to:
- High-level intent
- Domain terminology
- Specific filenames, file types, or configuration files
Input:
"${userQuery}"
Task:
1. Extract high-level intent terms
2. Expand to related domain-specific terminology
3. Expand to likely filenames, config files, or structural artifacts if relevant
4. Combine ALL useful terms into ONE OR-joined FTS query
Rules:
- Output ONLY the OR-joined terms
- Max 12 total terms
- Use OR between terms
- Include filenames when relevant
- No explanations
- No natural language sentences
Wrap the result in <FILE_CONTENT> tags.
<FILE_CONTENT>
term1 OR term2 OR term3
</FILE_CONTENT>
`.trim();
// Model-generated query is always normalized by enforceFtsQueryPolicy.
// Example: "code* OR file*" + target file -> keeps anchored file term first.
try {
const response = await generate({ content: prompt, query: "" });
const rawText = String(response.data ?? "");
const { content } = extractTaggedContent(rawText, "FILE_CONTENT");
return enforceFtsQueryPolicy(userQuery, content, MAX_PRIMARY_TERMS, intent);
}
catch (err) {
return enforceFtsQueryPolicy(userQuery, userQuery, MAX_PRIMARY_TERMS, intent);
}
}
async function generateFallbackFtsQueries(userQuery, failedQuery, intent = {}) {
const prompt = `
You are generating fallback SQLite FTS queries for a source code repository search.
Original user query:
"${userQuery}"
Primary FTS query returned ZERO results:
"${failedQuery}"
Task:
Generate 3–5 independent FTS queries (MAX 5).
For each query:
1. Think at a different abstraction level (intent-level, domain-level, structural-level).
2. Include filenames, file types, modules, config files, or symbols when relevant.
3. Use a single OR-joined expression.
4. Max 10 terms per query.
Rules:
- Avoid natural language sentences
- No explanations
- No commentary
- Each line must be one complete OR expression
Output format (STRICT):
<FILE_CONTENT>
query1
query2
query3
query4
query5
</FILE_CONTENT>
`.trim();
try {
const response = await generate({ content: prompt, query: "" });
const rawText = String(response.data ?? "");
const { content } = extractTaggedContent(rawText, "FILE_CONTENT");
// Each candidate fallback line is sanitized/anchored the same way as primary.
const subQueries = content
.split(/\r?\n/)
.map(q => enforceFtsQueryPolicy(userQuery, q.trim(), MAX_FALLBACK_TERMS, intent))
.filter(Boolean)
.slice(0, 5);
if (!subQueries.length) {
throw new Error("No fallback subqueries generated");
}
return subQueries;
}
catch (err) {
log(`⚠️ [semanticSearchFiles] Fallback FTS generation failed: ${String(err)}`);
return null;
}
}
/* -------------------------------------------------- */
/* PLANNER SEARCH */
/* -------------------------------------------------- */
export async function plannerSearchFiles(originalQuery, query, topK = 5) {
const db = getDbForRepo();
const seen = new Map();
const usedQueries = [];
const safeQuery = sanitizeQueryForFts(query);
if (safeQuery)
usedQueries.push(safeQuery);
const primaryResults = db
.prepare(sqlTemplates.searchFilesTemplate)
.all(safeQuery, RELATED_FILES_LIMIT);
primaryResults.forEach(r => seen.set(r.id, r));
logInputOutput("plannerSearchFiles primary FTS", "input", {
safeQuery,
count: primaryResults.length,
});
if (primaryResults.length === 0) {
const stmt = db.prepare(sqlTemplates.searchFilesTemplate);
const llmPrimaryQuery = await generatePrimaryFtsQuery(originalQuery);
const llmFallbackQueries = await generateFallbackFtsQueries(originalQuery, llmPrimaryQuery || safeQuery);
const candidateQueries = [];
const pushCandidate = (q) => {
if (!q)
return;
const sanitized = sanitizeQueryForFts(q);
if (!sanitized)
return;
if (sanitized === safeQuery)
return;
if (candidateQueries.includes(sanitized))
return;
candidateQueries.push(sanitized);
usedQueries.push(sanitized);
};
pushCandidate(llmPrimaryQuery);
(llmFallbackQueries ?? []).forEach(pushCandidate);
// Keep legacy planner expansion as final fallback if LLM query generation did not add anything useful.
if (candidateQueries.length === 0) {
pushCandidate(await expandQueryWithModel(originalQuery));
}
for (const candidateQuery of candidateQueries) {
const rows = stmt.all(candidateQuery, RELATED_FILES_LIMIT);
logInputOutput("plannerSearchFiles expanded FTS", "output", {
stage: "llm-expanded",
originalQuery,
ftsQuery: candidateQuery,
resultCount: rows.length,
});
rows.forEach(r => {
if (!seen.has(r.id))
seen.set(r.id, r);
});
if (seen.size > 0)
break;
}
}
if (seen.size === 0)
return [];
return rankAndMap(seen, topK, buildQueryExpansionTerms(originalQuery, usedQueries));
}
/* -------------------------------------------------- */
/* HELPERS */
/* -------------------------------------------------- */
function rankAndMap(seen, topK, queryExpansionTerms = []) {
// Lower BM25 is better in SQLite FTS, so ascending sort gives best matches first.
return Array.from(seen.values())
.sort((a, b) => (a.bm25Score ?? 0) - (b.bm25Score ?? 0))
.slice(0, topK)
.map(r => ({
id: r.id,
path: r.path,
filename: r.filename,
summary: r.summary,
type: r.type,
lastModified: r.lastModified,
bm25Score: r.bm25Score,
queryExpansionTerms,
}));
}
/**
* Builds a deduped token list used as retrieval metadata for downstream steps.
* Example:
* - original: "update fileIndex.ts logging"
* - queries: ["\"fileIndex.ts\" OR logging*"]
* - output contains: ["update", "fileindex", "ts", "logging"]
*/
function buildQueryExpansionTerms(originalQuery, queries) {
const terms = new Set();
const addTokens = (text) => {
for (const token of tokenizeQueryTerms(text)) {
if (!QUERY_OPERATOR_TOKENS.has(token)) {
terms.add(token);
}
}
};
addTokens(originalQuery);
for (const query of queries)
addTokens(query);
return Array.from(terms);
}
/**
* Applies strict normalization to an LLM-generated FTS query:
* 1) sanitize, 2) split into OR terms, 3) add anchors, 4) drop generic noise,
* 5) dedupe, 6) cap term count.
* Example:
* - user: "change semanticSearchFiles in fileIndex.ts"
* - candidate: "code* OR file* OR output*"
* - output: "\"fileIndex.ts\" OR semanticsearchfiles* OR output*"
*/
function enforceFtsQueryPolicy(userQuery, candidateQuery, maxTerms, intent = {}) {
const safe = sanitizeQueryForFts(candidateQuery);
const candidateTerms = splitOrTerms(safe);
const prioritizedAnchors = buildAnchorTerms(userQuery, intent);
const hasAnchors = prioritizedAnchors.length > 0;
const filtered = candidateTerms.filter(term => {
const normalized = normalizeFtsTerm(term);
if (!normalized || QUERY_OPERATOR_TOKENS.has(normalized))
return false;
if (hasAnchors && GENERIC_FTS_TERMS.has(normalized))
return false;
return true;
});
const merged = dedupeTerms([...prioritizedAnchors, ...filtered]);
return merged.slice(0, maxTerms).join(" OR ");
}
/**
* Splits an OR query into raw terms.
* Example: "\"fileIndex.ts\" OR semanticsearchfiles* OR output*" ->
* ["\"fileIndex.ts\"", "semanticsearchfiles*", "output*"]
*/
function splitOrTerms(query) {
return query
.split(/\s+OR\s+/i)
.map(part => part.trim())
.filter(Boolean);
}
/**
* Normalizes an FTS term for stable comparisons by removing quotes/wildcards.
* Example: "\"fileIndex.ts\"" -> "fileindex.ts", "Module*" -> "module"
*/
function normalizeFtsTerm(term) {
return term.replace(/["*]/g, "").toLowerCase().trim();
}
/**
* Dedupes terms using normalized keys while preserving first-seen order.
* Example: ["code*", "CODE*", "\"code\""] -> ["code*"]
*/
function dedupeTerms(terms) {
const seen = new Set();
const out = [];
for (const term of terms) {
const key = normalizeFtsTerm(term);
if (!key || seen.has(key))
continue;
seen.add(key);
out.push(term);
}
return out;
}
/**
* Extracts high-signal anchors from user text (explicit filenames + likely symbols).
* Example:
* - user: "update semanticSearchFiles in fileIndex.ts"
* - output includes: ["\"fileIndex.ts\"", "semanticsearchfiles*"]
*/
function buildAnchorTerms(userQuery, intent = {}) {
const anchorTerms = [];
const explicitFiles = extractFileReferences(userQuery);
const targetFiles = Array.isArray(intent.targetFiles)
? dedupeNormalizedStrings(intent.targetFiles)
: [];
const targetSymbols = Array.isArray(intent.targetSymbols)
? dedupeNormalizedStrings(intent.targetSymbols)
: [];
for (const fileRef of targetFiles) {
for (const matchedFile of extractFileReferences(fileRef)) {
const filename = path.basename(matchedFile);
if (filename)
anchorTerms.push(`"${filename}"`);
}
}
for (const fileRef of explicitFiles) {
const filename = path.basename(fileRef);
if (filename)
anchorTerms.push(`"${filename}"`);
}
for (const symbol of targetSymbols) {
for (const form of buildSymbolSearchForms(symbol)) {
const normalized = normalizeFtsTerm(form);
if (!normalized)
continue;
if (QUERY_OPERATOR_TOKENS.has(normalized))
continue;
if (GENERIC_FTS_TERMS.has(normalized))
continue;
anchorTerms.push(`${normalized}*`);
}
}
const symbolMatches = userQuery.match(/[A-Za-z_][A-Za-z0-9_]*/g) ?? [];
for (const token of symbolMatches) {
const isLikelySymbol = /[A-Z]/.test(token) ||
token.includes("_") ||
token.endsWith("Step") ||
token.endsWith("Module");
if (!isLikelySymbol)
continue;
const normalized = token.toLowerCase();
if (QUERY_OPERATOR_TOKENS.has(normalized))
continue;
if (GENERIC_FTS_TERMS.has(normalized))
continue;
anchorTerms.push(`${normalized}*`);
}
return dedupeTerms(anchorTerms);
}
function dedupeNormalizedStrings(tokens) {
// Lightweight normalization only (trim + slash normalization + case-insensitive dedupe).
// Example: [" FileIndex.ts ", "fileindex.ts"] -> ["FileIndex.ts"].
const out = [];
const seen = new Set();
for (const token of tokens) {
if (typeof token !== "string")
continue;
const normalized = token.trim().replace(/\\/g, "/");
if (!normalized)
continue;
const key = normalized.toLowerCase();
if (seen.has(key))
continue;
seen.add(key);
out.push(normalized);
}
return out;
}
/**
* Tokenizes text into lowercase alphanumeric terms for query expansion metadata.
* Example: "\"fileIndex.ts\" OR semanticsearchfiles*" -> ["fileindex", "ts", "semanticsearchfiles"]
*/
function tokenizeQueryTerms(text) {
const matches = text.toLowerCase().match(/[a-z_][a-z0-9_]{1,}/g) ?? [];
return Array.from(new Set(matches));
}
async function expandQueryWithModel(query) {
const prompt = `
Return concrete search terms likely to appear in source code.
Wrap the result in <FILE_CONTENT> tags.
Question:
"${query}"
`.trim();
try {
const response = await generate({ content: prompt, query: "" });
const rawText = String(response.data ?? "");
const { content } = extractTaggedContent(rawText, "FILE_CONTENT");
return sanitizeQueryForFts(content);
}
catch {
return null;
}
}