UNPKG

autosnippet

Version:

Extract code patterns into a knowledge base for AI coding assistants

277 lines (276 loc) 10.7 kB
/** * IntentExtractor — Intake Layer * * Pure functions: extract intent signals from user query + active file. * Builds multi-query set, infers language/module/scenario for search routing. * * @module service/task/IntentExtractor */ import { tokenize } from '#service/search/tokenizer.js'; // ── Universal Patterns (language-agnostic) ────────── const UNIVERSAL_PATTERNS = [ /\b[A-Z][a-z]+(?:[A-Z][a-z]+)+\b/g, // CamelCase /`([^`]+)`/g, // backtick code /\b[\w-]+\.(?:ts|js|m|h|swift|py|java|go|rs|tsx|kt)\b/g, // file names /@[\w-]+/g, // trigger references ]; // ── Language Extension Map ────────────────────────── const LANG_MAP = { m: 'objectivec', h: 'objectivec', mm: 'objectivec', swift: 'swift', ts: 'typescript', tsx: 'typescript', js: 'javascript', jsx: 'javascript', py: 'python', go: 'go', rs: 'rust', java: 'java', kt: 'kotlin', }; // ── Cross-Language Synonym Groups ─────────────────── // Each group contains EN morphological variants + CN equivalents. // Used to expand queries so English terms match Chinese recipe fields (and vice versa). const SYNONYM_GROUPS = [ // Design patterns & DI ['inject', 'injection', '注入'], ['construct', 'constructor', '构造器', '构造函数'], ['depend', 'dependency', 'dependencies', '依赖'], ['protocol', '协议'], ['interface', '接口'], ['pattern', '模式'], ['factory', '工厂'], ['singleton', '单例'], ['delegate', '代理', '委托'], ['observe', 'observer', '观察者'], ['subscribe', 'subscription', '订阅'], ['repository', 'repo', '仓库'], // Architecture ['module', '模块'], ['architect', 'architecture', '架构'], ['route', 'router', 'routing', '路由'], ['middleware', '中间件'], ['component', '组件'], ['lifecycle', '生命周期'], ['layer', '分层', '层'], // Language features ['generic', 'generics', '泛型'], ['closure', '闭包'], ['callback', '回调'], ['extend', 'extension', '扩展'], ['inherit', 'inheritance', '继承'], ['abstract', 'abstraction', '抽象'], ['encapsulate', 'encapsulation', '封装'], ['polymorph', 'polymorphism', '多态'], ['implement', 'implementation', '实现'], // Concurrency ['async', 'asynchronous', '异步'], ['sync', 'synchronous', '同步'], ['thread', 'threading', '线程'], ['concur', 'concurrency', '并发'], // Memory management ['memory', '内存'], ['leak', 'leakage', '泄漏'], ['weak', '弱引用'], ['retain', '持有', '保留'], ['release', '释放'], ['reference', '引用'], // Common concepts ['network', '网络'], ['cache', 'caching', '缓存'], ['persist', 'persistence', '持久化'], ['serialize', 'serialization', '序列化'], ['validate', 'validation', '校验', '验证'], ['authenticate', 'authentication', '认证'], ['authorize', 'authorization', '授权'], ['config', 'configuration', '配置'], ['navigate', 'navigation', '导航'], ['animate', 'animation', '动画'], ['layout', '布局'], ['render', 'rendering', '渲染'], ['responsive', '响应式'], ['state', '状态'], ['toast', '提示'], ['error', '错误'], ['handle', 'handler', '处理'], ['service', '服务'], ['test', 'testing', '测试'], ]; /** Lookup: lowercased term → synonym expansions (excluding the term itself) */ const SYNONYM_LOOKUP = new Map(); for (const group of SYNONYM_GROUPS) { for (const term of group) { SYNONYM_LOOKUP.set(term.toLowerCase(), group.filter((t) => t !== term)); } } // ── Public API ────────────────────────────────────── /** * Extract intent signals from user query and active file. * Pure function — no side effects, no DI. */ export function extract(userQuery, activeFile, language, termOpts) { const queries = buildQueries(userQuery, activeFile, termOpts); const keywordQueries = buildKeywordQueries(userQuery); const inferredLang = language || (activeFile ? inferLanguage(activeFile) : null); const module = activeFile ? inferFileContext(activeFile) : null; const scenario = classifyScenario(userQuery); return { queries, keywordQueries, language: inferredLang, module, scenario, raw: { userQuery, activeFile, language }, }; } /** * Build multi-query set from user query + active file. * Q1: raw query, Q2: extracted tech terms, Q3: file context, Q4: synonym focus. * Q1 is enriched with cross-language synonyms to bridge EN↔CJK matching. * Q4 (long queries only): synonym expansion as a separate focused query * to prevent BM25 dilution in verbose natural language inputs. */ export function buildQueries(userQuery, activeFile, termOpts) { // Enrich raw query with cross-language synonyms const synonyms = expandWithSynonyms(userQuery); const enrichedQuery = synonyms ? `${userQuery} ${synonyms}` : userQuery; const queries = [enrichedQuery]; const terms = extractTechTerms(userQuery, termOpts); if (terms.length > 0) { queries.push(terms.join(' ')); } // Q4: For long queries (> 50 chars), add cross-language synonyms as a // separate focused query. In long sentences, synonym terms appended to Q1 // get diluted by common words ("ViewController", "ViewModel"), causing // BM25 to miss the user's actual intent. A short focused query matches // domain-specific terms (e.g. "singleton 单例 inject 注入") directly. if (synonyms && userQuery.length > 50) { queries.push(synonyms); } if (activeFile) { const ctx = inferFileContext(activeFile); if (ctx) { queries.push(ctx); } } return queries; } /** * Build keyword-mode queries for cross-language synonym matching. * Uses keyword mode to preserve raw FWS scores without CoarseRanker semantic normalization. */ export function buildKeywordQueries(userQuery) { const expanded = expandWithSynonyms(userQuery); return expanded ? [expanded] : []; } /** * Extract tech terms from query using universal patterns + dynamic project prefixes. */ export function extractTechTerms(query, opts = {}) { const terms = new Set(); // 1. Universal patterns (always run) for (const pattern of UNIVERSAL_PATTERNS) { for (const match of query.matchAll(new RegExp(pattern.source, pattern.flags))) { const term = match[1] || match[0]; if (term.length >= 3 && term.length <= 50) { terms.add(term); } } } // 2. Project prefix patterns (dynamic) const allPrefixes = [...(opts.projectPrefixes ?? []), ...(opts.platformPrefixes ?? [])]; const prefixPattern = buildPrefixPattern(allPrefixes); if (prefixPattern) { for (const match of query.matchAll(prefixPattern)) { if (match[0].length >= 3 && match[0].length <= 50) { terms.add(match[0]); } } } return [...terms].slice(0, 8); } /** * Infer file context string from file path for search augmentation. * Returns module path + class name, e.g. "Services/Network BDNetworkManager" */ export function inferFileContext(filePath) { const parts = filePath.replace(/\\/g, '/').split('/'); const fileName = parts[parts.length - 1] || ''; // Extract class name (remove extension) const className = fileName.replace(/\.\w+$/, ''); // Extract meaningful module path (skip root dir and file name) const meaningful = parts .slice(1, -1) .filter((p) => !['src', 'lib', 'Sources', 'BiliDili', 'BiliDemo'].includes(p)); const module = meaningful.slice(0, 2).join('/'); const segments = [module, className].filter(Boolean); return segments.length > 0 ? segments.join(' ') : null; } /** * Infer language from file extension. */ export function inferLanguage(filePath) { const ext = filePath.split('.').pop()?.toLowerCase(); return ext ? (LANG_MAP[ext] ?? null) : null; } /** * Classify search scenario from user query (lightweight rule-based). */ export function classifyScenario(userQuery) { const q = userQuery.toLowerCase(); if (/帮我[加写做实现创建]|implement|add|create|新[增加建]|添加|修改|删除|实现|开发|编写|创建|初始化/.test(q)) { return 'generate'; } if (/检查|review|lint|合规|违规|guard|规[则范]/.test(q)) { return 'lint'; } if (/什么是|怎么[用做]|原理|explain|学习|理解|为什么/.test(q)) { return 'learning'; } return 'search'; } // ── Internal Helpers ──────────────────────────────── /** * Expand query tokens with cross-language synonyms. * Tokenizes query, looks up each token in the synonym table, * returns a query string of synonym expansions for cross-language matching. * * Strategy: per-token cross-script expansion. Each token's script is checked * individually, and only synonyms in the OPPOSITE script are added. * This correctly handles mixed EN/CJK queries (e.g. "在 module 里用 singleton") * where both EN→CJK and CJK→EN expansions are needed. */ function expandWithSynonyms(query) { const tokens = tokenize(query); const crossScriptTerms = new Set(); const CJK_RE = /[\u4e00-\u9fff\u3400-\u4dbf]/; for (const token of tokens) { const synonyms = SYNONYM_LOOKUP.get(token.toLowerCase()); if (!synonyms) { continue; } // Determine THIS token's script, not the whole query's const tokenIsCJK = CJK_RE.test(token); for (const syn of synonyms) { const synIsCJK = CJK_RE.test(syn); // Cross-script: EN token → add CJK synonyms; CJK token → add EN synonyms if (tokenIsCJK !== synIsCJK) { crossScriptTerms.add(syn); } } } if (crossScriptTerms.size === 0) { return null; } return [...crossScriptTerms].slice(0, 16).join(' '); } function buildPrefixPattern(prefixes) { if (prefixes.length === 0) { return null; } const sorted = [...prefixes].sort((a, b) => b.length - a.length); const escaped = sorted.map((p) => p.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')); return new RegExp(`\\b(?:${escaped.join('|')})\\w{2,}\\b`, 'g'); }