autosnippet
Version:
Extract code patterns into a knowledge base for AI coding assistants
381 lines (380 loc) • 12.2 kB
JavaScript
/**
* ASTChunker — 基于 AST 的语法感知代码分块
*
* 利用 web-tree-sitter 按函数/类/方法边界分块:
* - 保持语义完整性 (不在函数/类中间截断)
* - 超大节点递归拆分
* - 自动携带结构元数据 (nodeType, name, startLine, endLine)
*
* 支持语言: JavaScript, TypeScript, Python, Java, Kotlin, Go, Swift,
* Rust, Dart, ObjC (取决于已加载的 tree-sitter grammar)
*
* @module infrastructure/vector/ASTChunker
*/
import { estimateTokens } from '../../shared/token-utils.js';
// AST 相关的延迟加载 (避免 import 时强制初始化 parser)
let _astReady = false;
let _parseToTree = null;
let _isAvailable = null;
let _supportedLanguages = null;
/**
* 各语言的顶层可分块 AST 节点类型
* 这些节点通常代表独立的代码单元 (函数/类/方法/接口等)
*/
const TOP_LEVEL_TYPES = new Set([
// JavaScript / TypeScript
'function_declaration',
'class_declaration',
'abstract_class_declaration',
'interface_declaration',
'type_alias_declaration',
'enum_declaration',
'export_statement',
'lexical_declaration',
'variable_declaration',
// Python
'function_definition',
'class_definition',
'decorated_definition',
// Java
'method_declaration',
'constructor_declaration',
'field_declaration',
'class_body_declaration',
// Kotlin
'function_declaration',
'object_declaration',
'property_declaration',
// Go
'function_declaration',
'method_declaration',
'type_declaration',
'const_declaration',
'var_declaration',
// Swift
'function_declaration',
'class_declaration',
'struct_declaration',
'protocol_declaration',
'extension_declaration',
// Rust
'function_item',
'struct_item',
'enum_item',
'trait_item',
'impl_item',
'type_item',
'mod_item',
'const_item',
'static_item',
'macro_definition',
// Dart
'function_definition',
'class_definition',
'mixin_declaration',
'extension_declaration',
'top_level_definition',
// ObjC
'class_implementation',
'category_implementation',
'protocol_declaration',
]);
/**
* 语言 ID → tree-sitter langId 映射
* LanguageService.inferLang() 返回的 id 可能不完全匹配 AST 插件注册的 langId
*/
const LANG_ID_MAP = {
javascript: 'javascript',
typescript: 'typescript',
tsx: 'tsx',
python: 'python',
java: 'java',
kotlin: 'kotlin',
go: 'go',
swift: 'swift',
rust: 'rust',
dart: 'dart',
objectivec: 'objectivec',
'objective-c': 'objectivec',
objc: 'objectivec',
};
/**
* 初始化 AST 解析器 (幂等, 延迟加载)
* @returns 是否成功初始化
*/
async function ensureParser() {
if (_astReady) {
return true;
}
try {
// 触发 AST 插件的顶层 await loadPlugins()
await import('../../core/ast/index.js');
const astAnalyzer = await import('../../core/AstAnalyzer.js');
_parseToTree = astAnalyzer.parseToTree;
_isAvailable = astAnalyzer.isAvailable;
_supportedLanguages = astAnalyzer.supportedLanguages;
_astReady = _isAvailable?.() ?? false;
return _astReady;
}
catch {
return false;
}
}
/**
* 检查 ASTChunker 是否支持指定语言
* @param language LanguageService.inferLang() 返回的语言 ID
*/
export function isASTChunkerAvailable(language) {
if (!_astReady || !_supportedLanguages) {
return false;
}
const langId = LANG_ID_MAP[language] || language;
const supported = _supportedLanguages();
return supported.includes(langId);
}
/**
* 按 AST 节点边界分块
*
* 策略:
* 1. 解析源代码为 AST
* 2. 提取根节点的直接子节点中的顶层声明 (函数/类/方法/接口等)
* 3. 小于 maxChunkTokens 的节点作为单独 chunk
* 4. 超大节点递归拆分 (按子节点边界)
* 5. 非声明代码 (import, 注释等) 合并为一个 chunk
*
* @param content 源代码
* @param language 语言标识 (来自 LanguageService.inferLang)
* @param metadata 基础 metadata
* @returns >}
*/
export function chunkByAST(content, language, metadata = {}, options = {}) {
const { maxChunkTokens = 512 } = options;
if (!content || content.trim().length === 0) {
return [];
}
const langId = LANG_ID_MAP[language] || language;
if (!_astReady || !_parseToTree) {
return null; // 返回 null 表示不支持, 调用方应 fallback
}
const parsed = _parseToTree(content, langId);
if (!parsed?.rootNode) {
return null;
}
const rootNode = parsed.rootNode;
const chunks = [];
let preambleLines = []; // 非声明代码 (imports, comments 等)
// 遍历根节点的直接子节点
for (let i = 0; i < rootNode.childCount; i++) {
const child = rootNode.child(i);
if (!child) {
continue;
}
const nodeText = content.slice(child.startIndex, child.endIndex);
const nodeTokens = estimateTokens(nodeText);
const isTopLevel = TOP_LEVEL_TYPES.has(child.type);
if (!isTopLevel) {
// 非顶层声明 → 积累到 preamble
preambleLines.push(nodeText);
continue;
}
// 先 flush preamble
if (preambleLines.length > 0) {
const preamble = preambleLines.join('\n');
if (preamble.trim().length > 0) {
chunks.push({
content: preamble,
metadata: {
...metadata,
nodeType: 'preamble',
startLine: chunks.length === 0 ? 1 : undefined,
},
});
}
preambleLines = [];
}
if (nodeTokens <= maxChunkTokens) {
// 单个 chunk
chunks.push({
content: nodeText,
metadata: {
...metadata,
nodeType: child.type,
name: extractNodeName(child),
startLine: child.startPosition.row + 1,
endLine: child.endPosition.row + 1,
},
});
}
else {
// 超大节点: 递归拆分
const subChunks = splitLargeNode(child, content, metadata, maxChunkTokens);
chunks.push(...subChunks);
}
}
// flush 剩余 preamble
if (preambleLines.length > 0) {
const preamble = preambleLines.join('\n');
if (preamble.trim().length > 0) {
chunks.push({
content: preamble,
metadata: { ...metadata, nodeType: 'epilogue' },
});
}
}
// 如果 AST 没有产生任何 chunk (例如空文件), 返回 null 让 fallback 处理
if (chunks.length === 0) {
return null;
}
// 设置 chunkIndex 和 totalChunks
for (let i = 0; i < chunks.length; i++) {
chunks[i].metadata.chunkIndex = i;
chunks[i].metadata.totalChunks = chunks.length;
chunks[i].metadata.chunkStrategy = 'ast';
}
return chunks;
}
/**
* 递归拆分超大 AST 节点
*
* 策略: 按子节点边界分组, 直到每组 ≤ maxChunkTokens
*
* @param node tree-sitter AST node
* @param source 完整源代码
* @returns >}
*/
function splitLargeNode(node, source, metadata, maxChunkTokens) {
const chunks = [];
const parentName = extractNodeName(node);
// 如果没有子节点, 按行切割
if (node.childCount === 0) {
return splitByLines(source.slice(node.startIndex, node.endIndex), metadata, node, parentName, maxChunkTokens);
}
// 按子节点分组, 累积到 maxChunkTokens
let currentLines = [];
let currentTokens = 0;
let groupStartLine = node.startPosition.row + 1;
for (let i = 0; i < node.childCount; i++) {
const child = node.child(i);
if (!child) {
continue;
}
const childText = source.slice(child.startIndex, child.endIndex);
const childTokens = estimateTokens(childText);
// 如果单个子节点就超大, 递归拆分
if (childTokens > maxChunkTokens && child.childCount > 0) {
// 先 flush 当前积累
if (currentLines.length > 0) {
chunks.push({
content: currentLines.join('\n'),
metadata: {
...metadata,
nodeType: node.type,
name: parentName,
startLine: groupStartLine,
endLine: child.startPosition.row,
splitPart: true,
},
});
currentLines = [];
currentTokens = 0;
}
// 递归
chunks.push(...splitLargeNode(child, source, metadata, maxChunkTokens));
groupStartLine = child.endPosition.row + 2;
continue;
}
// 如果加入后超限, 先 flush
if (currentTokens + childTokens > maxChunkTokens && currentLines.length > 0) {
chunks.push({
content: currentLines.join('\n'),
metadata: {
...metadata,
nodeType: node.type,
name: parentName,
startLine: groupStartLine,
endLine: child.startPosition.row,
splitPart: true,
},
});
currentLines = [];
currentTokens = 0;
groupStartLine = child.startPosition.row + 1;
}
currentLines.push(childText);
currentTokens += childTokens;
}
// flush 剩余
if (currentLines.length > 0) {
chunks.push({
content: currentLines.join('\n'),
metadata: {
...metadata,
nodeType: node.type,
name: parentName,
startLine: groupStartLine,
endLine: node.endPosition.row + 1,
splitPart: chunks.length > 0,
},
});
}
return chunks;
}
/** 按行切割 (最后手段, 当 AST 无法进一步拆分时) */
function splitByLines(text, metadata, node, parentName, maxChunkTokens) {
const lines = text.split('\n');
const chunks = [];
let current = [];
let currentTokens = 0;
const _maxChars = maxChunkTokens * 4;
for (const line of lines) {
const lineTokens = estimateTokens(line);
if (currentTokens + lineTokens > maxChunkTokens && current.length > 0) {
chunks.push({
content: current.join('\n'),
metadata: {
...metadata,
nodeType: node.type,
name: parentName,
splitPart: true,
},
});
current = [];
currentTokens = 0;
}
current.push(line);
currentTokens += lineTokens;
}
if (current.length > 0) {
chunks.push({
content: current.join('\n'),
metadata: {
...metadata,
nodeType: node.type,
name: parentName,
splitPart: chunks.length > 0,
},
});
}
return chunks;
}
/**
* 从 AST 节点提取名称
* @param node tree-sitter node
*/
function extractNodeName(node) {
// 常见模式: 节点有 name 子节点
const nameNode = node.childForFieldName?.('name') || node.childForFieldName?.('declarator');
if (nameNode) {
// 可能是 identifier, operator 等
return nameNode.text?.slice(0, 100); // 限制长度
}
// 某些节点类型有特殊命名子节点
for (let i = 0; i < Math.min(node.childCount, 5); i++) {
const child = node.child(i);
if (child?.type === 'identifier' || child?.type === 'type_identifier') {
return child.text?.slice(0, 100);
}
}
return undefined;
}
export { ensureParser, TOP_LEVEL_TYPES, LANG_ID_MAP };