@shirokuma-library/mcp-knowledge-base
Version:
Shirokuma MCP Server for comprehensive knowledge management including issues, plans, documents, and work sessions. All stored data is structured for AI processing, not human readability.
311 lines (308 loc) • 11.5 kB
JavaScript
import Database from 'better-sqlite3';
import { createHash } from 'crypto';
import { readFileSync, existsSync, statSync, mkdirSync } from 'fs';
import { join, extname } from 'path';
import { execSync } from 'child_process';
import { minimatch } from 'minimatch';
import { pipeline } from '@xenova/transformers';
import { config as appConfig } from '../config.js';
const DEFAULT_CONFIG = {
indexPath: join(appConfig.database.path, 'index.db'),
maxFileSize: 10 * 1024 * 1024,
chunkSize: 30,
extensions: [
'.js', '.ts', '.jsx', '.tsx', '.mjs', '.cjs',
'.py', '.java', '.cpp', '.c', '.h', '.go', '.rs', '.rb', '.php',
'.html', '.xml', '.yaml', '.yml', '.json', '.toml',
'.md', '.txt', '.rst',
'.env.example', '.gitignore', 'Dockerfile', 'Makefile'
],
ignorePatterns: [
'**/.git/**',
'**/node_modules/**',
'**/.shirokuma/**',
'*.min.js',
'*.min.css',
'**/*.map',
'**/.env',
'**/*.key',
'**/*.pem',
'**/dist/**',
'**/build/**',
'**/coverage/**'
]
};
export class FileIndexer {
db;
config;
embedder;
initialized = false;
constructor(config) {
this.config = {
...DEFAULT_CONFIG,
...config,
extensions: config.extensions || DEFAULT_CONFIG.extensions,
ignorePatterns: config.ignorePatterns || DEFAULT_CONFIG.ignorePatterns
};
const dbPath = this.config.indexPath.startsWith('/') || this.config.indexPath.match(/^[A-Za-z]:\\/)
? this.config.indexPath
: join(this.config.projectRoot, this.config.indexPath);
const indexDir = dbPath.substring(0, dbPath.lastIndexOf('/'));
if (!existsSync(indexDir)) {
mkdirSync(indexDir, { recursive: true });
}
this.db = new Database(dbPath);
this.initSchema();
}
initSchema() {
this.db.exec('PRAGMA foreign_keys = ON');
this.db.exec(`
-- Files table
CREATE TABLE IF NOT EXISTS files (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT UNIQUE NOT NULL,
file_hash TEXT NOT NULL,
total_chunks INTEGER NOT NULL DEFAULT 0,
file_size INTEGER NOT NULL DEFAULT 0,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
-- Chunks table
CREATE TABLE IF NOT EXISTS file_chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL,
chunk_index INTEGER NOT NULL,
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL,
content TEXT NOT NULL,
embedding TEXT NOT NULL,
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE,
UNIQUE(file_id, chunk_index)
);
-- Indexes for performance
CREATE INDEX IF NOT EXISTS idx_files_path ON files(file_path);
CREATE INDEX IF NOT EXISTS idx_files_hash ON files(file_hash);
CREATE INDEX IF NOT EXISTS idx_chunks_file_id ON file_chunks(file_id);
`);
}
async initialize() {
if (this.initialized) {
return;
}
this.embedder = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
this.initialized = true;
}
getGitFiles() {
try {
const output = execSync('git ls-files', {
cwd: this.config.projectRoot,
encoding: 'utf-8'
});
return output
.split('\n')
.filter(file => file.length > 0)
.filter(file => this.shouldIndexFile(file));
}
catch (error) {
throw new Error('Not a git repository. File indexing works with git-managed files only.');
}
}
shouldIndexFile(filePath) {
const ext = extname(filePath);
if (ext && !this.config.extensions.includes(ext)) {
const fileName = filePath.split('/').pop() || '';
if (!this.config.extensions.includes(fileName)) {
return false;
}
}
for (const pattern of this.config.ignorePatterns) {
if (minimatch(filePath, pattern)) {
return false;
}
}
const ignorePath = join(this.config.projectRoot, '.shirokumaignore');
if (existsSync(ignorePath)) {
const ignorePatterns = readFileSync(ignorePath, 'utf-8')
.split('\n')
.filter(line => line.trim() && !line.startsWith('#'));
for (const pattern of ignorePatterns) {
if (pattern.startsWith('!')) {
if (minimatch(filePath, pattern.slice(1))) {
return true;
}
}
else {
if (minimatch(filePath, pattern)) {
return false;
}
}
}
}
return true;
}
getFileHash(content) {
return createHash('sha256').update(content).digest('hex').substring(0, 16);
}
chunkFile(content, filePath) {
const lines = content.split('\n');
const chunks = [];
for (let i = 0; i < lines.length; i += this.config.chunkSize) {
const chunkLines = lines.slice(i, i + this.config.chunkSize);
const chunkContent = chunkLines.join('\n').trim();
if (chunkContent.length > 10) {
chunks.push({
filePath,
chunkIndex: Math.floor(i / this.config.chunkSize),
startLine: i,
endLine: Math.min(i + this.config.chunkSize, lines.length),
content: chunkContent
});
}
}
return chunks;
}
async generateEmbedding(text) {
if (!this.embedder) {
throw new Error('Embedder not initialized');
}
const output = await this.embedder(text, {
pooling: 'mean',
normalize: true
});
return Array.from(output.data);
}
async indexFile(filePath) {
const fullPath = join(this.config.projectRoot, filePath);
const stats = existsSync(fullPath) && statSync(fullPath);
if (!stats || stats.size > this.config.maxFileSize) {
return;
}
const content = readFileSync(fullPath, 'utf-8');
const fileHash = this.getFileHash(content);
const existing = this.db.prepare('SELECT id, file_hash FROM files WHERE file_path = ? LIMIT 1').get(filePath);
if (existing && existing.file_hash === fileHash) {
return;
}
const chunks = this.chunkFile(content, filePath);
const chunksWithEmbeddings = await Promise.all(chunks.map(async (chunk) => ({
...chunk,
embedding: await this.generateEmbedding(chunk.content)
})));
this.db.transaction(() => {
let fileId;
if (existing) {
this.db.prepare('UPDATE files SET file_hash = ?, total_chunks = ?, file_size = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?').run(fileHash, chunks.length, stats.size, existing.id);
fileId = existing.id;
this.db.prepare('DELETE FROM file_chunks WHERE file_id = ?').run(fileId);
}
else {
const result = this.db.prepare('INSERT INTO files (file_path, file_hash, total_chunks, file_size) VALUES (?, ?, ?, ?)').run(filePath, fileHash, chunks.length, stats.size);
fileId = result.lastInsertRowid;
}
const insertChunk = this.db.prepare(`
INSERT INTO file_chunks
(file_id, chunk_index, start_line, end_line, content, embedding)
VALUES (?, ?, ?, ?, ?, ?)
`);
for (const chunk of chunksWithEmbeddings) {
insertChunk.run(fileId, chunk.chunkIndex, chunk.startLine, chunk.endLine, chunk.content, JSON.stringify(chunk.embedding));
}
})();
}
async indexAll(onProgress) {
await this.initialize();
const files = this.getGitFiles();
this.cleanupDeletedFiles(files);
for (let i = 0; i < files.length; i++) {
const file = files[i];
if (onProgress) {
onProgress(file, i + 1, files.length);
}
try {
await this.indexFile(file);
}
catch (error) {
console.error(`Error indexing ${file}:`, error);
}
}
}
cleanupDeletedFiles(currentFiles) {
const currentFilesSet = new Set(currentFiles);
const indexedFiles = this.db.prepare('SELECT id, file_path FROM files').all();
const deleteStmt = this.db.prepare('DELETE FROM files WHERE id = ?');
let deletedCount = 0;
for (const { id, file_path } of indexedFiles) {
if (!currentFilesSet.has(file_path)) {
deleteStmt.run(id);
deletedCount++;
}
}
if (deletedCount > 0) {
console.log(`Cleaned up ${deletedCount} deleted files from index`);
}
}
async search(query, options) {
await this.initialize();
const limit = options?.limit || 10;
const minScore = options?.minScore || 0.3;
const queryEmbedding = await this.generateEmbedding(query);
let sql = `
SELECT
f.file_path,
c.chunk_index,
c.start_line,
c.end_line,
c.content,
c.embedding
FROM file_chunks c
JOIN files f ON c.file_id = f.id
`;
const params = [];
if (options?.fileTypes && options.fileTypes.length > 0) {
sql += ` WHERE ${options.fileTypes.map(() => 'f.file_path LIKE ?').join(' OR ')}`;
params.push(...options.fileTypes.map(type => `%.${type}`));
}
const chunks = this.db.prepare(sql).all(...params);
const results = chunks
.map(chunk => {
const embedding = JSON.parse(chunk.embedding);
const similarity = this.cosineSimilarity(queryEmbedding, embedding);
return {
filePath: chunk.file_path,
chunkIndex: chunk.chunk_index,
startLine: chunk.start_line,
endLine: chunk.end_line,
content: chunk.content,
similarity
};
})
.filter(result => result.similarity >= minScore)
.sort((a, b) => b.similarity - a.similarity)
.slice(0, limit);
return results;
}
cosineSimilarity(vec1, vec2) {
let dotProduct = 0;
for (let i = 0; i < vec1.length; i++) {
dotProduct += vec1[i] * vec2[i];
}
return dotProduct;
}
getStats() {
const fileStats = this.db.prepare(`
SELECT COUNT(*) as total_files FROM files
`).get();
const chunkStats = this.db.prepare(`
SELECT COUNT(*) as total_chunks FROM file_chunks
`).get();
return {
totalFiles: fileStats.total_files || 0,
totalChunks: chunkStats.total_chunks || 0,
indexSize: existsSync(join(this.config.projectRoot, this.config.indexPath))
? statSync(join(this.config.projectRoot, this.config.indexPath)).size
: 0
};
}
close() {
this.db.close();
}
}