UNPKG

vexify

Version:

Portable vector database with in-process ONNX embeddings. Zero-config semantic search via SQLite. No external servers required.

702 lines (587 loc) 23 kB
#!/usr/bin/env node 'use strict'; const { VecStoreFactory, FolderSync, getConfig, processors, WebCrawler, Updater } = require('../index'); const { GoogleDriveCrawler } = require('../crawlers/gdrive'); const { CodeCrawler } = require('../crawlers/code'); const { MCPServer } = require('../mcp/server'); const fs = require('fs'); const path = require('path'); const args = process.argv.slice(2); const command = args[0]; function getArgValue(flagName) { const idx = args.findIndex(arg => arg === flagName || arg.startsWith(`${flagName}=`)); if (idx === -1) return null; const arg = args[idx]; if (arg.includes('=')) { return arg.split('=')[1]; } return args[idx + 1]; } async function init() { const config = getConfig({ dbPath: args[1], modelName: args[2] }); console.error(`Initializing vecstore with database: ${config.dbPath}`); console.error(`Using model: ${config.modelName}`); const vecStore = await VecStoreFactory.create(config); console.error('✓ VecStore initialized successfully'); process.exit(0); } async function add() { if (args.length < 4) { console.error('Usage: vexify add <db-path> <id> <text> [model]'); process.exit(1); } const id = args[2]; const text = args[3]; const config = getConfig({ dbPath: args[1], modelName: args[4] }); const vecStore = await VecStoreFactory.create(config); await vecStore.addDocument(id, text); console.error(`✓ Added document: ${id}`); process.exit(0); } async function query() { if (args.length < 3) { console.error('Usage: vexify query <db-path> <query-text> [topK] [model]'); process.exit(1); } const queryText = args[2]; const config = getConfig({ dbPath: args[1], topK: parseInt(args[3]), modelName: args[4] }); const vecStore = await VecStoreFactory.create(config); const results = await vecStore.query(queryText, config.topK); console.error(`\nTop ${config.topK} results:\n`); results.forEach((result, i) => { console.error(`${i + 1}. [${result.id}] (score: ${result.score.toFixed(4)})`); if (result.metadata?.crawlUrl) { console.error(` URL: ${result.metadata.crawlUrl}`); } if (result.metadata?.pageNumber) { console.error(` Page: ${result.metadata.pageNumber}/${result.metadata.totalPages || '?'}`); } console.error(` ${result.content.substring(0, 100)}${result.content.length > 100 ? '...' : ''}\n`); }); process.exit(0); } function listProcessors() { const extensions = processors.getAllExtensions(); console.error('\nSupported file formats:\n'); const groups = { 'Documents': ['.pdf', '.docx', '.doc', '.txt', '.text'], 'Web': ['.html', '.htm'], 'Data': ['.json', '.jsonl', '.csv', '.xlsx', '.xls'] }; Object.entries(groups).forEach(([name, exts]) => { const available = exts.filter(e => extensions.includes(e)); if (available.length > 0) { console.error(` ${name}:`, available.join(', ')); } }); console.error('\nTotal:', extensions.length, 'formats supported'); console.error('\nAll extensions:', extensions.join(', ')); process.exit(0); } async function syncFolder() { if (args.length < 3) { console.error('Usage: vexify sync <db-path> <folder-path> [model] [--extensions .pdf,.txt] [--no-recursive]'); process.exit(1); } const folderPath = args[2]; const extensionsArgIndex = args.findIndex(arg => arg.startsWith('--extensions')); let extensions = undefined; let extensionValues = []; if (extensionsArgIndex !== -1) { const extensionsArg = args[extensionsArgIndex]; if (extensionsArg.includes('=')) { extensions = extensionsArg.split('=')[1].split(',').map(e => e.trim()); } else if (extensionsArgIndex + 1 < args.length) { extensionValues = args[extensionsArgIndex + 1].split(','); extensions = extensionValues.map(e => e.trim()); } } const modelName = args.find(arg => !arg.startsWith('--') && arg !== args[0] && arg !== args[1] && arg !== folderPath && !extensionValues.includes(arg) ); const recursive = args.includes('--no-recursive') ? false : undefined; const config = getConfig({ dbPath: args[1], modelName, extensions, recursive }); if (!fs.existsSync(folderPath)) { console.error(`Error: Folder not found: ${folderPath}`); process.exit(1); } const vecStore = await VecStoreFactory.create(config); const folderSync = new FolderSync(vecStore, config); console.error(`Syncing folder: ${folderPath}`); console.error(`Extensions: ${config.extensions ? config.extensions.join(', ') : 'all supported'}`); console.error(`Recursive: ${config.recursive}\n`); const results = await folderSync.sync(folderPath); console.error(`\n✓ Sync completed:`); console.error(` Added: ${results.added} documents`); console.error(` Skipped: ${results.skipped} duplicates`); console.error(` Removed: ${results.removed} files`); if (results.errors.length > 0) { console.error(`\n⚠ Errors (${results.errors.length}):`); results.errors.forEach(err => { console.error(` - ${err.file}: ${err.error}`); }); } process.exit(0); } async function crawl() { if (args.length < 2) { console.error('Usage: vexify crawl <url> [output-dir] [--max-pages N] [--max-depth N] [--db-path path] [--model name]'); process.exit(1); } const url = args[1]; const outputDirArg = args.find((arg, i) => i > 1 && !arg.startsWith('--')); const outputDir = outputDirArg || `./${new URL(url).hostname.replace(/^www\./, '')}`; const maxPages = parseInt(getArgValue('--max-pages')) || 10000; const maxDepth = parseInt(getArgValue('--max-depth')) || 3; const dbPath = getArgValue('--db-path') || `${outputDir}.db`; const modelName = getArgValue('--model') || 'embeddinggemma'; const concurrency = parseInt(getArgValue('--concurrency')) || 3; const stateFile = `${outputDir}/.crawl-state.json`; const crawler = new WebCrawler({ maxPages, maxDepth, concurrency, stateFile }); console.error(`Crawling site: ${url}`); console.error(`Output directory: ${outputDir}`); console.error(`Database: ${dbPath}`); console.error(`Max pages: ${maxPages}, Max depth: ${maxDepth}\n`); let vecStore = null; let indexed = { added: 0, skipped: 0 }; const config = getConfig({ dbPath, modelName }); vecStore = await VecStoreFactory.create(config); const { TextDeduplicator } = require('../processors/dedup'); const dedup = new TextDeduplicator(50, 2); const analysisPages = []; const analysisBatchSize = 5; const urlHashMap = vecStore ? await vecStore.store.getCrawledUrlsWithHash() : new Map(); let updated = 0; const onPageCrawled = async (page) => { const processors = require('../processors'); const ext = require('path').extname(page.path).toLowerCase(); const ProcessorClass = processors.getProcessor(ext); if (!ProcessorClass) { console.error(` ⚠ No processor for ${ext}`); return; } const processor = new ProcessorClass(); let documents; try { documents = await processor.process(page.path, { url: page.url }); } catch (error) { console.error(` ⚠ Failed to process ${page.path}: ${error.message}`); return; } if (ext === '.html' || ext === '.htm') { if (analysisPages.length < analysisBatchSize) { analysisPages.push(...documents.map(d => d.content)); if (analysisPages.length >= analysisBatchSize) { const commonCount = dedup.analyzeDocuments(analysisPages); console.error(` Analyzed ${analysisPages.length} pages, found ${commonCount} common text patterns\n`); } } } for (const doc of documents) { if (ext === '.html' || ext === '.htm') { if (dedup.analyzed) { doc.content = dedup.deduplicate(doc.content); } } doc.metadata.source = 'crawl'; if (page.url) { doc.metadata.crawlUrl = page.url; } const oldHash = urlHashMap.get(page.url); const newHash = doc.metadata.contentHash; if (oldHash && newHash && oldHash !== newHash) { const deleted = await vecStore.store.deleteByCrawlUrl(page.url); if (deleted > 0) { updated++; } } const result = await vecStore.addDocument(doc.id, doc.content, doc.metadata); if (result.skipped) { indexed.skipped++; } else { indexed.added++; } } }; const results = await crawler.crawlSite(url, outputDir, vecStore, onPageCrawled); console.error(`\n✓ Site crawl completed:`); console.error(` Pages: ${results.pages.length}`); console.error(` Files: ${results.files.length}`); console.error(` Errors: ${results.errors.length}`); console.error(`✓ Indexed: ${indexed.added} documents, Skipped: ${indexed.skipped} duplicates, Updated: ${updated} changed pages`); process.exit(0); } async function update() { if (args.length < 2) { console.error('Usage: vexify update <db-path> [model]'); process.exit(1); } const config = getConfig({ dbPath: args[1], modelName: args[2] }); const vecStore = await VecStoreFactory.create(config); const updater = new Updater(vecStore); console.error(`Updating embeddings to version ${vecStore.version}...`); const results = await updater.updateAll(); console.error(`\n✓ Update completed:`); console.error(` Checked: ${results.checked} documents`); console.error(` Reprocessed: ${results.reprocessed} documents`); if (results.errors.length > 0) { console.error(`\n⚠ Errors (${results.errors.length}):`); results.errors.forEach(err => { console.error(` - ${err.id}: ${err.error}`); }); } process.exit(0); } async function syncGdrive() { if (args.length < 3) { console.error('Usage: vexify gdrive <db-path> <folder-id> [options]'); console.error('\nOptions:'); console.error(' --service-account <path> Path to service account JSON'); console.error(' --impersonate <email> Email to impersonate (with service account)'); console.error(' --client-secret <path> Path to OAuth client secret JSON'); console.error(' --max-files <N> Maximum files to process (default: 1000)'); console.error(' --model <name> Embedding model (default: nomic-embed-text)'); console.error(' --incremental Process 1 file at a time, resume on next call'); process.exit(1); } const dbPath = args[1]; const folderId = args[2] || 'root'; const options = {}; for (let i = 3; i < args.length; i++) { if (args[i] === '--service-account') { options.serviceAccountPath = args[++i]; } else if (args[i] === '--impersonate') { options.impersonateEmail = args[++i]; } else if (args[i] === '--client-secret') { options.clientSecretPath = args[++i]; } else if (args[i] === '--max-files') { options.maxFiles = parseInt(args[++i]); } else if (args[i] === '--model') { options.modelName = args[++i]; } else if (args[i] === '--incremental') { options.incrementalMode = true; options.maxFiles = 1; } } if (!options.serviceAccountPath && !options.clientSecretPath) { console.error('\nError: Must provide either --service-account or --client-secret'); process.exit(1); } const config = getConfig({ dbPath, modelName: options.modelName }); const vecStore = await VecStoreFactory.create(config); const crawler = new GoogleDriveCrawler(options); console.error('\n📂 Google Drive Sync'); console.error(`Database: ${dbPath}`); console.error(`Folder ID: ${folderId}`); if (options.impersonateEmail) { console.error(`Impersonating: ${options.impersonateEmail}`); } console.error(`Max files: ${options.maxFiles || 1000}\n`); const results = await crawler.crawl(folderId, vecStore); console.error(`\n✓ Google Drive sync completed:`); console.error(` New: ${results.processed} files`); console.error(` Updated: ${results.updated} files`); console.error(` Skipped (unchanged): ${results.skipped} files`); console.error(` Deleted: ${results.deleted} files`); console.error(` Errors: ${results.errors.length}`); if (results.errors.length > 0) { console.error(`\n⚠ Errors:`); results.errors.slice(0, 5).forEach(err => { console.error(` - ${err.file || err.folderId}: ${err.error}`); }); if (results.errors.length > 5) { console.error(` ... and ${results.errors.length - 5} more`); } } process.exit(0); } async function crawlCode() { const rootPath = args[1]; if (!rootPath) { console.error('Usage: vexify code <directory-path> [options]'); console.error(''); console.error('Options:'); console.error(' --db-path <path> Database file (default: ./code.db)'); console.error(' --model <name> Embedding model (default: unclemusclez/jina-embeddings-v2-base-code)'); console.error(' --max-depth <N> Maximum directory depth (default: 10)'); console.error(' --max-size <MB> Maximum file size in MB (default: 1)'); console.error(' --include-binary Include binary files'); console.error(' --ignore <pattern> Additional ignore pattern (can be used multiple times)'); process.exit(1); } // Parse options let dbPath = './code.db'; let modelName = 'unclemusclez/jina-embeddings-v2-base-code'; let maxDepth = 10; let maxSizeMB = 1; let includeBinary = false; let customIgnorePatterns = []; for (let i = 2; i < args.length; i++) { switch (args[i]) { case '--db-path': dbPath = args[++i]; break; case '--model': modelName = args[++i]; break; case '--max-depth': maxDepth = parseInt(args[++i]); break; case '--max-size': maxSizeMB = parseInt(args[++i]); break; case '--include-binary': includeBinary = true; break; case '--ignore': customIgnorePatterns.push(args[++i]); break; default: console.error(`Unknown option: ${args[i]}`); process.exit(1); } } // Validate directory exists if (!fs.existsSync(rootPath)) { console.error(`Directory not found: ${rootPath}`); process.exit(1); } const config = getConfig({ dbPath, modelName }); const vecStore = await VecStoreFactory.create(config); console.error(`🔍 Code repository crawler`); console.error(`Directory: ${rootPath}`); console.error(`Database: ${dbPath}`); console.error(`Model: ${modelName}`); console.error(`Max depth: ${maxDepth}`); console.error(`Max file size: ${maxSizeMB}MB`); if (customIgnorePatterns.length > 0) { console.error(`Custom ignore patterns: ${customIgnorePatterns.join(', ')}`); } console.error(''); const crawler = new CodeCrawler({ rootPath, maxDepth, maxFileSize: maxSizeMB * 1024 * 1024, includeBinary, customIgnorePatterns }); let indexed = { added: 0, skipped: 0 }; const onPageCrawled = async (doc) => { const result = await vecStore.addDocument(doc.id, doc.content, doc.metadata); if (result.skipped) { indexed.skipped++; } else { indexed.added++; } }; const results = await crawler.crawl(vecStore, onPageCrawled); console.error(`\n✓ Code repository crawl completed:`); console.error(` Files indexed: ${indexed.added}`); console.error(` Duplicates skipped: ${indexed.skipped}`); console.error(` Total files processed: ${results.stats.totalFiles}`); console.error(` Errors: ${results.stats.errors}`); process.exit(0); } async function startMcpServer() { const options = { silent: true }; // Parse MCP-specific options (all optional now) for (let i = 1; i < args.length; i++) { if (args[i] === '--db-path') { options.dbPath = args[++i]; } else if (args[i] === '--directory') { options.directory = args[++i]; } else if (args[i] === '--model') { options.modelName = args[++i]; } else if (args[i] === '--verbose') { options.silent = false; } } const server = new MCPServer(options); try { await server.start(); } catch (error) { if (error.message.includes('better_sqlite3.node') || error.message.includes('bindings file')) { console.error('Detecting missing better-sqlite3 build, attempting auto-fix...'); try { const { execSync } = require('child_process'); const installScript = path.join(__dirname, '..', 'install.js'); if (fs.existsSync(installScript)) { execSync(`node "${installScript}"`, { stdio: 'inherit' }); console.error('Auto-fix completed, retrying MCP server start...'); delete require.cache[require.resolve('better-sqlite3')]; await server.start(); return; } } catch (fixError) { console.error('Auto-fix encountered an error:', fixError.message); console.error('\nPlease rebuild better-sqlite3 manually:'); console.error(' cd node_modules/better-sqlite3 && npm run build-release'); process.exit(1); } } if (error.message.includes('sqlite-vec') && error.message.includes('not found')) { console.error('MCP server error:', error.message); console.error('\nMissing platform-specific sqlite-vec extension.'); console.error('Please install the correct package for your platform:'); console.error(' Windows: npm install sqlite-vec-windows-x64'); console.error(' macOS (Intel): npm install sqlite-vec-darwin-x64'); console.error(' macOS (ARM): npm install sqlite-vec-darwin-arm64'); console.error(' Linux: npm install sqlite-vec-linux-x64'); process.exit(1); } console.error('MCP server error:', error.message); process.exit(1); } } function help() { console.error(` vexify - Portable vector database with auto-installing Ollama Usage: npx vexify <command> [options] Commands: init <db-path> [model] Initialize a new vector store add <db-path> <id> <text> [model] Add a document query <db-path> <query> [topK] [model] Query the vector store sync <db-path> <folder-path> [model] [opts] Sync folder with database crawl <url> [output-dir] [opts] Crawl site with automatic indexing code <directory-path> [opts] Index code repository with smart ignore patterns gdrive <db-path> <folder-id> [opts] Sync Google Drive folder update <db-path> [model] Re-embed old documents with new version mcp [options] Start MCP server for agent integration (syncs before each search, uses current directory and ./.vexify.db by default) processors List supported file formats help Show this help message Sync Options: --extensions .pdf,.txt File extensions to process (default: all supported) --no-recursive Don't scan subfolders Crawl Options: --max-pages N Maximum pages to crawl (default: 10000) --max-depth N Maximum link depth (default: 3) --db-path path Database path (default: <output-dir>.db) --model name Embedding model (default: nomic-embed-text) --concurrency N Parallel browser instances (default: 3) Code Options: --db-path <path> Database file (default: ./code.db) --model <name> Embedding model (default: unclemusclez/jina-embeddings-v2-base-code) --max-depth <N> Maximum directory depth (default: 10) --max-size <MB> Maximum file size in MB (default: 1) --include-binary Include binary files --ignore <pattern> Additional ignore pattern Google Drive Options: --service-account <path> Service account JSON (domain-wide delegation) --impersonate <email> Email to impersonate (requires service account) --client-secret <path> OAuth client secret JSON (user login) --max-files N Maximum files to process (default: 1000) --model name Embedding model (default: nomic-embed-text) MCP Server Options: --db-path <path> Database file (default: ./.vexify.db) --directory <path> Directory to index/search (default: current directory) --model <name> Embedding model (default: unclemusclez/jina-embeddings-v2-base-code) Crawl Features: ✓ Automatic resume on Ctrl+C - state saved to .crawl-state.json ✓ Auto-indexes to database by default ✓ Cloudflare bypass with stealth mode ✓ Text deduplication (removes common boilerplate) ✓ Skips already-crawled URLs from database Supported Formats: Documents: .pdf, .docx, .doc, .txt Web: .html, .htm Data: .json, .jsonl, .csv, .xlsx, .xls Examples: npx vexify init ./mydb.db npx vexify add ./mydb.db doc1 "Hello world" npx vexify query ./mydb.db "greeting" 5 npx vexify sync ./mydb.db ./docs npx vexify sync ./mydb.db ./docs --extensions .pdf,.docx npx vexify crawl https://example.com npx vexify crawl https://example.com --max-pages=5000 npx vexify code ./my-project --model unclemusclez/jina-embeddings-v2-base-code npx vexify code ./my-project --max-size 5 --ignore "*.test.js" --ignore "coverage/**" npx vexify gdrive ./mydb.db root --service-account ./sa.json --impersonate user@domain.com npx vexify gdrive ./mydb.db 1ABC_folderID --client-secret ./oauth.json npx vexify update ./mydb.db npx vexify mcp npx vexify mcp --directory ./my-project --db-path ./project.db npx vexify mcp --directory ~/docstudio --model nomic-embed-text npx vexify processors Default model: nomic-embed-text (via Ollama) - fast, cross-platform (x86, ARM, Apple Silicon) Ollama auto-installs to node_modules/.ollama/ if not available Note: Crawl automatically resumes if interrupted - just rerun the same command `); process.exit(0); } async function main() { try { switch (command) { case 'init': await init(); break; case 'add': await add(); break; case 'query': await query(); break; case 'sync': await syncFolder(); break; case 'code': await crawlCode(); break; case 'crawl': await crawl(); break; case 'gdrive': await syncGdrive(); break; case 'update': await update(); break; case 'mcp': await startMcpServer(); break; case 'processors': listProcessors(); break; case 'help': case '--help': case '-h': case undefined: help(); break; default: console.error(`Unknown command: ${command}`); help(); } } catch (error) { console.error('Error:', error.message); process.exit(1); } } main();