vexify
Version:
Portable vector database with in-process ONNX embeddings. Zero-config semantic search via SQLite. No external servers required.
194 lines (175 loc) • 6.83 kB
JavaScript
'use strict';
const path = require('path');
const os = require('os');
const MODEL_REGISTRY = {
'nomic-embed-text': { dimension: 384, provider: 'ollama', tags: ['default', 'lightweight'] },
'embeddinggemma': { dimension: 768, provider: 'ollama', tags: ['default'] },
'mxbai-embed-large': { dimension: 1024, provider: 'ollama' },
'all-minilm': { dimension: 384, provider: 'ollama' },
'snowflake-arctic-embed': { dimension: 1024, provider: 'ollama' },
'jina-embeddings-v2-base-code': { dimension: 768, provider: 'ollama', tags: ['code'] },
'Xenova/all-MiniLM-L6-v2': { dimension: 384, provider: 'transformers', tags: ['lightweight'] },
'Xenova/bge-small-en-v1.5': { dimension: 384, provider: 'transformers', tags: ['lightweight'] },
'Xenova/bge-base-en-v1.5': { dimension: 768, provider: 'transformers', tags: ['default', 'code'] },
'Xenova/bge-large-en-v1.5': { dimension: 1024, provider: 'transformers' },
'Xenova/multilingual-e5-small': { dimension: 384, provider: 'transformers' },
'Xenova/multilingual-e5-base': { dimension: 768, provider: 'transformers' },
'Xenova/multilingual-e5-large': { dimension: 1024, provider: 'transformers' }
};
const CONVENTIONS = {
db: {
defaultPath: './vecstore.db',
defaultName: 'vecstore.db',
dataDir: path.join(process.cwd(), '.vecstore')
},
embedder: {
defaultModel: 'nomic-embed-text',
defaultHost: 'http://localhost:11434',
vllmHost: 'http://localhost:8000',
embedderType: 'auto'
},
sync: {
defaultExtensions: null,
recursive: true,
watchMode: false,
ignoreDirs: ['node_modules', '.git', 'dist', 'build'],
concurrency: 12,
embedBatchSize: 1,
embedConcurrency: 5,
bufferSize: 100,
flushDelay: 300
},
crawlers: {
web: {
maxPages: 100,
maxDepth: 3,
timeout: 30000,
retries: 3,
userAgent: 'Mozilla/5.0 (compatible; vexify/1.0; +https://github.com/yourusername/vexify)'
},
code: {
supportedLanguages: {
js: { extensions: ['.js', '.mjs', '.cjs'], parser: 'javascript' },
ts: { extensions: ['.ts'], parser: 'typescript' },
py: { extensions: ['.py'], parser: 'python' },
json: { extensions: ['.json'], parser: 'json' },
md: { extensions: ['.md', '.markdown'], parser: 'markdown' },
html: { extensions: ['.html', '.htm'], parser: 'html' },
css: { extensions: ['.css', '.scss', '.sass', '.less'], parser: 'css' },
java: { extensions: ['.java'], parser: 'java' },
go: { extensions: ['.go'], parser: 'go' },
rs: { extensions: ['.rs'], parser: 'rust' }
}
}
},
processing: {
pdf: {
chunkSize: 2000,
ocrTimeout: 10000
},
text: {
languageDetection: true,
minLength: 50
}
},
ollama: {
pullTimeout: 600000,
embedTimeout: 300000,
healthCheckInterval: 30000
},
mcp: {
syncInterval: 60000,
validationInterval: 30000,
backgroundIndexingDelay: 1000
},
search: {
defaultTopK: 5,
minScore: 0.0,
algorithm: 'cosine'
},
storage: {
storeContent: true,
storeVectors: true,
compression: false
}
};
function getConfig(overrides = {}) {
const cleanOverrides = {};
for (const key in overrides) {
if (overrides[key] !== undefined) {
cleanOverrides[key] = overrides[key];
}
}
return {
dbPath: cleanOverrides.dbPath !== undefined ? cleanOverrides.dbPath : CONVENTIONS.db.defaultPath,
modelName: cleanOverrides.modelName !== undefined ? cleanOverrides.modelName : CONVENTIONS.embedder.defaultModel,
ollamaHost: cleanOverrides.ollamaHost !== undefined ? cleanOverrides.ollamaHost : CONVENTIONS.embedder.defaultHost,
vllmHost: cleanOverrides.vllmHost !== undefined ? cleanOverrides.vllmHost : CONVENTIONS.embedder.vllmHost,
embedderType: cleanOverrides.embedderType !== undefined ? cleanOverrides.embedderType : CONVENTIONS.embedder.embedderType,
extensions: cleanOverrides.extensions !== undefined ? cleanOverrides.extensions : CONVENTIONS.sync.defaultExtensions,
recursive: cleanOverrides.recursive !== undefined ? cleanOverrides.recursive : CONVENTIONS.sync.recursive,
topK: cleanOverrides.topK !== undefined ? cleanOverrides.topK : CONVENTIONS.search.defaultTopK,
storeContent: cleanOverrides.storeContent !== undefined ? cleanOverrides.storeContent : CONVENTIONS.storage.storeContent,
ignoreDirs: cleanOverrides.ignoreDirs !== undefined ? cleanOverrides.ignoreDirs : CONVENTIONS.sync.ignoreDirs,
concurrency: cleanOverrides.concurrency !== undefined ? cleanOverrides.concurrency : CONVENTIONS.sync.concurrency,
embedBatchSize: cleanOverrides.embedBatchSize !== undefined ? cleanOverrides.embedBatchSize : CONVENTIONS.sync.embedBatchSize,
embedConcurrency: cleanOverrides.embedConcurrency !== undefined ? cleanOverrides.embedConcurrency : CONVENTIONS.sync.embedConcurrency,
...cleanOverrides
};
}
function getModelDimension(modelName = null) {
const model = modelName || CONVENTIONS.embedder.defaultModel;
const spec = MODEL_REGISTRY[model];
if (!spec) {
throw new Error(`Unknown model: ${model}. Supported models: ${Object.keys(MODEL_REGISTRY).join(', ')}`);
}
return spec.dimension;
}
function getSupportedLanguages() {
const result = {};
const { js, ts, py, json, md, html, css, java, go, rs } = CONVENTIONS.crawlers.code.supportedLanguages;
result.javascript = js;
result.typescript = ts;
result.python = py;
result.json = json;
result.markdown = md;
result.html = html;
result.css = css;
result.java = java;
result.go = go;
result.rust = rs;
return result;
}
function detectOptimalModel(dirPath = process.cwd()) {
const fs = require('fs');
const pathModule = require('path');
const codeRepoIndicators = [
'package.json', 'tsconfig.json', 'Cargo.toml', 'requirements.txt',
'pyproject.toml', 'pom.xml', 'build.gradle'
];
const isCodeRepo = codeRepoIndicators.some(indicator =>
fs.existsSync(pathModule.join(dirPath, indicator))
);
return isCodeRepo ? 'jina-embeddings-v2-base-code' : CONVENTIONS.embedder.defaultModel;
}
function validateModelDimension(modelName, actualDimension) {
const spec = MODEL_REGISTRY[modelName];
if (!spec) {
throw new Error(`Unknown model: ${modelName}. Supported models: ${Object.keys(MODEL_REGISTRY).join(', ')}`);
}
if (actualDimension !== spec.dimension) {
throw new Error(
`Vector dimension mismatch for model ${modelName}: expected ${spec.dimension}, got ${actualDimension}. ` +
`This indicates a corrupted or incompatible vector database. Delete your .db file and resync.`
);
}
}
module.exports = {
MODEL_REGISTRY,
CONVENTIONS,
getConfig,
getModelDimension,
getSupportedLanguages,
detectOptimalModel,
validateModelDimension
};