UNPKG

@boundless-oss/atlas

Version:

Atlas - MCP Server for comprehensive startup project management

141 lines (112 loc) 3.67 kB
import { promises as fs } from 'fs'; import path from 'path'; import crypto from 'crypto'; import type { EmbeddingModel, EmbeddingVector } from './types.js'; export interface LocalEmbeddingConfig { modelName?: string; dimension?: number; cachePath?: string; } export class LocalEmbeddingModel implements EmbeddingModel { readonly modelName: string; readonly dimension: number; private cachePath: string; private initialized = false; constructor(config: LocalEmbeddingConfig = {}) { this.modelName = config.modelName || 'all-MiniLM-L6-v2'; this.dimension = config.dimension || 384; this.cachePath = config.cachePath || '.atlas/rag/embeddings-cache'; } async initialize(): Promise<void> { if (this.initialized) return; // Ensure cache directory exists try { await fs.access(this.cachePath); } catch { await fs.mkdir(this.cachePath, { recursive: true }); } this.initialized = true; } async embed(texts: string[]): Promise<EmbeddingVector[]> { if (!texts || texts.length === 0) { return []; } const embeddings: EmbeddingVector[] = []; for (const text of texts) { const embedding = await this.embedSingle(text); embeddings.push(embedding); } return embeddings; } async embedSingle(text: string): Promise<EmbeddingVector> { await this.initialize(); // Check cache first const cacheKey = this.getCacheKey(text); const cachePath = path.join(this.cachePath, `${cacheKey}.json`); try { const cached = await fs.readFile(cachePath, 'utf-8'); const data = JSON.parse(cached); return new Float32Array(data); } catch { // Not in cache, generate embedding } // Generate embedding const embedding = await this.generateEmbedding(text); // Cache the result try { await fs.writeFile(cachePath, JSON.stringify(Array.from(embedding))); } catch { // Ignore cache write errors } return embedding; } cosineSimilarity(a: EmbeddingVector, b: EmbeddingVector): number { if (a.length !== b.length) { throw new Error('Vectors must have the same dimension'); } let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } normA = Math.sqrt(normA); normB = Math.sqrt(normB); if (normA === 0 || normB === 0) { return 0; } return dotProduct / (normA * normB); } private async generateEmbedding(text: string): Promise<EmbeddingVector> { // For now, we'll create a deterministic mock embedding based on the text // In a real implementation, this would use a local embedding model const embedding = new Float32Array(this.dimension); // Create a simple deterministic embedding based on text characteristics const hash = crypto.createHash('sha256').update(text).digest(); for (let i = 0; i < this.dimension; i++) { // Use hash bytes to generate values between -1 and 1 const byte = hash[i % hash.length]; embedding[i] = (byte / 127.5) - 1; } // Normalize the embedding let norm = 0; for (let i = 0; i < embedding.length; i++) { norm += embedding[i] * embedding[i]; } norm = Math.sqrt(norm); if (norm > 0) { for (let i = 0; i < embedding.length; i++) { embedding[i] /= norm; } } return embedding; } private getCacheKey(text: string): string { return crypto .createHash('sha256') .update(`${this.modelName}:${text}`) .digest('hex'); } }