@boundless-oss/atlas
Version:
Atlas - MCP Server for comprehensive startup project management
141 lines (112 loc) • 3.67 kB
text/typescript
import { promises as fs } from 'fs';
import path from 'path';
import crypto from 'crypto';
import type { EmbeddingModel, EmbeddingVector } from './types.js';
export interface LocalEmbeddingConfig {
modelName?: string;
dimension?: number;
cachePath?: string;
}
export class LocalEmbeddingModel implements EmbeddingModel {
readonly modelName: string;
readonly dimension: number;
private cachePath: string;
private initialized = false;
constructor(config: LocalEmbeddingConfig = {}) {
this.modelName = config.modelName || 'all-MiniLM-L6-v2';
this.dimension = config.dimension || 384;
this.cachePath = config.cachePath || '.atlas/rag/embeddings-cache';
}
async initialize(): Promise<void> {
if (this.initialized) return;
// Ensure cache directory exists
try {
await fs.access(this.cachePath);
} catch {
await fs.mkdir(this.cachePath, { recursive: true });
}
this.initialized = true;
}
async embed(texts: string[]): Promise<EmbeddingVector[]> {
if (!texts || texts.length === 0) {
return [];
}
const embeddings: EmbeddingVector[] = [];
for (const text of texts) {
const embedding = await this.embedSingle(text);
embeddings.push(embedding);
}
return embeddings;
}
async embedSingle(text: string): Promise<EmbeddingVector> {
await this.initialize();
// Check cache first
const cacheKey = this.getCacheKey(text);
const cachePath = path.join(this.cachePath, `${cacheKey}.json`);
try {
const cached = await fs.readFile(cachePath, 'utf-8');
const data = JSON.parse(cached);
return new Float32Array(data);
} catch {
// Not in cache, generate embedding
}
// Generate embedding
const embedding = await this.generateEmbedding(text);
// Cache the result
try {
await fs.writeFile(cachePath, JSON.stringify(Array.from(embedding)));
} catch {
// Ignore cache write errors
}
return embedding;
}
cosineSimilarity(a: EmbeddingVector, b: EmbeddingVector): number {
if (a.length !== b.length) {
throw new Error('Vectors must have the same dimension');
}
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
normA = Math.sqrt(normA);
normB = Math.sqrt(normB);
if (normA === 0 || normB === 0) {
return 0;
}
return dotProduct / (normA * normB);
}
private async generateEmbedding(text: string): Promise<EmbeddingVector> {
// For now, we'll create a deterministic mock embedding based on the text
// In a real implementation, this would use a local embedding model
const embedding = new Float32Array(this.dimension);
// Create a simple deterministic embedding based on text characteristics
const hash = crypto.createHash('sha256').update(text).digest();
for (let i = 0; i < this.dimension; i++) {
// Use hash bytes to generate values between -1 and 1
const byte = hash[i % hash.length];
embedding[i] = (byte / 127.5) - 1;
}
// Normalize the embedding
let norm = 0;
for (let i = 0; i < embedding.length; i++) {
norm += embedding[i] * embedding[i];
}
norm = Math.sqrt(norm);
if (norm > 0) {
for (let i = 0; i < embedding.length; i++) {
embedding[i] /= norm;
}
}
return embedding;
}
private getCacheKey(text: string): string {
return crypto
.createHash('sha256')
.update(`${this.modelName}:${text}`)
.digest('hex');
}
}