@boundless-oss/atlas
Version:
Atlas - MCP Server for comprehensive startup project management
220 lines (187 loc) • 5.98 kB
text/typescript
import crypto from 'crypto';
export interface Embedding {
text: string;
vector: number[];
metadata?: Record<string, any>;
}
export interface SearchResult {
text: string;
score: number;
metadata?: Record<string, any>;
}
export class LocalEmbeddings {
private embeddings: Map<string, Embedding> = new Map();
/**
* Generate a deterministic embedding vector from text using hashing
* This is a simple local alternative to using an API
*/
generateEmbedding(text: string): number[] {
// Normalize text
const normalized = text.toLowerCase().trim();
// Create multiple hash variants for dimensionality
const dimensions = 128;
const vector: number[] = [];
for (let i = 0; i < dimensions; i++) {
const hash = crypto
.createHash('sha256')
.update(`${normalized}-${i}`)
.digest();
// Convert hash bytes to normalized float between -1 and 1
const value = (hash[0] + hash[1] * 256) / 65535 * 2 - 1;
vector.push(value);
}
// Normalize vector
const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0));
return vector.map(val => val / magnitude);
}
/**
* Add text with its embedding to the store
*/
addDocument(id: string, text: string, metadata?: Record<string, any>): void {
const vector = this.generateEmbedding(text);
this.embeddings.set(id, {
text,
vector,
metadata,
});
}
/**
* Compute cosine similarity between two vectors
*/
private cosineSimilarity(a: number[], b: number[]): number {
let dotProduct = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
}
return dotProduct;
}
/**
* Search for similar documents using cosine similarity
*/
search(query: string, topK: number = 5, threshold: number = 0.5): SearchResult[] {
const queryVector = this.generateEmbedding(query);
const results: SearchResult[] = [];
for (const [id, embedding] of this.embeddings) {
const score = this.cosineSimilarity(queryVector, embedding.vector);
if (score >= threshold) {
results.push({
text: embedding.text,
score,
metadata: embedding.metadata,
});
}
}
// Sort by score descending and return top K
return results
.sort((a, b) => b.score - a.score)
.slice(0, topK);
}
/**
* Clear all embeddings
*/
clear(): void {
this.embeddings.clear();
}
/**
* Get total number of embeddings
*/
size(): number {
return this.embeddings.size;
}
/**
* Export embeddings for persistence
*/
export(): Array<[string, Embedding]> {
return Array.from(this.embeddings.entries());
}
/**
* Import embeddings from export
*/
import(data: Array<[string, Embedding]>): void {
this.embeddings = new Map(data);
}
}
/**
* Create a specialized embeddings store for code search
*/
export class CodeEmbeddings extends LocalEmbeddings {
/**
* Add code file with enhanced metadata
*/
addCodeFile(filePath: string, content: string, language: string): void {
// Extract meaningful code features
const features = this.extractCodeFeatures(content, language);
// Create enriched text representation
const enrichedText = `${filePath} ${language} ${features.join(' ')} ${content}`;
this.addDocument(filePath, enrichedText, {
filePath,
language,
features,
originalContent: content,
});
}
/**
* Extract semantic features from code
*/
private extractCodeFeatures(content: string, language: string): string[] {
const features: string[] = [];
// Extract function/method names
const functionRegex = /(?:function|def|fn|func)\s+(\w+)/g;
let match;
while ((match = functionRegex.exec(content)) !== null) {
features.push(`function:${match[1]}`);
}
// Extract class names
const classRegex = /(?:class|struct|interface)\s+(\w+)/g;
while ((match = classRegex.exec(content)) !== null) {
features.push(`class:${match[1]}`);
}
// Extract imports/dependencies
const importRegex = /(?:import|require|use|include)\s+['"]([\w\-\.\/]+)['"]/g;
while ((match = importRegex.exec(content)) !== null) {
features.push(`import:${match[1]}`);
}
// Extract variable declarations (limited to avoid noise)
const varRegex = /(?:const|let|var|val)\s+(\w+)\s*=/g;
const varMatches = content.match(varRegex) || [];
if (varMatches.length < 20) { // Only include if not too many
varMatches.forEach(v => {
const varName = v.match(/(\w+)\s*=/)?.[1];
if (varName) features.push(`var:${varName}`);
});
}
return features;
}
/**
* Search for code with query understanding
*/
searchCode(query: string, topK: number = 5): SearchResult[] {
// Enhance query with code-specific terms
const enhancedQuery = this.enhanceCodeQuery(query);
const results = this.search(enhancedQuery, topK, 0.3); // Lower threshold for code
// Restore original content in results
return results.map(result => ({
...result,
text: result.metadata?.originalContent || result.text,
}));
}
/**
* Enhance search query with code-specific understanding
*/
private enhanceCodeQuery(query: string): string {
const enhancements: string[] = [query];
// Add common programming synonyms
const synonyms: Record<string, string[]> = {
'function': ['method', 'func', 'fn', 'def'],
'class': ['struct', 'type', 'interface'],
'variable': ['var', 'const', 'let', 'val'],
'import': ['require', 'include', 'use'],
};
for (const [term, syns] of Object.entries(synonyms)) {
if (query.toLowerCase().includes(term)) {
enhancements.push(...syns);
}
}
return enhancements.join(' ');
}
}