@ooples/token-optimizer-mcp
Version:
Intelligent context window optimization for Claude Code - store content externally via caching and compression, freeing up your context window for what matters
140 lines • 5.72 kB
JavaScript
import { createHash } from 'node:crypto';
/**
* A simple embedding generator using hashing and statistical features
* This is an MVP implementation that doesn't require external API calls.
* Can be extended later to use OpenAI, HuggingFace, or other embedding APIs.
*/
export class FoundationModelEmbeddingGenerator {
dimensions;
/**
* @param dimensions - The dimensionality of the embedding vectors (default: 128)
*/
constructor(dimensions = 128) {
this.dimensions = dimensions;
}
/**
* Generate an embedding vector for the given text
* Uses a hybrid approach combining:
* 1. Hashing-based features for content similarity
* 2. Statistical features (length, character distribution)
* 3. N-gram features for semantic similarity
*/
async generateEmbedding(text) {
const normalized = this.normalizeText(text);
const embedding = new Array(this.dimensions).fill(0);
// Part 1: Hash-based features (first 1/6 of dimensions)
// Use multiple hash functions to create diverse features
// Reduced from 1/3 to 1/6 to reduce sensitivity to exact text changes
const hashSection = Math.floor(this.dimensions / 6);
for (let i = 0; i < hashSection; i++) {
const hash = createHash('sha256')
.update(normalized + i.toString())
.digest();
// Convert hash bytes to normalized values [-1, 1]
embedding[i] = hash[i % hash.length] / 127.5 - 1;
}
// Part 2: Statistical features (next 1/3)
// Increased weight for semantic features
const statsSection = Math.floor(this.dimensions / 3);
const statsStart = hashSection;
const stats = this.computeStatistics(normalized);
for (let i = 0; i < statsSection; i++) {
embedding[statsStart + i] = stats[i % stats.length];
}
// Part 3: N-gram features (remaining ~1/2)
// Increased weight for n-gram semantic matching
const ngramSection = this.dimensions - hashSection - statsSection;
const ngramStart = hashSection + statsSection;
const ngrams = this.computeNgramFeatures(normalized, ngramSection);
for (let i = 0; i < ngramSection; i++) {
embedding[ngramStart + i] = ngrams[i];
}
// Normalize the embedding vector to unit length
return this.normalizeVector(embedding);
}
getDimensions() {
return this.dimensions;
}
/**
* Normalize text for consistent embedding generation
*/
normalizeText(text) {
return text
.toLowerCase()
.replace(/\s+/g, ' ') // Normalize whitespace
.replace(/[^\w\s]/g, '') // Remove punctuation
.trim();
}
/**
* Compute statistical features from text
*/
computeStatistics(text) {
const stats = [];
// Length-based features
stats.push(Math.tanh(text.length / 1000)); // Normalized length
// Character frequency features
const charFreq = new Map();
for (const char of text) {
charFreq.set(char, (charFreq.get(char) || 0) + 1);
}
// Entropy (measure of character diversity)
let entropy = 0;
for (const freq of charFreq.values()) {
const p = freq / text.length;
entropy -= p * Math.log2(p);
}
stats.push(Math.tanh(entropy / 5)); // Normalized entropy
// Vowel ratio
const vowels = (text.match(/[aeiou]/g) || []).length;
stats.push(vowels / Math.max(text.length, 1));
// Digit ratio
const digits = (text.match(/\d/g) || []).length;
stats.push(digits / Math.max(text.length, 1));
// Average word length
const words = text.split(' ').filter((w) => w.length > 0);
const avgWordLen = words.reduce((sum, w) => sum + w.length, 0) / Math.max(words.length, 1);
stats.push(Math.tanh(avgWordLen / 10));
// Repeat pattern to fill space if needed
while (stats.length < 32) {
stats.push(...stats.slice(0, Math.min(stats.length, 32 - stats.length)));
}
return stats.slice(0, 32);
}
/**
* Compute n-gram features for semantic similarity
*/
computeNgramFeatures(text, numFeatures) {
const features = new Array(numFeatures).fill(0);
// Compute 2-gram and 3-gram frequencies
const ngrams = new Map();
// 2-grams (bigrams)
for (let i = 0; i < text.length - 1; i++) {
const bigram = text.substring(i, i + 2);
ngrams.set(bigram, (ngrams.get(bigram) || 0) + 1);
}
// 3-grams (trigrams)
for (let i = 0; i < text.length - 2; i++) {
const trigram = text.substring(i, i + 3);
ngrams.set(trigram, (ngrams.get(trigram) || 0) + 1);
}
// Hash n-grams to feature indices
for (const [ngram, count] of ngrams.entries()) {
const hash = createHash('md5').update(ngram).digest();
const idx = hash[0] % numFeatures;
features[idx] += count / text.length; // Normalized count
}
// Clip values to [-1, 1] range
return features.map((v) => Math.tanh(v * 10));
}
/**
* Normalize a vector to unit length (L2 normalization)
*/
normalizeVector(vector) {
const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0));
if (magnitude === 0) {
return vector; // Avoid division by zero
}
return vector.map((val) => val / magnitude);
}
}
//# sourceMappingURL=FoundationModelEmbeddingGenerator.js.map