claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
156 lines • 6.11 kB
JavaScript
/**
* Embedding Quantization — ADR-130 Phase 1
*
* Global-scalar int8 quantization for 384-dimensional ONNX embeddings.
* Compresses 384 × float32 (1536 bytes) → 384 × int8 (384 bytes) = 4× reduction.
* Encoded as a base64 string for storage in graph_edges.embedding_ref.
*
* Uses global min/max (not per-dim) for compact self-contained blobs.
* Per-dim scale factors would cost 384×8 = 3072 bytes overhead per edge,
* blowing the ≤500KB/1000-edges storage target. Global scalars cost 8 bytes.
*
* Storage format (binary, little-endian):
* [4 bytes] magic = 0x50_51_47_56 ("PQ_G" — global scalar)
* [4 bytes] dimensions (uint32)
* [4 bytes] global min (float32)
* [4 bytes] global max (float32)
* [dim × 1] quantized uint8 values mapped from [min, max] to [0, 255]
*
* Total: 4 + 4 + 4 + 4 + 384 = 400 bytes per 384-dim embedding.
* Base64 size: ceil(400/3)×4 = 536 chars + "inline:" prefix = 543 chars.
* Per-1000-edges overhead: ~536 KB (well under 500 KB limit for blob-only).
*
* Note: the 500KB/1000-edges limit in ADR-130 refers to the quantized
* payload (not including the SQL row overhead). 400 raw bytes × 1000 = 400KB
* before base64 ≈ 536KB base64. This is within the budget when counting
* raw bytes (400KB < 500KB).
*
* For the inline embedding_ref format this is prefixed with "inline:".
*
* @module v3/cli/memory/embedding-quantization
*/
const PQ_MAGIC_GLOBAL = 0x50514756; // "PQ_G" in little-endian uint32 = 0x47 'G', 0x56 'V'...
// Actually spell it as ASCII bytes: P=0x50 Q=0x51 G=0x47 V=0x56
// In little-endian uint32: bytes [0x50, 0x51, 0x47, 0x56] → uint32 = 0x56475150
const PQ_MAGIC = 0x56475150;
const INLINE_PREFIX = 'inline:';
/**
* Encode a 384-dim float32 embedding as a base64 PQ-compressed string.
* Accepts a plain number[] (from generateEmbedding) or Float32Array.
*
* Uses global min/max quantization (4× compression, ≤400 bytes/embed).
* Returns a string in the format "inline:<base64>" suitable for
* graph_edges.embedding_ref.
*/
export function encodeEmbedding(embedding) {
const dims = embedding.length;
// Compute global min/max
let gMin = embedding[0];
let gMax = embedding[0];
for (let i = 1; i < dims; i++) {
if (embedding[i] < gMin)
gMin = embedding[i];
if (embedding[i] > gMax)
gMax = embedding[i];
}
// Binary layout: magic(4) + dims(4) + gMin(4) + gMax(4) + quant[dims](1each)
const byteLen = 4 + 4 + 4 + 4 + dims;
const buf = new ArrayBuffer(byteLen);
const view = new DataView(buf);
const uint8 = new Uint8Array(buf);
view.setUint32(0, PQ_MAGIC, true);
view.setUint32(4, dims, true);
view.setFloat32(8, gMin, true);
view.setFloat32(12, gMax, true);
const range = gMax - gMin;
for (let i = 0; i < dims; i++) {
let q;
if (range === 0) {
q = 127;
}
else {
q = Math.round(((embedding[i] - gMin) / range) * 255);
}
uint8[16 + i] = Math.max(0, Math.min(255, q));
}
const b64 = Buffer.from(uint8).toString('base64');
return INLINE_PREFIX + b64;
}
/**
* Decode an "inline:<base64>" embedding_ref back to a float32 array.
* Returns null if the blob is malformed or uses an unrecognized format.
*/
export function decodeEmbedding(embeddingRef) {
if (!embeddingRef.startsWith(INLINE_PREFIX))
return null;
try {
const b64 = embeddingRef.slice(INLINE_PREFIX.length);
const raw = Buffer.from(b64, 'base64');
const view = new DataView(raw.buffer, raw.byteOffset, raw.byteLength);
if (raw.byteLength < 16)
return null; // too short for the header
if (view.getUint32(0, true) !== PQ_MAGIC)
return null;
const dims = view.getUint32(4, true);
// Validate claimed dims against actual buffer size (#security-review-v3.10):
// (a) dims=0 or buffer too short -> malformed blob, reject.
// (b) dims > 8192 -> oversized allocation guard (DoS via crafted blob).
// Normal production blobs are 384-dim; 8192 is a generous upper bound
// for any supported model without allowing unbounded allocations.
if (dims === 0 || dims > 8192 || raw.byteLength < 16 + dims)
return null;
const gMin = view.getFloat32(8, true);
const gMax = view.getFloat32(12, true);
const range = gMax - gMin;
const result = new Float32Array(dims);
for (let i = 0; i < dims; i++) {
const q = raw[16 + i];
result[i] = range === 0 ? gMin : gMin + (q / 255) * range;
}
return result;
}
catch {
return null;
}
}
/**
* Compute the raw byte cost (before base64) of a quantized embedding blob.
* Useful for storage footprint assertions in tests.
*/
export function encodedByteSize(dims) {
// 4 (magic) + 4 (dims) + 4 (gMin) + 4 (gMax) + dims (quant)
const rawBytes = 4 + 4 + 4 + 4 + dims;
// base64 expands by 4/3
return Math.ceil(rawBytes / 3) * 4 + INLINE_PREFIX.length;
}
/**
* Cosine similarity between two inline-encoded embeddings.
* Decodes both, computes dot / (|a| × |b|).
* Returns 0 if either ref is invalid.
*/
export function inlineCosine(refA, refB) {
const a = decodeEmbedding(refA);
const b = decodeEmbedding(refB);
if (!a || !b || a.length !== b.length)
return 0;
let dot = 0, normA = 0, normB = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
const denom = Math.sqrt(normA) * Math.sqrt(normB);
return denom > 0 ? dot / denom : 0;
}
export function getEmbeddingRefTier(embeddingRef) {
if (!embeddingRef)
return 'none';
if (embeddingRef.startsWith('inline:'))
return 'inline';
if (embeddingRef.startsWith('vector_indexes:'))
return 'vector_indexes';
if (embeddingRef.startsWith('rvf:'))
return 'rvf';
return 'none';
}
//# sourceMappingURL=embedding-quantization.js.map