aiwg
Version:
Deployment tool and support utility for AI context. Copies agents, skills, commands, rules, and behaviors into the paths each AI platform reads (Claude Code, Codex, Copilot, Cursor, Warp, OpenClaw, and 6 more) so one source of truth works across 10 platfo
189 lines • 7.28 kB
JavaScript
/**
* Semantic Embedding Index
*
* Optional ANN (approximate nearest neighbor) layer on top of the artifact
* index. Embeds node summaries/titles into dense vectors using a small local
* model and stores them in an HNSW index for fast similarity queries.
*
* Install: npm install @xenova/transformers hnswlib-node
*
* @implements #730
* @source @src/artifacts/types.ts
* @tests @test/unit/artifacts/embedding-index.test.ts
*/
import fs from 'fs';
import path from 'path';
/**
* Default embedding model (all-MiniLM-L6-v2: ~22MB, 384 dims, ~5ms/embedding on CPU)
*/
export const DEFAULT_EMBEDDING_MODEL = 'Xenova/all-MiniLM-L6-v2';
export const DEFAULT_EMBEDDING_DIMS = 384;
/**
* Check if embedding dependencies are available.
*/
export async function checkEmbeddingDeps() {
const missing = [];
try {
await (new Function('m', 'return import(m)'))('@xenova/transformers');
}
catch {
missing.push('@xenova/transformers');
}
try {
await (new Function('m', 'return import(m)'))('hnswlib-node');
}
catch {
missing.push('hnswlib-node');
}
return { available: missing.length === 0, missing };
}
/**
* Build an embedding index from artifact metadata entries.
*
* Embeds each entry's title + summary into a dense vector and stores
* them in an HNSW index for fast approximate nearest-neighbor queries.
*
* @param entries - Map of node ID → MetadataEntry
* @param outputDir - Directory to write embeddings/ subfolder
* @param model - Transformer model identifier
* @returns Number of entries embedded
*/
export async function buildEmbeddingIndex(entries, outputDir, model = DEFAULT_EMBEDDING_MODEL) {
const transformersMod = await (new Function('m', 'return import(m)'))('@xenova/transformers');
const { pipeline } = transformersMod;
const hnswlib = await (new Function('m', 'return import(m)'))('hnswlib-node');
const HierarchicalNSW = hnswlib.HierarchicalNSW ?? hnswlib.default?.HierarchicalNSW;
if (!HierarchicalNSW) {
throw new Error('hnswlib-node: HierarchicalNSW not found in module exports');
}
const embed = await pipeline('feature-extraction', model);
const ids = Object.keys(entries);
if (ids.length === 0)
return 0;
// Determine dimensions from a test embedding
const testResult = await embed('test', { pooling: 'mean', normalize: true });
const dims = testResult.data.length;
const index = new HierarchicalNSW('cosine', dims);
index.initIndex(Math.max(ids.length, 1));
const checksums = {};
for (let i = 0; i < ids.length; i++) {
const entry = entries[ids[i]];
const text = `${entry.title} ${entry.summary}`.trim();
const result = await embed(text, { pooling: 'mean', normalize: true });
index.addPoint(Array.from(result.data), i);
checksums[ids[i]] = entry.checksum;
}
// Write index and manifest
const embeddingsDir = path.join(outputDir, 'embeddings');
fs.mkdirSync(embeddingsDir, { recursive: true });
index.writeIndex(path.join(embeddingsDir, 'vectors.hnsw'));
const manifest = {
model,
dims,
nodeIds: ids,
builtAt: new Date().toISOString(),
checksums,
};
fs.writeFileSync(path.join(embeddingsDir, 'manifest.json'), JSON.stringify(manifest, null, 2), 'utf-8');
return ids.length;
}
/**
* Load an embedding manifest from an index directory.
*/
export function loadEmbeddingManifest(indexDir) {
const manifestPath = path.join(indexDir, 'embeddings', 'manifest.json');
if (!fs.existsSync(manifestPath))
return null;
try {
return JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
}
catch {
return null;
}
}
/**
* Query the embedding index for semantically similar artifacts.
*
* @param query - Natural language query string
* @param indexDir - Directory containing the embeddings/ subfolder
* @param topK - Number of results to return
* @returns Ranked list of semantic results
*/
export async function semanticQuery(query, indexDir, topK = 10) {
const manifest = loadEmbeddingManifest(indexDir);
if (!manifest) {
throw new Error(`No embedding index found at ${indexDir}/embeddings/`);
}
const transformersMod = await (new Function('m', 'return import(m)'))('@xenova/transformers');
const { pipeline } = transformersMod;
const hnswlib = await (new Function('m', 'return import(m)'))('hnswlib-node');
const HierarchicalNSW = hnswlib.HierarchicalNSW ?? hnswlib.default?.HierarchicalNSW;
if (!HierarchicalNSW) {
throw new Error('hnswlib-node: HierarchicalNSW not found in module exports');
}
const embed = await pipeline('feature-extraction', manifest.model);
const result = await embed(query, { pooling: 'mean', normalize: true });
const index = new HierarchicalNSW('cosine', manifest.dims);
index.readIndex(path.join(indexDir, 'embeddings', 'vectors.hnsw'));
// setEfSearch controls recall quality — higher = better but slower
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const idx = index;
if (typeof idx.setEfSearch === 'function') {
idx.setEfSearch(Math.max(topK * 2, 50));
}
const effectiveK = Math.min(topK, manifest.nodeIds.length);
const { neighbors, distances } = index.searchKnn(Array.from(result.data), effectiveK);
return neighbors.map((pos, i) => ({
nodeId: manifest.nodeIds[pos],
// HNSW cosine distance is 1 - cosine_similarity
score: 1 - (distances[i] ?? 0),
}));
}
/**
* Get semantic neighbors of a specific node.
*
* @param nodeId - Node to find neighbors for
* @param entries - Metadata entries to get the node's text
* @param indexDir - Directory containing the embeddings/ subfolder
* @param topK - Number of results
*/
export async function semanticNeighbors(nodeId, entries, indexDir, topK = 10) {
const entry = entries[nodeId];
if (!entry) {
throw new Error(`Node '${nodeId}' not found in metadata`);
}
const queryText = `${entry.title} ${entry.summary}`.trim();
// Get topK + 1 since the node itself will likely be the top result
const results = await semanticQuery(queryText, indexDir, topK + 1);
// Filter out the query node itself
return results.filter(r => r.nodeId !== nodeId).slice(0, topK);
}
/**
* Determine which entries need re-embedding based on checksum changes.
*
* @param entries - Current metadata entries
* @param manifest - Existing embedding manifest
* @returns Object with entries that changed and entries that are new
*/
export function detectEmbeddingChanges(entries, manifest) {
const changed = [];
const added = [];
const removed = [];
const manifestIds = new Set(manifest.nodeIds);
const entryIds = new Set(Object.keys(entries));
for (const id of entryIds) {
if (!manifestIds.has(id)) {
added.push(id);
}
else if (entries[id].checksum !== manifest.checksums[id]) {
changed.push(id);
}
}
for (const id of manifestIds) {
if (!entryIds.has(id)) {
removed.push(id);
}
}
return { changed, added, removed };
}
//# sourceMappingURL=embedding-index.js.map