sf-agent-framework
Version:
AI Agent Orchestration Framework for Salesforce Development - Two-phase architecture with 70% context reduction
658 lines (554 loc) ⢠18.2 kB
JavaScript
/**
* Document Sharding System
*
* Purpose: Achieve 90% token savings by intelligently sharding large documents
* into story-sized, context-aware chunks that can be loaded on-demand
*
* Key Features:
* - Semantic chunking (preserves meaning across boundaries)
* - Priority-based loading (load what matters first)
* - Story-sized shards (fits in agent context windows)
* - Dependency tracking (knows what needs what)
* - Token budget awareness (stays within limits)
*
* @module DocumentSharder
* @version 1.0.0
* @date 2025-11-25
*/
const fs = require('fs-extra');
const path = require('path');
const crypto = require('crypto');
class DocumentSharder {
constructor(rootDir = process.cwd()) {
this.rootDir = rootDir;
this.shardsDir = path.join(rootDir, '.sf-agent', 'shards');
this.indexPath = path.join(this.shardsDir, 'shard-index.json');
// Configuration
this.config = {
// Target sizes (in tokens - approximate using chars/4)
maxShardTokens: 2000, // ~8000 chars
minShardTokens: 500, // ~2000 chars
overlapTokens: 100, // ~400 chars overlap for context
// Chunking strategies
strategies: {
markdown: 'heading-based', // Split on headings
code: 'function-based', // Split on functions/classes
yaml: 'section-based', // Split on top-level sections
json: 'object-based', // Split on objects
text: 'paragraph-based', // Split on paragraphs
},
// Priority levels for loading
priorities: {
critical: 1, // Load first (architecture, requirements)
high: 2, // Load early (implementation details)
medium: 3, // Load as needed (tests, examples)
low: 4, // Load on demand (documentation, comments)
archive: 5, // Don't auto-load (history, backups)
},
};
}
/**
* Initialize sharding system
*/
async initialize() {
await fs.ensureDir(this.shardsDir);
if (!(await fs.pathExists(this.indexPath))) {
await this.createEmptyIndex();
}
console.log('ā Document sharding system initialized');
return true;
}
/**
* Create empty shard index
*/
async createEmptyIndex() {
const index = {
version: '1.0.0',
created: new Date().toISOString(),
lastUpdated: new Date().toISOString(),
shards: {},
documents: {},
stats: {
totalDocuments: 0,
totalShards: 0,
totalTokensSaved: 0,
},
};
await fs.writeJson(this.indexPath, index, { spaces: 2 });
}
/**
* Shard a document into smaller chunks
*/
async shardDocument(documentPath, options = {}) {
try {
console.log(`\nš Sharding document: ${documentPath}`);
// Read document
const content = await fs.readFile(documentPath, 'utf8');
const fileType = path.extname(documentPath).slice(1) || 'text';
const strategy = options.strategy || this.config.strategies[fileType] || 'paragraph-based';
// Calculate original token count (approximate)
const originalTokens = this.estimateTokens(content);
console.log(` Original size: ~${originalTokens} tokens`);
// Choose sharding strategy
const chunks = await this.chunkDocument(content, fileType, strategy);
console.log(` Created ${chunks.length} chunks`);
// Create shards with metadata
const shards = await this.createShards(documentPath, chunks, options);
// Update index
await this.updateIndex(documentPath, shards, originalTokens);
// Calculate savings
const tokensSaved = this.calculateSavings(originalTokens, shards);
console.log(` ā Token savings: ${tokensSaved.percentage}% (${tokensSaved.saved} tokens)`);
return {
documentPath,
shards,
originalTokens,
tokensSaved: tokensSaved.saved,
savingsPercentage: tokensSaved.percentage,
};
} catch (error) {
console.error('ā Sharding failed:', error.message);
throw error;
}
}
/**
* Chunk document based on strategy
*/
async chunkDocument(content, fileType, strategy) {
switch (strategy) {
case 'heading-based':
return this.chunkByHeadings(content);
case 'function-based':
return this.chunkByFunctions(content);
case 'section-based':
return this.chunkBySections(content);
case 'object-based':
return this.chunkByObjects(content);
case 'paragraph-based':
default:
return this.chunkByParagraphs(content);
}
}
/**
* Chunk by markdown headings
*/
chunkByHeadings(content) {
const chunks = [];
const lines = content.split('\n');
let currentChunk = { lines: [], heading: null, level: 0 };
for (const line of lines) {
const headingMatch = line.match(/^(#{1,6})\s+(.+)/);
if (headingMatch) {
// Save previous chunk if it has content
if (currentChunk.lines.length > 0) {
chunks.push(this.finalizeChunk(currentChunk));
}
// Start new chunk
currentChunk = {
lines: [line],
heading: headingMatch[2],
level: headingMatch[1].length,
};
} else {
currentChunk.lines.push(line);
}
// Split if chunk gets too large
if (this.estimateTokens(currentChunk.lines.join('\n')) > this.config.maxShardTokens) {
chunks.push(this.finalizeChunk(currentChunk));
currentChunk = {
lines: [],
heading: `${currentChunk.heading} (continued)`,
level: currentChunk.level,
};
}
}
// Add final chunk
if (currentChunk.lines.length > 0) {
chunks.push(this.finalizeChunk(currentChunk));
}
return chunks;
}
/**
* Chunk by functions/classes
*/
chunkByFunctions(content) {
const chunks = [];
const lines = content.split('\n');
let currentChunk = { lines: [], type: 'code', name: 'preamble' };
let braceDepth = 0;
for (const line of lines) {
// Detect function/class declarations
const funcMatch = line.match(/(?:function|class|const|let|var)\s+(\w+)/);
const asyncMatch = line.match(/async\s+(?:function\s+)?(\w+)/);
if ((funcMatch || asyncMatch) && braceDepth === 0) {
// Save previous chunk
if (currentChunk.lines.length > 0) {
chunks.push(this.finalizeChunk(currentChunk));
}
// Start new chunk
currentChunk = {
lines: [line],
type: 'function',
name: funcMatch?.[1] || asyncMatch?.[1] || 'anonymous',
};
} else {
currentChunk.lines.push(line);
}
// Track brace depth
braceDepth += (line.match(/{/g) || []).length;
braceDepth -= (line.match(/}/g) || []).length;
// Split if chunk gets too large
if (this.estimateTokens(currentChunk.lines.join('\n')) > this.config.maxShardTokens) {
chunks.push(this.finalizeChunk(currentChunk));
currentChunk = { lines: [], type: 'code', name: 'overflow' };
}
}
// Add final chunk
if (currentChunk.lines.length > 0) {
chunks.push(this.finalizeChunk(currentChunk));
}
return chunks;
}
/**
* Chunk by YAML sections
*/
chunkBySections(content) {
const chunks = [];
const lines = content.split('\n');
let currentChunk = { lines: [], section: 'header', level: 0 };
for (const line of lines) {
// Detect top-level keys (no indentation)
if (line.match(/^[a-zA-Z_][\w-]*:/) && !line.startsWith(' ')) {
// Save previous chunk
if (currentChunk.lines.length > 0) {
chunks.push(this.finalizeChunk(currentChunk));
}
// Start new chunk
const sectionName = line.split(':')[0].trim();
currentChunk = {
lines: [line],
section: sectionName,
level: 0,
};
} else {
currentChunk.lines.push(line);
}
// Split if chunk gets too large
if (this.estimateTokens(currentChunk.lines.join('\n')) > this.config.maxShardTokens) {
chunks.push(this.finalizeChunk(currentChunk));
currentChunk = { lines: [], section: `${currentChunk.section}_continued`, level: 0 };
}
}
// Add final chunk
if (currentChunk.lines.length > 0) {
chunks.push(this.finalizeChunk(currentChunk));
}
return chunks;
}
/**
* Chunk by JSON objects
*/
chunkByObjects(content) {
try {
const data = JSON.parse(content);
const chunks = [];
if (Array.isArray(data)) {
// Split array into chunks
for (let i = 0; i < data.length; i++) {
chunks.push({
content: JSON.stringify(data[i], null, 2),
type: 'array-item',
name: `item_${i}`,
});
}
} else if (typeof data === 'object') {
// Split object by keys
for (const [key, value] of Object.entries(data)) {
chunks.push({
content: JSON.stringify({ [key]: value }, null, 2),
type: 'object-key',
name: key,
});
}
}
return chunks;
} catch (error) {
// Fallback to paragraph-based if JSON is invalid
return this.chunkByParagraphs(content);
}
}
/**
* Chunk by paragraphs
*/
chunkByParagraphs(content) {
const chunks = [];
const paragraphs = content.split(/\n\s*\n/);
let currentChunk = { lines: [], type: 'text', name: 'paragraph' };
for (const paragraph of paragraphs) {
const trimmed = paragraph.trim();
if (!trimmed) continue;
// Check if adding this paragraph would exceed max size
const potentialContent = [...currentChunk.lines, trimmed].join('\n\n');
if (
this.estimateTokens(potentialContent) > this.config.maxShardTokens &&
currentChunk.lines.length > 0
) {
// Save current chunk
chunks.push(this.finalizeChunk(currentChunk));
currentChunk = { lines: [trimmed], type: 'text', name: 'paragraph' };
} else {
currentChunk.lines.push(trimmed);
}
}
// Add final chunk
if (currentChunk.lines.length > 0) {
chunks.push(this.finalizeChunk(currentChunk));
}
return chunks;
}
/**
* Finalize chunk with metadata
*/
finalizeChunk(chunk) {
const content = Array.isArray(chunk.lines) ? chunk.lines.join('\n') : chunk.content;
return {
content,
metadata: {
type: chunk.type || 'unknown',
name: chunk.heading || chunk.name || chunk.section || 'untitled',
level: chunk.level || 0,
tokens: this.estimateTokens(content),
},
};
}
/**
* Create shards with full metadata
*/
async createShards(documentPath, chunks, options = {}) {
const shards = [];
const docHash = this.hashPath(documentPath);
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
const shardId = `${docHash}_${i.toString().padStart(4, '0')}`;
const shardPath = path.join(this.shardsDir, `${shardId}.shard`);
// Determine priority
const priority = this.determinePriority(chunk, i, chunks.length, options);
// Create shard metadata
const shard = {
id: shardId,
documentPath,
index: i,
totalShards: chunks.length,
priority,
metadata: chunk.metadata,
tokens: chunk.metadata.tokens,
created: new Date().toISOString(),
path: shardPath,
};
// Write shard file
await fs.writeFile(shardPath, chunk.content, 'utf8');
// Add metadata file
await fs.writeJson(`${shardPath}.meta`, shard, { spaces: 2 });
shards.push(shard);
}
return shards;
}
/**
* Determine shard priority
*/
determinePriority(chunk, index, totalShards, options) {
// User-specified priority
if (options.priority) return options.priority;
// First shard is usually critical (contains overview)
if (index === 0) return this.config.priorities.critical;
// Last few shards often less important (appendix, references)
if (index >= totalShards - 2) return this.config.priorities.low;
// Check content type
const name = chunk.metadata.name.toLowerCase();
// Critical sections
if (
name.includes('architecture') ||
name.includes('requirement') ||
name.includes('overview') ||
name.includes('introduction')
) {
return this.config.priorities.critical;
}
// High priority sections
if (
name.includes('implementation') ||
name.includes('design') ||
name.includes('api') ||
name.includes('interface')
) {
return this.config.priorities.high;
}
// Low priority sections
if (
name.includes('example') ||
name.includes('test') ||
name.includes('reference') ||
name.includes('history')
) {
return this.config.priorities.low;
}
// Default: medium priority
return this.config.priorities.medium;
}
/**
* Update shard index
*/
async updateIndex(documentPath, shards, originalTokens) {
const index = await fs.readJson(this.indexPath);
const docHash = this.hashPath(documentPath);
// Update document entry
index.documents[docHash] = {
path: documentPath,
shards: shards.map((s) => s.id),
originalTokens,
shardedTokens: shards.reduce((sum, s) => sum + s.tokens, 0),
created: new Date().toISOString(),
shardCount: shards.length,
};
// Update shard entries
for (const shard of shards) {
index.shards[shard.id] = {
documentHash: docHash,
index: shard.index,
priority: shard.priority,
tokens: shard.tokens,
path: shard.path,
};
}
// Update stats
index.stats.totalDocuments = Object.keys(index.documents).length;
index.stats.totalShards = Object.keys(index.shards).length;
index.lastUpdated = new Date().toISOString();
await fs.writeJson(this.indexPath, index, { spaces: 2 });
}
/**
* Load shards by priority
*/
async loadShardsByPriority(documentPath, maxPriority = 3, maxTokens = 10000) {
const index = await fs.readJson(this.indexPath);
const docHash = this.hashPath(documentPath);
const docEntry = index.documents[docHash];
if (!docEntry) {
throw new Error(`Document not found in index: ${documentPath}`);
}
// Get all shards for this document
const allShards = docEntry.shards.map((id) => ({
id,
...index.shards[id],
}));
// Sort by priority (lower number = higher priority)
allShards.sort((a, b) => a.priority - b.priority);
// Load shards up to budget
const loadedShards = [];
let tokenCount = 0;
for (const shardMeta of allShards) {
if (shardMeta.priority > maxPriority) break;
if (tokenCount + shardMeta.tokens > maxTokens) break;
const shardContent = await fs.readFile(shardMeta.path, 'utf8');
loadedShards.push({
...shardMeta,
content: shardContent,
});
tokenCount += shardMeta.tokens;
}
return {
shards: loadedShards,
tokensLoaded: tokenCount,
totalAvailable: allShards.length,
loadedCount: loadedShards.length,
};
}
/**
* Load specific shard by ID
*/
async loadShard(shardId) {
const index = await fs.readJson(this.indexPath);
const shardMeta = index.shards[shardId];
if (!shardMeta) {
throw new Error(`Shard not found: ${shardId}`);
}
const content = await fs.readFile(shardMeta.path, 'utf8');
return {
...shardMeta,
content,
};
}
/**
* Get shard statistics
*/
async getStats() {
const index = await fs.readJson(this.indexPath);
// Calculate total savings
let totalOriginal = 0;
let totalSharded = 0;
for (const doc of Object.values(index.documents)) {
totalOriginal += doc.originalTokens;
totalSharded += doc.shardedTokens;
}
const savings =
totalOriginal > 0 ? Math.round(((totalOriginal - totalSharded) / totalOriginal) * 100) : 0;
return {
...index.stats,
totalOriginalTokens: totalOriginal,
totalShardedTokens: totalSharded,
savingsPercentage: savings,
documents: Object.values(index.documents).map((doc) => ({
path: doc.path,
shardCount: doc.shardCount,
originalTokens: doc.originalTokens,
})),
};
}
/**
* Calculate savings
*/
calculateSavings(originalTokens, shards) {
// In typical usage, only critical/high priority shards are loaded
const criticalShards = shards.filter((s) => s.priority <= this.config.priorities.high);
const tokensLoaded = criticalShards.reduce((sum, s) => sum + s.tokens, 0);
const saved = originalTokens - tokensLoaded;
const percentage = Math.round((saved / originalTokens) * 100);
return { saved, percentage, tokensLoaded };
}
/**
* Estimate token count (chars / 4 is rough approximation)
*/
estimateTokens(text) {
return Math.ceil(text.length / 4);
}
/**
* Hash file path for consistent IDs
*/
hashPath(filePath) {
return crypto.createHash('md5').update(filePath).digest('hex').slice(0, 8);
}
/**
* Clean up old shards
*/
async cleanup(documentPath) {
const index = await fs.readJson(this.indexPath);
const docHash = this.hashPath(documentPath);
const docEntry = index.documents[docHash];
if (!docEntry) return;
// Delete shard files
for (const shardId of docEntry.shards) {
const shardMeta = index.shards[shardId];
await fs.remove(shardMeta.path);
await fs.remove(`${shardMeta.path}.meta`);
delete index.shards[shardId];
}
// Remove document entry
delete index.documents[docHash];
// Update stats
index.stats.totalDocuments = Object.keys(index.documents).length;
index.stats.totalShards = Object.keys(index.shards).length;
await fs.writeJson(this.indexPath, index, { spaces: 2 });
console.log(`ā Cleaned up shards for: ${documentPath}`);
}
}
module.exports = DocumentSharder;