packfs-core
Version:
Semantic filesystem operations for LLM agent frameworks with natural language understanding. See LLM_AGENT_GUIDE.md for copy-paste examples.
107 lines • 3.68 kB
JavaScript
;
/**
* Semantic text chunking for large files
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.SemanticChunker = void 0;
class SemanticChunker {
constructor(options = {}) {
this.options = {
maxChunkSize: options.maxChunkSize ?? 4000,
overlapSize: options.overlapSize ?? 200,
preserveFormatting: options.preserveFormatting ?? true,
encoding: options.encoding ?? 'utf-8'
};
}
/**
* Chunk text content semantically
*/
chunk(content) {
const chunks = [];
if (content.length <= this.options.maxChunkSize) {
return {
chunks: [content],
metadata: {
totalSize: content.length,
chunkCount: 1,
avgChunkSize: content.length
}
};
}
// Split by paragraphs first, then by sentences if needed
const paragraphs = content.split(/\n\s*\n/);
let currentChunk = '';
for (const paragraph of paragraphs) {
if (currentChunk.length + paragraph.length <= this.options.maxChunkSize) {
currentChunk += (currentChunk ? '\n\n' : '') + paragraph;
}
else {
if (currentChunk) {
chunks.push(currentChunk);
// Add overlap from previous chunk
const overlap = this.getOverlap(currentChunk);
currentChunk = overlap + (overlap ? '\n\n' : '') + paragraph;
}
else {
// Paragraph is too large, split by sentences
chunks.push(...this.splitLargeParagraph(paragraph));
}
}
}
if (currentChunk) {
chunks.push(currentChunk);
}
return {
chunks,
metadata: {
totalSize: content.length,
chunkCount: chunks.length,
avgChunkSize: Math.round(content.length / chunks.length)
}
};
}
getOverlap(chunk) {
if (chunk.length <= this.options.overlapSize) {
return chunk;
}
return chunk.slice(-this.options.overlapSize);
}
splitLargeParagraph(paragraph) {
const sentences = paragraph.split(/[.!?]+\s+/);
const chunks = [];
let currentChunk = '';
for (const sentence of sentences) {
if (currentChunk.length + sentence.length <= this.options.maxChunkSize) {
currentChunk += (currentChunk ? '. ' : '') + sentence;
}
else {
if (currentChunk) {
chunks.push(currentChunk);
currentChunk = sentence;
}
else {
// Single sentence is too large, split by words
chunks.push(...this.splitBySizeLimit(sentence));
}
}
}
if (currentChunk) {
chunks.push(currentChunk);
}
return chunks;
}
splitBySizeLimit(text) {
const chunks = [];
let start = 0;
while (start < text.length) {
const end = Math.min(start + this.options.maxChunkSize, text.length);
chunks.push(text.slice(start, end));
const nextStart = end - this.options.overlapSize;
// Ensure we always make progress
start = Math.max(nextStart, start + 1);
}
return chunks;
}
}
exports.SemanticChunker = SemanticChunker;
//# sourceMappingURL=chunker.js.map