@restnfeel/agentc-starter-kit
Version:
한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템
112 lines (94 loc) • 2.94 kB
text/typescript
import { Document, DocumentChunk, ChunkMetadata } from "../types";
export interface TextSplitterConfig {
chunkSize: number;
chunkOverlap: number;
separators?: string[];
keepSeparator?: boolean;
}
export class RecursiveTextSplitter {
private config: TextSplitterConfig;
constructor(config: TextSplitterConfig) {
this.config = {
separators: ["\n\n", "\n", " ", ""],
keepSeparator: false,
...config,
};
}
async splitDocument(document: Document): Promise<DocumentChunk[]> {
const chunks = this.splitText(document.content);
return chunks
.map((chunk, index) => {
const startOffset = document.content.indexOf(chunk);
const endOffset = startOffset + chunk.length;
const metadata: ChunkMetadata = {
documentId: document.id,
chunkIndex: index,
startOffset,
endOffset,
tokens: this.estimateTokenCount(chunk),
source: document.source,
};
return {
id: `${document.id}_chunk_${index}`,
content: chunk.trim(),
metadata,
};
})
.filter((chunk) => chunk.content.length > 0);
}
private splitText(text: string): string[] {
const { separators = [], chunkSize, chunkOverlap } = this.config;
if (text.length <= chunkSize) {
return [text];
}
// Find the best separator
let separator = "";
for (const sep of separators) {
if (text.includes(sep)) {
separator = sep;
break;
}
}
const splits = separator ? text.split(separator) : [text];
const chunks: string[] = [];
let currentChunk = "";
for (let i = 0; i < splits.length; i++) {
const split = splits[i];
if (currentChunk.length + split.length + separator.length <= chunkSize) {
if (currentChunk) {
currentChunk += separator + split;
} else {
currentChunk = split;
}
} else {
if (currentChunk) {
chunks.push(currentChunk);
// Handle overlap
if (chunkOverlap > 0 && currentChunk.length > chunkOverlap) {
const overlapText = currentChunk.slice(-chunkOverlap);
currentChunk = overlapText + separator + split;
} else {
currentChunk = split;
}
} else {
// If split is larger than chunk size, recursively split
if (split.length > chunkSize) {
const subChunks = this.splitText(split);
chunks.push(...subChunks.slice(0, -1));
currentChunk = subChunks[subChunks.length - 1] || "";
} else {
currentChunk = split;
}
}
}
}
if (currentChunk) {
chunks.push(currentChunk);
}
return chunks;
}
private estimateTokenCount(text: string): number {
// Rough estimation: 1 token ≈ 4 characters
return Math.ceil(text.length / 4);
}
}