UNPKG

@restnfeel/agentc-starter-kit

Version:

한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템

112 lines (94 loc) 2.94 kB
import { Document, DocumentChunk, ChunkMetadata } from "../types"; export interface TextSplitterConfig { chunkSize: number; chunkOverlap: number; separators?: string[]; keepSeparator?: boolean; } export class RecursiveTextSplitter { private config: TextSplitterConfig; constructor(config: TextSplitterConfig) { this.config = { separators: ["\n\n", "\n", " ", ""], keepSeparator: false, ...config, }; } async splitDocument(document: Document): Promise<DocumentChunk[]> { const chunks = this.splitText(document.content); return chunks .map((chunk, index) => { const startOffset = document.content.indexOf(chunk); const endOffset = startOffset + chunk.length; const metadata: ChunkMetadata = { documentId: document.id, chunkIndex: index, startOffset, endOffset, tokens: this.estimateTokenCount(chunk), source: document.source, }; return { id: `${document.id}_chunk_${index}`, content: chunk.trim(), metadata, }; }) .filter((chunk) => chunk.content.length > 0); } private splitText(text: string): string[] { const { separators = [], chunkSize, chunkOverlap } = this.config; if (text.length <= chunkSize) { return [text]; } // Find the best separator let separator = ""; for (const sep of separators) { if (text.includes(sep)) { separator = sep; break; } } const splits = separator ? text.split(separator) : [text]; const chunks: string[] = []; let currentChunk = ""; for (let i = 0; i < splits.length; i++) { const split = splits[i]; if (currentChunk.length + split.length + separator.length <= chunkSize) { if (currentChunk) { currentChunk += separator + split; } else { currentChunk = split; } } else { if (currentChunk) { chunks.push(currentChunk); // Handle overlap if (chunkOverlap > 0 && currentChunk.length > chunkOverlap) { const overlapText = currentChunk.slice(-chunkOverlap); currentChunk = overlapText + separator + split; } else { currentChunk = split; } } else { // If split is larger than chunk size, recursively split if (split.length > chunkSize) { const subChunks = this.splitText(split); chunks.push(...subChunks.slice(0, -1)); currentChunk = subChunks[subChunks.length - 1] || ""; } else { currentChunk = split; } } } } if (currentChunk) { chunks.push(currentChunk); } return chunks; } private estimateTokenCount(text: string): number { // Rough estimation: 1 token ≈ 4 characters return Math.ceil(text.length / 4); } }