@restnfeel/agentc-starter-kit
Version:
한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템
95 lines (93 loc) • 3.22 kB
JavaScript
class RecursiveTextSplitter {
constructor(config) {
this.config = {
separators: ["\n\n", "\n", " ", ""],
keepSeparator: false,
...config,
};
}
async splitDocument(document) {
const chunks = this.splitText(document.content);
return chunks
.map((chunk, index) => {
const startOffset = document.content.indexOf(chunk);
const endOffset = startOffset + chunk.length;
const metadata = {
documentId: document.id,
chunkIndex: index,
startOffset,
endOffset,
tokens: this.estimateTokenCount(chunk),
source: document.source,
};
return {
id: `${document.id}_chunk_${index}`,
content: chunk.trim(),
metadata,
};
})
.filter((chunk) => chunk.content.length > 0);
}
splitText(text) {
const { separators = [], chunkSize, chunkOverlap } = this.config;
if (text.length <= chunkSize) {
return [text];
}
// Find the best separator
let separator = "";
for (const sep of separators) {
if (text.includes(sep)) {
separator = sep;
break;
}
}
const splits = separator ? text.split(separator) : [text];
const chunks = [];
let currentChunk = "";
for (let i = 0; i < splits.length; i++) {
const split = splits[i];
if (currentChunk.length + split.length + separator.length <= chunkSize) {
if (currentChunk) {
currentChunk += separator + split;
}
else {
currentChunk = split;
}
}
else {
if (currentChunk) {
chunks.push(currentChunk);
// Handle overlap
if (chunkOverlap > 0 && currentChunk.length > chunkOverlap) {
const overlapText = currentChunk.slice(-chunkOverlap);
currentChunk = overlapText + separator + split;
}
else {
currentChunk = split;
}
}
else {
// If split is larger than chunk size, recursively split
if (split.length > chunkSize) {
const subChunks = this.splitText(split);
chunks.push(...subChunks.slice(0, -1));
currentChunk = subChunks[subChunks.length - 1] || "";
}
else {
currentChunk = split;
}
}
}
}
if (currentChunk) {
chunks.push(currentChunk);
}
return chunks;
}
estimateTokenCount(text) {
// Rough estimation: 1 token ≈ 4 characters
return Math.ceil(text.length / 4);
}
}
export { RecursiveTextSplitter };
//# sourceMappingURL=recursive.js.map