UNPKG

@jackhua/mini-langchain

Version:

A lightweight TypeScript implementation of LangChain with cost optimization features

118 lines 4.15 kB
"use strict"; /** * Character text splitter */ Object.defineProperty(exports, "__esModule", { value: true }); exports.TokenTextSplitter = exports.CharacterTextSplitter = void 0; const base_1 = require("./base"); /** * Split text by character count */ class CharacterTextSplitter extends base_1.BaseTextSplitter { constructor(params = { chunkSize: 1000, chunkOverlap: 200, separator: '\n\n' }) { super(params); this.separator = params.separator || '\n\n'; } async splitText(text) { // Split by separator const splits = text.split(this.separator); // Filter out empty strings const nonEmptySplits = splits.filter(s => s.trim().length > 0); // Merge small splits const chunks = this.mergeSplits(nonEmptySplits, this.separator); // Add overlap return this.addOverlap(chunks); } addOverlap(chunks) { if (this.chunkOverlap === 0 || chunks.length <= 1) { return chunks; } const overlappedChunks = []; for (let i = 0; i < chunks.length; i++) { if (i === 0) { overlappedChunks.push(chunks[i]); } else { // Get overlap from previous chunk const prevChunk = chunks[i - 1]; const overlapStart = Math.max(0, prevChunk.length - this.chunkOverlap); const overlap = prevChunk.slice(overlapStart); overlappedChunks.push(overlap + chunks[i]); } } return overlappedChunks; } } exports.CharacterTextSplitter = CharacterTextSplitter; /** * Token text splitter (simplified - counts words as tokens) */ class TokenTextSplitter extends base_1.BaseTextSplitter { constructor(params = { chunkSize: 512, chunkOverlap: 50, encoding: 'cl100k_base' }) { // Override length function to count tokens (simplified to words) super({ ...params, lengthFunction: (text) => this.countTokens(text) }); this.encoding = params.encoding || 'cl100k_base'; } async splitText(text) { const words = text.split(/\s+/); const chunks = []; let currentChunk = []; let currentLength = 0; for (const word of words) { const wordLength = this.countTokens(word); if (currentLength + wordLength > this.chunkSize && currentChunk.length > 0) { chunks.push(currentChunk.join(' ')); // Keep overlap if (this.chunkOverlap > 0) { const overlapWords = []; let overlapLength = 0; for (let i = currentChunk.length - 1; i >= 0; i--) { const overlapWord = currentChunk[i]; const overlapWordLength = this.countTokens(overlapWord); if (overlapLength + overlapWordLength <= this.chunkOverlap) { overlapWords.unshift(overlapWord); overlapLength += overlapWordLength; } else { break; } } currentChunk = overlapWords; currentLength = overlapLength; } else { currentChunk = []; currentLength = 0; } } currentChunk.push(word); currentLength += wordLength; } if (currentChunk.length > 0) { chunks.push(currentChunk.join(' ')); } return chunks; } /** * Simple token counting (word-based approximation) * In production, use tiktoken or similar */ countTokens(text) { // Rough approximation: 1 word ≈ 1.3 tokens const words = text.trim().split(/\s+/).filter(w => w.length > 0); return Math.ceil(words.length * 1.3); } } exports.TokenTextSplitter = TokenTextSplitter; //# sourceMappingURL=character.js.map