@jackhua/mini-langchain
Version:
A lightweight TypeScript implementation of LangChain with cost optimization features
118 lines • 4.15 kB
JavaScript
;
/**
* Character text splitter
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.TokenTextSplitter = exports.CharacterTextSplitter = void 0;
const base_1 = require("./base");
/**
* Split text by character count
*/
class CharacterTextSplitter extends base_1.BaseTextSplitter {
constructor(params = {
chunkSize: 1000,
chunkOverlap: 200,
separator: '\n\n'
}) {
super(params);
this.separator = params.separator || '\n\n';
}
async splitText(text) {
// Split by separator
const splits = text.split(this.separator);
// Filter out empty strings
const nonEmptySplits = splits.filter(s => s.trim().length > 0);
// Merge small splits
const chunks = this.mergeSplits(nonEmptySplits, this.separator);
// Add overlap
return this.addOverlap(chunks);
}
addOverlap(chunks) {
if (this.chunkOverlap === 0 || chunks.length <= 1) {
return chunks;
}
const overlappedChunks = [];
for (let i = 0; i < chunks.length; i++) {
if (i === 0) {
overlappedChunks.push(chunks[i]);
}
else {
// Get overlap from previous chunk
const prevChunk = chunks[i - 1];
const overlapStart = Math.max(0, prevChunk.length - this.chunkOverlap);
const overlap = prevChunk.slice(overlapStart);
overlappedChunks.push(overlap + chunks[i]);
}
}
return overlappedChunks;
}
}
exports.CharacterTextSplitter = CharacterTextSplitter;
/**
* Token text splitter (simplified - counts words as tokens)
*/
class TokenTextSplitter extends base_1.BaseTextSplitter {
constructor(params = {
chunkSize: 512,
chunkOverlap: 50,
encoding: 'cl100k_base'
}) {
// Override length function to count tokens (simplified to words)
super({
...params,
lengthFunction: (text) => this.countTokens(text)
});
this.encoding = params.encoding || 'cl100k_base';
}
async splitText(text) {
const words = text.split(/\s+/);
const chunks = [];
let currentChunk = [];
let currentLength = 0;
for (const word of words) {
const wordLength = this.countTokens(word);
if (currentLength + wordLength > this.chunkSize && currentChunk.length > 0) {
chunks.push(currentChunk.join(' '));
// Keep overlap
if (this.chunkOverlap > 0) {
const overlapWords = [];
let overlapLength = 0;
for (let i = currentChunk.length - 1; i >= 0; i--) {
const overlapWord = currentChunk[i];
const overlapWordLength = this.countTokens(overlapWord);
if (overlapLength + overlapWordLength <= this.chunkOverlap) {
overlapWords.unshift(overlapWord);
overlapLength += overlapWordLength;
}
else {
break;
}
}
currentChunk = overlapWords;
currentLength = overlapLength;
}
else {
currentChunk = [];
currentLength = 0;
}
}
currentChunk.push(word);
currentLength += wordLength;
}
if (currentChunk.length > 0) {
chunks.push(currentChunk.join(' '));
}
return chunks;
}
/**
* Simple token counting (word-based approximation)
* In production, use tiktoken or similar
*/
countTokens(text) {
// Rough approximation: 1 word ≈ 1.3 tokens
const words = text.trim().split(/\s+/).filter(w => w.length > 0);
return Math.ceil(words.length * 1.3);
}
}
exports.TokenTextSplitter = TokenTextSplitter;
//# sourceMappingURL=character.js.map