@jackhua/mini-langchain
Version:
A lightweight TypeScript implementation of LangChain with cost optimization features
240 lines • 7.96 kB
JavaScript
"use strict";
/**
* Recursive character text splitter
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.RecursiveCharacterTextSplitterForLanguage = exports.RecursiveCharacterTextSplitter = void 0;
const base_1 = require("./base");
/**
* Recursively split text by trying a list of separators
*/
class RecursiveCharacterTextSplitter extends base_1.BaseTextSplitter {
constructor(params = {
chunkSize: 1000,
chunkOverlap: 200
}) {
super(params);
// Default separators optimized for different content types
this.separators = params.separators || [
'\n\n', // Double newline (paragraphs)
'\n', // Single newline
'. ', // Sentence ending
'! ', // Exclamation
'? ', // Question
'; ', // Semicolon
': ', // Colon
' - ', // Dash
' ', // Space
'' // Character
];
}
async splitText(text) {
return this.splitTextRecursively(text, this.separators);
}
splitTextRecursively(text, separators) {
const finalChunks = [];
// Get the separator to use
let separator = separators[separators.length - 1];
let newSeparators = [];
for (let i = 0; i < separators.length; i++) {
const s = separators[i];
if (s === '') {
separator = s;
break;
}
if (text.includes(s)) {
separator = s;
newSeparators = separators.slice(i + 1);
break;
}
}
// Split the text
const splits = this.splitBySeparator(text, separator);
// Process each split
const goodSplits = [];
for (const split of splits) {
if (this.lengthFunction(split) < this.chunkSize) {
goodSplits.push(split);
}
else {
if (goodSplits.length > 0) {
const merged = this.mergeSplits(goodSplits, separator);
finalChunks.push(...merged);
goodSplits.length = 0;
}
if (newSeparators.length === 0) {
// No more separators, have to split by chunk size
finalChunks.push(...this.splitBySize(split));
}
else {
// Recursively split
const otherChunks = this.splitTextRecursively(split, newSeparators);
finalChunks.push(...otherChunks);
}
}
}
if (goodSplits.length > 0) {
const merged = this.mergeSplits(goodSplits, separator);
finalChunks.push(...merged);
}
return this.addOverlap(finalChunks);
}
splitBySeparator(text, separator) {
if (separator === '') {
return text.split('');
}
const splits = text.split(separator);
if (this.keepSeparator && separator !== '') {
const result = [];
for (let i = 0; i < splits.length; i++) {
if (i < splits.length - 1) {
result.push(splits[i] + separator);
}
else if (splits[i]) {
result.push(splits[i]);
}
}
return result;
}
return splits.filter(s => s);
}
splitBySize(text) {
const chunks = [];
let start = 0;
while (start < text.length) {
const end = start + this.chunkSize;
chunks.push(text.slice(start, end));
start = end;
}
return chunks;
}
addOverlap(chunks) {
if (this.chunkOverlap === 0 || chunks.length <= 1) {
return chunks;
}
const overlappedChunks = [];
for (let i = 0; i < chunks.length; i++) {
let chunk = chunks[i];
// Add overlap from previous chunk
if (i > 0 && this.chunkOverlap > 0) {
const prevChunk = chunks[i - 1];
const overlapStart = Math.max(0, prevChunk.length - this.chunkOverlap);
const overlap = prevChunk.slice(overlapStart);
chunk = overlap + chunk;
}
// Add overlap from next chunk
if (i < chunks.length - 1 && this.chunkOverlap > 0) {
const nextChunk = chunks[i + 1];
const overlapEnd = Math.min(nextChunk.length, this.chunkOverlap);
const overlap = nextChunk.slice(0, overlapEnd);
chunk = chunk + overlap;
}
overlappedChunks.push(chunk);
}
return overlappedChunks;
}
}
exports.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter;
/**
* Create a recursive character text splitter optimized for different languages
*/
class RecursiveCharacterTextSplitterForLanguage extends RecursiveCharacterTextSplitter {
static fromLanguage(language, params) {
const separators = this.getSeparatorsForLanguage(language);
return new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
...params,
separators
});
}
static getSeparatorsForLanguage(language) {
switch (language) {
case 'markdown':
return [
'\n## ', // H2 headers
'\n### ', // H3 headers
'\n#### ', // H4 headers
'\n##### ', // H5 headers
'\n###### ', // H6 headers
'```\n', // Code blocks
'\n\n', // Paragraphs
'\n', // Lines
'. ', // Sentences
' ', // Words
'' // Characters
];
case 'python':
return [
'\nclass ',
'\ndef ',
'\n\tdef ',
'\n\n',
'\n',
' ',
''
];
case 'javascript':
case 'typescript':
return [
'\nfunction ',
'\nconst ',
'\nlet ',
'\nvar ',
'\nclass ',
'\nif ',
'\n\n',
'\n',
' ',
''
];
case 'html':
return [
'<body',
'<div',
'<p',
'<br',
'<li',
'<h1',
'<h2',
'<h3',
'<h4',
'<h5',
'<h6',
'<span',
'<table',
'<tr',
'<td',
'<th',
'<ul',
'<ol',
'<header',
'<footer',
'<nav',
'<head',
'<style',
'<script',
'<meta',
'<title',
' ',
''
];
case 'css':
return [
'\n}',
'\n.',
'\n#',
'\n@',
'\n:',
'\n{',
';',
' ',
''
];
default:
return ['\n\n', '\n', ' ', ''];
}
}
}
exports.RecursiveCharacterTextSplitterForLanguage = RecursiveCharacterTextSplitterForLanguage;
//# sourceMappingURL=recursive.js.map