UNPKG

@jackhua/mini-langchain

Version:

A lightweight TypeScript implementation of LangChain with cost optimization features

240 lines 7.96 kB
"use strict"; /** * Recursive character text splitter */ Object.defineProperty(exports, "__esModule", { value: true }); exports.RecursiveCharacterTextSplitterForLanguage = exports.RecursiveCharacterTextSplitter = void 0; const base_1 = require("./base"); /** * Recursively split text by trying a list of separators */ class RecursiveCharacterTextSplitter extends base_1.BaseTextSplitter { constructor(params = { chunkSize: 1000, chunkOverlap: 200 }) { super(params); // Default separators optimized for different content types this.separators = params.separators || [ '\n\n', // Double newline (paragraphs) '\n', // Single newline '. ', // Sentence ending '! ', // Exclamation '? ', // Question '; ', // Semicolon ': ', // Colon ' - ', // Dash ' ', // Space '' // Character ]; } async splitText(text) { return this.splitTextRecursively(text, this.separators); } splitTextRecursively(text, separators) { const finalChunks = []; // Get the separator to use let separator = separators[separators.length - 1]; let newSeparators = []; for (let i = 0; i < separators.length; i++) { const s = separators[i]; if (s === '') { separator = s; break; } if (text.includes(s)) { separator = s; newSeparators = separators.slice(i + 1); break; } } // Split the text const splits = this.splitBySeparator(text, separator); // Process each split const goodSplits = []; for (const split of splits) { if (this.lengthFunction(split) < this.chunkSize) { goodSplits.push(split); } else { if (goodSplits.length > 0) { const merged = this.mergeSplits(goodSplits, separator); finalChunks.push(...merged); goodSplits.length = 0; } if (newSeparators.length === 0) { // No more separators, have to split by chunk size finalChunks.push(...this.splitBySize(split)); } else { // Recursively split const otherChunks = this.splitTextRecursively(split, newSeparators); finalChunks.push(...otherChunks); } } } if (goodSplits.length > 0) { const merged = this.mergeSplits(goodSplits, separator); finalChunks.push(...merged); } return this.addOverlap(finalChunks); } splitBySeparator(text, separator) { if (separator === '') { return text.split(''); } const splits = text.split(separator); if (this.keepSeparator && separator !== '') { const result = []; for (let i = 0; i < splits.length; i++) { if (i < splits.length - 1) { result.push(splits[i] + separator); } else if (splits[i]) { result.push(splits[i]); } } return result; } return splits.filter(s => s); } splitBySize(text) { const chunks = []; let start = 0; while (start < text.length) { const end = start + this.chunkSize; chunks.push(text.slice(start, end)); start = end; } return chunks; } addOverlap(chunks) { if (this.chunkOverlap === 0 || chunks.length <= 1) { return chunks; } const overlappedChunks = []; for (let i = 0; i < chunks.length; i++) { let chunk = chunks[i]; // Add overlap from previous chunk if (i > 0 && this.chunkOverlap > 0) { const prevChunk = chunks[i - 1]; const overlapStart = Math.max(0, prevChunk.length - this.chunkOverlap); const overlap = prevChunk.slice(overlapStart); chunk = overlap + chunk; } // Add overlap from next chunk if (i < chunks.length - 1 && this.chunkOverlap > 0) { const nextChunk = chunks[i + 1]; const overlapEnd = Math.min(nextChunk.length, this.chunkOverlap); const overlap = nextChunk.slice(0, overlapEnd); chunk = chunk + overlap; } overlappedChunks.push(chunk); } return overlappedChunks; } } exports.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter; /** * Create a recursive character text splitter optimized for different languages */ class RecursiveCharacterTextSplitterForLanguage extends RecursiveCharacterTextSplitter { static fromLanguage(language, params) { const separators = this.getSeparatorsForLanguage(language); return new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200, ...params, separators }); } static getSeparatorsForLanguage(language) { switch (language) { case 'markdown': return [ '\n## ', // H2 headers '\n### ', // H3 headers '\n#### ', // H4 headers '\n##### ', // H5 headers '\n###### ', // H6 headers '```\n', // Code blocks '\n\n', // Paragraphs '\n', // Lines '. ', // Sentences ' ', // Words '' // Characters ]; case 'python': return [ '\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', '' ]; case 'javascript': case 'typescript': return [ '\nfunction ', '\nconst ', '\nlet ', '\nvar ', '\nclass ', '\nif ', '\n\n', '\n', ' ', '' ]; case 'html': return [ '<body', '<div', '<p', '<br', '<li', '<h1', '<h2', '<h3', '<h4', '<h5', '<h6', '<span', '<table', '<tr', '<td', '<th', '<ul', '<ol', '<header', '<footer', '<nav', '<head', '<style', '<script', '<meta', '<title', ' ', '' ]; case 'css': return [ '\n}', '\n.', '\n#', '\n@', '\n:', '\n{', ';', ' ', '' ]; default: return ['\n\n', '\n', ' ', '']; } } } exports.RecursiveCharacterTextSplitterForLanguage = RecursiveCharacterTextSplitterForLanguage; //# sourceMappingURL=recursive.js.map