UNPKG

@jackhua/mini-langchain

Version:

A lightweight TypeScript implementation of LangChain with cost optimization features

101 lines 3.39 kB
"use strict"; /** * Base text splitter interface */ Object.defineProperty(exports, "__esModule", { value: true }); exports.BaseTextSplitter = void 0; /** * Base text splitter */ class BaseTextSplitter { constructor(params) { this.chunkSize = params.chunkSize; this.chunkOverlap = params.chunkOverlap; this.lengthFunction = params.lengthFunction || ((text) => text.length); this.keepSeparator = params.keepSeparator ?? false; this.addStartIndex = params.addStartIndex ?? false; if (this.chunkOverlap >= this.chunkSize) { throw new Error('Chunk overlap must be less than chunk size'); } } /** * Create documents from text chunks */ async createDocuments(texts, metadatas) { const documents = []; for (let i = 0; i < texts.length; i++) { const text = texts[i]; const metadata = metadatas ? metadatas[i] : {}; const chunks = await this.splitText(text); for (let j = 0; j < chunks.length; j++) { const chunk = chunks[j]; const doc = { pageContent: chunk, metadata: { ...metadata, ...(this.addStartIndex && { start_index: this.getStartIndex(text, chunk, j) }) } }; documents.push(doc); } } return documents; } /** * Split documents */ async splitDocuments(documents) { const texts = documents.map(doc => doc.pageContent); const metadatas = documents.map(doc => doc.metadata || {}); return this.createDocuments(texts, metadatas); } /** * Merge splits that fit within chunk size */ mergeSplits(splits, separator) { const mergedSplits = []; const currentSplits = []; let total = 0; for (const split of splits) { const splitLen = this.lengthFunction(split); if (total + splitLen + (currentSplits.length > 0 ? separator.length : 0) > this.chunkSize) { if (currentSplits.length > 0) { const merged = this.joinDocs(currentSplits, separator); if (merged.trim()) { mergedSplits.push(merged); } } currentSplits.length = 0; total = 0; } currentSplits.push(split); total += splitLen + (currentSplits.length > 1 ? separator.length : 0); } if (currentSplits.length > 0) { const merged = this.joinDocs(currentSplits, separator); if (merged.trim()) { mergedSplits.push(merged); } } return mergedSplits; } /** * Join documents with separator */ joinDocs(docs, separator) { const text = docs.join(separator).trim(); return text || ''; } /** * Get start index of chunk in original text */ getStartIndex(text, chunk, chunkIndex) { let index = 0; for (let i = 0; i < chunkIndex; i++) { index = text.indexOf(chunk, index) + chunk.length; } return text.indexOf(chunk, index); } } exports.BaseTextSplitter = BaseTextSplitter; //# sourceMappingURL=base.js.map