@jackhua/mini-langchain
Version:
A lightweight TypeScript implementation of LangChain with cost optimization features
101 lines • 3.39 kB
JavaScript
;
/**
* Base text splitter interface
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.BaseTextSplitter = void 0;
/**
* Base text splitter
*/
class BaseTextSplitter {
constructor(params) {
this.chunkSize = params.chunkSize;
this.chunkOverlap = params.chunkOverlap;
this.lengthFunction = params.lengthFunction || ((text) => text.length);
this.keepSeparator = params.keepSeparator ?? false;
this.addStartIndex = params.addStartIndex ?? false;
if (this.chunkOverlap >= this.chunkSize) {
throw new Error('Chunk overlap must be less than chunk size');
}
}
/**
* Create documents from text chunks
*/
async createDocuments(texts, metadatas) {
const documents = [];
for (let i = 0; i < texts.length; i++) {
const text = texts[i];
const metadata = metadatas ? metadatas[i] : {};
const chunks = await this.splitText(text);
for (let j = 0; j < chunks.length; j++) {
const chunk = chunks[j];
const doc = {
pageContent: chunk,
metadata: {
...metadata,
...(this.addStartIndex && { start_index: this.getStartIndex(text, chunk, j) })
}
};
documents.push(doc);
}
}
return documents;
}
/**
* Split documents
*/
async splitDocuments(documents) {
const texts = documents.map(doc => doc.pageContent);
const metadatas = documents.map(doc => doc.metadata || {});
return this.createDocuments(texts, metadatas);
}
/**
* Merge splits that fit within chunk size
*/
mergeSplits(splits, separator) {
const mergedSplits = [];
const currentSplits = [];
let total = 0;
for (const split of splits) {
const splitLen = this.lengthFunction(split);
if (total + splitLen + (currentSplits.length > 0 ? separator.length : 0) > this.chunkSize) {
if (currentSplits.length > 0) {
const merged = this.joinDocs(currentSplits, separator);
if (merged.trim()) {
mergedSplits.push(merged);
}
}
currentSplits.length = 0;
total = 0;
}
currentSplits.push(split);
total += splitLen + (currentSplits.length > 1 ? separator.length : 0);
}
if (currentSplits.length > 0) {
const merged = this.joinDocs(currentSplits, separator);
if (merged.trim()) {
mergedSplits.push(merged);
}
}
return mergedSplits;
}
/**
* Join documents with separator
*/
joinDocs(docs, separator) {
const text = docs.join(separator).trim();
return text || '';
}
/**
* Get start index of chunk in original text
*/
getStartIndex(text, chunk, chunkIndex) {
let index = 0;
for (let i = 0; i < chunkIndex; i++) {
index = text.indexOf(chunk, index) + chunk.length;
}
return text.indexOf(chunk, index);
}
}
exports.BaseTextSplitter = BaseTextSplitter;
//# sourceMappingURL=base.js.map