@llamaindex/core
Version:
LlamaIndex Core Module
778 lines (767 loc) • 30.9 kB
JavaScript
Object.defineProperty(exports, '__esModule', { value: true });
var index_cjs$1 = require('../../global/dist/index.cjs');
var index_cjs = require('../../schema/dist/index.cjs');
var index_cjs$2 = require('../../utils/dist/index.cjs');
var env = require('@llamaindex/env');
var zod = require('zod');
class NodeParser extends index_cjs.TransformComponent {
constructor(){
super((nodes)=>{
// alex: should we fix `as` type?
return this.getNodesFromDocuments(nodes);
}), this.includeMetadata = true, this.includePrevNextRel = true;
}
postProcessParsedNodes(nodes, parentDocMap) {
nodes.forEach((node, i)=>{
const parentDoc = parentDocMap.get(node.sourceNode?.nodeId || "");
if (parentDoc) {
const startCharIdx = parentDoc.text.indexOf(node.getContent(index_cjs.MetadataMode.NONE));
if (startCharIdx >= 0) {
node.startCharIdx = startCharIdx;
node.endCharIdx = startCharIdx + node.getContent(index_cjs.MetadataMode.NONE).length;
}
if (this.includeMetadata && node.metadata && parentDoc.metadata) {
node.metadata = {
...node.metadata,
...parentDoc.metadata
};
}
}
if (this.includePrevNextRel && node.sourceNode) {
const previousNode = i > 0 ? nodes[i - 1] : null;
const nextNode = i < nodes.length - 1 ? nodes[i + 1] : null;
if (previousNode && previousNode.sourceNode && previousNode.sourceNode.nodeId === node.sourceNode.nodeId) {
node.relationships = {
...node.relationships,
[index_cjs.NodeRelationship.PREVIOUS]: previousNode.asRelatedNodeInfo()
};
}
if (nextNode && nextNode.sourceNode && nextNode.sourceNode.nodeId === node.sourceNode.nodeId) {
node.relationships = {
...node.relationships,
[index_cjs.NodeRelationship.NEXT]: nextNode.asRelatedNodeInfo()
};
}
}
});
return nodes;
}
getNodesFromDocuments(documents) {
const docsId = new Map(documents.map((doc)=>[
doc.id_,
doc
]));
const callbackManager = index_cjs$1.Settings.callbackManager;
callbackManager.dispatchEvent("node-parsing-start", {
documents
});
const parsedNodes = this.parseNodes(documents);
if (index_cjs$2.isPromise(parsedNodes)) {
return parsedNodes.then((parsedNodes)=>{
const nodes = this.postProcessParsedNodes(parsedNodes, docsId);
callbackManager.dispatchEvent("node-parsing-end", {
nodes
});
return nodes;
});
} else {
const nodes = this.postProcessParsedNodes(parsedNodes, docsId);
callbackManager.dispatchEvent("node-parsing-end", {
nodes
});
return nodes;
}
}
}
class TextSplitter extends NodeParser {
splitTexts(texts) {
return texts.flatMap((text)=>this.splitText(text));
}
parseNodes(nodes) {
return nodes.reduce((allNodes, node)=>{
const splits = this.splitText(node.getContent(index_cjs.MetadataMode.ALL));
const nodes = index_cjs.buildNodeFromSplits(splits, node);
return allNodes.concat(nodes);
}, []);
}
}
class MetadataAwareTextSplitter extends TextSplitter {
splitTextsMetadataAware(texts, metadata) {
if (texts.length !== metadata.length) {
throw new TypeError("`texts` and `metadata` must have the same length");
}
return texts.flatMap((text, i)=>this.splitTextMetadataAware(text, metadata[i]));
}
getMetadataString(node) {
const embedStr = node.getMetadataStr(index_cjs.MetadataMode.EMBED);
const llmStr = node.getMetadataStr(index_cjs.MetadataMode.LLM);
if (embedStr.length > llmStr.length) {
return embedStr;
} else {
return llmStr;
}
}
parseNodes(nodes) {
return nodes.reduce((allNodes, node)=>{
const metadataStr = this.getMetadataString(node);
const splits = this.splitTextMetadataAware(node.getContent(index_cjs.MetadataMode.NONE), metadataStr);
return allNodes.concat(index_cjs.buildNodeFromSplits(splits, node));
}, []);
}
}
var __getOwnPropNames = Object.getOwnPropertyNames;
var __commonJS = (cb, mod)=>function __require() {
return mod || (0, cb[__getOwnPropNames(cb)[0]])((mod = {
exports: {}
}).exports, mod), mod.exports;
};
// lib/natural/tokenizers/tokenizer.js
var require_tokenizer = __commonJS({
"lib/natural/tokenizers/tokenizer.js" (exports, module) {
var Tokenizer = class {
trim(array) {
while(array[array.length - 1] === ""){
array.pop();
}
while(array[0] === ""){
array.shift();
}
return array;
}
};
module.exports = Tokenizer;
}
});
// lib/natural/tokenizers/sentence_tokenizer.js
var require_sentence_tokenizer = __commonJS({
"lib/natural/tokenizers/sentence_tokenizer.js" (exports, module) {
var Tokenizer = require_tokenizer();
var NUM = "NUMBER";
var DELIM = "DELIM";
var URI = "URI";
var ABBREV = "ABBREV";
function generateUniqueCode(base, index) {
return `{{${base}_${index}}}`;
}
function escapeRegExp(string) {
return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
var SentenceTokenizer = class extends Tokenizer {
constructor(abbreviations){
super();
if (abbreviations) {
this.abbreviations = abbreviations;
} else {
this.abbreviations = [];
}
this.replacementMap = null;
this.replacementCounter = 0;
}
replaceUrisWithPlaceholders(text) {
const urlPattern = /(https?:\/\/\S+|www\.\S+|ftp:\/\/\S+|(mailto:)?[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}|file:\/\/\S+)/gi;
const modifiedText = text.replace(urlPattern, (match)=>{
const placeholder = generateUniqueCode(URI, this.replacementCounter++);
this.replacementMap.set(placeholder, match);
return placeholder;
});
return modifiedText;
}
replaceAbbreviations(text) {
if (this.abbreviations.length === 0) {
return text;
}
const pattern = new RegExp(`(${this.abbreviations.map((abbrev)=>escapeRegExp(abbrev)).join("|")})`, "gi");
const replacedText = text.replace(pattern, (match)=>{
const code = generateUniqueCode(ABBREV, this.replacementCounter++);
this.replacementMap.set(code, match);
return code;
});
return replacedText;
}
replaceDelimitersWithPlaceholders(text) {
const delimiterPattern = /([.?!… ]*)([.?!…])(["'”’)}\]]?)/g;
const modifiedText = text.replace(delimiterPattern, (match, p1, p2, p3)=>{
const placeholder = generateUniqueCode(DELIM, this.replacementCounter++);
this.delimiterMap.set(placeholder, p1 + p2 + p3);
return placeholder;
});
return modifiedText;
}
splitOnPlaceholders(text, placeholders) {
if (this.delimiterMap.size === 0) {
return [
text
];
}
const keys = Array.from(this.delimiterMap.keys());
const pattern = new RegExp(`(${keys.map(escapeRegExp).join("|")})`);
const parts = text.split(pattern);
const sentences = [];
for(let i = 0; i < parts.length; i += 2){
const sentence = parts[i];
const placeholder = parts[i + 1] || "";
sentences.push(sentence + placeholder);
}
return sentences;
}
replaceNumbersWithCode(text) {
const numberPattern = /\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b/g;
const replacedText = text.replace(numberPattern, (match)=>{
const code = generateUniqueCode(NUM, this.replacementCounter++);
this.replacementMap.set(code, match);
return code;
});
return replacedText;
}
revertReplacements(text) {
let originalText = text;
for (const [placeholder, replacement] of this.replacementMap.entries()){
const pattern = new RegExp(escapeRegExp(placeholder), "g");
originalText = originalText.replace(pattern, replacement);
}
return originalText;
}
revertDelimiters(text) {
let originalText = text;
for (const [placeholder, replacement] of this.delimiterMap.entries()){
const pattern = new RegExp(escapeRegExp(placeholder), "g");
originalText = originalText.replace(pattern, replacement);
}
return originalText;
}
tokenize(text) {
this.replacementCounter = 0;
this.replacementMap = /* @__PURE__ */ new Map();
this.delimiterMap = /* @__PURE__ */ new Map();
const result1 = this.replaceAbbreviations(text);
const result2 = this.replaceUrisWithPlaceholders(result1);
const result3 = this.replaceNumbersWithCode(result2);
const result4 = this.replaceDelimitersWithPlaceholders(result3);
const sentences = this.splitOnPlaceholders(result4);
const newSentences = sentences.map((s)=>{
const s1 = this.revertReplacements(s);
return this.revertDelimiters(s1);
});
const trimmedSentences = this.trim(newSentences);
const trimmedSentences2 = trimmedSentences.map((sent)=>sent.trim());
return trimmedSentences2;
}
};
module.exports = SentenceTokenizer;
}
});
var SentenceTokenizer = require_sentence_tokenizer();
const truncateText = (text, textSplitter)=>{
const chunks = textSplitter.splitText(text);
return chunks[0] ?? text;
};
const splitTextKeepSeparator = (text, separator)=>{
const parts = text.split(separator);
const result = parts.map((part, index)=>index > 0 ? separator + part : part);
return result.filter((s)=>s);
};
const splitBySep = (sep, keepSep = true)=>{
if (keepSep) {
return (text)=>splitTextKeepSeparator(text, sep);
} else {
return (text)=>text.split(sep);
}
};
const splitByChar = ()=>{
return (text)=>text.split("");
};
let sentenceTokenizer = null;
const splitBySentenceTokenizer = ()=>{
if (!sentenceTokenizer) {
sentenceTokenizer = new SentenceTokenizer([
"i.e.",
"etc.",
"vs.",
"Inc.",
"A.S.A.P."
]);
}
const tokenizer = sentenceTokenizer;
return (text)=>{
try {
return tokenizer.tokenize(text);
} catch {
return [
text
];
}
};
};
const splitByRegex = (regex)=>{
return (text)=>text.match(new RegExp(regex, "g")) || [];
};
const splitByPhraseRegex = ()=>{
const regex = "[^,.;]+[,.;]?";
return splitByRegex(regex);
};
/**
* Parse text with a preference for complete sentences.
*/ class SentenceSplitter extends MetadataAwareTextSplitter {
#chunkingTokenizerFn;
#splitFns;
#subSentenceSplitFns;
#tokenizer;
constructor(params){
super(), /**
* The token chunk size for each chunk.
*/ this.chunkSize = 1024, /**
* The token overlap of each chunk when splitting.
*/ this.chunkOverlap = 200, /**
* Default separator for splitting into words
*/ this.separator = " ", /**
* Separator between paragraphs.
*/ this.paragraphSeparator = "\n\n\n", /**
* Backup regex for splitting into sentences.
*/ this.secondaryChunkingRegex = "[^,.;。?!]+[,.;。?!]?", this.#chunkingTokenizerFn = splitBySentenceTokenizer(), this.#splitFns = new Set(), this.#subSentenceSplitFns = new Set(), this.tokenSize = (text)=>this.#tokenizer.encode(text).length;
if (params) {
const parsedParams = index_cjs.sentenceSplitterSchema.parse(params);
this.chunkSize = parsedParams.chunkSize;
this.chunkOverlap = parsedParams.chunkOverlap;
this.separator = parsedParams.separator;
this.paragraphSeparator = parsedParams.paragraphSeparator;
this.secondaryChunkingRegex = parsedParams.secondaryChunkingRegex;
}
this.#tokenizer = params?.tokenizer ?? index_cjs$1.Settings.tokenizer;
this.#splitFns.add(splitBySep(this.paragraphSeparator));
this.#splitFns.add(this.#chunkingTokenizerFn);
this.#subSentenceSplitFns.add(splitByRegex(this.secondaryChunkingRegex));
this.#subSentenceSplitFns.add(splitBySep(this.separator));
this.#subSentenceSplitFns.add(splitByChar());
}
splitTextMetadataAware(text, metadata) {
const metadataLength = this.tokenSize(metadata);
const effectiveChunkSize = this.chunkSize - metadataLength;
if (effectiveChunkSize <= 0) {
throw new Error(`Metadata length (${metadataLength}) is longer than chunk size (${this.chunkSize}). Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`);
} else if (effectiveChunkSize < 50) {
console.log(`Metadata length (${metadataLength}) is close to chunk size (${this.chunkSize}). Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`);
}
return this._splitText(text, effectiveChunkSize);
}
splitText(text) {
return this._splitText(text, this.chunkSize);
}
_splitText(text, chunkSize) {
if (text === "") return [
text
];
const callbackManager = index_cjs$1.Settings.callbackManager;
callbackManager.dispatchEvent("chunking-start", {
text: [
text
]
});
const splits = this.#split(text, chunkSize);
const chunks = this.#merge(splits, chunkSize);
callbackManager.dispatchEvent("chunking-end", {
chunks
});
return chunks;
}
#split(text, chunkSize) {
const tokenSize = this.tokenSize(text);
if (tokenSize <= chunkSize) {
return [
{
text,
isSentence: true,
tokenSize
}
];
}
const [textSplitsByFns, isSentence] = this.#getSplitsByFns(text);
const textSplits = [];
for (const textSplit of textSplitsByFns){
const tokenSize = this.tokenSize(textSplit);
if (tokenSize <= chunkSize) {
textSplits.push({
text: textSplit,
isSentence,
tokenSize
});
} else {
const recursiveTextSplits = this.#split(textSplit, chunkSize);
textSplits.push(...recursiveTextSplits);
}
}
return textSplits;
}
#getSplitsByFns(text) {
for (const splitFn of this.#splitFns){
const splits = splitFn(text);
if (splits.length > 1) {
return [
splits,
true
];
}
}
for (const splitFn of this.#subSentenceSplitFns){
const splits = splitFn(text);
if (splits.length > 1) {
return [
splits,
false
];
}
}
return [
[
text
],
true
];
}
#merge(splits, chunkSize) {
const chunks = [];
let currentChunk = [];
let lastChunk = [];
let currentChunkLength = 0;
let newChunk = true;
const closeChunk = ()=>{
chunks.push(currentChunk.map(([text])=>text).join(""));
lastChunk = currentChunk;
currentChunk = [];
currentChunkLength = 0;
newChunk = true;
let lastIndex = lastChunk.length - 1;
while(lastIndex >= 0 && currentChunkLength + lastChunk[lastIndex][1] <= this.chunkOverlap){
const [text, length] = lastChunk[lastIndex];
currentChunkLength += length;
currentChunk.unshift([
text,
length
]);
lastIndex -= 1;
}
};
while(splits.length > 0){
const curSplit = splits[0];
if (curSplit.tokenSize > chunkSize) {
throw new Error("Single token exceeded chunk size");
}
if (currentChunkLength + curSplit.tokenSize > chunkSize && !newChunk) {
closeChunk();
} else {
if (curSplit.isSentence || currentChunkLength + curSplit.tokenSize <= chunkSize || newChunk) {
currentChunkLength += curSplit.tokenSize;
currentChunk.push([
curSplit.text,
curSplit.tokenSize
]);
splits.shift();
newChunk = false;
} else {
closeChunk();
}
}
}
// Handle the last chunk
if (!newChunk) {
chunks.push(currentChunk.map(([text])=>text).join(""));
}
return this.#postprocessChunks(chunks);
}
/**
* Remove whitespace only chunks and remove leading and trailing whitespace.
*/ #postprocessChunks(chunks) {
const newChunks = [];
for (const chunk of chunks){
const trimmedChunk = chunk.trim();
if (trimmedChunk !== "") {
newChunks.push(trimmedChunk);
}
}
return newChunks;
}
}
class MarkdownNodeParser extends NodeParser {
parseNodes(nodes, showProgress) {
return nodes.reduce((allNodes, node)=>{
const markdownNodes = this.getNodesFromNode(node);
return allNodes.concat(markdownNodes);
}, []);
}
getNodesFromNode(node) {
const text = node.getContent(index_cjs.MetadataMode.NONE);
const markdownNodes = [];
const lines = text.split("\n");
let metadata = {};
let codeBlock = false;
let currentSection = "";
for (const line of lines){
if (line.trim().startsWith("```")) {
codeBlock = !codeBlock;
}
const headerMatch = /^(#+)\s(.*)/.exec(line);
if (headerMatch && !codeBlock) {
if (currentSection !== "") {
markdownNodes.push(this.buildNodeFromSplit(currentSection.trim(), node, metadata));
}
metadata = this.updateMetadata(metadata, headerMatch[2], headerMatch[1].trim().length);
currentSection = `${headerMatch[2]}\n`;
} else {
currentSection += line + "\n";
}
}
if (currentSection !== "") {
markdownNodes.push(this.buildNodeFromSplit(currentSection.trim(), node, metadata));
}
return markdownNodes;
}
updateMetadata(headersMetadata, newHeader, newHeaderLevel) {
const updatedHeaders = {};
for(let i = 1; i < newHeaderLevel; i++){
const key = `Header_${i}`;
if (key in headersMetadata) {
updatedHeaders[key] = headersMetadata[key];
}
}
updatedHeaders[`Header_${newHeaderLevel}`] = newHeader;
return updatedHeaders;
}
buildNodeFromSplit(textSplit, node, metadata) {
const newNode = index_cjs.buildNodeFromSplits([
textSplit
], node, undefined)[0];
if (this.includeMetadata) {
newNode.metadata = {
...newNode.metadata,
...metadata
};
}
return newNode;
}
}
class SentenceWindowNodeParser extends NodeParser {
static{
this.DEFAULT_WINDOW_SIZE = 3;
}
static{
this.DEFAULT_WINDOW_METADATA_KEY = "window";
}
static{
this.DEFAULT_ORIGINAL_TEXT_METADATA_KEY = "originalText";
}
constructor(params){
super(), this.sentenceSplitter = splitBySentenceTokenizer(), this.idGenerator = ()=>env.randomUUID();
if (params) {
const parsedParams = index_cjs.sentenceWindowNodeParserSchema.parse(params);
this.windowSize = parsedParams.windowSize;
this.windowMetadataKey = parsedParams.windowMetadataKey;
this.originalTextMetadataKey = parsedParams.originalTextMetadataKey;
} else {
this.windowSize = SentenceWindowNodeParser.DEFAULT_WINDOW_SIZE;
this.windowMetadataKey = SentenceWindowNodeParser.DEFAULT_WINDOW_METADATA_KEY;
this.originalTextMetadataKey = SentenceWindowNodeParser.DEFAULT_ORIGINAL_TEXT_METADATA_KEY;
}
}
parseNodes(nodes, showProgress) {
return nodes.reduce((allNodes, node)=>{
const nodes = this.buildWindowNodesFromDocuments([
node
]);
return allNodes.concat(nodes);
}, []);
}
buildWindowNodesFromDocuments(documents) {
const allNodes = [];
for (const doc of documents){
const text = doc.text;
const textSplits = this.sentenceSplitter(text);
const nodes = index_cjs.buildNodeFromSplits(textSplits, doc, undefined, this.idGenerator);
nodes.forEach((node, i)=>{
const windowNodes = nodes.slice(Math.max(0, i - this.windowSize), Math.min(i + this.windowSize + 1, nodes.length));
node.metadata[this.windowMetadataKey] = windowNodes.map((n)=>n.text).join(" ");
node.metadata[this.originalTextMetadataKey] = node.text;
node.excludedEmbedMetadataKeys.push(this.windowMetadataKey, this.originalTextMetadataKey);
node.excludedLlmMetadataKeys.push(this.windowMetadataKey, this.originalTextMetadataKey);
});
allNodes.push(...nodes);
}
return allNodes;
}
}
const DEFAULT_METADATA_FORMAT_LEN = 2;
const tokenTextSplitterSchema = zod.z.object({
chunkSize: zod.z.number().positive().default(index_cjs$1.DEFAULT_CHUNK_SIZE),
chunkOverlap: zod.z.number().nonnegative().default(index_cjs$1.DEFAULT_CHUNK_OVERLAP),
separator: zod.z.string().default(" "),
backupSeparators: zod.z.array(zod.z.string()).default([
"\n"
])
});
class TokenTextSplitter extends MetadataAwareTextSplitter {
#tokenizer;
#splitFns;
constructor(params){
super(), this.chunkSize = index_cjs$1.DEFAULT_CHUNK_SIZE, this.chunkOverlap = index_cjs$1.DEFAULT_CHUNK_OVERLAP, this.separator = " ", this.backupSeparators = [
"\n"
], this.#splitFns = [];
if (params) {
const parsedParams = tokenTextSplitterSchema.parse(params);
this.chunkSize = parsedParams.chunkSize;
this.chunkOverlap = parsedParams.chunkOverlap;
this.separator = parsedParams.separator;
this.backupSeparators = parsedParams.backupSeparators;
}
if (this.chunkOverlap > this.chunkSize) {
throw new Error(`Got a larger chunk overlap (${this.chunkOverlap}) than chunk size (${this.chunkSize}), should be smaller.`);
}
this.#tokenizer = params?.tokenizer ?? index_cjs$1.Settings.tokenizer;
const allSeparators = [
this.separator,
...this.backupSeparators
];
this.#splitFns = allSeparators.map((sep)=>splitBySep(sep));
this.#splitFns.push(splitByChar());
}
/**
* Split text into chunks, reserving space required for metadata string.
* @param text The text to split.
* @param metadata The metadata string.
* @returns An array of text chunks.
*/ splitTextMetadataAware(text, metadata) {
const metadataLength = this.tokenSize(metadata) + DEFAULT_METADATA_FORMAT_LEN;
const effectiveChunkSize = this.chunkSize - metadataLength;
if (effectiveChunkSize <= 0) {
throw new Error(`Metadata length (${metadataLength}) is longer than chunk size (${this.chunkSize}). ` + `Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`);
} else if (effectiveChunkSize < 50) {
console.warn(`Metadata length (${metadataLength}) is close to chunk size (${this.chunkSize}). ` + `Resulting chunks are less than 50 tokens. Consider increasing the chunk size or decreasing the size of your metadata to avoid this.`);
}
return this._splitText(text, effectiveChunkSize);
}
/**
* Split text into chunks.
* @param text The text to split.
* @returns An array of text chunks.
*/ splitText(text) {
return this._splitText(text, this.chunkSize);
}
/**
* Internal method to split text into chunks up to a specified size.
* @param text The text to split.
* @param chunkSize The maximum size of each chunk.
* @returns An array of text chunks.
*/ _splitText(text, chunkSize) {
if (text === "") return [
text
];
// Dispatch chunking start event
index_cjs$1.Settings.callbackManager.dispatchEvent("chunking-start", {
text: [
text
]
});
const splits = this._split(text, chunkSize);
const chunks = this._merge(splits, chunkSize);
index_cjs$1.Settings.callbackManager.dispatchEvent("chunking-end", {
chunks
});
return chunks;
}
/**
* Break text into splits that are smaller than the chunk size.
* @param text The text to split.
* @param chunkSize The maximum size of each split.
* @returns An array of text splits.
*/ _split(text, chunkSize) {
if (this.tokenSize(text) <= chunkSize) {
return [
text
];
}
for (const splitFn of this.#splitFns){
const splits = splitFn(text);
if (splits.length > 1) {
const newSplits = [];
for (const split of splits){
const splitLen = this.tokenSize(split);
if (splitLen <= chunkSize) {
newSplits.push(split);
} else {
newSplits.push(...this._split(split, chunkSize));
}
}
return newSplits;
}
}
return [
text
];
}
/**
* Merge splits into chunks with overlap.
* @param splits The array of text splits.
* @param chunkSize The maximum size of each chunk.
* @returns An array of merged text chunks.
*/ _merge(splits, chunkSize) {
const chunks = [];
let currentChunk = [];
let currentLength = 0;
for (const split of splits){
const splitLength = this.tokenSize(split);
if (splitLength > chunkSize) {
console.warn(`Got a split of size ${splitLength}, larger than chunk size ${chunkSize}.`);
}
if (currentLength + splitLength > chunkSize) {
const chunk = currentChunk.join("").trim();
if (chunk) {
chunks.push(chunk);
}
currentChunk = [];
currentLength = 0;
const overlapTokens = this.chunkOverlap;
const overlapSplits = [];
let overlapLength = 0;
while(overlapSplits.length < splits.length && overlapLength < overlapTokens){
const overlapSplit = currentChunk.shift();
if (!overlapSplit) break;
overlapSplits.push(overlapSplit);
overlapLength += this.tokenSize(overlapSplit);
}
for (const overlapSplit of overlapSplits.reverse()){
currentChunk.push(overlapSplit);
currentLength += this.tokenSize(overlapSplit);
if (currentLength >= overlapTokens) break;
}
}
currentChunk.push(split);
currentLength += splitLength;
}
const finalChunk = currentChunk.join("").trim();
if (finalChunk) {
chunks.push(finalChunk);
}
return chunks;
}
/**
* Calculate the number of tokens in the text using the tokenizer.
* @param text The text to tokenize.
* @returns The number of tokens.
*/ tokenSize(text) {
return this.#tokenizer.encode(text).length;
}
}
/**
* @deprecated Use `SentenceSplitter` instead
*/ const SimpleNodeParser = SentenceSplitter;
exports.MarkdownNodeParser = MarkdownNodeParser;
exports.MetadataAwareTextSplitter = MetadataAwareTextSplitter;
exports.NodeParser = NodeParser;
exports.SentenceSplitter = SentenceSplitter;
exports.SentenceWindowNodeParser = SentenceWindowNodeParser;
exports.SimpleNodeParser = SimpleNodeParser;
exports.TextSplitter = TextSplitter;
exports.TokenTextSplitter = TokenTextSplitter;
exports.splitByChar = splitByChar;
exports.splitByPhraseRegex = splitByPhraseRegex;
exports.splitByRegex = splitByRegex;
exports.splitBySentenceTokenizer = splitBySentenceTokenizer;
exports.splitBySep = splitBySep;
exports.truncateText = truncateText;