@llamaindex/core
Version:
LlamaIndex Core Module
166 lines (155 loc) • 6.36 kB
TypeScript
import { z } from 'zod';
import { TextNode, TransformComponent, sentenceSplitterSchema, sentenceWindowNodeParserSchema, Document } from '../../schema/dist/index.js';
import { Tokenizer } from '@llamaindex/env/tokenizers';
declare abstract class NodeParser<Result extends TextNode[] | Promise<TextNode[]> = TextNode[] | Promise<TextNode[]>> extends TransformComponent<Result> {
includeMetadata: boolean;
includePrevNextRel: boolean;
constructor();
protected postProcessParsedNodes(nodes: Awaited<Result>, parentDocMap: Map<string, TextNode>): Awaited<Result>;
protected abstract parseNodes(documents: TextNode[], showProgress?: boolean): Result;
getNodesFromDocuments(documents: TextNode[]): Result;
}
declare abstract class TextSplitter extends NodeParser<TextNode[]> {
abstract splitText(text: string): string[];
splitTexts(texts: string[]): string[];
protected parseNodes(nodes: TextNode[]): TextNode[];
}
declare abstract class MetadataAwareTextSplitter extends TextSplitter {
abstract splitTextMetadataAware(text: string, metadata: string): string[];
splitTextsMetadataAware(texts: string[], metadata: string[]): string[];
protected getMetadataString(node: TextNode): string;
protected parseNodes(nodes: TextNode[]): TextNode[];
}
type SplitterParams = {
tokenizer?: Tokenizer;
};
/**
* Parse text with a preference for complete sentences.
*/
declare class SentenceSplitter extends MetadataAwareTextSplitter {
#private;
/**
* The token chunk size for each chunk.
*/
chunkSize: number;
/**
* The token overlap of each chunk when splitting.
*/
chunkOverlap: number;
/**
* Default separator for splitting into words
*/
separator: string;
/**
* Separator between paragraphs.
*/
paragraphSeparator: string;
/**
* Backup regex for splitting into sentences.
*/
secondaryChunkingRegex: string;
constructor(params?: z.input<typeof sentenceSplitterSchema> & SplitterParams);
splitTextMetadataAware(text: string, metadata: string): string[];
splitText(text: string): string[];
_splitText(text: string, chunkSize: number): string[];
tokenSize: (text: string) => number;
}
declare class MarkdownNodeParser extends NodeParser<TextNode[]> {
parseNodes(nodes: TextNode[], showProgress?: boolean): TextNode[];
protected getNodesFromNode(node: TextNode): TextNode[];
private updateMetadata;
private buildNodeFromSplit;
}
type TextSplitterFn = (text: string) => string[];
declare const truncateText: (text: string, textSplitter: TextSplitter) => string;
declare const splitBySep: (sep: string, keepSep?: boolean) => TextSplitterFn;
declare const splitByChar: () => TextSplitterFn;
declare const splitBySentenceTokenizer: () => TextSplitterFn;
declare const splitByRegex: (regex: string) => TextSplitterFn;
declare const splitByPhraseRegex: () => TextSplitterFn;
declare class SentenceWindowNodeParser extends NodeParser<TextNode[]> {
static DEFAULT_WINDOW_SIZE: number;
static DEFAULT_WINDOW_METADATA_KEY: string;
static DEFAULT_ORIGINAL_TEXT_METADATA_KEY: string;
windowSize: number;
windowMetadataKey: string;
originalTextMetadataKey: string;
sentenceSplitter: TextSplitterFn;
idGenerator: () => string;
constructor(params?: z.input<typeof sentenceWindowNodeParserSchema>);
parseNodes(nodes: TextNode[], showProgress?: boolean): TextNode[];
buildWindowNodesFromDocuments(documents: Document[]): TextNode[];
}
declare const tokenTextSplitterSchema: z.ZodObject<{
chunkSize: z.ZodDefault<z.ZodNumber>;
chunkOverlap: z.ZodDefault<z.ZodNumber>;
separator: z.ZodDefault<z.ZodString>;
backupSeparators: z.ZodDefault<z.ZodArray<z.ZodString, "many">>;
}, "strip", z.ZodTypeAny, {
chunkSize: number;
chunkOverlap: number;
separator: string;
backupSeparators: string[];
}, {
chunkSize?: number | undefined;
chunkOverlap?: number | undefined;
separator?: string | undefined;
backupSeparators?: string[] | undefined;
}>;
declare class TokenTextSplitter extends MetadataAwareTextSplitter {
#private;
chunkSize: number;
chunkOverlap: number;
separator: string;
backupSeparators: string[];
constructor(params?: SplitterParams & Partial<z.infer<typeof tokenTextSplitterSchema>>);
/**
* Split text into chunks, reserving space required for metadata string.
* @param text The text to split.
* @param metadata The metadata string.
* @returns An array of text chunks.
*/
splitTextMetadataAware(text: string, metadata: string): string[];
/**
* Split text into chunks.
* @param text The text to split.
* @returns An array of text chunks.
*/
splitText(text: string): string[];
/**
* Internal method to split text into chunks up to a specified size.
* @param text The text to split.
* @param chunkSize The maximum size of each chunk.
* @returns An array of text chunks.
*/
private _splitText;
/**
* Break text into splits that are smaller than the chunk size.
* @param text The text to split.
* @param chunkSize The maximum size of each split.
* @returns An array of text splits.
*/
private _split;
/**
* Merge splits into chunks with overlap.
* @param splits The array of text splits.
* @param chunkSize The maximum size of each chunk.
* @returns An array of merged text chunks.
*/
private _merge;
/**
* Calculate the number of tokens in the text using the tokenizer.
* @param text The text to tokenize.
* @returns The number of tokens.
*/
private tokenSize;
}
/**
* Current logic is based on the following implementation:
* @link @link https://github.com/run-llama/llama_index/blob/cc0ea90e7e72b8e4f5069aac981d56bb1d568323/llama-index-core/llama_index/core/node_parser
*/
/**
* @deprecated Use `SentenceSplitter` instead
*/
declare const SimpleNodeParser: typeof SentenceSplitter;
export { MarkdownNodeParser, MetadataAwareTextSplitter, NodeParser, SentenceSplitter, SentenceWindowNodeParser, SimpleNodeParser, type SplitterParams, TextSplitter, type TextSplitterFn, TokenTextSplitter, splitByChar, splitByPhraseRegex, splitByRegex, splitBySentenceTokenizer, splitBySep, truncateText };