UNPKG

@llamaindex/core

Version:
166 lines (155 loc) 6.36 kB
import { z } from 'zod'; import { TextNode, TransformComponent, sentenceSplitterSchema, sentenceWindowNodeParserSchema, Document } from '../../schema/dist/index.cjs'; import { Tokenizer } from '@llamaindex/env/tokenizers'; declare abstract class NodeParser<Result extends TextNode[] | Promise<TextNode[]> = TextNode[] | Promise<TextNode[]>> extends TransformComponent<Result> { includeMetadata: boolean; includePrevNextRel: boolean; constructor(); protected postProcessParsedNodes(nodes: Awaited<Result>, parentDocMap: Map<string, TextNode>): Awaited<Result>; protected abstract parseNodes(documents: TextNode[], showProgress?: boolean): Result; getNodesFromDocuments(documents: TextNode[]): Result; } declare abstract class TextSplitter extends NodeParser<TextNode[]> { abstract splitText(text: string): string[]; splitTexts(texts: string[]): string[]; protected parseNodes(nodes: TextNode[]): TextNode[]; } declare abstract class MetadataAwareTextSplitter extends TextSplitter { abstract splitTextMetadataAware(text: string, metadata: string): string[]; splitTextsMetadataAware(texts: string[], metadata: string[]): string[]; protected getMetadataString(node: TextNode): string; protected parseNodes(nodes: TextNode[]): TextNode[]; } type SplitterParams = { tokenizer?: Tokenizer; }; /** * Parse text with a preference for complete sentences. */ declare class SentenceSplitter extends MetadataAwareTextSplitter { #private; /** * The token chunk size for each chunk. */ chunkSize: number; /** * The token overlap of each chunk when splitting. */ chunkOverlap: number; /** * Default separator for splitting into words */ separator: string; /** * Separator between paragraphs. */ paragraphSeparator: string; /** * Backup regex for splitting into sentences. */ secondaryChunkingRegex: string; constructor(params?: z.input<typeof sentenceSplitterSchema> & SplitterParams); splitTextMetadataAware(text: string, metadata: string): string[]; splitText(text: string): string[]; _splitText(text: string, chunkSize: number): string[]; tokenSize: (text: string) => number; } declare class MarkdownNodeParser extends NodeParser<TextNode[]> { parseNodes(nodes: TextNode[], showProgress?: boolean): TextNode[]; protected getNodesFromNode(node: TextNode): TextNode[]; private updateMetadata; private buildNodeFromSplit; } type TextSplitterFn = (text: string) => string[]; declare const truncateText: (text: string, textSplitter: TextSplitter) => string; declare const splitBySep: (sep: string, keepSep?: boolean) => TextSplitterFn; declare const splitByChar: () => TextSplitterFn; declare const splitBySentenceTokenizer: () => TextSplitterFn; declare const splitByRegex: (regex: string) => TextSplitterFn; declare const splitByPhraseRegex: () => TextSplitterFn; declare class SentenceWindowNodeParser extends NodeParser<TextNode[]> { static DEFAULT_WINDOW_SIZE: number; static DEFAULT_WINDOW_METADATA_KEY: string; static DEFAULT_ORIGINAL_TEXT_METADATA_KEY: string; windowSize: number; windowMetadataKey: string; originalTextMetadataKey: string; sentenceSplitter: TextSplitterFn; idGenerator: () => string; constructor(params?: z.input<typeof sentenceWindowNodeParserSchema>); parseNodes(nodes: TextNode[], showProgress?: boolean): TextNode[]; buildWindowNodesFromDocuments(documents: Document[]): TextNode[]; } declare const tokenTextSplitterSchema: z.ZodObject<{ chunkSize: z.ZodDefault<z.ZodNumber>; chunkOverlap: z.ZodDefault<z.ZodNumber>; separator: z.ZodDefault<z.ZodString>; backupSeparators: z.ZodDefault<z.ZodArray<z.ZodString, "many">>; }, "strip", z.ZodTypeAny, { chunkSize: number; chunkOverlap: number; separator: string; backupSeparators: string[]; }, { chunkSize?: number | undefined; chunkOverlap?: number | undefined; separator?: string | undefined; backupSeparators?: string[] | undefined; }>; declare class TokenTextSplitter extends MetadataAwareTextSplitter { #private; chunkSize: number; chunkOverlap: number; separator: string; backupSeparators: string[]; constructor(params?: SplitterParams & Partial<z.infer<typeof tokenTextSplitterSchema>>); /** * Split text into chunks, reserving space required for metadata string. * @param text The text to split. * @param metadata The metadata string. * @returns An array of text chunks. */ splitTextMetadataAware(text: string, metadata: string): string[]; /** * Split text into chunks. * @param text The text to split. * @returns An array of text chunks. */ splitText(text: string): string[]; /** * Internal method to split text into chunks up to a specified size. * @param text The text to split. * @param chunkSize The maximum size of each chunk. * @returns An array of text chunks. */ private _splitText; /** * Break text into splits that are smaller than the chunk size. * @param text The text to split. * @param chunkSize The maximum size of each split. * @returns An array of text splits. */ private _split; /** * Merge splits into chunks with overlap. * @param splits The array of text splits. * @param chunkSize The maximum size of each chunk. * @returns An array of merged text chunks. */ private _merge; /** * Calculate the number of tokens in the text using the tokenizer. * @param text The text to tokenize. * @returns The number of tokens. */ private tokenSize; } /** * Current logic is based on the following implementation: * @link @link https://github.com/run-llama/llama_index/blob/cc0ea90e7e72b8e4f5069aac981d56bb1d568323/llama-index-core/llama_index/core/node_parser */ /** * @deprecated Use `SentenceSplitter` instead */ declare const SimpleNodeParser: typeof SentenceSplitter; export { MarkdownNodeParser, MetadataAwareTextSplitter, NodeParser, SentenceSplitter, SentenceWindowNodeParser, SimpleNodeParser, type SplitterParams, TextSplitter, type TextSplitterFn, TokenTextSplitter, splitByChar, splitByPhraseRegex, splitByRegex, splitBySentenceTokenizer, splitBySep, truncateText };