mongodb-rag-core
Version:
Common elements used by MongoDB Chatbot Framework components.
76 lines (72 loc) • 2.56 kB
TypeScript
import { ChunkTransformer } from "./ChunkTransformer";
import { EmbeddedContent } from "../contentStore";
import { Page } from "../contentStore";
export type ContentChunk = Omit<EmbeddedContent, "embeddings" | "updated">;
/**
A ChunkFunc is a function that takes a page and returns it in chunks.
*/
export type ChunkFunc = (page: Page, options?: Partial<ChunkOptions>) => Promise<ContentChunk[]>;
/**
Options for converting a `Page` into `ContentChunk[]`.
*/
export type ChunkOptions = {
/**
Minimum chunk size before transform function is applied to it.
If a chunk has fewer tokens than this number, it is discarded before ingestion.
You can use this as a vector search optimization to avoid including chunks
with very few tokens and thus very little semantic meaning.
@example
You might set this to `15` to avoid including chunks that are just a few characters or words.
For instance, you likely would not want to set a chunk that is just the closing
of a code block (```), which occurs not infrequently if chunking using the
Langchain RecursiveCharacterTextSplitter.
Chunk 1:
````text
```py
foo = "bar"
# more semantically relevant python code...
````
Chunk 2:
````text
```
````
*/
minChunkSize?: number;
/**
Maximum chunk size before transform function is applied to it.
If Page has more tokens than this number, it is split into smaller chunks.
*/
maxChunkSize: number;
/**
Number of tokens to overlap between chunks.
If this is 0, chunks will not overlap.
If this is greater than 0, chunks will overlap by this number of tokens.
*/
chunkOverlap: number;
/**
Tokenizer to use to count number of tokens in text.
*/
tokenizer: SomeTokenizer;
/**
If provided, this will override the maxChunkSize for openapi-yaml pages.
This is useful because openapi-yaml pages tend to be very large, and
we want to split them into smaller chunks than the default maxChunkSize.
*/
yamlChunkSize?: number;
/**
Transform to be applied to each chunk as it is produced.
Provides the opportunity to prepend metadata, etc.
*/
transform?: ChunkTransformer;
};
export type SomeTokenizer = {
encode(text: string): {
bpe: number[];
text: string[];
};
};
/**
Returns chunked of a content page.
*/
export declare const chunkPage: ChunkFunc;
//# sourceMappingURL=chunkPage.d.ts.map