@caleblawson/rag
Version:
The Retrieval-Augmented Generation (RAG) module contains document processing and embedding utilities.
76 lines (70 loc) • 1.87 kB
text/typescript
import type { TiktokenEncoding, TiktokenModel } from 'js-tiktoken';
import type {
TitleExtractorsArgs,
SummaryExtractArgs,
QuestionAnswerExtractArgs,
KeywordExtractArgs,
} from './extractors';
export enum Language {
CPP = 'cpp',
GO = 'go',
JAVA = 'java',
KOTLIN = 'kotlin',
JS = 'js',
TS = 'ts',
PHP = 'php',
PROTO = 'proto',
PYTHON = 'python',
RST = 'rst',
RUBY = 'ruby',
RUST = 'rust',
SCALA = 'scala',
SWIFT = 'swift',
MARKDOWN = 'markdown',
LATEX = 'latex',
HTML = 'html',
SOL = 'sol',
CSHARP = 'csharp',
COBOL = 'cobol',
C = 'c',
LUA = 'lua',
PERL = 'perl',
HASKELL = 'haskell',
ELIXIR = 'elixir',
POWERSHELL = 'powershell',
}
export type ExtractParams = {
title?: TitleExtractorsArgs | boolean;
summary?: SummaryExtractArgs | boolean;
questions?: QuestionAnswerExtractArgs | boolean;
keywords?: KeywordExtractArgs | boolean;
};
export type ChunkOptions = {
headers?: [string, string][];
returnEachLine?: boolean;
sections?: [string, string][];
separator?: string;
separators?: string[];
isSeparatorRegex?: boolean;
size?: number;
maxSize?: number;
minSize?: number;
overlap?: number;
lengthFunction?: (text: string) => number;
keepSeparator?: boolean | 'start' | 'end';
addStartIndex?: boolean;
stripWhitespace?: boolean;
language?: Language;
ensureAscii?: boolean;
convertLists?: boolean;
encodingName?: TiktokenEncoding;
modelName?: TiktokenModel;
allowedSpecial?: Set<string> | 'all';
disallowedSpecial?: Set<string> | 'all';
stripHeaders?: boolean;
};
export type ChunkStrategy = 'recursive' | 'character' | 'token' | 'markdown' | 'html' | 'json' | 'latex';
export interface ChunkParams extends ChunkOptions {
strategy?: ChunkStrategy;
extract?: ExtractParams;
}