gpt4all
Version:
Native Node.js LLM bindings for all.
859 lines (772 loc) • 28.2 kB
TypeScript
/// <reference types="node" />
declare module "gpt4all";
interface LLModelOptions {
/**
* Model architecture. This argument currently does not have any functionality and is just used as descriptive identifier for user.
*/
type?: string;
model_name: string;
model_path: string;
library_path?: string;
}
interface ModelConfig {
systemPrompt: string;
promptTemplate: string;
path: string;
url?: string;
}
/**
* Options for the chat session.
*/
interface ChatSessionOptions extends Partial<LLModelPromptContext> {
/**
* System prompt to ingest on initialization.
*/
systemPrompt?: string;
/**
* Messages to ingest on initialization.
*/
messages?: ChatMessage[];
}
/**
* ChatSession utilizes an InferenceModel for efficient processing of chat conversations.
*/
declare class ChatSession implements CompletionProvider {
/**
* Constructs a new ChatSession using the provided InferenceModel and options.
* Does not set the chat session as the active chat session until initialize is called.
* @param {InferenceModel} model An InferenceModel instance.
* @param {ChatSessionOptions} [options] Options for the chat session including default completion options.
*/
constructor(model: InferenceModel, options?: ChatSessionOptions);
/**
* The underlying InferenceModel used for generating completions.
*/
model: InferenceModel;
/**
* The name of the model.
*/
modelName: string;
/**
* The messages that have been exchanged in this chat session.
*/
messages: ChatMessage[];
/**
* The system prompt that has been ingested at the beginning of the chat session.
*/
systemPrompt: string;
/**
* The current prompt context of the chat session.
*/
promptContext: LLModelPromptContext;
/**
* Ingests system prompt and initial messages.
* Sets this chat session as the active chat session of the model.
* @param {CompletionOptions} [options] Set completion options for initialization.
* @returns {Promise<number>} The number of tokens ingested during initialization. systemPrompt + messages.
*/
initialize(completionOpts?: CompletionOptions): Promise<number>;
/**
* Prompts the model in chat-session context.
* @param {CompletionInput} input Input string or message array.
* @param {CompletionOptions} [options] Set completion options for this generation.
* @returns {Promise<InferenceResult>} The inference result.
* @throws {Error} If the chat session is not the active chat session of the model.
* @throws {Error} If nPast is set to a value higher than what has been ingested in the session.
*/
generate(
input: CompletionInput,
options?: CompletionOptions
): Promise<InferenceResult>;
}
/**
* Shape of InferenceModel generations.
*/
interface InferenceResult extends LLModelInferenceResult {
tokensIngested: number;
tokensGenerated: number;
}
/**
* InferenceModel represents an LLM which can make next-token predictions.
*/
declare class InferenceModel implements CompletionProvider {
constructor(llm: LLModel, config: ModelConfig);
/** The native LLModel */
llm: LLModel;
/** The configuration the instance was constructed with. */
config: ModelConfig;
/** The active chat session of the model. */
activeChatSession?: ChatSession;
/** The name of the model. */
modelName: string;
/**
* Create a chat session with the model and set it as the active chat session of this model.
* A model instance can only have one active chat session at a time.
* @param {ChatSessionOptions} options The options for the chat session.
* @returns {Promise<ChatSession>} The chat session.
*/
createChatSession(options?: ChatSessionOptions): Promise<ChatSession>;
/**
* Prompts the model with a given input and optional parameters.
* @param {CompletionInput} input The prompt input.
* @param {CompletionOptions} options Prompt context and other options.
* @returns {Promise<InferenceResult>} The model's response to the prompt.
* @throws {Error} If nPast is set to a value smaller than 0.
* @throws {Error} If a messages array without a tailing user message is provided.
*/
generate(
prompt: string,
options?: CompletionOptions
): Promise<InferenceResult>;
/**
* delete and cleanup the native model
*/
dispose(): void;
}
/**
* Options for generating one or more embeddings.
*/
interface EmbedddingOptions {
/**
* The model-specific prefix representing the embedding task, without the trailing colon. For Nomic Embed
* this can be `search_query`, `search_document`, `classification`, or `clustering`.
*/
prefix?: string;
/**
*The embedding dimension, for use with Matryoshka-capable models. Defaults to full-size.
* @default determines on the model being used.
*/
dimensionality?: number;
/**
* How to handle texts longer than the model can accept. One of `mean` or `truncate`.
* @default "mean"
*/
longTextMode?: "mean" | "truncate";
/**
* Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens
* with long_text_mode="mean" will raise an error. Disabled by default.
* @default false
*/
atlas?: boolean;
}
/**
* The nodejs moral equivalent to python binding's Embed4All().embed()
* meow
* @param {EmbeddingModel} model The embedding model instance.
* @param {string} text Text to embed.
* @param {EmbeddingOptions} options Optional parameters for the embedding.
* @returns {EmbeddingResult} The embedding result.
* @throws {Error} If dimensionality is set to a value smaller than 1.
*/
declare function createEmbedding(
model: EmbeddingModel,
text: string,
options?: EmbedddingOptions
): EmbeddingResult<Float32Array>;
/**
* Overload that takes multiple strings to embed.
* @param {EmbeddingModel} model The embedding model instance.
* @param {string[]} texts Texts to embed.
* @param {EmbeddingOptions} options Optional parameters for the embedding.
* @returns {EmbeddingResult<Float32Array[]>} The embedding result.
* @throws {Error} If dimensionality is set to a value smaller than 1.
*/
declare function createEmbedding(
model: EmbeddingModel,
text: string[],
options?: EmbedddingOptions
): EmbeddingResult<Float32Array[]>;
/**
* The resulting embedding.
*/
interface EmbeddingResult<T> {
/**
* Encoded token count. Includes overlap but specifically excludes tokens used for the prefix/task_type, BOS/CLS token, and EOS/SEP token
**/
n_prompt_tokens: number;
embeddings: T;
}
/**
* EmbeddingModel represents an LLM which can create embeddings, which are float arrays
*/
declare class EmbeddingModel {
constructor(llm: LLModel, config: ModelConfig);
/** The native LLModel */
llm: LLModel;
/** The configuration the instance was constructed with. */
config: ModelConfig;
/**
* Create an embedding from a given input string. See EmbeddingOptions.
* @param {string} text
* @param {string} prefix
* @param {number} dimensionality
* @param {boolean} doMean
* @param {boolean} atlas
* @returns {EmbeddingResult<Float32Array>} The embedding result.
*/
embed(
text: string,
prefix: string,
dimensionality: number,
doMean: boolean,
atlas: boolean
): EmbeddingResult<Float32Array>;
/**
* Create an embedding from a given input text array. See EmbeddingOptions.
* @param {string[]} text
* @param {string} prefix
* @param {number} dimensionality
* @param {boolean} doMean
* @param {boolean} atlas
* @returns {EmbeddingResult<Float32Array[]>} The embedding result.
*/
embed(
text: string[],
prefix: string,
dimensionality: number,
doMean: boolean,
atlas: boolean
): EmbeddingResult<Float32Array[]>;
/**
* delete and cleanup the native model
*/
dispose(): void;
}
/**
* Shape of LLModel's inference result.
*/
interface LLModelInferenceResult {
text: string;
nPast: number;
}
interface LLModelInferenceOptions extends Partial<LLModelPromptContext> {
/** Callback for response tokens, called for each generated token.
* @param {number} tokenId The token id.
* @param {string} token The token.
* @returns {boolean | undefined} Whether to continue generating tokens.
* */
onResponseToken?: (tokenId: number, token: string) => boolean | void;
/** Callback for prompt tokens, called for each input token in the prompt.
* @param {number} tokenId The token id.
* @returns {boolean | undefined} Whether to continue ingesting the prompt.
* */
onPromptToken?: (tokenId: number) => boolean | void;
}
/**
* LLModel class representing a language model.
* This is a base class that provides common functionality for different types of language models.
*/
declare class LLModel {
/**
* Initialize a new LLModel.
* @param {string} path Absolute path to the model file.
* @throws {Error} If the model file does not exist.
*/
constructor(options: LLModelOptions);
/** undefined or user supplied */
type(): string | undefined;
/** The name of the model. */
name(): string;
/**
* Get the size of the internal state of the model.
* NOTE: This state data is specific to the type of model you have created.
* @return the size in bytes of the internal state of the model
*/
stateSize(): number;
/**
* Get the number of threads used for model inference.
* The default is the number of physical cores your computer has.
* @returns The number of threads used for model inference.
*/
threadCount(): number;
/**
* Set the number of threads used for model inference.
* @param newNumber The new number of threads.
*/
setThreadCount(newNumber: number): void;
/**
* Prompt the model directly with a given input string and optional parameters.
* Use the higher level createCompletion methods for a more user-friendly interface.
* @param {string} prompt The prompt input.
* @param {LLModelInferenceOptions} options Optional parameters for the generation.
* @returns {LLModelInferenceResult} The response text and final context size.
*/
infer(
prompt: string,
options: LLModelInferenceOptions
): Promise<LLModelInferenceResult>;
/**
* Embed text with the model. See EmbeddingOptions for more information.
* Use the higher level createEmbedding methods for a more user-friendly interface.
* @param {string} text
* @param {string} prefix
* @param {number} dimensionality
* @param {boolean} doMean
* @param {boolean} atlas
* @returns {Float32Array} The embedding of the text.
*/
embed(
text: string,
prefix: string,
dimensionality: number,
doMean: boolean,
atlas: boolean
): Float32Array;
/**
* Embed multiple texts with the model. See EmbeddingOptions for more information.
* Use the higher level createEmbedding methods for a more user-friendly interface.
* @param {string[]} texts
* @param {string} prefix
* @param {number} dimensionality
* @param {boolean} doMean
* @param {boolean} atlas
* @returns {Float32Array[]} The embeddings of the texts.
*/
embed(
texts: string,
prefix: string,
dimensionality: number,
doMean: boolean,
atlas: boolean
): Float32Array[];
/**
* Whether the model is loaded or not.
*/
isModelLoaded(): boolean;
/**
* Where to search for the pluggable backend libraries
*/
setLibraryPath(s: string): void;
/**
* Where to get the pluggable backend libraries
*/
getLibraryPath(): string;
/**
* Initiate a GPU by a string identifier.
* @param {number} memory_required Should be in the range size_t or will throw
* @param {string} device_name 'amd' | 'nvidia' | 'intel' | 'gpu' | gpu name.
* read LoadModelOptions.device for more information
*/
initGpuByString(memory_required: number, device_name: string): boolean;
/**
* From C documentation
* @returns True if a GPU device is successfully initialized, false otherwise.
*/
hasGpuDevice(): boolean;
/**
* GPUs that are usable for this LLModel
* @param {number} nCtx Maximum size of context window
* @throws if hasGpuDevice returns false (i think)
* @returns
*/
listGpu(nCtx: number): GpuDevice[];
/**
* delete and cleanup the native model
*/
dispose(): void;
}
/**
* an object that contains gpu data on this machine.
*/
interface GpuDevice {
index: number;
/**
* same as VkPhysicalDeviceType
*/
type: number;
heapSize: number;
name: string;
vendor: string;
}
/**
* Options that configure a model's behavior.
*/
interface LoadModelOptions {
/**
* Where to look for model files.
*/
modelPath?: string;
/**
* Where to look for the backend libraries.
*/
librariesPath?: string;
/**
* The path to the model configuration file, useful for offline usage or custom model configurations.
*/
modelConfigFile?: string;
/**
* Whether to allow downloading the model if it is not present at the specified path.
*/
allowDownload?: boolean;
/**
* Enable verbose logging.
*/
verbose?: boolean;
/**
* The processing unit on which the model will run. It can be set to
* - "cpu": Model will run on the central processing unit.
* - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
* - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
* - "gpu name": Model will run on the GPU that matches the name if it's available.
* Note: If a GPU device lacks sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All
* instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the
* model.
* @default "cpu"
*/
device?: string;
/**
* The Maximum window size of this model
* @default 2048
*/
nCtx?: number;
/**
* Number of gpu layers needed
* @default 100
*/
ngl?: number;
}
interface InferenceModelOptions extends LoadModelOptions {
type?: "inference";
}
interface EmbeddingModelOptions extends LoadModelOptions {
type: "embedding";
}
/**
* Loads a machine learning model with the specified name. The defacto way to create a model.
* By default this will download a model from the official GPT4ALL website, if a model is not present at given path.
*
* @param {string} modelName - The name of the model to load.
* @param {LoadModelOptions|undefined} [options] - (Optional) Additional options for loading the model.
* @returns {Promise<InferenceModel | EmbeddingModel>} A promise that resolves to an instance of the loaded LLModel.
*/
declare function loadModel(
modelName: string,
options?: InferenceModelOptions
): Promise<InferenceModel>;
declare function loadModel(
modelName: string,
options?: EmbeddingModelOptions
): Promise<EmbeddingModel>;
declare function loadModel(
modelName: string,
options?: EmbeddingModelOptions | InferenceModelOptions
): Promise<InferenceModel | EmbeddingModel>;
/**
* Interface for createCompletion methods, implemented by InferenceModel and ChatSession.
* Implement your own CompletionProvider or extend ChatSession to generate completions with custom logic.
*/
interface CompletionProvider {
modelName: string;
generate(
input: CompletionInput,
options?: CompletionOptions
): Promise<InferenceResult>;
}
/**
* Options for creating a completion.
*/
interface CompletionOptions extends LLModelInferenceOptions {
/**
* Indicates if verbose logging is enabled.
* @default false
*/
verbose?: boolean;
}
/**
* The input for creating a completion. May be a string or an array of messages.
*/
type CompletionInput = string | ChatMessage[];
/**
* The nodejs equivalent to python binding's chat_completion
* @param {CompletionProvider} provider - The inference model object or chat session
* @param {CompletionInput} input - The input string or message array
* @param {CompletionOptions} options - The options for creating the completion.
* @returns {CompletionResult} The completion result.
*/
declare function createCompletion(
provider: CompletionProvider,
input: CompletionInput,
options?: CompletionOptions
): Promise<CompletionResult>;
/**
* Streaming variant of createCompletion, returns a stream of tokens and a promise that resolves to the completion result.
* @param {CompletionProvider} provider - The inference model object or chat session
* @param {CompletionInput} input - The input string or message array
* @param {CompletionOptions} options - The options for creating the completion.
* @returns {CompletionStreamReturn} An object of token stream and the completion result promise.
*/
declare function createCompletionStream(
provider: CompletionProvider,
input: CompletionInput,
options?: CompletionOptions
): CompletionStreamReturn;
/**
* The result of a streamed completion, containing a stream of tokens and a promise that resolves to the completion result.
*/
interface CompletionStreamReturn {
tokens: NodeJS.ReadableStream;
result: Promise<CompletionResult>;
}
/**
* Async generator variant of createCompletion, yields tokens as they are generated and returns the completion result.
* @param {CompletionProvider} provider - The inference model object or chat session
* @param {CompletionInput} input - The input string or message array
* @param {CompletionOptions} options - The options for creating the completion.
* @returns {AsyncGenerator<string>} The stream of generated tokens
*/
declare function createCompletionGenerator(
provider: CompletionProvider,
input: CompletionInput,
options: CompletionOptions
): AsyncGenerator<string, CompletionResult>;
/**
* A message in the conversation.
*/
interface ChatMessage {
/** The role of the message. */
role: "system" | "assistant" | "user";
/** The message content. */
content: string;
}
/**
* The result of a completion.
*/
interface CompletionResult {
/** The model used for the completion. */
model: string;
/** Token usage report. */
usage: {
/** The number of tokens ingested during the completion. */
prompt_tokens: number;
/** The number of tokens generated in the completion. */
completion_tokens: number;
/** The total number of tokens used. */
total_tokens: number;
/** Number of tokens used in the conversation. */
n_past_tokens: number;
};
/** The generated completion. */
choices: Array<{
message: ChatMessage;
}>;
}
/**
* Model inference arguments for generating completions.
*/
interface LLModelPromptContext {
/** The size of the raw logits vector. */
logitsSize: number;
/** The size of the raw tokens vector. */
tokensSize: number;
/** The number of tokens in the past conversation.
* This may be used to "roll back" the conversation to a previous state.
* Note that for most use cases the default value should be sufficient and this should not be set.
* @default 0 For completions using InferenceModel, meaning the model will only consider the input prompt.
* @default nPast For completions using ChatSession. This means the context window will be automatically determined
* and possibly resized (see contextErase) to keep the conversation performant.
* */
nPast: number;
/** The maximum number of tokens to predict.
* @default 4096
* */
nPredict: number;
/** Template for user / assistant message pairs.
* %1 is required and will be replaced by the user input.
* %2 is optional and will be replaced by the assistant response. If not present, the assistant response will be appended.
*/
promptTemplate?: string;
/** The context window size. Do not use, it has no effect. See loadModel options.
* THIS IS DEPRECATED!!!
* Use loadModel's nCtx option instead.
* @default 2048
*/
nCtx: number;
/** The top-k logits to sample from.
* Top-K sampling selects the next token only from the top K most likely tokens predicted by the model.
* It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit
* the diversity of the output. A higher value for top-K (eg., 100) will consider more tokens and lead
* to more diverse text, while a lower value (eg., 10) will focus on the most probable tokens and generate
* more conservative text. 30 - 60 is a good range for most tasks.
* @default 40
* */
topK: number;
/** The nucleus sampling probability threshold.
* Top-P limits the selection of the next token to a subset of tokens with a cumulative probability
* above a threshold P. This method, also known as nucleus sampling, finds a balance between diversity
* and quality by considering both token probabilities and the number of tokens available for sampling.
* When using a higher value for top-P (eg., 0.95), the generated text becomes more diverse.
* On the other hand, a lower value (eg., 0.1) produces more focused and conservative text.
* @default 0.9
*
* */
topP: number;
/**
* The minimum probability of a token to be considered.
* @default 0.0
*/
minP: number;
/** The temperature to adjust the model's output distribution.
* Temperature is like a knob that adjusts how creative or focused the output becomes. Higher temperatures
* (eg., 1.2) increase randomness, resulting in more imaginative and diverse text. Lower temperatures (eg., 0.5)
* make the output more focused, predictable, and conservative. When the temperature is set to 0, the output
* becomes completely deterministic, always selecting the most probable next token and producing identical results
* each time. Try what value fits best for your use case and model.
* @default 0.1
* @alias temperature
* */
temp: number;
temperature: number;
/** The number of predictions to generate in parallel.
* By splitting the prompt every N tokens, prompt-batch-size reduces RAM usage during processing. However,
* this can increase the processing time as a trade-off. If the N value is set too low (e.g., 10), long prompts
* with 500+ tokens will be most affected, requiring numerous processing runs to complete the prompt processing.
* To ensure optimal performance, setting the prompt-batch-size to 2048 allows processing of all tokens in a single run.
* @default 8
* */
nBatch: number;
/** The penalty factor for repeated tokens.
* Repeat-penalty can help penalize tokens based on how frequently they occur in the text, including the input prompt.
* A token that has already appeared five times is penalized more heavily than a token that has appeared only one time.
* A value of 1 means that there is no penalty and values larger than 1 discourage repeated tokens.
* @default 1.18
* */
repeatPenalty: number;
/** The number of last tokens to penalize.
* The repeat-penalty-tokens N option controls the number of tokens in the history to consider for penalizing repetition.
* A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only
* consider recent tokens.
* @default 10
* */
repeatLastN: number;
/** The percentage of context to erase if the context window is exceeded.
* Set it to a lower value to keep context for longer at the cost of performance.
* @default 0.75
* */
contextErase: number;
}
/**
* From python api:
* models will be stored in (homedir)/.cache/gpt4all/`
*/
declare const DEFAULT_DIRECTORY: string;
/**
* From python api:
* The default path for dynamic libraries to be stored.
* You may separate paths by a semicolon to search in multiple areas.
* This searches DEFAULT_DIRECTORY/libraries, cwd/libraries, and finally cwd.
*/
declare const DEFAULT_LIBRARIES_DIRECTORY: string;
/**
* Default model configuration.
*/
declare const DEFAULT_MODEL_CONFIG: ModelConfig;
/**
* Default prompt context.
*/
declare const DEFAULT_PROMPT_CONTEXT: LLModelPromptContext;
/**
* Default model list url.
*/
declare const DEFAULT_MODEL_LIST_URL: string;
/**
* Initiates the download of a model file.
* By default this downloads without waiting. use the controller returned to alter this behavior.
* @param {string} modelName - The model to be downloaded.
* @param {DownloadModelOptions} options - to pass into the downloader. Default is { location: (cwd), verbose: false }.
* @returns {DownloadController} object that allows controlling the download process.
*
* @throws {Error} If the model already exists in the specified location.
* @throws {Error} If the model cannot be found at the specified url.
*
* @example
* const download = downloadModel('ggml-gpt4all-j-v1.3-groovy.bin')
* download.promise.then(() => console.log('Downloaded!'))
*/
declare function downloadModel(
modelName: string,
options?: DownloadModelOptions
): DownloadController;
/**
* Options for the model download process.
*/
interface DownloadModelOptions {
/**
* location to download the model.
* Default is process.cwd(), or the current working directory
*/
modelPath?: string;
/**
* Debug mode -- check how long it took to download in seconds
* @default false
*/
verbose?: boolean;
/**
* Remote download url. Defaults to `https://gpt4all.io/models/gguf/<modelName>`
* @default https://gpt4all.io/models/gguf/<modelName>
*/
url?: string;
/**
* MD5 sum of the model file. If this is provided, the downloaded file will be checked against this sum.
* If the sums do not match, an error will be thrown and the file will be deleted.
*/
md5sum?: string;
}
interface ListModelsOptions {
url?: string;
file?: string;
}
declare function listModels(
options?: ListModelsOptions
): Promise<ModelConfig[]>;
interface RetrieveModelOptions {
allowDownload?: boolean;
verbose?: boolean;
modelPath?: string;
modelConfigFile?: string;
}
declare function retrieveModel(
modelName: string,
options?: RetrieveModelOptions
): Promise<ModelConfig>;
/**
* Model download controller.
*/
interface DownloadController {
/** Cancel the request to download if this is called. */
cancel: () => void;
/** A promise resolving to the downloaded models config once the download is done */
promise: Promise<ModelConfig>;
}
export {
LLModel,
LLModelPromptContext,
ModelConfig,
InferenceModel,
InferenceResult,
EmbeddingModel,
EmbeddingResult,
ChatSession,
ChatMessage,
CompletionInput,
CompletionProvider,
CompletionOptions,
CompletionResult,
LoadModelOptions,
DownloadController,
RetrieveModelOptions,
DownloadModelOptions,
GpuDevice,
loadModel,
downloadModel,
retrieveModel,
listModels,
createCompletion,
createCompletionStream,
createCompletionGenerator,
createEmbedding,
DEFAULT_DIRECTORY,
DEFAULT_LIBRARIES_DIRECTORY,
DEFAULT_MODEL_CONFIG,
DEFAULT_PROMPT_CONTEXT,
DEFAULT_MODEL_LIST_URL,
};