inference-server
Version:
Libraries and server to build AI applications. Adapters to various native bindings allowing local inference. Integrate it with your application, or use as a microservice.
52 lines (51 loc) • 3.1 kB
TypeScript
import { LlamaChat, LlamaModel, LlamaContext, LlamaCompletion, LlamaContextSequence, LlamaGrammar, ChatHistoryItem, LlamaChatResponse, LlamaEmbeddingContext, GgufFileInfo, LLamaChatContextShiftOptions, LlamaContextOptions, ChatWrapper } from 'node-llama-cpp';
import { ChatCompletionTaskResult, TextCompletionTaskResult, EngineContext, ToolDefinition, EmbeddingTaskResult, FileDownloadProgress, ModelConfig, TextCompletionGrammar, ChatMessage, EngineTaskContext, EngineTextCompletionTaskContext, TextCompletionParamsBase, ChatCompletionTaskArgs, TextCompletionTaskArgs, EmbeddingTaskArgs } from '../../types/index.js';
export interface NodeLlamaCppInstance {
model: LlamaModel;
context: LlamaContext;
chat?: LlamaChat;
chatHistory: ChatHistoryItem[];
grammars: Record<string, LlamaGrammar>;
pendingFunctionCalls: Record<string, any>;
lastEvaluation?: LlamaChatResponse['lastEvaluation'];
embeddingContext?: LlamaEmbeddingContext;
completion?: LlamaCompletion;
contextSequence: LlamaContextSequence;
chatWrapper?: ChatWrapper;
}
export interface NodeLlamaCppModelMeta {
gguf: GgufFileInfo;
}
export interface NodeLlamaCppModelConfig extends ModelConfig {
location: string;
grammars?: Record<string, TextCompletionGrammar>;
sha256?: string;
completionDefaults?: TextCompletionParamsBase;
initialMessages?: ChatMessage[];
prefix?: string;
tools?: {
definitions?: Record<string, ToolDefinition>;
documentParams?: boolean;
maxParallelCalls?: number;
};
contextSize?: number;
batchSize?: number;
lora?: LlamaContextOptions['lora'];
contextShiftStrategy?: LLamaChatContextShiftOptions['strategy'];
chatWrapper?: ChatWrapper;
device?: {
gpu?: boolean | 'auto' | (string & {});
gpuLayers?: number;
cpuThreads?: number;
memLock?: boolean;
};
}
export declare const autoGpu = true;
export declare function prepareModel({ config, log }: EngineContext<NodeLlamaCppModelConfig>, onProgress?: (progress: FileDownloadProgress) => void, signal?: AbortSignal): Promise<{
gguf?: any;
} | undefined>;
export declare function createInstance({ config, log }: EngineContext<NodeLlamaCppModelConfig>, signal?: AbortSignal): Promise<NodeLlamaCppInstance>;
export declare function disposeInstance(instance: NodeLlamaCppInstance): Promise<void>;
export declare function processChatCompletionTask(task: ChatCompletionTaskArgs, ctx: EngineTextCompletionTaskContext<NodeLlamaCppInstance, NodeLlamaCppModelConfig, NodeLlamaCppModelMeta>, signal?: AbortSignal): Promise<ChatCompletionTaskResult>;
export declare function processTextCompletionTask(task: TextCompletionTaskArgs, ctx: EngineTextCompletionTaskContext<NodeLlamaCppInstance, NodeLlamaCppModelConfig, NodeLlamaCppModelMeta>, signal?: AbortSignal): Promise<TextCompletionTaskResult>;
export declare function processEmbeddingTask(task: EmbeddingTaskArgs, ctx: EngineTaskContext<NodeLlamaCppInstance, NodeLlamaCppModelConfig, NodeLlamaCppModelMeta>, signal?: AbortSignal): Promise<EmbeddingTaskResult>;