inference-server

Version:

Libraries and server to build AI applications. Adapters to various native bindings allowing local inference. Integrate it with your application, or use as a microservice.

github.com/iimez/inference-server

iimez/inference-server

255 lines (254 loc) • 8.66 kB

TypeScript

import type { SomeJSONSchema } from 'ajv/dist/types/json-schema'; import type { ChatWrapper } from 'node-llama-cpp'; import type { BuiltInEngineName } from '../engines/index.js'; import { ChatMessage, ToolDefinition } from '../types/chat.js'; import type { ContextShiftStrategy } from '../engines/node-llama-cpp/types.js'; import type { StableDiffusionWeightType, StableDiffusionSamplingMethod, StableDiffusionSchedule } from '../engines/stable-diffusion-cpp/types.js'; import type { TransformersJsModelClass, TransformersJsTokenizerClass, TransformersJsProcessorClass, TransformersJsDataType } from '../engines/transformers-js/types.js'; import type { InferenceParams, TextCompletionParamsBase } from '../types/engine.js'; import type { TaskKind } from '../types/tasks.js'; export * from '../types/chat.js'; export * from '../types/engine.js'; export * from '../types/tasks.js'; export interface ModelOptionsBase { engine: BuiltInEngineName | (string & {}); task: TaskKind | (string & {}); prepare?: 'blocking' | 'async' | 'on-demand'; minInstances?: number; maxInstances?: number; location?: string; } export interface BuiltInModelOptionsBase extends ModelOptionsBase { engine: BuiltInEngineName; task: TaskKind; url?: string; location?: string; } export interface ModelConfigBase extends ModelOptionsBase { id: string; minInstances: number; maxInstances: number; modelsCachePath: string; } export interface ModelConfig extends ModelConfigBase { url?: string; location?: string; task: TaskKind | (string & {}); engine: BuiltInEngineName | (string & {}); ttl?: number; prefix?: string; initialMessages?: ChatMessage[]; device?: { gpu?: boolean | 'auto' | (string & {}); }; } export interface Image { data: Buffer; width: number; height: number; channels: 1 | 2 | 3 | 4; } export interface Audio { sampleRate: number; channels: 1 | 2; samples: Float32Array; } export interface ModelRequestMeta { sequence: number; abortController: AbortController; } export type ModelInstanceRequest = ModelRequestMeta & InferenceParams; interface EmbeddingModelOptions { task: 'embedding'; } export type TextCompletionGrammar = string | SomeJSONSchema; interface TextCompletionModelOptions { task: 'text-completion'; contextSize?: number; grammars?: Record<string, TextCompletionGrammar>; completionDefaults?: TextCompletionParamsBase; initialMessages?: ChatMessage[]; prefix?: string; batchSize?: number; } /** * Configuration options for Node.js Llama.cpp model implementations. * @interface * @extends {BuiltInModelOptionsBase} */ interface LlamaCppModelOptionsBase extends BuiltInModelOptionsBase { /** * Specifies the engine to be used for model execution. * Must be set to 'node-llama-cpp'. * @type {'node-llama-cpp'} */ engine: 'node-llama-cpp'; /** * Defines the type of task the model will perform. * @type {'text-completion' | 'embedding'} */ task: 'text-completion' | 'embedding'; /** * Optional SHA-256 hash of the model file. * Can be used for model verification. * @type {string} * @optional */ sha256?: string; /** * Optional batch size for processing. * Controls how many tokens are processed simultaneously. * @type {number} * @optional */ batchSize?: number; /** * Optional strategy for handling context window shifts. * Determines how the model manages context when it exceeds the maximum length. * @type {ContextShiftStrategy} * @optional */ contextShiftStrategy?: ContextShiftStrategy; /** * A ChatWrapper instance to use for templating conversation messages. * See https://node-llama-cpp.withcat.ai/guide/chat-wrapper */ chatWrapper?: ChatWrapper; /** * Configuration for model tools and their execution. * @type {object} * @optional */ tools?: { /** * Dictionary of tool definitions where keys are tool names and values are their definitions. * @type {Record<string, ToolDefinition>} */ definitions: Record<string, ToolDefinition>; /** * Whether to include parameter documentation in tool definitions. * @type {boolean} * @optional */ documentParams?: boolean; /** * Maximum number of parallel tool executions allowed. * @type {number} * @optional */ maxParallelCalls?: number; }; /** * Device configuration for model execution. * @type {object} * @optional */ device?: { /** * GPU usage configuration. * - true: Use GPU * - false: Don't use GPU * - 'auto': Automatically detect and use GPU if available * - string: Specific GPU device identifier * @type {boolean | 'auto' | (string & {})} * @optional */ gpu?: boolean | 'auto' | (string & {}); /** * Number of layers to offload to GPU. * Only applicable when GPU is enabled. * @type {number} * @optional */ gpuLayers?: number; /** * Number of CPU threads to use for computation. * @type {number} * @optional */ cpuThreads?: number; /** * Whether to lock memory to prevent swapping. * Can improve performance but requires appropriate system permissions. * @type {boolean} * @optional */ memLock?: boolean; }; } interface LlamaCppEmbeddingModelOptions extends LlamaCppModelOptionsBase, EmbeddingModelOptions { task: 'embedding'; } export interface LlamaCppTextCompletionModelOptions extends LlamaCppModelOptionsBase, TextCompletionModelOptions { task: 'text-completion'; } interface GPT4AllModelOptions extends BuiltInModelOptionsBase { engine: 'gpt4all'; task: 'text-completion' | 'embedding'; md5?: string; device?: { gpu?: boolean | 'auto' | (string & {}); gpuLayers?: number; cpuThreads?: number; }; } type GPT4AllTextCompletionModelOptions = TextCompletionModelOptions & GPT4AllModelOptions; type GPT4AllEmbeddingModelOptions = GPT4AllModelOptions & EmbeddingModelOptions; export interface TransformersJsModel { processor?: { url?: string; file?: string; }; processorClass?: TransformersJsProcessorClass | string; tokenizerClass?: TransformersJsTokenizerClass | string; modelClass?: TransformersJsModelClass | string; dtype?: Record<string, TransformersJsDataType> | TransformersJsDataType; } export type SpeakerEmbeddings = Record<string, { url?: string; file?: string; } | Float32Array>; export interface TransformersJsSpeechModel { speakerEmbeddings?: SpeakerEmbeddings; vocoderClass?: TransformersJsModelClass; vocoder?: { url?: string; file?: string; }; } interface TransformersJsModelOptions extends BuiltInModelOptionsBase, TransformersJsModel, TransformersJsSpeechModel { engine: 'transformers-js'; task: 'image-to-text' | 'speech-to-text' | 'text-to-speech' | 'text-completion' | 'chat-completion' | 'embedding' | 'object-detection' | 'text-classification'; textModel?: TransformersJsModel; visionModel?: TransformersJsModel; speechModel?: TransformersJsModel & TransformersJsSpeechModel; device?: { gpu?: boolean | 'auto' | (string & {}); }; } export interface ModelFileSource { url?: string; file?: string; sha256?: string; } interface StableDiffusionModelOptions extends BuiltInModelOptionsBase { engine: 'stable-diffusion-cpp'; task: 'image-to-text' | 'text-to-image' | 'image-to-image'; sha256?: string; url?: string; diffusionModel?: boolean; vae?: ModelFileSource; clipL?: ModelFileSource; clipG?: ModelFileSource; t5xxl?: ModelFileSource; taesd?: ModelFileSource; controlNet?: ModelFileSource; samplingMethod?: StableDiffusionSamplingMethod; weightType?: StableDiffusionWeightType; schedule?: StableDiffusionSchedule; loras?: ModelFileSource[]; } export interface CustomEngineModelOptions extends ModelOptionsBase { } export type BuiltInModelOptions = LlamaCppTextCompletionModelOptions | LlamaCppEmbeddingModelOptions | GPT4AllTextCompletionModelOptions | GPT4AllEmbeddingModelOptions | TransformersJsModelOptions | StableDiffusionModelOptions; export type ModelOptions = BuiltInModelOptions | CustomEngineModelOptions;