UNPKG

inference-server

Version:

Libraries and server to build AI applications. Adapters to various native bindings allowing local inference. Integrate it with your application, or use as a microservice.

148 lines (147 loc) 7.67 kB
import { ModelPool } from './pool.js'; import { ModelStore } from './store.js'; import { ModelOptions, InferenceParams, ModelEngine, ChatCompletionTaskArgs, TextCompletionTaskArgs, EmbeddingTaskArgs, ImageToTextTaskArgs, TextToSpeechTaskArgs, SpeechToTextTaskArgs, TextToImageTaskArgs, ImageToImageTaskArgs, ObjectDetectionTaskArgs, ChatCompletionTaskResult, TextCompletionTaskResult, EmbeddingTaskResult, TextToImageTaskResult, TextToSpeechTaskResult, SpeechToTextTaskResult, ImageToTextTaskResult, ImageToImageTaskResult, ObjectDetectionTaskResult, ChatCompletionInferenceTaskArgs, TextCompletionInferenceTaskArgs, EmbeddingInferenceTaskArgs, ImageToTextInferenceTaskArgs, SpeechToTextInferenceTaskArgs, TextToSpeechInferenceTaskArgs, TextToImageInferenceTaskArgs, ImageToImageInferenceTaskArgs, ObjectDetectionInferenceTaskArgs, TextClassificationTaskArgs, TextClassificationTaskResult, TextClassificationInferenceTaskArgs } from './types/index.js'; import { Logger, LogLevel } from './lib/logger.js'; /** * Configuration options for initializing a `InferenceServer`. * @interface InferenceServerOptions */ export interface InferenceServerOptions { /** * A record of custom engines to be used for processing tasks. Each engine is identified by a unique name. * @type {Record<string, ModelEngine>} * @optional */ engines?: Record<string, ModelEngine>; /** * A record of model configurations, where each model is identified by a unique ID, defined by the user. * @type {Record<string, ModelOptions>} */ models: Record<string, ModelOptions>; /** * The maximum number of concurrent tasks allowed in the model pool. * @type {number} * @optional */ concurrency?: number; /** * The path to the cache directory where model files and related data will be stored. * @type {string} * @optional */ cachePath?: string; /** * A logger instance or log level to control logging for the server. If a log level is provided, * a default logger will be created with that level. * @type {Logger | LogLevel} * @optional */ log?: Logger | LogLevel; } /** * Represents a server for managing and serving machine learning models, including model initialization, * file downloads, request handling, and task processing. The example provided starts an inference server * using llama.cpp as the engine, with the task of text-completion and two instances of smollm. * * @class InferenceServer * @example * const inferenceServer = new InferenceServer({ * log: 'info', * concurrency: 2, * models: { * 'smollm-135m': { * task: 'text-completion', * url: 'https://huggingface.co/HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/blob/main/smollm-135m-instruct-add-basics-q8_0.gguf', * engine: 'node-llama-cpp', * maxInstances: 2, * }, * }) * inferenceServer.start() */ export declare class InferenceServer { /** @property {ModelPool} pool - A pool for managing model instances and concurrency. */ pool: ModelPool; /** @property {ModelStore} store - A store for managing model metadata, preparation, and storage. */ store: ModelStore; /** @property {Record<string, ModelEngine>} engines - A record of engines (custom and built-in) used for processing tasks. */ engines: Record<string, ModelEngine>; /** @property {Logger} log - Logger for tracking the server's activities and errors. */ log: Logger; /** * Constructs a `InferenceServer` instance with the specified options. * @param {InferenceServerOptions} options - Configuration options for the server. */ constructor(options: InferenceServerOptions); modelExists(modelId: string): boolean; /** * Starts the inference server, initializing engines and preparing the model store and pool. * @returns {Promise<void>} Resolves when the server is fully started. */ start(): Promise<void>; /** * Stops the server. disposes all resources. Clears the queue of working tasks. **/ stop(): Promise<void>; /** * Requests an available model instance from the pool for a specific task. * Use this for manual control over when to release the instance back to the pool. * @param {InferenceParams} args - The inference task arguments. * @param {AbortSignal} [signal] - An optional signal to abort the request. * @returns {Promise<ModelInstance>} A model instance that can fulfill the task. */ requestInstance(args: InferenceParams, signal?: AbortSignal): Promise<import("./pool.js").ModelInstanceHandle>; private prepareInstance; processTask(args: TextCompletionInferenceTaskArgs): Promise<TextCompletionTaskResult>; processTask(args: ChatCompletionInferenceTaskArgs): Promise<ChatCompletionTaskResult>; processTask(args: EmbeddingInferenceTaskArgs): Promise<EmbeddingTaskResult>; processTask(args: ImageToTextInferenceTaskArgs): Promise<ImageToTextTaskResult>; processTask(args: SpeechToTextInferenceTaskArgs): Promise<SpeechToTextTaskResult>; processTask(args: TextToSpeechInferenceTaskArgs): Promise<TextToSpeechTaskResult>; processTask(args: TextToImageInferenceTaskArgs): Promise<TextToImageTaskResult>; processTask(args: ImageToImageInferenceTaskArgs): Promise<ImageToImageTaskResult>; processTask(args: ObjectDetectionInferenceTaskArgs): Promise<ObjectDetectionTaskResult>; processTask(args: TextClassificationInferenceTaskArgs): Promise<TextClassificationTaskResult>; processChatCompletionTask(args: ChatCompletionTaskArgs): Promise<ChatCompletionTaskResult>; processTextCompletionTask(args: TextCompletionTaskArgs): Promise<TextCompletionTaskResult>; processEmbeddingTask(args: EmbeddingTaskArgs): Promise<EmbeddingTaskResult>; processImageToTextTask(args: ImageToTextTaskArgs): Promise<ImageToTextTaskResult>; processSpeechToTextTask(args: SpeechToTextTaskArgs): Promise<SpeechToTextTaskResult>; processTextToSpeechTask(args: TextToSpeechTaskArgs): Promise<TextToSpeechTaskResult>; processTextToImageTask(args: TextToImageTaskArgs): Promise<TextToImageTaskResult>; processImageToImageTask(args: ImageToImageTaskArgs): Promise<ImageToImageTaskResult>; processObjectDetectionTask(args: ObjectDetectionTaskArgs): Promise<ObjectDetectionTaskResult>; processTextClassificationTask(args: TextClassificationTaskArgs): Promise<TextClassificationTaskResult>; /** * Retrieves the current status of the model server, including pool and store status. * * @returns {Object} The status object containing pool and store information. */ getStatus(): { pool: { processing: number; pending: number; instances: { [k: string]: { model: string; status: "error" | "preparing" | "idle" | "busy" | "loading"; engine: (string & {}) | import("./engines/index.js").BuiltInEngineName; device: string; contextState: string | undefined; lastUsed: string; }; }; }; store: { [k: string]: { engine: (string & {}) | import("./engines/index.js").BuiltInEngineName; device: { gpu?: boolean | "auto" | (string & {}); } | undefined; minInstances: number; maxInstances: number; status: "error" | "unloaded" | "preparing" | "ready"; downloads: any; }; }; }; }