inference-server
Version:
Libraries and server to build AI applications. Adapters to various native bindings allowing local inference. Integrate it with your application, or use as a microservice.
148 lines (147 loc) • 7.67 kB
TypeScript
import { ModelPool } from './pool.js';
import { ModelStore } from './store.js';
import { ModelOptions, InferenceParams, ModelEngine, ChatCompletionTaskArgs, TextCompletionTaskArgs, EmbeddingTaskArgs, ImageToTextTaskArgs, TextToSpeechTaskArgs, SpeechToTextTaskArgs, TextToImageTaskArgs, ImageToImageTaskArgs, ObjectDetectionTaskArgs, ChatCompletionTaskResult, TextCompletionTaskResult, EmbeddingTaskResult, TextToImageTaskResult, TextToSpeechTaskResult, SpeechToTextTaskResult, ImageToTextTaskResult, ImageToImageTaskResult, ObjectDetectionTaskResult, ChatCompletionInferenceTaskArgs, TextCompletionInferenceTaskArgs, EmbeddingInferenceTaskArgs, ImageToTextInferenceTaskArgs, SpeechToTextInferenceTaskArgs, TextToSpeechInferenceTaskArgs, TextToImageInferenceTaskArgs, ImageToImageInferenceTaskArgs, ObjectDetectionInferenceTaskArgs, TextClassificationTaskArgs, TextClassificationTaskResult, TextClassificationInferenceTaskArgs } from './types/index.js';
import { Logger, LogLevel } from './lib/logger.js';
/**
* Configuration options for initializing a `InferenceServer`.
* @interface InferenceServerOptions
*/
export interface InferenceServerOptions {
/**
* A record of custom engines to be used for processing tasks. Each engine is identified by a unique name.
* @type {Record<string, ModelEngine>}
* @optional
*/
engines?: Record<string, ModelEngine>;
/**
* A record of model configurations, where each model is identified by a unique ID, defined by the user.
* @type {Record<string, ModelOptions>}
*/
models: Record<string, ModelOptions>;
/**
* The maximum number of concurrent tasks allowed in the model pool.
* @type {number}
* @optional
*/
concurrency?: number;
/**
* The path to the cache directory where model files and related data will be stored.
* @type {string}
* @optional
*/
cachePath?: string;
/**
* A logger instance or log level to control logging for the server. If a log level is provided,
* a default logger will be created with that level.
* @type {Logger | LogLevel}
* @optional
*/
log?: Logger | LogLevel;
}
/**
* Represents a server for managing and serving machine learning models, including model initialization,
* file downloads, request handling, and task processing. The example provided starts an inference server
* using llama.cpp as the engine, with the task of text-completion and two instances of smollm.
*
* @class InferenceServer
* @example
* const inferenceServer = new InferenceServer({
* log: 'info',
* concurrency: 2,
* models: {
* 'smollm-135m': {
* task: 'text-completion',
* url: 'https://huggingface.co/HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/blob/main/smollm-135m-instruct-add-basics-q8_0.gguf',
* engine: 'node-llama-cpp',
* maxInstances: 2,
* },
* })
* inferenceServer.start()
*/
export declare class InferenceServer {
/** @property {ModelPool} pool - A pool for managing model instances and concurrency. */
pool: ModelPool;
/** @property {ModelStore} store - A store for managing model metadata, preparation, and storage. */
store: ModelStore;
/** @property {Record<string, ModelEngine>} engines - A record of engines (custom and built-in) used for processing tasks. */
engines: Record<string, ModelEngine>;
/** @property {Logger} log - Logger for tracking the server's activities and errors. */
log: Logger;
/**
* Constructs a `InferenceServer` instance with the specified options.
* @param {InferenceServerOptions} options - Configuration options for the server.
*/
constructor(options: InferenceServerOptions);
modelExists(modelId: string): boolean;
/**
* Starts the inference server, initializing engines and preparing the model store and pool.
* @returns {Promise<void>} Resolves when the server is fully started.
*/
start(): Promise<void>;
/**
* Stops the server. disposes all resources. Clears the queue of working tasks.
**/
stop(): Promise<void>;
/**
* Requests an available model instance from the pool for a specific task.
* Use this for manual control over when to release the instance back to the pool.
* @param {InferenceParams} args - The inference task arguments.
* @param {AbortSignal} [signal] - An optional signal to abort the request.
* @returns {Promise<ModelInstance>} A model instance that can fulfill the task.
*/
requestInstance(args: InferenceParams, signal?: AbortSignal): Promise<import("./pool.js").ModelInstanceHandle>;
private prepareInstance;
processTask(args: TextCompletionInferenceTaskArgs): Promise<TextCompletionTaskResult>;
processTask(args: ChatCompletionInferenceTaskArgs): Promise<ChatCompletionTaskResult>;
processTask(args: EmbeddingInferenceTaskArgs): Promise<EmbeddingTaskResult>;
processTask(args: ImageToTextInferenceTaskArgs): Promise<ImageToTextTaskResult>;
processTask(args: SpeechToTextInferenceTaskArgs): Promise<SpeechToTextTaskResult>;
processTask(args: TextToSpeechInferenceTaskArgs): Promise<TextToSpeechTaskResult>;
processTask(args: TextToImageInferenceTaskArgs): Promise<TextToImageTaskResult>;
processTask(args: ImageToImageInferenceTaskArgs): Promise<ImageToImageTaskResult>;
processTask(args: ObjectDetectionInferenceTaskArgs): Promise<ObjectDetectionTaskResult>;
processTask(args: TextClassificationInferenceTaskArgs): Promise<TextClassificationTaskResult>;
processChatCompletionTask(args: ChatCompletionTaskArgs): Promise<ChatCompletionTaskResult>;
processTextCompletionTask(args: TextCompletionTaskArgs): Promise<TextCompletionTaskResult>;
processEmbeddingTask(args: EmbeddingTaskArgs): Promise<EmbeddingTaskResult>;
processImageToTextTask(args: ImageToTextTaskArgs): Promise<ImageToTextTaskResult>;
processSpeechToTextTask(args: SpeechToTextTaskArgs): Promise<SpeechToTextTaskResult>;
processTextToSpeechTask(args: TextToSpeechTaskArgs): Promise<TextToSpeechTaskResult>;
processTextToImageTask(args: TextToImageTaskArgs): Promise<TextToImageTaskResult>;
processImageToImageTask(args: ImageToImageTaskArgs): Promise<ImageToImageTaskResult>;
processObjectDetectionTask(args: ObjectDetectionTaskArgs): Promise<ObjectDetectionTaskResult>;
processTextClassificationTask(args: TextClassificationTaskArgs): Promise<TextClassificationTaskResult>;
/**
* Retrieves the current status of the model server, including pool and store status.
*
* @returns {Object} The status object containing pool and store information.
*/
getStatus(): {
pool: {
processing: number;
pending: number;
instances: {
[k: string]: {
model: string;
status: "error" | "preparing" | "idle" | "busy" | "loading";
engine: (string & {}) | import("./engines/index.js").BuiltInEngineName;
device: string;
contextState: string | undefined;
lastUsed: string;
};
};
};
store: {
[k: string]: {
engine: (string & {}) | import("./engines/index.js").BuiltInEngineName;
device: {
gpu?: boolean | "auto" | (string & {});
} | undefined;
minInstances: number;
maxInstances: number;
status: "error" | "unloaded" | "preparing" | "ready";
downloads: any;
};
};
};
}