inference-server
Version:
Libraries and server to build AI applications. Adapters to various native bindings allowing local inference. Integrate it with your application, or use as a microservice.
329 lines • 12.6 kB
JavaScript
import path from 'node:path';
import { builtInEngineNames } from './engines/index.js';
import { ModelPool } from './pool.js';
import { ModelStore } from './store.js';
import { createSublogger, LogLevels } from './lib/logger.js';
import { resolveModelFileLocation } from './lib/resolveModelFileLocation.js';
import { validateModelOptions } from './lib/validateModelOptions.js';
import { getCacheDirPath } from './lib/getCacheDirPath.js';
/**
* Represents a server for managing and serving machine learning models, including model initialization,
* file downloads, request handling, and task processing. The example provided starts an inference server
* using llama.cpp as the engine, with the task of text-completion and two instances of smollm.
*
* @class InferenceServer
* @example
* const inferenceServer = new InferenceServer({
* log: 'info',
* concurrency: 2,
* models: {
* 'smollm-135m': {
* task: 'text-completion',
* url: 'https://huggingface.co/HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/blob/main/smollm-135m-instruct-add-basics-q8_0.gguf',
* engine: 'node-llama-cpp',
* maxInstances: 2,
* },
* })
* inferenceServer.start()
*/
export class InferenceServer {
/** @property {ModelPool} pool - A pool for managing model instances and concurrency. */
pool;
/** @property {ModelStore} store - A store for managing model metadata, preparation, and storage. */
store;
/** @property {Record<string, ModelEngine>} engines - A record of engines (custom and built-in) used for processing tasks. */
engines = {};
/** @property {Logger} log - Logger for tracking the server's activities and errors. */
log;
/**
* Constructs a `InferenceServer` instance with the specified options.
* @param {InferenceServerOptions} options - Configuration options for the server.
*/
constructor(options) {
this.log = createSublogger(options.log);
let modelsCachePath = getCacheDirPath('models');
if (options.cachePath) {
modelsCachePath = path.join(options.cachePath, 'models');
}
const modelsWithDefaults = {};
const usedEngines = [];
for (const modelId in options.models) {
const modelOptions = options.models[modelId];
const isBuiltIn = builtInEngineNames.includes(modelOptions.engine);
if (isBuiltIn) {
const builtInModelOptions = modelOptions;
// can validate and resolve location of model files if a built-in engine is used
validateModelOptions(modelId, builtInModelOptions);
modelsWithDefaults[modelId] = {
id: modelId,
minInstances: 0,
maxInstances: 1,
modelsCachePath,
location: resolveModelFileLocation({
url: builtInModelOptions.url,
filePath: builtInModelOptions.location,
modelsCachePath,
}),
...builtInModelOptions,
};
}
else {
const customEngineOptions = modelOptions;
modelsWithDefaults[modelId] = {
id: modelId,
minInstances: 0,
maxInstances: 1,
modelsCachePath,
...customEngineOptions,
};
}
usedEngines.push({
model: modelId,
engine: modelOptions.engine,
});
}
const customEngines = Object.keys(options.engines ?? {});
for (const ref of usedEngines) {
const isBuiltIn = builtInEngineNames.includes(ref.engine);
const isCustom = customEngines.includes(ref.engine);
if (!isBuiltIn && !isCustom) {
throw new Error(`Engine "${ref.engine}" used by model "${ref.model}" does not exist`);
}
if (isCustom) {
this.engines[ref.engine] = options.engines[ref.engine];
}
}
this.store = new ModelStore({
log: this.log,
// TODO expose this? or remove it?
// prepareConcurrency: 2,
models: modelsWithDefaults,
modelsCachePath,
});
this.pool = new ModelPool({
log: this.log,
concurrency: options.concurrency ?? 1,
models: modelsWithDefaults,
}, this.prepareInstance.bind(this));
}
modelExists(modelId) {
return !!this.pool.config.models[modelId];
}
/**
* Starts the inference server, initializing engines and preparing the model store and pool.
* @returns {Promise<void>} Resolves when the server is fully started.
*/
async start() {
this.log(LogLevels.info, 'Starting inference server ...');
const engineStartPromises = [];
// call startEngine on custom engines
for (const [key, methods] of Object.entries(this.engines)) {
if (methods.start) {
engineStartPromises.push(methods.start(this));
}
}
// import built-in engines
for (const key of builtInEngineNames) {
// skip unused engines
const modelUsingEngine = Object.keys(this.store.models).find((modelId) => this.store.models[modelId].engine === key);
if (!modelUsingEngine) {
continue;
}
engineStartPromises.push(new Promise(async (resolve, reject) => {
try {
const engine = await import(`./engines/${key}/engine.js`);
this.engines[key] = engine;
resolve({
key,
engine,
});
}
catch (err) {
reject(err);
}
}));
}
this.log(LogLevels.debug, 'Waiting for engines to start ...');
await Promise.all(engineStartPromises);
this.log(LogLevels.debug, `Engines started: ${Object.keys(this.engines).join(', ')}`);
await Promise.all([this.store.init(this.engines), this.pool.init(this.engines)]);
this.log(LogLevels.info, 'Inference server started');
}
/**
* Stops the server. disposes all resources. Clears the queue of working tasks.
**/
async stop() {
this.log(LogLevels.info, 'Stopping inference server ...');
this.pool.queue.clear();
this.store.dispose();
try {
await this.pool.dispose(); // might cause abort errors when there are still running tasks
}
catch (err) {
this.log(LogLevels.error, 'Error while stopping model server', err);
}
this.log(LogLevels.debug, 'Waiting for running tasks to finish');
// wait until all running tasks are cancelled
await this.pool.queue.onIdle();
this.log(LogLevels.debug, 'Model server stopped');
}
/**
* Requests an available model instance from the pool for a specific task.
* Use this for manual control over when to release the instance back to the pool.
* @param {InferenceParams} args - The inference task arguments.
* @param {AbortSignal} [signal] - An optional signal to abort the request.
* @returns {Promise<ModelInstance>} A model instance that can fulfill the task.
*/
requestInstance(args, signal) {
return this.pool.requestInstance(args, signal);
}
// gets called by the pool right before a new instance is created
async prepareInstance(instance, signal) {
const model = instance.config;
const modelStoreStatus = this.store.models[model.id].status;
if (modelStoreStatus === 'unloaded') {
await this.store.prepareModel(model.id, signal);
}
if (modelStoreStatus === 'preparing') {
const modelReady = new Promise((resolve, reject) => {
const onCompleted = async (storeModel) => {
if (storeModel.id === model.id) {
this.store.prepareQueue.off('completed', onCompleted);
if (storeModel.status === 'ready') {
resolve();
}
else {
reject();
}
}
};
this.store.prepareQueue.on('completed', onCompleted);
});
await modelReady;
}
}
async processTask(args) {
const lock = await this.requestInstance(args);
const waitForResult = async (task) => {
const result = await task.result;
await lock.release();
return result;
};
if (args.task === 'text-completion') {
const task = lock.instance.processTextCompletionTask(args);
return await waitForResult(task);
}
if (args.task === 'chat-completion') {
const task = lock.instance.processChatCompletionTask(args);
return await waitForResult(task);
}
if (args.task === 'embedding') {
const task = lock.instance.processEmbeddingTask(args);
return await waitForResult(task);
}
if (args.task === 'image-to-text') {
const task = lock.instance.processImageToTextTask(args);
return await waitForResult(task);
}
if (args.task === 'speech-to-text') {
const task = lock.instance.processSpeechToTextTask(args);
return await waitForResult(task);
}
if (args.task === 'text-to-speech') {
const task = lock.instance.processTextToSpeechTask(args);
return await waitForResult(task);
}
if (args.task === 'text-to-image') {
const task = lock.instance.processTextToImageTask(args);
return await waitForResult(task);
}
if (args.task === 'image-to-image') {
const task = lock.instance.processImageToImageTask(args);
return await waitForResult(task);
}
if (args.task === 'object-detection') {
const task = lock.instance.processObjectDetectionTask(args);
return await waitForResult(task);
}
if (args.task === 'text-classification') {
const task = lock.instance.processTextClassificationTask(args);
return await waitForResult(task);
}
// @ts-expect-error
throw new Error(`Unknown task type: ${args.task}`);
}
processChatCompletionTask(args) {
return this.processTask({
task: 'chat-completion',
...args,
});
}
processTextCompletionTask(args) {
return this.processTask({
task: 'text-completion',
...args,
});
}
processEmbeddingTask(args) {
return this.processTask({
task: 'embedding',
...args,
});
}
processImageToTextTask(args) {
return this.processTask({
task: 'image-to-text',
...args,
});
}
processSpeechToTextTask(args) {
return this.processTask({
task: 'speech-to-text',
...args,
});
}
processTextToSpeechTask(args) {
return this.processTask({
task: 'text-to-speech',
...args,
});
}
processTextToImageTask(args) {
return this.processTask({
task: 'text-to-image',
...args,
});
}
processImageToImageTask(args) {
return this.processTask({
task: 'image-to-image',
...args,
});
}
processObjectDetectionTask(args) {
return this.processTask({
task: 'object-detection',
...args,
});
}
processTextClassificationTask(args) {
return this.processTask({
task: 'text-classification',
...args,
});
}
/**
* Retrieves the current status of the model server, including pool and store status.
*
* @returns {Object} The status object containing pool and store information.
*/
getStatus() {
const poolStatus = this.pool.getStatus();
const storeStatus = this.store.getStatus();
return {
pool: poolStatus,
store: storeStatus,
};
}
}
//# sourceMappingURL=server.js.map