UNPKG

inference-server

Version:

Libraries and server to build AI applications. Adapters to various native bindings allowing local inference. Integrate it with your application, or use as a microservice.

329 lines 12.6 kB
import path from 'node:path'; import { builtInEngineNames } from './engines/index.js'; import { ModelPool } from './pool.js'; import { ModelStore } from './store.js'; import { createSublogger, LogLevels } from './lib/logger.js'; import { resolveModelFileLocation } from './lib/resolveModelFileLocation.js'; import { validateModelOptions } from './lib/validateModelOptions.js'; import { getCacheDirPath } from './lib/getCacheDirPath.js'; /** * Represents a server for managing and serving machine learning models, including model initialization, * file downloads, request handling, and task processing. The example provided starts an inference server * using llama.cpp as the engine, with the task of text-completion and two instances of smollm. * * @class InferenceServer * @example * const inferenceServer = new InferenceServer({ * log: 'info', * concurrency: 2, * models: { * 'smollm-135m': { * task: 'text-completion', * url: 'https://huggingface.co/HuggingFaceTB/smollm-135M-instruct-v0.2-Q8_0-GGUF/blob/main/smollm-135m-instruct-add-basics-q8_0.gguf', * engine: 'node-llama-cpp', * maxInstances: 2, * }, * }) * inferenceServer.start() */ export class InferenceServer { /** @property {ModelPool} pool - A pool for managing model instances and concurrency. */ pool; /** @property {ModelStore} store - A store for managing model metadata, preparation, and storage. */ store; /** @property {Record<string, ModelEngine>} engines - A record of engines (custom and built-in) used for processing tasks. */ engines = {}; /** @property {Logger} log - Logger for tracking the server's activities and errors. */ log; /** * Constructs a `InferenceServer` instance with the specified options. * @param {InferenceServerOptions} options - Configuration options for the server. */ constructor(options) { this.log = createSublogger(options.log); let modelsCachePath = getCacheDirPath('models'); if (options.cachePath) { modelsCachePath = path.join(options.cachePath, 'models'); } const modelsWithDefaults = {}; const usedEngines = []; for (const modelId in options.models) { const modelOptions = options.models[modelId]; const isBuiltIn = builtInEngineNames.includes(modelOptions.engine); if (isBuiltIn) { const builtInModelOptions = modelOptions; // can validate and resolve location of model files if a built-in engine is used validateModelOptions(modelId, builtInModelOptions); modelsWithDefaults[modelId] = { id: modelId, minInstances: 0, maxInstances: 1, modelsCachePath, location: resolveModelFileLocation({ url: builtInModelOptions.url, filePath: builtInModelOptions.location, modelsCachePath, }), ...builtInModelOptions, }; } else { const customEngineOptions = modelOptions; modelsWithDefaults[modelId] = { id: modelId, minInstances: 0, maxInstances: 1, modelsCachePath, ...customEngineOptions, }; } usedEngines.push({ model: modelId, engine: modelOptions.engine, }); } const customEngines = Object.keys(options.engines ?? {}); for (const ref of usedEngines) { const isBuiltIn = builtInEngineNames.includes(ref.engine); const isCustom = customEngines.includes(ref.engine); if (!isBuiltIn && !isCustom) { throw new Error(`Engine "${ref.engine}" used by model "${ref.model}" does not exist`); } if (isCustom) { this.engines[ref.engine] = options.engines[ref.engine]; } } this.store = new ModelStore({ log: this.log, // TODO expose this? or remove it? // prepareConcurrency: 2, models: modelsWithDefaults, modelsCachePath, }); this.pool = new ModelPool({ log: this.log, concurrency: options.concurrency ?? 1, models: modelsWithDefaults, }, this.prepareInstance.bind(this)); } modelExists(modelId) { return !!this.pool.config.models[modelId]; } /** * Starts the inference server, initializing engines and preparing the model store and pool. * @returns {Promise<void>} Resolves when the server is fully started. */ async start() { this.log(LogLevels.info, 'Starting inference server ...'); const engineStartPromises = []; // call startEngine on custom engines for (const [key, methods] of Object.entries(this.engines)) { if (methods.start) { engineStartPromises.push(methods.start(this)); } } // import built-in engines for (const key of builtInEngineNames) { // skip unused engines const modelUsingEngine = Object.keys(this.store.models).find((modelId) => this.store.models[modelId].engine === key); if (!modelUsingEngine) { continue; } engineStartPromises.push(new Promise(async (resolve, reject) => { try { const engine = await import(`./engines/${key}/engine.js`); this.engines[key] = engine; resolve({ key, engine, }); } catch (err) { reject(err); } })); } this.log(LogLevels.debug, 'Waiting for engines to start ...'); await Promise.all(engineStartPromises); this.log(LogLevels.debug, `Engines started: ${Object.keys(this.engines).join(', ')}`); await Promise.all([this.store.init(this.engines), this.pool.init(this.engines)]); this.log(LogLevels.info, 'Inference server started'); } /** * Stops the server. disposes all resources. Clears the queue of working tasks. **/ async stop() { this.log(LogLevels.info, 'Stopping inference server ...'); this.pool.queue.clear(); this.store.dispose(); try { await this.pool.dispose(); // might cause abort errors when there are still running tasks } catch (err) { this.log(LogLevels.error, 'Error while stopping model server', err); } this.log(LogLevels.debug, 'Waiting for running tasks to finish'); // wait until all running tasks are cancelled await this.pool.queue.onIdle(); this.log(LogLevels.debug, 'Model server stopped'); } /** * Requests an available model instance from the pool for a specific task. * Use this for manual control over when to release the instance back to the pool. * @param {InferenceParams} args - The inference task arguments. * @param {AbortSignal} [signal] - An optional signal to abort the request. * @returns {Promise<ModelInstance>} A model instance that can fulfill the task. */ requestInstance(args, signal) { return this.pool.requestInstance(args, signal); } // gets called by the pool right before a new instance is created async prepareInstance(instance, signal) { const model = instance.config; const modelStoreStatus = this.store.models[model.id].status; if (modelStoreStatus === 'unloaded') { await this.store.prepareModel(model.id, signal); } if (modelStoreStatus === 'preparing') { const modelReady = new Promise((resolve, reject) => { const onCompleted = async (storeModel) => { if (storeModel.id === model.id) { this.store.prepareQueue.off('completed', onCompleted); if (storeModel.status === 'ready') { resolve(); } else { reject(); } } }; this.store.prepareQueue.on('completed', onCompleted); }); await modelReady; } } async processTask(args) { const lock = await this.requestInstance(args); const waitForResult = async (task) => { const result = await task.result; await lock.release(); return result; }; if (args.task === 'text-completion') { const task = lock.instance.processTextCompletionTask(args); return await waitForResult(task); } if (args.task === 'chat-completion') { const task = lock.instance.processChatCompletionTask(args); return await waitForResult(task); } if (args.task === 'embedding') { const task = lock.instance.processEmbeddingTask(args); return await waitForResult(task); } if (args.task === 'image-to-text') { const task = lock.instance.processImageToTextTask(args); return await waitForResult(task); } if (args.task === 'speech-to-text') { const task = lock.instance.processSpeechToTextTask(args); return await waitForResult(task); } if (args.task === 'text-to-speech') { const task = lock.instance.processTextToSpeechTask(args); return await waitForResult(task); } if (args.task === 'text-to-image') { const task = lock.instance.processTextToImageTask(args); return await waitForResult(task); } if (args.task === 'image-to-image') { const task = lock.instance.processImageToImageTask(args); return await waitForResult(task); } if (args.task === 'object-detection') { const task = lock.instance.processObjectDetectionTask(args); return await waitForResult(task); } if (args.task === 'text-classification') { const task = lock.instance.processTextClassificationTask(args); return await waitForResult(task); } // @ts-expect-error throw new Error(`Unknown task type: ${args.task}`); } processChatCompletionTask(args) { return this.processTask({ task: 'chat-completion', ...args, }); } processTextCompletionTask(args) { return this.processTask({ task: 'text-completion', ...args, }); } processEmbeddingTask(args) { return this.processTask({ task: 'embedding', ...args, }); } processImageToTextTask(args) { return this.processTask({ task: 'image-to-text', ...args, }); } processSpeechToTextTask(args) { return this.processTask({ task: 'speech-to-text', ...args, }); } processTextToSpeechTask(args) { return this.processTask({ task: 'text-to-speech', ...args, }); } processTextToImageTask(args) { return this.processTask({ task: 'text-to-image', ...args, }); } processImageToImageTask(args) { return this.processTask({ task: 'image-to-image', ...args, }); } processObjectDetectionTask(args) { return this.processTask({ task: 'object-detection', ...args, }); } processTextClassificationTask(args) { return this.processTask({ task: 'text-classification', ...args, }); } /** * Retrieves the current status of the model server, including pool and store status. * * @returns {Object} The status object containing pool and store information. */ getStatus() { const poolStatus = this.pool.getStatus(); const storeStatus = this.store.getStatus(); return { pool: poolStatus, store: storeStatus, }; } } //# sourceMappingURL=server.js.map