UNPKG

inference-server

Version:

Libraries and server to build AI applications. Adapters to various native bindings allowing local inference. Integrate it with your application, or use as a microservice.

86 lines 4.69 kB
import { CustomEngine } from '../engines/index.js'; // an experimental engine that replaces images with their descriptions before passing them to a chat model export class ChatWithVisionEngine extends CustomEngine { imageToTextModel; chatModel; constructor({ imageToTextModel, chatModel }) { super(); this.imageToTextModel = imageToTextModel; this.chatModel = chatModel; } async processChatCompletionTask(task) { const imageTextPromises = []; const imageToTextModel = await this.pool.requestInstance({ model: this.imageToTextModel, }); const messagesWithImageDescriptions = [...task.messages]; for (let m = 0; m < messagesWithImageDescriptions.length; m++) { const message = messagesWithImageDescriptions[m]; if (!Array.isArray(message.content)) { continue; } for (let p = 0; p < message.content.length; p++) { const contentPart = message.content[p]; if (contentPart.type !== 'image') { continue; } imageTextPromises.push(new Promise(async (resolve, reject) => { // all florence2 task prompts: https://huggingface.co/microsoft/Florence-2-base#tasks // "task_prompts_without_inputs": { // "<OCR>": "What is the text in the image?", // "<OCR_WITH_REGION>": "What is the text in the image, with regions?", // "<CAPTION>": "What does the image describe?", // "<DETAILED_CAPTION>": "Describe in detail what is shown in the image.", // "<MORE_DETAILED_CAPTION>": "Describe with a paragraph what is shown in the image.", // "<OD>": "Locate the objects with category name in the image.", // "<DENSE_REGION_CAPTION>": "Locate the objects in the image, with their descriptions.", // "<REGION_PROPOSAL>": "Locate the region proposals in the image." // }, // "task_prompts_with_input": { // "<CAPTION_TO_PHRASE_GROUNDING>": "Locate the phrases in the caption: {input}", // "<REFERRING_EXPRESSION_SEGMENTATION>": "Locate {input} in the image with mask", // "<REGION_TO_SEGMENTATION>": "What is the polygon mask of region {input}", // "<OPEN_VOCABULARY_DETECTION>": "Locate {input} in the image.", // "<REGION_TO_CATEGORY>": "What is the region {input}?", // "<REGION_TO_DESCRIPTION>": "What does the region {input} describe?", // "<REGION_TO_OCR>": "What text is in the region {input}?" // } // const imageData = await fetch(contentPart.image.url).then((res) => res.arrayBuffer()) const imageDescriptionTask = imageToTextModel.instance.processImageToTextTask({ model: this.imageToTextModel, // url: contentPart.url, image: contentPart.image, prompt: 'What does the image describe?', }); const result = await imageDescriptionTask.result; resolve({ text: result.text, messageIndex: m, contentPartIndex: p, }); })); } } const imageTextResults = await Promise.all(imageTextPromises); imageToTextModel.release(); console.debug('Image text results', imageTextResults); for (const imageTextResult of imageTextResults) { const { text, messageIndex, contentPartIndex } = imageTextResult; const message = messagesWithImageDescriptions[messageIndex]; // if ('type' in message.content[contentPartIndex]) { // message.content[contentPartIndex].type = 'text' // @ts-ignore message.content[contentPartIndex] = { type: 'text', text: `User uploaded image: ${text}`, }; } const chatRequest = { ...task, messages: messagesWithImageDescriptions, model: this.chatModel }; const chatModel = await this.pool.requestInstance(chatRequest); const chatTask = chatModel.instance.processChatCompletionTask(chatRequest); const result = await chatTask.result; chatModel.release(); return result; } } //# sourceMappingURL=ChatWithVision.js.map