@huggingface/tasks

import type { TaskDataCustom } from "../index.js"; const taskData: TaskDataCustom = { datasets: [ { description: "Instructions composed of image and text.", id: "liuhaotian/LLaVA-Instruct-150K", }, { description: "Collection of image-text pairs on scientific topics.", id: "DAMO-NLP-SG/multimodal_textbook", }, { description: "A collection of datasets made for model fine-tuning.", id: "HuggingFaceM4/the_cauldron", }, { description: "Screenshots of websites with their HTML/CSS codes.", id: "HuggingFaceM4/WebSight", }, ], demo: { inputs: [ { filename: "image-text-to-text-input.png", type: "img", }, { label: "Text Prompt", content: "Describe the position of the bee in detail.", type: "text", }, ], outputs: [ { label: "Answer", content: "The bee is sitting on a pink flower, surrounded by other flowers. The bee is positioned in the center of the flower, with its head and front legs sticking out.", type: "text", }, ], }, metrics: [], models: [ { description: "Small and efficient yet powerful vision language model.", id: "HuggingFaceTB/SmolVLM-Instruct", }, { description: "A screenshot understanding model used to control computers.", id: "microsoft/OmniParser-v2.0", }, { description: "Cutting-edge vision language model.", id: "allenai/Molmo-7B-D-0924", }, { description: "Small yet powerful model.", id: "vikhyatk/moondream2", }, { description: "Strong image-text-to-text model.", id: "Qwen/Qwen2.5-VL-7B-Instruct", }, { description: "Image-text-to-text model with agentic capabilities.", id: "microsoft/Magma-8B", }, { description: "Strong image-text-to-text model focused on documents.", id: "allenai/olmOCR-7B-0225-preview", }, { description: "Small yet strong image-text-to-text model.", id: "ibm-granite/granite-vision-3.2-2b", }, ], spaces: [ { description: "Leaderboard to evaluate vision language models.", id: "opencompass/open_vlm_leaderboard", }, { description: "Vision language models arena, where models are ranked by votes of users.", id: "WildVision/vision-arena", }, { description: "Powerful vision-language model assistant.", id: "akhaliq/Molmo-7B-D-0924", }, { description: "Powerful vision language assistant that can understand multiple images.", id: "HuggingFaceTB/SmolVLM2", }, { description: "An application for chatting with an image-text-to-text model.", id: "GanymedeNil/Qwen2-VL-7B", }, { description: "An application that parses screenshots into actions.", id: "showlab/ShowUI", }, { description: "An application that detects gaze.", id: "moondream/gaze-demo", }, ], summary: "Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.", widgetModels: ["Qwen/Qwen2-VL-7B-Instruct"], youtubeId: "IoGaGfU1CIg", }; export default taskData;