UNPKG

@huggingface/tasks

Version:
87 lines (84 loc) 2.51 kB
import type { TaskDataCustom } from "../index.js"; const taskData: TaskDataCustom = { datasets: [ { description: "Instructions composed of image and text.", id: "liuhaotian/LLaVA-Instruct-150K", }, { description: "Collection of image-text pairs on scientific topics.", id: "DAMO-NLP-SG/multimodal_textbook", }, { description: "A collection of datasets made for model fine-tuning.", id: "HuggingFaceM4/the_cauldron", }, { description: "Screenshots of websites with their HTML/CSS codes.", id: "HuggingFaceM4/WebSight", }, ], demo: { inputs: [ { filename: "image-text-to-text-input.png", type: "img", }, { label: "Text Prompt", content: "Describe the position of the bee in detail.", type: "text", }, ], outputs: [ { label: "Answer", content: "The bee is sitting on a pink flower, surrounded by other flowers. The bee is positioned in the center of the flower, with its head and front legs sticking out.", type: "text", }, ], }, metrics: [], models: [ { description: "Small and efficient yet powerful vision language model.", id: "HuggingFaceTB/SmolVLM-Instruct", }, { description: "Cutting-edge reasoning vision language model.", id: "zai-org/GLM-4.5V", }, { description: "Cutting-edge small vision language model to convert documents to text.", id: "rednote-hilab/dots.ocr", }, { description: "Small yet powerful model.", id: "Qwen/Qwen2.5-VL-3B-Instruct", }, { description: "Image-text-to-text model with agentic capabilities.", id: "microsoft/Magma-8B", }, ], spaces: [ { description: "Leaderboard to evaluate vision language models.", id: "opencompass/open_vlm_leaderboard", }, { description: "An application that compares object detection capabilities of different vision language models.", id: "sergiopaniego/vlm_object_understanding", }, { description: "An application to compare different OCR models.", id: "prithivMLmods/Multimodal-OCR", }, ], summary: "Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.", widgetModels: ["zai-org/GLM-4.5V"], youtubeId: "IoGaGfU1CIg", }; export default taskData;