@huggingface/tasks
Version:
List of ML tasks for huggingface.co/tasks
115 lines (112 loc) • 3.25 kB
text/typescript
import type { TaskDataCustom } from "../index.js";
const taskData: TaskDataCustom = {
datasets: [
{
description: "Instructions composed of image and text.",
id: "liuhaotian/LLaVA-Instruct-150K",
},
{
description: "Collection of image-text pairs on scientific topics.",
id: "DAMO-NLP-SG/multimodal_textbook",
},
{
description: "A collection of datasets made for model fine-tuning.",
id: "HuggingFaceM4/the_cauldron",
},
{
description: "Screenshots of websites with their HTML/CSS codes.",
id: "HuggingFaceM4/WebSight",
},
],
demo: {
inputs: [
{
filename: "image-text-to-text-input.png",
type: "img",
},
{
label: "Text Prompt",
content: "Describe the position of the bee in detail.",
type: "text",
},
],
outputs: [
{
label: "Answer",
content:
"The bee is sitting on a pink flower, surrounded by other flowers. The bee is positioned in the center of the flower, with its head and front legs sticking out.",
type: "text",
},
],
},
metrics: [],
models: [
{
description: "Small and efficient yet powerful vision language model.",
id: "HuggingFaceTB/SmolVLM-Instruct",
},
{
description: "A screenshot understanding model used to control computers.",
id: "microsoft/OmniParser-v2.0",
},
{
description: "Cutting-edge vision language model.",
id: "allenai/Molmo-7B-D-0924",
},
{
description: "Small yet powerful model.",
id: "vikhyatk/moondream2",
},
{
description: "Strong image-text-to-text model.",
id: "Qwen/Qwen2.5-VL-7B-Instruct",
},
{
description: "Image-text-to-text model with agentic capabilities.",
id: "microsoft/Magma-8B",
},
{
description: "Strong image-text-to-text model focused on documents.",
id: "allenai/olmOCR-7B-0225-preview",
},
{
description: "Small yet strong image-text-to-text model.",
id: "ibm-granite/granite-vision-3.2-2b",
},
],
spaces: [
{
description: "Leaderboard to evaluate vision language models.",
id: "opencompass/open_vlm_leaderboard",
},
{
description: "Vision language models arena, where models are ranked by votes of users.",
id: "WildVision/vision-arena",
},
{
description: "Powerful vision-language model assistant.",
id: "akhaliq/Molmo-7B-D-0924",
},
{
description: "Powerful vision language assistant that can understand multiple images.",
id: "HuggingFaceTB/SmolVLM2",
},
{
description: "An application for chatting with an image-text-to-text model.",
id: "GanymedeNil/Qwen2-VL-7B",
},
{
description: "An application that parses screenshots into actions.",
id: "showlab/ShowUI",
},
{
description: "An application that detects gaze.",
id: "moondream/gaze-demo",
},
],
summary:
"Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",
widgetModels: ["Qwen/Qwen2-VL-7B-Instruct"],
youtubeId: "IoGaGfU1CIg",
};
export default taskData;