@huggingface/tasks
Version:
List of ML tasks for huggingface.co/tasks
87 lines (84 loc) • 2.51 kB
text/typescript
import type { TaskDataCustom } from "../index.js";
const taskData: TaskDataCustom = {
datasets: [
{
description: "Instructions composed of image and text.",
id: "liuhaotian/LLaVA-Instruct-150K",
},
{
description: "Collection of image-text pairs on scientific topics.",
id: "DAMO-NLP-SG/multimodal_textbook",
},
{
description: "A collection of datasets made for model fine-tuning.",
id: "HuggingFaceM4/the_cauldron",
},
{
description: "Screenshots of websites with their HTML/CSS codes.",
id: "HuggingFaceM4/WebSight",
},
],
demo: {
inputs: [
{
filename: "image-text-to-text-input.png",
type: "img",
},
{
label: "Text Prompt",
content: "Describe the position of the bee in detail.",
type: "text",
},
],
outputs: [
{
label: "Answer",
content:
"The bee is sitting on a pink flower, surrounded by other flowers. The bee is positioned in the center of the flower, with its head and front legs sticking out.",
type: "text",
},
],
},
metrics: [],
models: [
{
description: "Small and efficient yet powerful vision language model.",
id: "HuggingFaceTB/SmolVLM-Instruct",
},
{
description: "Cutting-edge reasoning vision language model.",
id: "zai-org/GLM-4.5V",
},
{
description: "Cutting-edge small vision language model to convert documents to text.",
id: "rednote-hilab/dots.ocr",
},
{
description: "Small yet powerful model.",
id: "Qwen/Qwen2.5-VL-3B-Instruct",
},
{
description: "Image-text-to-text model with agentic capabilities.",
id: "microsoft/Magma-8B",
},
],
spaces: [
{
description: "Leaderboard to evaluate vision language models.",
id: "opencompass/open_vlm_leaderboard",
},
{
description: "An application that compares object detection capabilities of different vision language models.",
id: "sergiopaniego/vlm_object_understanding",
},
{
description: "An application to compare different OCR models.",
id: "prithivMLmods/Multimodal-OCR",
},
],
summary:
"Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",
widgetModels: ["zai-org/GLM-4.5V"],
youtubeId: "IoGaGfU1CIg",
};
export default taskData;