UNPKG

@huggingface/tasks

Version:
95 lines (92 loc) 2.41 kB
import type { TaskDataCustom } from "../index.js"; const taskData: TaskDataCustom = { datasets: [ { description: "31,175 hours of multilingual audio-text dataset in 108 languages.", id: "mozilla-foundation/common_voice_17_0", }, { description: "Multilingual and diverse audio dataset with 101k hours of audio.", id: "amphion/Emilia-Dataset", }, { description: "A dataset with 44.6k hours of English speaker data and 6k hours of other language speakers.", id: "parler-tts/mls_eng", }, { description: "A multilingual audio dataset with 370K hours of audio.", id: "espnet/yodas", }, ], demo: { inputs: [ { filename: "input.flac", type: "audio", }, ], outputs: [ { /// GOING ALONG SLUSHY COUNTRY ROADS AND SPEAKING TO DAMP AUDIENCES I label: "Transcript", content: "Going along slushy country roads and speaking to damp audiences in...", type: "text", }, ], }, metrics: [ { description: "", id: "wer", }, { description: "", id: "cer", }, ], models: [ { description: "A powerful ASR model by OpenAI.", id: "openai/whisper-large-v3", }, { description: "A good generic speech model by MetaAI for fine-tuning.", id: "facebook/w2v-bert-2.0", }, { description: "An end-to-end model that performs ASR and Speech Translation by MetaAI.", id: "facebook/seamless-m4t-v2-large", }, { description: "A powerful multilingual ASR and Speech Translation model by Nvidia.", id: "nvidia/canary-1b", }, { description: "Powerful speaker diarization model.", id: "pyannote/speaker-diarization-3.1", }, ], spaces: [ { description: "A powerful general-purpose speech recognition application.", id: "hf-audio/whisper-large-v3", }, { description: "Latest ASR model from Useful Sensors.", id: "mrfakename/Moonshinex", }, { description: "A high quality speech and text translation model by Meta.", id: "facebook/seamless_m4t", }, { description: "A powerful multilingual ASR and Speech Translation model by Nvidia", id: "nvidia/canary-1b", }, ], summary: "Automatic Speech Recognition (ASR), also known as Speech to Text (STT), is the task of transcribing a given audio to text. It has many applications, such as voice user interfaces.", widgetModels: ["openai/whisper-large-v3"], youtubeId: "TksaY_FDgnk", }; export default taskData;