AVSpeech (402 files)
metadata.jsonl.zip |
133.32MB |
clips/xpi.tar |
3.70GB |
clips/xpj.tar |
2.77GB |
clips/xph.tar |
3.58GB |
clips/xpf.tar |
3.66GB |
clips/xpg.tar |
3.45GB |
clips/xpe.tar |
3.57GB |
clips/xpc.tar |
3.43GB |
clips/xpd.tar |
3.63GB |
clips/xpa.tar |
3.57GB |
clips/xpb.tar |
3.57GB |
clips/xoy.tar |
3.60GB |
clips/xoz.tar |
3.82GB |
clips/xox.tar |
3.69GB |
clips/xow.tar |
3.84GB |
clips/xou.tar |
3.95GB |
clips/xov.tar |
3.93GB |
clips/xot.tar |
3.83GB |
clips/xos.tar |
3.85GB |
clips/xor.tar |
3.76GB |
clips/xoq.tar |
3.75GB |
clips/xoo.tar |
3.72GB |
clips/xop.tar |
3.86GB |
clips/xon.tar |
3.87GB |
clips/xom.tar |
3.95GB |
clips/xol.tar |
3.56GB |
clips/xok.tar |
3.91GB |
clips/xoj.tar |
4.06GB |
clips/xoi.tar |
3.90GB |
clips/xoh.tar |
3.88GB |
clips/xog.tar |
3.77GB |
clips/xof.tar |
3.67GB |
clips/xod.tar |
3.92GB |
clips/xoe.tar |
3.69GB |
clips/xob.tar |
3.66GB |
clips/xoc.tar |
3.82GB |
clips/xoa.tar |
3.62GB |
clips/xnz.tar |
3.94GB |
clips/xny.tar |
4.14GB |
clips/xnx.tar |
3.93GB |
clips/xnv.tar |
4.22GB |
clips/xnw.tar |
3.94GB |
clips/xnt.tar |
3.58GB |
clips/xnu.tar |
3.63GB |
clips/xnr.tar |
4.01GB |
clips/xns.tar |
3.85GB |
clips/xnp.tar |
3.91GB |
clips/xnq.tar |
3.84GB |
clips/xnn.tar |
3.58GB |
|
|
|
Type: Dataset
Bibtex:
Tags:
Bibtex:
@article{,
title= {AVSpeech: Large-scale Audio-Visual Speech Dataset },
journal= {},
author= {Ariel Ephrat and Inbar Mosseri and Oran Lang and Tali Dekel and Kevin Wilson and Avinatan Hassidim and William T. Freeman and Michael Rubinstein},
year= {},
url= {https://looking-to-listen.github.io/avspeech/},
abstract= {AVSpeech is a new, large-scale audio-visual dataset comprising speech video clips with no interfering background noises. The segments are 3-10 seconds long, and in each clip the audible sound in the soundtrack belongs to a single speaking person, visible in the video. In total, the dataset contains roughly 4700 hours* of video segments, from a total of 290k YouTube videos, spanning a wide variety of people, languages and face poses. For more details on how we created the dataset see our paper, Looking to Listen at the Cocktail Party: A Speaker-Independent Audio-Visual Model for Speech Separation (https://arxiv.org/abs/1804.03619).
* UPLOADER'S NOTE: This dataset contains 3000 hours of video segments and not the entire 4700 hours. 1700 hours were not included as some no longer existed on youtube, had a copyright violation, not available in the United States, or was of poor quality. Over 1 million segments are included in this torrent, each between 3 - 10 seconds, and in 720p resolution. See README on how to use this dataset},
keywords= {speech isolation, lip reading, face detection},
terms= {},
license= {},
superseded= {}
}
metadata.jsonl.zip