@article{,
title= {AVSpeech: Large-scale Audio-Visual Speech Dataset },
journal= {},
author= {Ariel Ephrat and Inbar Mosseri and Oran Lang and Tali Dekel and Kevin Wilson and Avinatan Hassidim and William T. Freeman and Michael Rubinstein},
year= {},
url= {https://looking-to-listen.github.io/avspeech/},
abstract= {AVSpeech is a new, large-scale audio-visual dataset comprising speech video clips with no interfering background noises. The segments are 3-10 seconds long, and in each clip the audible sound in the soundtrack belongs to a single speaking person, visible in the video. In total, the dataset contains roughly 4700 hours* of video segments, from a total of 290k YouTube videos, spanning a wide variety of people, languages and face poses. For more details on how we created the dataset see our paper, Looking to Listen at the Cocktail Party: A Speaker-Independent Audio-Visual Model for Speech Separation (https://arxiv.org/abs/1804.03619).
* UPLOADER'S NOTE: This dataset contains 3000 hours of video segments and not the entire 4700 hours. 1700 hours were not included as some no longer existed on youtube, had a copyright violation, not available in the United States, or was of poor quality. Over 1 million segments are included in this torrent, each between 3 - 10 seconds, and in 720p resolution. See README on how to use this dataset},
keywords= {speech isolation, lip reading, face detection},
terms= {},
license= {},
superseded= {}
}