diff --git a/README.md b/README.md index 6286441..b6dd5d3 100644 --- a/README.md +++ b/README.md @@ -1,57 +1,59 @@ -***Spoken Emotion Recognition Datasets:*** *A collection of datasets (count=49) for the purpose of emotion recognition/detection in speech. +***Speech Emotion Recognition (SER) Datasets:*** *A collection of datasets (count=51) for the purpose of emotion recognition/detection in speech. The table is chronologically ordered and includes a description of the content of each dataset along with the emotions included. The table can be browsed, sorted and searched under https://superkogito.github.io/SER-datasets/* -| Dataset | Year | Content | Emotions | Format | Size | Language | Paper | Access | License | -|:--------------------------------------------------------------------------------------------------------------------------------------------------|:----------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------|:------------------------|:------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------|:------------------------------------------------------------------------------------------------------------------------------------------| -| [Quechua-SER](https://figshare.com/articles/media/Quechua_Collao_for_Speech_Emotion_Recognition/20292516) | 2022 | 12420 audio recordings (~15 hours) and their transcriptions by 7 native speakers. | Emotional labels using dimensions: valence, arousal, and dominance. | Audio | 3.53 GB | Quechua Collao | [A speech corpus of Quechua Collao for automatic dimensional emotion recognition](https://www.nature.com/articles/s41597-022-01855-9) | Open | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) | -| [MESD](https://data.mendeley.com/datasets/cy34mh68j9/5) | 2022 | 864 audio files of single-word emotional utterances with Mexican cultural shaping. | 6 emotions provides single-word utterances for anger, disgust, fear, happiness, neutral, and sadness. | Audio | 0,097 GB | Spanish (Mexican) | [The Mexican Emotional Speech Database (MESD): elaboration and assessment based on machine learning](https://pubmed.ncbi.nlm.nih.gov/34891601/) | Open | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) | -| [SyntAct](https://zenodo.org/record/6573016#.ZAjy_9LMJpj) | 2022 | Synthesized database of three basic emotions and neutral expression based on rule-based manipulation for a diphone synthesizer which we release to the public | 997 utterances including 6 emotions: angry, bored, happy, neutral, sad and scared | Audio | 941 MB | German | [SyntAct: A Synthesized Database of Basic Emotions](http://felix.syntheticspeech.de/publications/synthetic_database.pdf) | Open | [CC BY-SA 4.0](https://creativecommons.org/licenses/by/4.0) | -| [LSSED](https://github.com/tobefans/LSSED) | 2021 | Large Scale Spanish Emotional Speech Database | 8 emotions provides Spanish spoken utterances for anger, boredom, disgust, fear, happiness, neutral, sadness, and surprise. | Audio | 90 GB | Spanish (Castilian) | [LSSED: A Large-Scale Spanish Emotional Speech Database for Speech Processing and Machine Learning](https://www.mdpi.com/1424-8220/21/23/6985) | Open | [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) | -| [MLEnd](https://www.kaggle.com/datasets/jesusrequena/mlend-spoken-numerals) | 2021 | ~32700 audio recordings files produced by 154 speakers. Each audio recording corresponds to one English numeral (from "zero" to "billion") | Intonations: neutral, bored, excited and question | Audio | 2.27 GB | -- | -- | Open | Unknown | -| [ASVP-ESD](https://www.kaggle.com/datasets/dejolilandry/asvpesdspeech-nonspeech-emotional-utterances) | 2021 | ~13285 audio files collected from movies, tv shows and youtube containing speech and non-speech. | 12 different natural emotions (boredom, neutral, happiness, sadness, anger, fear, surprise, disgust, excitement, pleasure, pain, disappointment) with 2 levels of intensity. | Audio | 2 GB | Chinese, English, French, Russian and others | -- | Open | Unknown | -| [ESD](https://hltsingapore.github.io/ESD/) | 2021 | 29 hours, 3500 sentences, by 10 native English speakers and 10 native Chinese speakers. | 5 emotions: angry, happy, neutral, sad, and surprise. | Audio, Text | 2.4 GB (zip) | Chinese, English | [Seen And Unseen Emotional Style Transfer For Voice Conversion With A New Emotional Speech Dataset](https://arxiv.org/pdf/2010.14794.pdf) | Open | Academic License | -| [MuSe-CAR](https://zenodo.org/record/4134758) | 2021 | 40 hours, 6,000+ recordings of 25,000+ sentences by 70+ English speakers (see db link for details). | continuous emotion dimensions characterized using valence, arousal, and trustworthiness. | Audio, Video, Text | 15 GB | English | [The Multimodal Sentiment Analysis in Car Reviews (MuSe-CaR) Dataset: Collection, Insights and Improvements](https://arxiv.org/pdf/2101.06053.pdf) | Restricted | Academic License & Commercial License | -| [THAI SER](https://github.com/vistec-AI/dataset-releases/releases/tag/v1) | 2021 | The recordings are 41 hours, 36 minutes long (27,854 utterances), and were performed by 200 professional actors (112 female, 88 male). | 5 main emotions assigned to actors: Neutral, Anger, Happiness, Sadness, and Frustration. | Audio | 12 GB | Thai | -- | Open | [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0) | -| [French Emotional Speech Database - Oréau](https://zenodo.org/records/4405783#.Yqjq_9JBxph) | 2020 | 79 utterances with 10 to 13 utterances pro emotion by 32 non-professional speakers. | 7 emotions: sadness, anger, disgust, fear, surprise, joy, neutral. | Audio | 0.264 GB | French | -- | Open | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) | -| [Att-HACK ](http://www.openslr.org/88/) | 2020 | 25 speakers interpreting 100 utterances in 4 social attitudes, with 3-5 repetitions each per attitude for a total of around 30 hours of speech. | expressive speech in French, 100 phrases with multiple versions (3 to 5) in four social attitudes (friendly, distant, dominant and seductive). | Audio | 6.6 GB | French | [Att-HACK: An Expressive Speech Database with Social Attitudes](https://arxiv.org/abs/2004.04410) | Open | [CC BY-NC-ND 4.0](https://creativecommons.org/licenses/by-nc-nd/4.0/) | -| [MSP-Podcast corpus](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) | 2020 | 100 hours by over 100 speakers (see db link for details). | This corpus is annotated with emotional labels using attribute-based descriptors (activation, dominance and valence) and categorical labels (anger, happiness, sadness, disgust, surprised, fear, contempt, neutral and other). | Audio | -- | -- | [The MSP-Conversation Corpus](http://www.interspeech2020.org/index.php?m=content&c=index&a=show&catid=290&id=684) | Restricted | Academic License & Commercial License | -| [BEASC](https://doi.org/10.6084/m9.figshare.12498033) | 2020 | Bangla Emotional Audio-Speech Corpus | 6 emotions provides Bangla spoken utterances for anger, happiness, sadness, fear, surprise, and neutral. | Audio | 9 GB | Bangla | [BEASC: Bangla Emotional Audio-Speech Corpus - A Speech Emotion Recognition Corpus for the Low-Resource Bangla Language](https://www.mdpi.com/2076-3417/10/11/3704) | Open | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) | -| [emotiontts open db](https://github.com/emotiontts/emotiontts_open_db) | 2020 | Recordings and their associated transcriptions by a diverse group of speakers. | 4 emotions: general, joy, anger, and sadness. | Audio, Text | -- | Korean | -- | Partially open | [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) | -| [URDU-Dataset](https://github.com/siddiquelatif/urdu-dataset) | 2020 | 400 utterances by 38 speakers (27 male and 11 female). | 4 emotions: angry, happy, neutral, and sad. | Audio | 0.072 GB | Urdu | [Cross Lingual Speech Emotion Recognition: Urdu vs. Western Languages](https://arxiv.org/pdf/1812.10411.pdf) | Open | -- | -| [BAVED](https://www.kaggle.com/a13x10/basic-arabic-vocal-emotions-dataset) | 2020 | 1935 recording by 61 speakers (45 male and 16 female). | 3 levels of emotion. | Audio | 0.195 GB | Arabic | -- | Open | -- | -| [VIVAE](https://zenodo.org/record/4066235) | 2020 | non-speech, 1085 audio file by 12 speakers. | non-speech 6 emotions: achievement, anger, fear, pain, pleasure, and surprise with 3 emotional intensities (low, moderate, strong, peak). | Audio | -- | -- | -- | Restricted | [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) | -| [SEWA](https://db.sewaproject.eu/) | 2019 | more than 2000 minutes of audio-visual data of 398 people (201 male and 197 female) coming from 6 cultures. | emotions are characterized using valence and arousal. | Audio, Video | -- | Chinese, English, German, Greek, Hungarian and Serbian | [SEWA DB: A Rich Database for Audio-Visual Emotion and Sentiment Research in the Wild](https://arxiv.org/pdf/1901.02839.pdf) | Restricted | [SEWA EULA](https://db.sewaproject.eu/media/doc/eula.pdf) | -| [MELD](https://affective-meld.github.io/) | 2019 | 1400 dialogues and 14000 utterances from Friends TV series by multiple speakers. | 7 emotions: Anger, disgust, sadness, joy, neutral, surprise and fear. MELD also has sentiment (positive, negative and neutral) annotation for each utterance. | Audio, Video, Text | 10.1 GB | English | [MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversations](https://arxiv.org/pdf/1810.02508.pdf) | Open | [MELD: GPL-3.0 License](https://github.com/declare-lab/MELD/blob/master/LICENSE) | -| [ShEMO](https://github.com/mansourehk/ShEMO) | 2019 | 3000 semi-natural utterances, equivalent to 3 hours and 25 minutes of speech data from online radio plays by 87 native-Persian speakers. | 6 emotions: anger, fear, happiness, sadness, neutral and surprise. | Audio | 0.101 GB | Persian | [ShEMO: a large-scale validated database for Persian speech emotion detection](https://link.springer.com/article/10.1007/s10579-018-9427-x) | Open | -- | -| [DEMoS](https://zenodo.org/record/2544829) | 2019 | 9365 emotional and 332 neutral samples produced by 68 native speakers (23 females, 45 males). | 7/6 emotions: anger, sadness, happiness, fear, surprise, disgust, and the secondary emotion guilt. | Audio | -- | Italian | [DEMoS: An Italian emotional speech corpus. Elicitation methods, machine learning, and perception](https://link.springer.com/epdf/10.1007/s10579-019-09450-y?author_access_token=5pf0w_D4k9z28TM6n4PbVPe4RwlQNchNByi7wbcMAY5hiA-aXzXNbZYfsMDDq2CdHD-w5ArAxIwlsk2nC_26pSyEAcu1xlKJ1c9m3JZj2ZlFmlVoCZUTcG3Hq2_2ozMLo3Hq3Y0CHzLdTxihQwch5Q%3D%3D) | Restricted | EULA: End User License Agreement | -| [AESDD](http://m3c.web.auth.gr/research/aesdd-speech-emotion-recognition/) | 2018 | around 500 utterances by a diverse group of actors (over 5 actors) siumlating various emotions. | 5 emotions: anger, disgust, fear, happiness, and sadness. | Audio | 0.392 GB | Greek | [Speech Emotion Recognition for Performance Interaction](https://www.researchgate.net/publication/326005164_Speech_Emotion_Recognition_for_Performance_Interaction) | Open | -- | -| [Emov-DB](https://mega.nz/#F!KBp32apT!gLIgyWf9iQ-yqnWFUFuUHg!mYwUnI4K) | 2018 | Recordings for 4 speakers- 2 males and 2 females. | The emotional styles are neutral, sleepiness, anger, disgust and amused. | Audio | 5.88 GB | English | [The emotional voices database: Towards controlling the emotion dimension in voice generation systems](https://arxiv.org/pdf/1806.09514.pdf) | Open | -- | -| [RAVDESS](https://zenodo.org/record/1188976#.XrC7a5NKjOR) | 2018 | 7356 recordings by 24 actors. | 7 emotions: calm, happy, sad, angry, fearful, surprise, and disgust | Audio, Video | 24.8 GB | English | [The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0196391) | Open | [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) | -| [JL corpus](https://www.kaggle.com/tli725/jl-corpus) | 2018 | 2400 recording of 240 sentences by 4 actors (2 males and 2 females). | 5 primary emotions: angry, sad, neutral, happy, excited. 5 secondary emotions: anxious, apologetic, pensive, worried, enthusiastic. | Audio | -- | English | [An Open Source Emotional Speech Corpus for Human Robot Interaction Applications](https://www.isca-speech.org/archive/Interspeech_2018/pdfs/1349.pdf) | Open | [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/) | -| [CaFE](https://zenodo.org/record/1478765) | 2018 | 6 different sentences by 12 speakers (6 fmelaes + 6 males). | 7 emotions: happy, sad, angry, fearful, surprise, disgust and neutral. Each emotion is acted in 2 different intensities. | Audio | 2 GB | French (Canadian) | -- | Open | [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) | -| [EmoFilm](https://zenodo.org/record/1326428) | 2018 | 1115 audio instances sentences extracted from various films. | 5 emotions: anger, contempt, happiness, fear, and sadness. | Audio | -- | English, Italian & Spanish | [Categorical vs Dimensional Perception of Italian Emotional Speech](https://pdfs.semanticscholar.org/e70e/fcf7f5b4c366a7b7e2c16267d7f7691a5391.pdf) | Restricted | EULA: End User License Agreement | -| [ANAD](https://www.kaggle.com/suso172/arabic-natural-audio-dataset) | 2018 | 1384 recording by multiple speakers. | 3 emotions: angry, happy, surprised. | Audio | 2 GB | Arabic | [Arabic Natural Audio Dataset](https://data.mendeley.com/datasets/xm232yxf7t/1) | Open | [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) | -| [EmoSynth](https://zenodo.org/record/3727593) | 2018 | 144 audio file labelled by 40 listeners. | Emotion (no speech) defined in regard of valence and arousal. | Audio | 0.1034 GB | -- | [The Perceived Emotion of Isolated Synthetic Audio: The EmoSynth Dataset and Results](https://dl.acm.org/doi/10.1145/3243274.3243277) | Open | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) | -| [CMU-MOSEI](https://www.amir-zadeh.com/datasets) | 2018 | 65 hours of annotated video from more than 1000 speakers and 250 topics. | 6 Emotion (happiness, sadness, anger,fear, disgust, surprise) + Likert scale. | Audio, Video | -- | English | [Multi-attention Recurrent Network for Human Communication Comprehension](https://arxiv.org/pdf/1802.00923.pdf) | Open | [CMU-MOSEI License](https://github.com/A2Zadeh/CMU-MultimodalSDK/blob/master/LICENSE.txt) | -| [VERBO](https://sites.google.com/view/verbodatabase/home) | 2018 | 14 different phrases by 12 speakers (6 female + 6 male) for a total of 1167 recordings. | 7 emotions: Happiness, Disgust, Fear, Neutral, Anger, Surprise, Sadness | Audio | -- | Portuguese | [VERBO: Voice Emotion Recognition dataBase in Portuguese Language](https://thescipub.com/pdf/jcssp.2018.1420.1430.pdf) | Restricted | Available for research purposes only | -| [CMU-MOSI](https://www.amir-zadeh.com/datasets) | 2017 | 2199 opinion utterances with annotated sentiment. | Sentiment annotated between very negative to very positive in seven Likert steps. | Audio, Video | -- | English | [Multi-attention Recurrent Network for Human Communication Comprehension](https://arxiv.org/pdf/1802.00923.pdf) | Open | [CMU-MOSI License](https://github.com/A2Zadeh/CMU-MultimodalSDK/blob/master/LICENSE.txt) | -| [MSP-IMPROV](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Improv.html) | 2017 | 20 sentences by 12 actors. | 4 emotions: angry, sad, happy, neutral, other, without agreement | Audio, Video | -- | English | [MSP-IMPROV: An Acted Corpus of Dyadic Interactions to Study Emotion Perception](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Busso_2017.pdf) | Restricted | Academic License & Commercial License | -| [CREMA-D](https://github.com/CheyneyComputerScience/CREMA-D) | 2017 | 7442 clip of 12 sentences spoken by 91 actors (48 males and 43 females). | 6 emotions: angry, disgusted, fearful, happy, neutral, and sad | Audio, Video | -- | English | [CREMA-D: Crowd-sourced Emotional Multimodal Actors Dataset](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4313618/) | Open | [Open Database License & Database Content License](https://github.com/CheyneyComputerScience/CREMA-D/blob/master/LICENSE.txt) | -| [Example emotion videos used in investigation of emotion perception in schizophrenia](https://espace.library.uq.edu.au/view/UQ:446541) | 2017 | 6 videos:Two example videos from each emotion category (angry, happy and neutral) by one female speaker. | 3 emotions: angry, happy and neutral. | Audio, Video | 0.063 GB | English | -- | Open | [Permitted Non-commercial Re-use with Acknowledgment](https://guides.library.uq.edu.au/deposit_your_data/terms_and_conditions) | -| [EMOVO](http://voice.fub.it/activities/corpora/emovo/index.html) | 2014 | 6 actors who played 14 sentences. | 6 emotions: disgust, fear, anger, joy, surprise, sadness. | Audio | 0.355 GB | Italian | [EMOVO Corpus: an Italian Emotional Speech Database](https://core.ac.uk/download/pdf/53857389.pdf) | Open | -- | -| [RECOLA](https://diuf.unifr.ch/main/diva/recola/download.html) | 2013 | 3.8 hours of recordings by 46 participants. | negative and positive sentiment (valence and arousal). | Audio, Video | -- | -- | [Introducing the RECOLA Multimodal Corpus of Remote Collaborative and Affective Interactions](https://drive.google.com/file/d/0B2V_I9XKBODhNENKUnZWNFdVXzQ/view) | Restricted | Academic License & Commercial License | -| [GEMEP corpus](https://www.unige.ch/cisa/gemep) | 2012 | Videos10 actors portraying 10 states. | 12 emotions: amusement, anxiety, cold anger (irritation), despair, hot anger (rage), fear (panic), interest, joy (elation), pleasure(sensory), pride, relief, and sadness. Plus, 5 additional emotions: admiration, contempt, disgust, surprise, and tenderness. | Audio, Video | -- | French | [Introducing the Geneva Multimodal Expression Corpus for Experimental Research on Emotion Perception](https://www.researchgate.net/publication/51796867_Introducing_the_Geneva_Multimodal_Expression_Corpus_for_Experimental_Research_on_Emotion_Perception) | Restricted | -- | -| [OGVC](https://sites.google.com/site/ogcorpus/home/en) | 2012 | 9114 spontaneous utterances and 2656 acted utterances by 4 professional actors (two male and two female). | 9 emotional states: fear, surprise, sadness, disgust, anger, anticipation, joy, acceptance and the neutral state. | Audio | -- | Japanese | [Naturalistic emotional speech collectionparadigm with online game and its psychological and acoustical assessment](https://www.jstage.jst.go.jp/article/ast/33/6/33_E1175/_pdf) | Restricted | -- | -| [LEGO corpus](https://www.ultes.eu/ressources/lego-spoken-dialogue-corpus/) | 2012 | 347 dialogs with 9,083 system-user exchanges. | Emotions classified as garbage, non-angry, slightly angry and very angry. | Audio | 1.1 GB | -- | [A Parameterized and Annotated Spoken Dialog Corpus of the CMU Let’s Go Bus Information System](http://www.lrec-conf.org/proceedings/lrec2012/pdf/333_Paper.pdf) | Open | License available with the data. Free of charges for research purposes only. | -| [SEMAINE](https://semaine-db.eu/) | 2012 | 95 dyadic conversations from 21 subjects. Each subject converses with another playing one of four characters with emotions. | 5 FeelTrace annotations: activation, valence, dominance, power, intensity | Audio, Video, Text | 104 GB | English | [The SEMAINE Database: Annotated Multimodal Records of Emotionally Colored Conversations between a Person and a Limited Agent](https://ieeexplore.ieee.org/document/5959155) | Restricted | Academic EULA | -| [SAVEE](http://kahlan.eps.surrey.ac.uk/savee/Database.html) | 2011 | 480 British English utterances by 4 males actors. | 7 emotions: anger, disgust, fear, happiness, sadness, surprise and neutral. | Audio, Video | -- | English (British) | [Multimodal Emotion Recognition](http://personal.ee.surrey.ac.uk/Personal/P.Jackson/pub/ma10/HaqJackson_MachineAudition10_approved.pdf) | Restricted | Free of charges for research purposes only. | -| [TESS](https://tspace.library.utoronto.ca/handle/1807/24487) | 2010 | 2800 recording by 2 actresses. | 7 emotions: anger, disgust, fear, happiness, pleasant surprise, sadness, and neutral. | Audio | -- | English | [BEHAVIOURAL FINDINGS FROM THE TORONTO EMOTIONAL SPEECH SET](https://www.semanticscholar.org/paper/BEHAVIOURAL-FINDINGS-FROM-THE-TORONTO-EMOTIONAL-SET-Dupuis-Pichora-Fuller/d7f746b3aee801a353b6929a65d9a34a68e71c6f/figure/2) | Open | [CC BY-NC-ND 4.0](https://creativecommons.org/licenses/by-nc-nd/4.0/) | -| [EEKK](https://metashare.ut.ee/repository/download/4d42d7a8463411e2a6e4005056b40024a19021a316b54b7fb707757d43d1a889/) | 2007 | 26 text passage read by 10 speakers. | 4 main emotions: joy, sadness, anger and neutral. | -- | 0.352 GB | Estonian | [Estonian Emotional Speech Corpus](https://www.researchgate.net/publication/261724574_Estonian_Emotional_Speech_Corpus_Release_1) | Open | [CC-BY license](https://metashare.ut.ee/repository/download/4d42d7a8463411e2a6e4005056b40024a19021a316b54b7fb707757d43d1a889/) | -| [IEMOCAP](https://sail.usc.edu/iemocap/iemocap_release.htm) | 2007 | 12 hours of audiovisual data by 10 actors. | 5 emotions: happiness, anger, sadness, frustration and neutral. | -- | -- | English | [IEMOCAP: Interactive emotional dyadic motion capture database](https://sail.usc.edu/iemocap/Busso_2008_iemocap.pdf) | Restricted | [IEMOCAP license](https://sail.usc.edu/iemocap/Data_Release_Form_IEMOCAP.pdf) | -| [Keio-ESD](http://research.nii.ac.jp/src/en/Keio-ESD.html) | 2006 | A set of human speech with vocal emotion spoken by a Japanese male speaker. | 47 emotions including angry, joyful, disgusting, downgrading, funny, worried, gentle, relief, indignation, shameful, etc. | Audio | -- | Japanese | [EMOTIONAL SPEECH SYNTHESIS USING SUBSPACE CONSTRAINTS IN PROSODY](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.420.8899&rep=rep1&type=pdf) | Restricted | Available for research purposes only. | -| [EMO-DB](http://emodb.bilderbar.info/index-1280.html) | 2005 | 800 recording spoken by 10 actors (5 males and 5 females). | 7 emotions: anger, neutral, fear, boredom, happiness, sadness, disgust. | Audio | -- | German | [A Database of German Emotional Speech](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.130.8506&rep=rep1&type=pdf) | Open | -- | -| [eNTERFACE05](http://www.enterface.net/enterface05/docs/results/databases/project2_database.zip) | 2005 | Videos by 42 subjects, coming from 14 different nationalities. | 6 emotions: anger, fear, surprise, happiness, sadness and disgust. | Audio, Video | 0.8 GB | German | -- | Open | Free of charges for research purposes only. | -| [DES](http://kom.aau.dk/~tb/speech/Emotions/) | 2002 | 4 speakers (2 males and 2 females). | 5 emotions: neutral, surprise, happiness, sadness and anger | -- | -- | Danish | [Documentation of the Danish Emotional Speech Database](http://kom.aau.dk/~tb/speech/Emotions/des.pdf) | -- | -- |## References +| Dataset | Year | Content | Emotions | Format | Size | Language | Paper | Access | License | +|:--------------------------------------------------------------------------------------------------------------------------------------------------|:----------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------|:------------------------|:------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------|:------------------------------------------------------------------------------------------------------------------------------------------| +| [Audio-Speech-Sentiment](https://www.kaggle.com/imsparsh/audio-speech-sentiment-analysis) | 2021 | Audio Speech Sentiment Dataset | 4 emotions provides audio recordings of spoken sentences for anger, happiness, sadness, and neutral emotions. | Audio | 1.1 GB | English | None | Open | [CC0: Public Domain](https://creativecommons.org/publicdomain/zero/1.0/) | +| [PMEmo](https://github.com/HuiZhangDB/PMEmo) | 2019 | PMEmo: A Multimodal Dataset for Emotion Recognition in Chronic Pain Patients | 4 emotions provides audio and visual recordings of participants with chronic pain performing physical activities and self-reporting their emotional state for happy, sad, angry, and neutral emotions. | Audio, video, and annotations | 17.8 GB | English | [PMEmo: A Multimodal Dataset for Emotion Recognition in Chronic Pain Patients](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6453745/) | Open | [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) | +| [Quechua-SER](https://figshare.com/articles/media/Quechua_Collao_for_Speech_Emotion_Recognition/20292516) | 2022 | 12420 audio recordings (~15 hours) and their transcriptions by 7 native speakers. | Emotional labels using dimensions: valence, arousal, and dominance. | Audio | 3.53 GB | Quechua Collao | [A speech corpus of Quechua Collao for automatic dimensional emotion recognition](https://www.nature.com/articles/s41597-022-01855-9) | Open | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) | +| [MESD](https://data.mendeley.com/datasets/cy34mh68j9/5) | 2022 | 864 audio files of single-word emotional utterances with Mexican cultural shaping. | 6 emotions provides single-word utterances for anger, disgust, fear, happiness, neutral, and sadness. | Audio | 0,097 GB | Spanish (Mexican) | [The Mexican Emotional Speech Database (MESD): elaboration and assessment based on machine learning](https://pubmed.ncbi.nlm.nih.gov/34891601/) | Open | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) | +| [SyntAct](https://zenodo.org/record/6573016#.ZAjy_9LMJpj) | 2022 | Synthesized database of three basic emotions and neutral expression based on rule-based manipulation for a diphone synthesizer which we release to the public | 997 utterances including 6 emotions: angry, bored, happy, neutral, sad and scared | Audio | 941 MB | German | [SyntAct: A Synthesized Database of Basic Emotions](http://felix.syntheticspeech.de/publications/synthetic_database.pdf) | Open | [CC BY-SA 4.0](https://creativecommons.org/licenses/by/4.0) | +| [LSSED](https://github.com/tobefans/LSSED) | 2021 | LSSED: A Large-Scale Dataset and Benchmark for Speech Emotion Recognition | Anger, happiness, sadness, disappointment, boredom, disgust, excitement, fear, surprise, normal, and other. | Audio | 90 GB | English | [LSSED: A Large-Scale Spanish Emotional Speech Database for Speech Processing and Machine Learning](https://arxiv.org/abs/2102.01754) | Restricted | [-](https://github.com/tobefans/LSSED/blob/main/EULA.pdf) | +| [MLEnd](https://www.kaggle.com/datasets/jesusrequena/mlend-spoken-numerals) | 2021 | ~32700 audio recordings files produced by 154 speakers. Each audio recording corresponds to one English numeral (from "zero" to "billion") | Intonations: neutral, bored, excited and question | Audio | 2.27 GB | -- | -- | Open | Unknown | +| [ASVP-ESD](https://www.kaggle.com/datasets/dejolilandry/asvpesdspeech-nonspeech-emotional-utterances) | 2021 | ~13285 audio files collected from movies, tv shows and youtube containing speech and non-speech. | 12 different natural emotions (boredom, neutral, happiness, sadness, anger, fear, surprise, disgust, excitement, pleasure, pain, disappointment) with 2 levels of intensity. | Audio | 2 GB | Chinese, English, French, Russian and others | -- | Open | Unknown | +| [ESD](https://hltsingapore.github.io/ESD/) | 2021 | 29 hours, 3500 sentences, by 10 native English speakers and 10 native Chinese speakers. | 5 emotions: angry, happy, neutral, sad, and surprise. | Audio, Text | 2.4 GB (zip) | Chinese, English | [Seen And Unseen Emotional Style Transfer For Voice Conversion With A New Emotional Speech Dataset](https://arxiv.org/pdf/2010.14794.pdf) | Open | Academic License | +| [MuSe-CAR](https://zenodo.org/record/4134758) | 2021 | 40 hours, 6,000+ recordings of 25,000+ sentences by 70+ English speakers (see db link for details). | continuous emotion dimensions characterized using valence, arousal, and trustworthiness. | Audio, Video, Text | 15 GB | English | [The Multimodal Sentiment Analysis in Car Reviews (MuSe-CaR) Dataset: Collection, Insights and Improvements](https://arxiv.org/pdf/2101.06053.pdf) | Restricted | Academic License & Commercial License | +| [THAI SER](https://github.com/vistec-AI/dataset-releases/releases/tag/v1) | 2021 | The recordings are 41 hours, 36 minutes long (27,854 utterances), and were performed by 200 professional actors (112 female, 88 male). | 5 main emotions assigned to actors: Neutral, Anger, Happiness, Sadness, and Frustration. | Audio | 12 GB | Thai | -- | Open | [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0) | +| [French Emotional Speech Database - Oréau](https://zenodo.org/records/4405783#.Yqjq_9JBxph) | 2020 | 79 utterances with 10 to 13 utterances pro emotion by 32 non-professional speakers. | 7 emotions: sadness, anger, disgust, fear, surprise, joy, neutral. | Audio | 0.264 GB | French | -- | Open | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) | +| [Att-HACK ](http://www.openslr.org/88/) | 2020 | 25 speakers interpreting 100 utterances in 4 social attitudes, with 3-5 repetitions each per attitude for a total of around 30 hours of speech. | expressive speech in French, 100 phrases with multiple versions (3 to 5) in four social attitudes (friendly, distant, dominant and seductive). | Audio | 6.6 GB | French | [Att-HACK: An Expressive Speech Database with Social Attitudes](https://arxiv.org/abs/2004.04410) | Open | [CC BY-NC-ND 4.0](https://creativecommons.org/licenses/by-nc-nd/4.0/) | +| [MSP-Podcast corpus](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) | 2020 | 100 hours by over 100 speakers (see db link for details). | This corpus is annotated with emotional labels using attribute-based descriptors (activation, dominance and valence) and categorical labels (anger, happiness, sadness, disgust, surprised, fear, contempt, neutral and other). | Audio | -- | -- | [The MSP-Conversation Corpus](http://www.interspeech2020.org/index.php?m=content&c=index&a=show&catid=290&id=684) | Restricted | Academic License & Commercial License | +| [BEASC](https://doi.org/10.6084/m9.figshare.12498033) | 2020 | Bangla Emotional Audio-Speech Corpus | 6 emotions provides Bangla spoken utterances for anger, happiness, sadness, fear, surprise, and neutral. | Audio | 9 GB | Bangla | [BEASC: Bangla Emotional Audio-Speech Corpus - A Speech Emotion Recognition Corpus for the Low-Resource Bangla Language](https://easy.dans.knaw.nl/ui/datasets/id/easy-dataset:236649) | Open | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) | +| [emotiontts open db](https://github.com/emotiontts/emotiontts_open_db) | 2020 | Recordings and their associated transcriptions by a diverse group of speakers. | 4 emotions: general, joy, anger, and sadness. | Audio, Text | -- | Korean | -- | Partially open | [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) | +| [URDU-Dataset](https://github.com/siddiquelatif/urdu-dataset) | 2020 | 400 utterances by 38 speakers (27 male and 11 female). | 4 emotions: angry, happy, neutral, and sad. | Audio | 0.072 GB | Urdu | [Cross Lingual Speech Emotion Recognition: Urdu vs. Western Languages](https://arxiv.org/pdf/1812.10411.pdf) | Open | -- | +| [BAVED](https://www.kaggle.com/a13x10/basic-arabic-vocal-emotions-dataset) | 2020 | 1935 recording by 61 speakers (45 male and 16 female). | 3 levels of emotion. | Audio | 0.195 GB | Arabic | -- | Open | -- | +| [VIVAE](https://zenodo.org/record/4066235) | 2020 | non-speech, 1085 audio file by 12 speakers. | non-speech 6 emotions: achievement, anger, fear, pain, pleasure, and surprise with 3 emotional intensities (low, moderate, strong, peak). | Audio | -- | -- | -- | Restricted | [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) | +| [SEWA](https://db.sewaproject.eu/) | 2019 | more than 2000 minutes of audio-visual data of 398 people (201 male and 197 female) coming from 6 cultures. | emotions are characterized using valence and arousal. | Audio, Video | -- | Chinese, English, German, Greek, Hungarian and Serbian | [SEWA DB: A Rich Database for Audio-Visual Emotion and Sentiment Research in the Wild](https://arxiv.org/pdf/1901.02839.pdf) | Restricted | [SEWA EULA](https://db.sewaproject.eu/media/doc/eula.pdf) | +| [MELD](https://affective-meld.github.io/) | 2019 | 1400 dialogues and 14000 utterances from Friends TV series by multiple speakers. | 7 emotions: Anger, disgust, sadness, joy, neutral, surprise and fear. MELD also has sentiment (positive, negative and neutral) annotation for each utterance. | Audio, Video, Text | 10.1 GB | English | [MELD: A Multimodal Multi-Party Dataset for Emotion Recognition in Conversations](https://arxiv.org/pdf/1810.02508.pdf) | Open | [MELD: GPL-3.0 License](https://github.com/declare-lab/MELD/blob/master/LICENSE) | +| [ShEMO](https://github.com/mansourehk/ShEMO) | 2019 | 3000 semi-natural utterances, equivalent to 3 hours and 25 minutes of speech data from online radio plays by 87 native-Persian speakers. | 6 emotions: anger, fear, happiness, sadness, neutral and surprise. | Audio | 0.101 GB | Persian | [ShEMO: a large-scale validated database for Persian speech emotion detection](https://link.springer.com/article/10.1007/s10579-018-9427-x) | Open | -- | +| [DEMoS](https://zenodo.org/record/2544829) | 2019 | 9365 emotional and 332 neutral samples produced by 68 native speakers (23 females, 45 males). | 7/6 emotions: anger, sadness, happiness, fear, surprise, disgust, and the secondary emotion guilt. | Audio | -- | Italian | [DEMoS: An Italian emotional speech corpus. Elicitation methods, machine learning, and perception](https://link.springer.com/epdf/10.1007/s10579-019-09450-y?author_access_token=5pf0w_D4k9z28TM6n4PbVPe4RwlQNchNByi7wbcMAY5hiA-aXzXNbZYfsMDDq2CdHD-w5ArAxIwlsk2nC_26pSyEAcu1xlKJ1c9m3JZj2ZlFmlVoCZUTcG3Hq2_2ozMLo3Hq3Y0CHzLdTxihQwch5Q%3D%3D) | Restricted | EULA: End User License Agreement | +| [AESDD](http://m3c.web.auth.gr/research/aesdd-speech-emotion-recognition/) | 2018 | around 500 utterances by a diverse group of actors (over 5 actors) siumlating various emotions. | 5 emotions: anger, disgust, fear, happiness, and sadness. | Audio | 0.392 GB | Greek | [Speech Emotion Recognition for Performance Interaction](https://www.researchgate.net/publication/326005164_Speech_Emotion_Recognition_for_Performance_Interaction) | Open | -- | +| [Emov-DB](https://mega.nz/#F!KBp32apT!gLIgyWf9iQ-yqnWFUFuUHg!mYwUnI4K) | 2018 | Recordings for 4 speakers- 2 males and 2 females. | The emotional styles are neutral, sleepiness, anger, disgust and amused. | Audio | 5.88 GB | English | [The emotional voices database: Towards controlling the emotion dimension in voice generation systems](https://arxiv.org/pdf/1806.09514.pdf) | Open | -- | +| [RAVDESS](https://zenodo.org/record/1188976#.XrC7a5NKjOR) | 2018 | 7356 recordings by 24 actors. | 7 emotions: calm, happy, sad, angry, fearful, surprise, and disgust | Audio, Video | 24.8 GB | English | [The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0196391) | Open | [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) | +| [JL corpus](https://www.kaggle.com/tli725/jl-corpus) | 2018 | 2400 recording of 240 sentences by 4 actors (2 males and 2 females). | 5 primary emotions: angry, sad, neutral, happy, excited. 5 secondary emotions: anxious, apologetic, pensive, worried, enthusiastic. | Audio | -- | English | [An Open Source Emotional Speech Corpus for Human Robot Interaction Applications](https://www.isca-speech.org/archive/Interspeech_2018/pdfs/1349.pdf) | Open | [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/) | +| [CaFE](https://zenodo.org/record/1478765) | 2018 | 6 different sentences by 12 speakers (6 fmelaes + 6 males). | 7 emotions: happy, sad, angry, fearful, surprise, disgust and neutral. Each emotion is acted in 2 different intensities. | Audio | 2 GB | French (Canadian) | -- | Open | [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) | +| [EmoFilm](https://zenodo.org/record/1326428) | 2018 | 1115 audio instances sentences extracted from various films. | 5 emotions: anger, contempt, happiness, fear, and sadness. | Audio | -- | English, Italian & Spanish | [Categorical vs Dimensional Perception of Italian Emotional Speech](https://pdfs.semanticscholar.org/e70e/fcf7f5b4c366a7b7e2c16267d7f7691a5391.pdf) | Restricted | EULA: End User License Agreement | +| [ANAD](https://www.kaggle.com/suso172/arabic-natural-audio-dataset) | 2018 | 1384 recording by multiple speakers. | 3 emotions: angry, happy, surprised. | Audio | 2 GB | Arabic | [Arabic Natural Audio Dataset](https://data.mendeley.com/datasets/xm232yxf7t/1) | Open | [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) | +| [EmoSynth](https://zenodo.org/record/3727593) | 2018 | 144 audio file labelled by 40 listeners. | Emotion (no speech) defined in regard of valence and arousal. | Audio | 0.1034 GB | -- | [The Perceived Emotion of Isolated Synthetic Audio: The EmoSynth Dataset and Results](https://dl.acm.org/doi/10.1145/3243274.3243277) | Open | [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) | +| [CMU-MOSEI](https://www.amir-zadeh.com/datasets) | 2018 | 65 hours of annotated video from more than 1000 speakers and 250 topics. | 6 Emotion (happiness, sadness, anger,fear, disgust, surprise) + Likert scale. | Audio, Video | -- | English | [Multi-attention Recurrent Network for Human Communication Comprehension](https://arxiv.org/pdf/1802.00923.pdf) | Open | [CMU-MOSEI License](https://github.com/A2Zadeh/CMU-MultimodalSDK/blob/master/LICENSE.txt) | +| [VERBO](https://sites.google.com/view/verbodatabase/home) | 2018 | 14 different phrases by 12 speakers (6 female + 6 male) for a total of 1167 recordings. | 7 emotions: Happiness, Disgust, Fear, Neutral, Anger, Surprise, Sadness | Audio | -- | Portuguese | [VERBO: Voice Emotion Recognition dataBase in Portuguese Language](https://thescipub.com/pdf/jcssp.2018.1420.1430.pdf) | Restricted | Available for research purposes only | +| [CMU-MOSI](https://www.amir-zadeh.com/datasets) | 2017 | 2199 opinion utterances with annotated sentiment. | Sentiment annotated between very negative to very positive in seven Likert steps. | Audio, Video | -- | English | [Multi-attention Recurrent Network for Human Communication Comprehension](https://arxiv.org/pdf/1802.00923.pdf) | Open | [CMU-MOSI License](https://github.com/A2Zadeh/CMU-MultimodalSDK/blob/master/LICENSE.txt) | +| [MSP-IMPROV](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Improv.html) | 2017 | 20 sentences by 12 actors. | 4 emotions: angry, sad, happy, neutral, other, without agreement | Audio, Video | -- | English | [MSP-IMPROV: An Acted Corpus of Dyadic Interactions to Study Emotion Perception](https://ecs.utdallas.edu/research/researchlabs/msp-lab/publications/Busso_2017.pdf) | Restricted | Academic License & Commercial License | +| [CREMA-D](https://github.com/CheyneyComputerScience/CREMA-D) | 2017 | 7442 clip of 12 sentences spoken by 91 actors (48 males and 43 females). | 6 emotions: angry, disgusted, fearful, happy, neutral, and sad | Audio, Video | -- | English | [CREMA-D: Crowd-sourced Emotional Multimodal Actors Dataset](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4313618/) | Open | [Open Database License & Database Content License](https://github.com/CheyneyComputerScience/CREMA-D/blob/master/LICENSE.txt) | +| [Example emotion videos used in investigation of emotion perception in schizophrenia](https://espace.library.uq.edu.au/view/UQ:446541) | 2017 | 6 videos:Two example videos from each emotion category (angry, happy and neutral) by one female speaker. | 3 emotions: angry, happy and neutral. | Audio, Video | 0.063 GB | English | -- | Open | [Permitted Non-commercial Re-use with Acknowledgment](https://guides.library.uq.edu.au/deposit_your_data/terms_and_conditions) | +| [EMOVO](http://voice.fub.it/activities/corpora/emovo/index.html) | 2014 | 6 actors who played 14 sentences. | 6 emotions: disgust, fear, anger, joy, surprise, sadness. | Audio | 0.355 GB | Italian | [EMOVO Corpus: an Italian Emotional Speech Database](https://core.ac.uk/download/pdf/53857389.pdf) | Open | -- | +| [RECOLA](https://diuf.unifr.ch/main/diva/recola/download.html) | 2013 | 3.8 hours of recordings by 46 participants. | negative and positive sentiment (valence and arousal). | Audio, Video | -- | -- | [Introducing the RECOLA Multimodal Corpus of Remote Collaborative and Affective Interactions](https://drive.google.com/file/d/0B2V_I9XKBODhNENKUnZWNFdVXzQ/view) | Restricted | Academic License & Commercial License | +| [GEMEP corpus](https://www.unige.ch/cisa/gemep) | 2012 | Videos10 actors portraying 10 states. | 12 emotions: amusement, anxiety, cold anger (irritation), despair, hot anger (rage), fear (panic), interest, joy (elation), pleasure(sensory), pride, relief, and sadness. Plus, 5 additional emotions: admiration, contempt, disgust, surprise, and tenderness. | Audio, Video | -- | French | [Introducing the Geneva Multimodal Expression Corpus for Experimental Research on Emotion Perception](https://www.researchgate.net/publication/51796867_Introducing_the_Geneva_Multimodal_Expression_Corpus_for_Experimental_Research_on_Emotion_Perception) | Restricted | -- | +| [OGVC](https://sites.google.com/site/ogcorpus/home/en) | 2012 | 9114 spontaneous utterances and 2656 acted utterances by 4 professional actors (two male and two female). | 9 emotional states: fear, surprise, sadness, disgust, anger, anticipation, joy, acceptance and the neutral state. | Audio | -- | Japanese | [Naturalistic emotional speech collectionparadigm with online game and its psychological and acoustical assessment](https://www.jstage.jst.go.jp/article/ast/33/6/33_E1175/_pdf) | Restricted | -- | +| [LEGO corpus](https://www.ultes.eu/ressources/lego-spoken-dialogue-corpus/) | 2012 | 347 dialogs with 9,083 system-user exchanges. | Emotions classified as garbage, non-angry, slightly angry and very angry. | Audio | 1.1 GB | -- | [A Parameterized and Annotated Spoken Dialog Corpus of the CMU Let’s Go Bus Information System](http://www.lrec-conf.org/proceedings/lrec2012/pdf/333_Paper.pdf) | Open | License available with the data. Free of charges for research purposes only. | +| [SEMAINE](https://semaine-db.eu/) | 2012 | 95 dyadic conversations from 21 subjects. Each subject converses with another playing one of four characters with emotions. | 5 FeelTrace annotations: activation, valence, dominance, power, intensity | Audio, Video, Text | 104 GB | English | [The SEMAINE Database: Annotated Multimodal Records of Emotionally Colored Conversations between a Person and a Limited Agent](https://ieeexplore.ieee.org/document/5959155) | Restricted | Academic EULA | +| [SAVEE](http://kahlan.eps.surrey.ac.uk/savee/Database.html) | 2011 | 480 British English utterances by 4 males actors. | 7 emotions: anger, disgust, fear, happiness, sadness, surprise and neutral. | Audio, Video | -- | English (British) | [Multimodal Emotion Recognition](http://personal.ee.surrey.ac.uk/Personal/P.Jackson/pub/ma10/HaqJackson_MachineAudition10_approved.pdf) | Restricted | Free of charges for research purposes only. | +| [TESS](https://tspace.library.utoronto.ca/handle/1807/24487) | 2010 | 2800 recording by 2 actresses. | 7 emotions: anger, disgust, fear, happiness, pleasant surprise, sadness, and neutral. | Audio | -- | English | [BEHAVIOURAL FINDINGS FROM THE TORONTO EMOTIONAL SPEECH SET](https://www.semanticscholar.org/paper/BEHAVIOURAL-FINDINGS-FROM-THE-TORONTO-EMOTIONAL-SET-Dupuis-Pichora-Fuller/d7f746b3aee801a353b6929a65d9a34a68e71c6f/figure/2) | Open | [CC BY-NC-ND 4.0](https://creativecommons.org/licenses/by-nc-nd/4.0/) | +| [EEKK](https://metashare.ut.ee/repository/download/4d42d7a8463411e2a6e4005056b40024a19021a316b54b7fb707757d43d1a889/) | 2007 | 26 text passage read by 10 speakers. | 4 main emotions: joy, sadness, anger and neutral. | -- | 0.352 GB | Estonian | [Estonian Emotional Speech Corpus](https://www.researchgate.net/publication/261724574_Estonian_Emotional_Speech_Corpus_Release_1) | Open | [CC-BY license](https://metashare.ut.ee/repository/download/4d42d7a8463411e2a6e4005056b40024a19021a316b54b7fb707757d43d1a889/) | +| [IEMOCAP](https://sail.usc.edu/iemocap/iemocap_release.htm) | 2007 | 12 hours of audiovisual data by 10 actors. | 5 emotions: happiness, anger, sadness, frustration and neutral. | -- | -- | English | [IEMOCAP: Interactive emotional dyadic motion capture database](https://sail.usc.edu/iemocap/Busso_2008_iemocap.pdf) | Restricted | [IEMOCAP license](https://sail.usc.edu/iemocap/Data_Release_Form_IEMOCAP.pdf) | +| [Keio-ESD](http://research.nii.ac.jp/src/en/Keio-ESD.html) | 2006 | A set of human speech with vocal emotion spoken by a Japanese male speaker. | 47 emotions including angry, joyful, disgusting, downgrading, funny, worried, gentle, relief, indignation, shameful, etc. | Audio | -- | Japanese | [EMOTIONAL SPEECH SYNTHESIS USING SUBSPACE CONSTRAINTS IN PROSODY](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.420.8899&rep=rep1&type=pdf) | Restricted | Available for research purposes only. | +| [EMO-DB](http://emodb.bilderbar.info/index-1280.html) | 2005 | 800 recording spoken by 10 actors (5 males and 5 females). | 7 emotions: anger, neutral, fear, boredom, happiness, sadness, disgust. | Audio | -- | German | [A Database of German Emotional Speech](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.130.8506&rep=rep1&type=pdf) | Open | -- | +| [eNTERFACE05](http://www.enterface.net/enterface05/docs/results/databases/project2_database.zip) | 2005 | Videos by 42 subjects, coming from 14 different nationalities. | 6 emotions: anger, fear, surprise, happiness, sadness and disgust. | Audio, Video | 0.8 GB | German | -- | Open | Free of charges for research purposes only. | +| [DES](http://kom.aau.dk/~tb/speech/Emotions/) | 2002 | 4 speakers (2 males and 2 females). | 5 emotions: neutral, surprise, happiness, sadness and anger | -- | -- | Danish | [Documentation of the Danish Emotional Speech Database](http://kom.aau.dk/~tb/speech/Emotions/des.pdf) | -- | -- |## References - Swain, Monorama & Routray, Aurobinda & Kabisatpathy, Prithviraj, Databases, features and classifiers for speech emotion recognition: a review, International Journal of Speech Technology, [paper](https://www.researchgate.net/publication/322602563_Databases_features_and_classifiers_for_speech_emotion_recognition_a_review#pf19) - Dimitrios Ververidis and Constantine Kotropoulos, A State of the Art Review on Emotional Speech Databases, Artificial Intelligence & Information Analysis Laboratory, Department of Informatics Aristotle, University of Thessaloniki, [paper](http://poseidon.csd.auth.gr/papers/PUBLISHED/CONFERENCE/pdf/Ververidis2003b.pdf) diff --git a/src/generate_files.py b/src/generate_files.py index b8618cd..632b563 100644 --- a/src/generate_files.py +++ b/src/generate_files.py @@ -13,7 +13,7 @@ keys = ["Dataset", "Year", "Content", "Emotions", "Format", "Size", "Language", "Paper", "Access", "License", "Dataset-link", "Paper-link", "License-link"] header = ["Dataset", "Year", "Content", "Emotions", "Format", "Size", "Language", "Paper", "Access", "License"] -md_0 = """***Spoken Emotion Recognition Datasets:*** *A collection of datasets (count={0})""".format(len(content.items())) +md_0 = """***Speech Emotion Recognition (SER) Datasets:*** *A collection of datasets (count={0})""".format(len(content.items())) md_1 = """ for the purpose of emotion recognition/detection in speech. The table is chronologically ordered and includes a description of the content of each dataset along with the emotions included. The table can be browsed, sorted and searched under https://superkogito.github.io/SER-datasets/* @@ -43,7 +43,10 @@ print(" -> Generate Markdown Text") def format_md_link(label, link): - res = "[{0}]({1})".format(label, link) if "http" in link else label + if link and "http" in link: + res = "[{0}]({1})".format(label, link) + else: + res = label return res # tabulate @@ -68,7 +71,10 @@ def format_md_link(label, link): print(" -> Generate Restructured Text") def format_rst_link(label, link): - res = "`{0} <{1}>`_".format(label, link) if "http" in link else label + if link and "http" in link: + res = "`{0} <{1}>`_".format(label, link) + else: + res = label return res # tabulate diff --git a/src/ser-datasets.csv b/src/ser-datasets.csv index 9c2d332..5794e70 100644 --- a/src/ser-datasets.csv +++ b/src/ser-datasets.csv @@ -1,8 +1,10 @@ Dataset,Year,Content,Emotions,Format,Size,Language,Paper,Access,License +`Audio-Speech-Sentiment `_,2021,Audio Speech Sentiment Dataset,"4 emotions provides audio recordings of spoken sentences for anger, happiness, sadness, and neutral emotions.",Audio,1.1 GB,English,,Open,`CC0: Public Domain `_ +`PMEmo `_,2019,PMEmo: A Multimodal Dataset for Emotion Recognition in Chronic Pain Patients,"4 emotions provides audio and visual recordings of participants with chronic pain performing physical activities and self-reporting their emotional state for happy, sad, angry, and neutral emotions.","Audio, video, and annotations",17.8 GB,English,`PMEmo: A Multimodal Dataset for Emotion Recognition in Chronic Pain Patients `_,Open,`CC BY-SA 4.0 `_ `Quechua-SER `_,2022,12420 audio recordings (~15 hours) and their transcriptions by 7 native speakers.,"Emotional labels using dimensions: valence, arousal, and dominance.",Audio,3.53 GB,Quechua Collao,`A speech corpus of Quechua Collao for automatic dimensional emotion recognition `_,Open,`CC BY 4.0 `_ `MESD `_,2022,864 audio files of single-word emotional utterances with Mexican cultural shaping.,"6 emotions provides single-word utterances for anger, disgust, fear, happiness, neutral, and sadness.",Audio,"0,097 GB",Spanish (Mexican),`The Mexican Emotional Speech Database (MESD): elaboration and assessment based on machine learning `_,Open,`CC BY 4.0 `_ `SyntAct `_,2022,Synthesized database of three basic emotions and neutral expression based on rule-based manipulation for a diphone synthesizer which we release to the public ,"997 utterances including 6 emotions: angry, bored, happy, neutral, sad and scared",Audio,941 MB,German,`SyntAct: A Synthesized Database of Basic Emotions `_,Open,`CC BY-SA 4.0 `_ -`LSSED `_,2021,Large Scale Spanish Emotional Speech Database,"8 emotions provides Spanish spoken utterances for anger, boredom, disgust, fear, happiness, neutral, sadness, and surprise.",Audio,90 GB,Spanish (Castilian),`LSSED: A Large-Scale Spanish Emotional Speech Database for Speech Processing and Machine Learning `_,Open,`CC BY-SA 4.0 `_ +`LSSED `_,2021,LSSED: A Large-Scale Dataset and Benchmark for Speech Emotion Recognition,"Anger, happiness, sadness, disappointment, boredom, disgust, excitement, fear, surprise, normal, and other.",Audio,90 GB,English,`LSSED: A Large-Scale Spanish Emotional Speech Database for Speech Processing and Machine Learning `_,Restricted,`- `_ `MLEnd `_,2021,"~32700 audio recordings files produced by 154 speakers. Each audio recording corresponds to one English numeral (from ""zero"" to ""billion"")","Intonations: neutral, bored, excited and question",Audio,2.27 GB,--,--,Open,Unknown `ASVP-ESD `_,2021,"~13285 audio files collected from movies, tv shows and youtube containing speech and non-speech.","12 different natural emotions (boredom, neutral, happiness, sadness, anger, fear, surprise, disgust, excitement, pleasure, pain, disappointment) with 2 levels of intensity.",Audio,2 GB,"Chinese, English, French, Russian and others",--,Open,Unknown `ESD `_,2021,"29 hours, 3500 sentences, by 10 native English speakers and 10 native Chinese speakers.","5 emotions: angry, happy, neutral, sad, and surprise.","Audio, Text",2.4 GB (zip),"Chinese, English",`Seen And Unseen Emotional Style Transfer For Voice Conversion With A New Emotional Speech Dataset `_,Open,Academic License @@ -11,7 +13,7 @@ Dataset,Year,Content,Emotions,Format,Size,Language,Paper,Access,License `French Emotional Speech Database - Oréau `_,2020,79 utterances with 10 to 13 utterances pro emotion by 32 non-professional speakers.,"7 emotions: sadness, anger, disgust, fear, surprise, joy, neutral.",Audio,0.264 GB,French,--,Open,`CC BY 4.0 `_ `Att-HACK `_,2020,"25 speakers interpreting 100 utterances in 4 social attitudes, with 3-5 repetitions each per attitude for a total of around 30 hours of speech.","expressive speech in French, 100 phrases with multiple versions (3 to 5) in four social attitudes (friendly, distant, dominant and seductive).",Audio,6.6 GB,French,`Att-HACK: An Expressive Speech Database with Social Attitudes `_,Open,`CC BY-NC-ND 4.0 `_ `MSP-Podcast corpus `_,2020,100 hours by over 100 speakers (see db link for details).,"This corpus is annotated with emotional labels using attribute-based descriptors (activation, dominance and valence) and categorical labels (anger, happiness, sadness, disgust, surprised, fear, contempt, neutral and other).",Audio,--,--,`The MSP-Conversation Corpus `_,Restricted,Academic License & Commercial License -`BEASC `_,2020,Bangla Emotional Audio-Speech Corpus,"6 emotions provides Bangla spoken utterances for anger, happiness, sadness, fear, surprise, and neutral.",Audio,9 GB,Bangla,`BEASC: Bangla Emotional Audio-Speech Corpus - A Speech Emotion Recognition Corpus for the Low-Resource Bangla Language `_,Open,`CC BY 4.0 `_ +`BEASC `_,2020,Bangla Emotional Audio-Speech Corpus,"6 emotions provides Bangla spoken utterances for anger, happiness, sadness, fear, surprise, and neutral.",Audio,9 GB,Bangla,`BEASC: Bangla Emotional Audio-Speech Corpus - A Speech Emotion Recognition Corpus for the Low-Resource Bangla Language `_,Open,`CC BY 4.0 `_ `emotiontts open db `_,2020,Recordings and their associated transcriptions by a diverse group of speakers.,"4 emotions: general, joy, anger, and sadness.","Audio, Text",--,Korean,--,Partially open,`CC BY-NC-SA 4.0 `_ `URDU-Dataset `_,2020,400 utterances by 38 speakers (27 male and 11 female).,"4 emotions: angry, happy, neutral, and sad.",Audio,0.072 GB,Urdu,`Cross Lingual Speech Emotion Recognition: Urdu vs. Western Languages `_,Open,-- `BAVED `_,2020,1935 recording by 61 speakers (45 male and 16 female).,3 levels of emotion.,Audio,0.195 GB,Arabic,--,Open,-- diff --git a/src/ser-datasets.json b/src/ser-datasets.json index 42de5b6..9bfb67f 100644 --- a/src/ser-datasets.json +++ b/src/ser-datasets.json @@ -1,4 +1,32 @@ { + "Audio-Speech-Sentiment": { + "Year": 2021, + "Content": "Audio Speech Sentiment Dataset", + "Emotions": "4 emotions provides audio recordings of spoken sentences for anger, happiness, sadness, and neutral emotions.", + "Format": "Audio", + "Size": "1.1 GB", + "Language": "English", + "Paper": null, + "Access": "Open", + "License": "CC0: Public Domain", + "Dataset-link": "https://www.kaggle.com/imsparsh/audio-speech-sentiment-analysis", + "Paper-link": null, + "License-link": "https://creativecommons.org/publicdomain/zero/1.0/" + }, + "PMEmo": { + "Year": 2019, + "Content": "PMEmo: A Multimodal Dataset for Emotion Recognition in Chronic Pain Patients", + "Emotions": "4 emotions provides audio and visual recordings of participants with chronic pain performing physical activities and self-reporting their emotional state for happy, sad, angry, and neutral emotions.", + "Format": "Audio, video, and annotations", + "Size": "17.8 GB", + "Language": "English", + "Paper": "PMEmo: A Multimodal Dataset for Emotion Recognition in Chronic Pain Patients", + "Access": "Open", + "License": "CC BY-SA 4.0", + "Dataset-link": "https://github.com/HuiZhangDB/PMEmo", + "Paper-link": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6453745/", + "License-link": "https://creativecommons.org/licenses/by-sa/4.0/" + }, "Quechua-SER": { "Year": 2022, "Content": "12420 audio recordings (~15 hours) and their transcriptions by 7 native speakers.", @@ -43,17 +71,17 @@ }, "LSSED": { "Year": 2021, - "Content": "Large Scale Spanish Emotional Speech Database", - "Emotions": "8 emotions provides Spanish spoken utterances for anger, boredom, disgust, fear, happiness, neutral, sadness, and surprise.", + "Content": "LSSED: A Large-Scale Dataset and Benchmark for Speech Emotion Recognition", + "Emotions": "Anger, happiness, sadness, disappointment, boredom, disgust, excitement, fear, surprise, normal, and other.", "Format": "Audio", "Size": "90 GB", - "Language": "Spanish (Castilian)", + "Language": "English", "Paper": "LSSED: A Large-Scale Spanish Emotional Speech Database for Speech Processing and Machine Learning", - "Access": "Open", - "License": "CC BY-SA 4.0", + "Access": "Restricted", + "License": "-", "Dataset-link": "https://github.com/tobefans/LSSED", - "Paper-link": "https://www.mdpi.com/1424-8220/21/23/6985", - "License-link": "https://creativecommons.org/licenses/by-sa/4.0/" + "Paper-link": "https://arxiv.org/abs/2102.01754", + "License-link": "https://github.com/tobefans/LSSED/blob/main/EULA.pdf" }, "MLEnd": { "Year": 2021, @@ -178,7 +206,7 @@ "Access": "Open", "License": "CC BY 4.0", "Dataset-link": "https://doi.org/10.6084/m9.figshare.12498033", - "Paper-link": "https://www.mdpi.com/2076-3417/10/11/3704", + "Paper-link": "https://easy.dans.knaw.nl/ui/datasets/id/easy-dataset:236649", "License-link": "https://creativecommons.org/licenses/by/4.0/" }, "emotiontts open db": {