diff --git a/requirements b/requirements new file mode 100644 index 0000000..5b2f108 --- /dev/null +++ b/requirements @@ -0,0 +1,2 @@ +pandas +requests \ No newline at end of file diff --git a/scripts/update-snapshots.py b/scripts/update-snapshots.py new file mode 100644 index 0000000..7aa5ca5 --- /dev/null +++ b/scripts/update-snapshots.py @@ -0,0 +1,60 @@ +import requests +import pandas as pd +import os + +def download_tsv_from_google_sheet(sheet_url): + # Modify the Google Sheet URL to export it as TSV + tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=') + + # Send a GET request to download the TSV file + response = requests.get(tsv_url) + + # Check if the request was successful + if response.status_code == 200: + # Read the TSV content into a pandas DataFrame + from io import StringIO + tsv_content = StringIO(response.text) + df = pd.read_csv(tsv_content, sep='\t') + return df + else: + print("Failed to download the TSV file.") + return None + +# Example usage (URL would need to be replaced with your actual Google Sheet URL) +catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0" +df_catalog = download_tsv_from_google_sheet(catalog_url) +print(df_catalog.sample(1)) + +# Get the current date in the format YYYYMMDD +today = pd.Timestamp.now().strftime("%Y%m%d") + +filename = f"pl-asr-speech-datasets-catalog-{today}.tsv" +filepath = os.path.join("./snapshots/catalog", filename) + +df_catalog.to_csv(filepath, sep='\t', index=False) +print(f"Saved the DataFrame to {filepath}") + +# save also as the "latest" file +latest_filepath = os.path.join("./snapshots", "pl-asr-speech-datasets-catalog-latest.tsv") +df_catalog.to_csv(latest_filepath, sep='\t', index=False) +print(f"Saved the DataFrame to {latest_filepath}") + + +taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057" +df_taxonomy = download_tsv_from_google_sheet(taxonomy_url) +print(df_taxonomy.sample(1)) + +# Get the current date in the format YYYYMMDD +today = pd.Timestamp.now().strftime("%Y%m%d") + +filename = f"pl-asr-speech-datasets-taxonomy-{today}.tsv" +filepath = os.path.join("./snapshots/taxonomy", filename) + +df_taxonomy.to_csv(filepath, sep='\t', index=False) +print(f"Saved the DataFrame to {filepath}") + +# save also as the "latest" file +latest_filepath = os.path.join("./snapshots", "pl-asr-speech-datasets-taxonomy-latest.tsv") +df_taxonomy.to_csv(latest_filepath, sep='\t', index=False) +print(f"Saved the DataFrame to {latest_filepath}") + \ No newline at end of file diff --git a/snapshots/catalog/pl-asr-speech-datasets-catalog-20231221.tsv b/snapshots/catalog/pl-asr-speech-datasets-catalog-20231221.tsv new file mode 100644 index 0000000..ddc6054 --- /dev/null +++ b/snapshots/catalog/pl-asr-speech-datasets-catalog-20231221.tsv @@ -0,0 +1,54 @@ +Dataset name Dataset ID Access type Access link Available online License Publisher Repository Languages Creation year ISLRN ISBN LR catalog ID Reference publication Contact point Latest version Last update year Sponsor Price - non-commercial usage Price - commercial usage Purpose and split Size audio total [hours] Size audio transcribed [hours] Size [GB] Speakers Audio recordings Audio segmentation Tokens Unique tokens Automatic QA Manual QA Manual QA scope Transcription coverage Transcription protocol Denormalized transcriptions Transcription and annotation format Domain Speech type Audio collection process Speech recordings source Acoustic environment Audio device Device model Audio format Audio codec Audio channels Sampling rate [Hz] Bits per sample Age info Age balance Gender info Gender balance Nativity info Accent info Accent representative Education info Occupation info Health info Time alignement annotation +Appen Global Phone Polish appen-gphone-02 paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL 2002 no info no info Appen ID: POL_ASR001 no info Appen no info no info European Union no info no info train, valid, test 25 25 2 99 10,130 no no info no info no info no info no info 100 no info no info no info general read controlled paid contributors, volunteers home, office studio mic Sennheiser 440-6 wav pcm 1 16000 16 yes 20-39 yes M53%-F47% yes no N/A no no no no +Appen Mobile Speech appen-mobile-unk paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL no info no info no info Appen ID: POL_ASR002_CN no info Appen no info no info Appen no info no info no info 293 293 no info 353 106,674 no no info 168,544 no info yes no info 100 no info no info no info news read controlled paid contributors various mobile phone no info wav pcm 1 16000 16 no info no info no info no info no no N/A no no no no +Appen SpeechDat Phone appen-speechdat-10 paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL 2010 no info no info Appen ID: Polish SpeechDat(E) Database https://www.phonetik.uni-muenchen.de/forschung/BITS/TP1/Cookbook/node1.html Appen no info no info European Union no info no info train, valid, test 78 78 no info 1000 48,000 no no info no info yes yes no info 100 no info no info no info multi-domain read controlled paid contributors quiet landline phone no info wav pcm 1 8000 16 no info no info no info no info no no N/A no no no no +BABEL elra-babel-98 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0307/ yes ELRA ELRA ELRA pl-PL 1998 376-102-726-476-0 no info ELRA-S0307 no info Valerie Mapelli no info 2010 European Union 600 EUR 6000 EUR train, valid, test 16 16 no info 60 no info no no info no info no info no info no info no info no info no info no info numbers no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no no N/A no no no no +Baza nagrań mowy AGH agh-corpus-may-15 no info no info no Proprietary AGH None pl-PL 2015 no info no info no info https://link.springer.com/article/10.1007/s10579-015-9302-y Piotr Żelasko no info no info AGH no info no info train, valid, test 25 25 no info 166 no info no 117 450 13 784 OpenSJP dictionary yes subset of unspecified size 100 no info no info MLF multi-domain read controlled no info no info various no info wav pcm 1 16000 16 no info 20-35 yes M66%-F-34% no no N/A no no no no +Baza nagrań mowy AGH dla systemu SARMATA agh-corpus-sep-15 no info no info no Proprietary AGH None pl-PL 2015 no info no info no info https://www.researchgate.net/publication/281774738_SARMATA_20_Automatic_Polish_Language_Speech_Recognition_System Piotr Żelasko no info no info AGH no info no info train, valid, test 42 42 no info 391 no info no no info no info OpenSJP dictionary yes subset of unspecified size 100 no info no info MLF multi-domain read controlled no info no info various no info wav pcm 1 16000 16 no info 20-35 yes M55%-F45% no no N/A no no no no +Clarin cyfry clarin-pjatk-cyfry-16 free https://clarin-pl.eu/dspace/bitstream/handle/11321/317/cyfry.zip?sequence=1&isAllowed=y yes CC-BY-SA PJATK DSpace CLARIN PL pl-PL 2016 no info no info http://hdl.handle.net/11321/317 no info Daniel Korzinek no info 2021 Polish Ministry of Science and Higher Education (N N104 205039) free free train, valid, test 1 1 0.12 25 488 no no info no info no info no info no info 100 no info no info no info digits read controlled no info quiet no info no info raw pcm 1 16000 16 no info N/A no N/A no no N/A no no no no +Clarin mobile clarin-pjatk-mobile-15 free https://clarin-pl.eu/DSpace/handle/11321/237 yes CC-BY PJATK DSpace CLARIN PL, VLO pl-PL 2015 no info no info no info no info Daniel Korzinek no info 2015 CLARIN-PL free free train, valid, test 13 13 1.5 to check 3,552 no no info no info no info no info no info 0 no info no info no info no info read controlled volunteers quiet landline phone no info wav pcm 1 16000 16 no info N/A no N/A no no N/A no no no no +Clarin studio clarin-pjatk-studio-15 free https://clarin-pl.eu/DSpace/handle/11321/236 yes CC-BY PJATK DSpace CLARIN PL, VLO pl-PL 2015 no info no info no info https://www.clarin.eu/sites/default/files/02%20-%20KORZINEK-Polish.pdf Daniel Korzinek no info 2015 CLARIN-PL free free train, valid, test 56 56 4 to check no info no no info no info no info no info no info 100 no info no info no info no info read controlled volunteers quiet studio mic no info wav pcm 1 16000 16 no info N/A no N/A no no N/A no no no no +Corpora put-corpora-97 no info NA no no info PUT None pl-PL 1997 no info no info no info https://www.isca-speech.org/archive_v0/archive_papers/eurospeech_1997/e97_1735.pdf Stefan Grocholewski no info no info no info no info no info no info 6 6 NA 45 365 no no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no no N/A no no no no +CSLU: 22 Languages Corpus ldc-clsu-pl-05 paid https://catalog.ldc.upenn.edu/LDC2005S26 yes LDC LDC LDC multi 2005 no info 1-58563-356-9 LDC2005S26 no info T. Lander or Linguistic Data Consortium no info no info no info 150 USD 150 USD no info 4 4 no info no info 2,500 no no info no info no info yes Check if prompt instructions were followed. Manual transcription of 30% of data. 30 no info no info no info no info read controlled no info no info landline phone no info RIFF uLaw 1 8000 8 no info no info no info no info no no N/A no no no no +DiaBiz clarin-diabiz-22 paid http://docs.pelcra.pl/doku.php?id=diabiz yes Proprietary UL DSpace CLARIN PL pl-PL 2022 no info no info no info https://aclanthology.org/2022.lrec-1.76/ Piotr Pęzik no info no info European Union, Poland, Clarin-Biz no info no info train, valid, test 410 410 no info 196 3,764 no 447576 no info no info yes transcription 100 no info no info json, xml, txt customer service conversational controlled paid contributors quiet landline phone no info wav pcm 2 8000 16 no info no info yes no info no no N/A no no no yes +Diabiz eval clarin-diabiz-eval-22 paid http://docs.pelcra.pl/doku.php?id=diabiz yes Proprietary UL DSpace CLARIN PL pl-PL 2022 no info no info no info no info Piotr Pęzik no info no info European Union, Poland, Clarin-Biz no info no info test 41 41 no info 146 no info no no info no info no info yes transcription 100 no info no info json, xml, txt customer service conversational controlled paid contributors quiet landline phone no info wav pcm 2 8000 16 no info no info no info no info yes no N/A no no no yes +DiaBiz sample clarin-diabiz-sample-22 no info http://docs.pelcra.pl/doku.php?id=diabiz yes Proprietary UL DSpace CLARIN PL pl-PL 2022 no info no info no info no info Piotr Pęzik no info no info European Union, Poland, Clarin-Biz no info no info none 1 1 no info no info 18 no no info no info no info yes transcription 100 no info no info json, xml, txt customer service conversational controlled paid contributors quiet landline phone no info wav pcm 2 8000 16 no info no info no info no info yes no N/A no no no yes +EASR Corpora of European Portuguese, French, Hungarian and Polish Elderly Speech elra-easr-14 no info no info no no info ELRA ELRA multi 2014 no info no info no info http://www.lrec-conf.org/proceedings/lrec2014/pdf/365_Paper.pdf Artur Kolesiński no info no info European Union no info no info train, valid, test 205 205 no info 781 no info no no info no info no info yes transcription 100 yes no info no info multi-domain read controlled paid contributors quiet headset no info wav pcm 1 16000 16 no info yes, but elderly group only (over 60) yes yes yes 100 no no no no no +ELRA Global Phone Polish elra-gphone-elra-02 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0320/ yes ELRA ELRA ELRA multi 2002 350-930-795-617-4 no info ELRA-S0320 no info Valerie Mapelli no info no info European Union 700 EUR 3700 train, valid, test 25 25 2 99 10,130 no no info no info no info no info no info 100 no info no info no info general read controlled paid contributors, volunteers home, office studio mic Sennheiser 440-6 wav pcm 1 16000 16 yes 20-39 yes M53%-F47% no no N/A no no no no +EU Parliament clarin-pjatk-pinc-21 free https://clarin-pl.eu/DSpace/handle/11321/821 yes CC-BY PJATK DSpace CLARIN PL pl-PL 2021 no info no info http://hdl.handle.net/11321/821 no info Daniel Korzinek no info 2021 Polish Ministry of Science and Higher Education (N N104 205039) free free none 32 32 3.7 no info 1,040 yes no info no info no info no info no info 100 no info no info no info multi-domain public speech corpus public speakers various various no info wav pcm 1 16000 16 no N/A no N/A no no N/A no no no yes +Exmeralda hzsk-exmeralda-pl-07 free https://corpora.uni-hamburg.de/hzsk/de/islandora/object/spoken-corpus:demo-1.0#corpus-metadata no HZSK-PUB HZSK HZSK multi 2007 no info no info http://hdl.handle.net/11022/0000-0000-4F70-A no info HZSK no info 2009 no info free not available none no info no info no info 5 no info no no info no info no info no info no info 100 no info no info no info broadcast conversational corpus paid contributors broadcast lavalier mic no info wav pcm 1 44100 16 no info no info yes M50%-F50% no no N/A no no no no +Gewiss hzsk-gewiss-pl-12 free https://gewiss.uni-leipzig.de/index.php?id=about_gewiss&L=1 no HZSK-PUB HZSK HZSK multi 2012 no info no info no info no info no info no info no info no info free free none 20 20 no info 10 no info no no info no info no info yes no info 100 GAT2 no info exmeralda academic lecture public speech corpus volunteers queit no info no info no info no info no info no info no info no info no info no info no info no no N/A no no no no +Hamburg bilingual hzsk-hamcopolig-11 free https://corpora.uni-hamburg.de/hzsk/de/islandora/object/spoken-corpus:hamcopolig yes HZSK-PUB HZSK CLARIN EU, VLO multi 2011 no info no info http://hdl.handle.net/11022/0000-0000-6969-5 https://benjamins.com/catalog/hsm.14.10cza Agnieszka Czachor 0.2 2012 no info free no info no info no info no info no info no info no info no no info no info no info no info no info no info no info no info no info multi-domain conversational controlled volunteers no info no info no info no info no info no info no info no info yes yes no info no info yes yes no no no no no +Jurisdic uam-jurisdic-08 no info no info no no info AMU None pl-PL 2008 no info no info no info http://www.lrec-conf.org/proceedings/lrec2008/pdf/326_paper.pdf Grażyna Demenko no info no info Poland no info no info train, valid, test 855 855 no info 1000 494,933 no no info no info yes yes transcription, annotation 100 yes no info no info multi-domain various various government agents quiet no info Sennheiser ME-3, AKG C-1000S, Sennheiser ew300G2 wav pcm 2 16000 16 no info no info yes no info yes yes no no no yes no +Korpus mowy szeptanej Politechniki Poznańskiej put-whisper-16 no info no info no no info PUT None pl-PL 2016 no info no info no info https://yadda.icm.edu.pl/baztech/element/bwmeta1.element.baztech-19695dfb-03d8-401f-bb88-c90abebe2bf5 Piotr Kozierski no info no info Polish Ministry of Higher Education no info no info no info 9 9 no info no info no info no no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no N/A no no no no +Korpus radiowy clarin-radio-21 free https://clarin-pl.eu/DSpace/handle/11321/820 yes CC-BY PJATK DSpace CLARIN PL pl-PL 2021 no info no info no info no info Łukasz Brocki no info 2021 Polish Ministry of Higher Education free free none 7 0 0.75 200 192 no no info no info no info no info no info 0 N/A no info N/A no info public speech corpus public speakers mixed various no info raw pcm 1 16000 16 no N/A no N/A no no N/A no no no no +LUNA pjatk-luna-07 no info http://nlp.ipipan.waw.pl/NLP-SEMINAR/070423.pdf no no info PJATK None pl-PL 2007 no info no info no info no info no info no info no info European Union no info no info no info 11 11 no info 500 500 no no info no info no info yes annotation 100 yes no info no info public transport conversational controlled volunteers no info landline phone no info no info no info no info no info no info no info no info no info no info no info no N/A no no no no +M-AILABS speech dataset mailabs-19 free https://data.solak.de/data/Training/stt_tts/pl_PL.tgz yes Proprietary M-AILABS Coqui Free Corpora Catalog multi 2019 no info no info no info no info Imdat Solak no info 2019 M-AILABS free free no info 54 54 4.2 no info no info no no info no info no info no info no info 100 yes no info yes multi-domain read corpus volunteers quiet various various wav pcm 1 16000 16 no N/A yes no info no no N/A no no no no +Mozilla Common Voice mozilla-comm-voice-20 free commonvoice.mozilla.org/ yes CC-0 Mozilla Foundation Common Voice multi 2020 no info no info no info https://arxiv.org/pdf/1912.06670.pdf Mozilla Org 9.0 2022 Mozilla Foundation free free train, valid, test 148 148 4 3062 no info no no info no info no info yes up to 3 binary validations of each recording (2 if first 2 validations are consistent) 100 no info no info no info multi-domain read controlled crowd various various multiple mp3 mpeg-3 1 48000 16 yes yes yes M60%-F14% no no N/A no no no no +Multilingual librispeech fair-mls-20 free https://www.openslr.org/94/ yes CC-BY FAIR Github multi 2020 no info no info SLR94 https://arxiv.org/abs/2012.03411 Facebook (Meta) 2 2020 Facebook free free train, valid, test 137 137 6.2 16 28,860 no 492320 67100 yes yes manually transcribed and annotated dev, test subsets 100 yes no info no info books read corpus volunteers various various no info flac opus 1 16000 16 no N/A yes no info no no N/A no no no no +Pelcra EMI pelcra-emi-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-GB 2018 no info no info PELCRA_EMI http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free valid 18 9 12 44 22 no 96000 no info no info no info no info 100 no info no info JSON, EAF, ELAN, EMU, TextGrid interview conversational controlled volunteers no info no info no info no info no info no info no info no info no info no info no info no info no info no N/A no no no no +Pelcra EMO pelcra-emo-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info PELCRA_EMO http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free train, valid, test 28 26 2.2 80 40 no 252000 no info no info no info no info 100 no info no info JSON, EAF, ELAN, EMU, TextGrid interview conversational controlled volunteers no info no info no info no info no info no info no info no info no info no info no info no info no info no N/A no no no no +Pelcra learner English corpus pelcra-plec-11 free http://pelcra.pl/plec/downloads_fs yes no info UL PELCRA en-PL 2011 no info no info no info “Towards the PELCRA Learner English Corpus.” In Corpus Data across Languages and Disciplines, edited by Piotr Pęzik, 28:33–42. Łódź Studies in Language. Peter Lang, 2012. Piotr Pęzik no info 2012 Polish Ministry of Science and Higher Education (N N104 205039) free no info none no info no info no info no info no info no no info no info yes yes annotation 100 no no info xls general read controlled volunteers quiet no info no info no info no info no info no info no info no info no info no info no info no info no N/A no no no no +Pelcra LUZ pelcra-luz-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info PELCRA_LUZ http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 20 20 13.2 42 21 no 213000 no info no info no info no info 100 no no info JSON, EAF, ELAN, EMU, TextGrid interview conversational controlled volunteers quiet no info no info no info no info no info no info no info no info no info no info no info no info no N/A no no no no +Pelcra Mowa Miasta Kraków pelcra-mmk-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info MMK http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 2 2 no info 11 4 no 15900 no info no info no info no info no info no no info no info conversations conversational corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no info no N/A no no no no +Pelcra Mowa Miasta Wrocław 1 pelcra-mmw-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info MMW_1 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 8 7 no info 65 14 no 60000 no info no info no info no info no info no no info no info conversations conversational corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no info no N/A no no no no +Pelcra Mowa Miasta Wrocław 2 pelcra-mmw2-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info MMW_2 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 7 7 no info 38 14 no 70000 no info no info no info no info no info no no info no info conversations conversational corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no no N/A no no no no +Pelcra PARL pelcra-parl-15 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info PELCRA_PARL http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 14 12 no info 251 48 no 99000 no info no info no info no info no info no no info no info parliament speech public speech corpus public speakers mixed no info no info no info no info no info no info no info no info no info no info no info no no N/A no no no no +Pelcra Spelling and NUmbers Voice database pelcra-snuv-12 free http://metashare.elda.org/repository/browse/spelling-and-numbers-voice-database/f9e499c663f111e2bff4525400d761477c36ad442d124e6892bb3c8ce1a1ecdf/ yes CC-BY UL PELCRA pl-PL 2012 no info no info no info no info Piotr Pęzik no info 2012 European Union, Poland free free no info 220 220 no info 210 99,517 no 704625 no info yes yes no info 100 no no info no info numbers read controlled paid contributors various headset no info WAV pcm 1 22050 16 no info no info no info no info yes no N/A no no no no +Pelcra YouTube 1 pelcra-yt1-20 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2020 no info no info PELCRA_YT1 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 7 5 no info 106 25 no 49000 no info no info no info no info no info no no info no info multi-domain various corpus public speakers various no info no info no info no info no info no info no info no info no info no info no info no no N/A no no no no +Pelcra YouTube 2 pelcra-yt2-20 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2020 no info no info PELCRA_YT2 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 6 5 no info 45 23 no 49000 no info no info no info no info no info no no info no info multi-domain various corpus public speakers mixed no info no info no info no info no info no info no info no info no info no info no info no no N/A no no no no +PolEval 2019 pjatk-poleval-19 free http://2019.poleval.pl/index.php/tasks/task5 yes no info PJATK None pl-PL 2019 no info no info no info http://2019.poleval.pl/files/2019/11.pdf Daniel Korzinek no info 2019 PJATK free free test 1 1 0.8 no info 29 no no info no info no info no info no info 0 UTF-8, lowercase, no punctuation, no numbers, no abbreviations no info no info parliament speech public speech corpus public speakers various no info no info wav pcm 1 16000 16 no N/A no N/A no no N/A no no no no +Polish Sejm Senat speech corpus clarin-sejm_senat-18 free https://huggingface.co/datasets/jimregan/clarinpl_sejmsenat yes CC-BY PJATK DSpace CLARIN PL pl-PL 2018 no info no info no info https://acoustics.ippt.pan.pl/index.php/aa/article/view/327/pdf_32 Daniel Korzinek no info 2018 Polish Ministry of Higher Education free free train, test 97 97 no info 516 6,762 no no info no info no info no info no info no info no info no info no info parliament speech public speech corpus public speakers parliament no info no info wav pcm 1 no info no info no info no info no info no info no no N/A no no no no +Polish Speech Database ldc-polish-speech-db-19 paid https://catalog.ldc.upenn.edu/LDC2019S19 yes LDC LDC LDC pl-PL 2019 803-554-461-385-1 1-58563-903-6 LDC2019S19 no info Tomasz Szwelnik no info no info VoiceLab 3000 USD 3000 USD train, valid, test 280 280 no info 200 263,424 no 815000 no info yes yes transcription and annotation 100 no no info txt multi-domain read controlled paid contributors quiet headset no info flac flac 1 16000 16 yes 15-30 yes M51%-F49% yes no N/A no no no no +PolyAI MINDS-14 polyai-minds14-21 free http://poly-public-data.s3.amazonaws.com/MInDS-14/MInDS-14.zip%22 yes CC-BY PolyAI Hugging Face Data Catalog multi 2021 no info no info no info https://arxiv.org/abs/2104.08524 PolyAI no info 2022 PolyAI free free train,test 1 1 0.5 no info 578 no no info no info no info no info no info no info no info no info no info no info read controlled paid contributors no info various no info wav pcm 1 8000 16 no info no info no N/A no no N/A no no no no +PWR Atlas Zasobów Otwartej Nauki AZON nagrania kontrolowane pwr-azon-read-20 free https://zasobynauki.pl/zasoby/korpus-nagran-probek-mowy-do-celow-budowy-modeli-akustycznych-dla-automatycznego-rozpoznawania-mowy,53293/ yes CC-BY-SA WUST AZON pl-PL 2020 no info no info no info no info Teresa Sas no info 2022 Politechnika Wrocławska free free none 5 5 1.7 29 2,788 no no info no info no info no info no info 100 no info no info no info multi-domain read controlled volunteers quiet no info no info wav pcm 1 44100 16 no N/A yes M72%-F28% no no N/A no no no no +PWR Atlas Zasobów Otwartej Nauki AZON nagrania spontaniczne pwr-azon-spontaneous-20 free https://zasobynauki.pl/zasoby/korpus-nagran-probek-mowy-do-celow-budowy-modeli-akustycznych-dla-automatycznego-rozpoznawania-mowy,62687/ yes CC-BY-SA WUST AZON pl-PL 2020 no info no info no info no info Teresa Sas no info 2022 Politechnika Wrocławska free free none 2 2 0.65 27 456 no no info no info no info no info no info 100 no info no info no info multi-domain public speech corpus public speakers mixed no info no info wav pcm 1 44100 16 no N/A yes M72%-F28% no no N/A no no no no +PWR male set sample pwr-maleset-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 6 6 0.57 no info 4,738 no no info no info no info no info no info 100 no info no info no info no info read controlled no info no info studio mic various wav pcm 1 16000 16 no N/A no N/A no no N/A no no no no +PWR short words sample pwr-shortwords-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 1 1 0.35 no info 939 no no info no info no info no info no info 100 no info no info no info no info read controlled no info no info studio mic various wav pcm 1 44100 16 no N/A no N/A no no N/A no no no no +PWR Very Important Utterances pwr-viu-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 1 1 0.1 no info 2,703 no no info no info no info no info no info 100 no info no info no info no info read controlled no info no info studio mic various wav pcm 1 44100 16 no N/A no N/A no no N/A no no no no +Shaip Polish Mobile Speech Dataset shaip-mobile-speech-21 paid https://pl.shaip.com/offerings/speech-data-catalog/ yes Proprietary Shaip Shaip data catalog pl-PL 2021 no info no info no info no info Shaip no info 2021 Shaip no info no info no info 1482 1482 no info 2049 no info yes no info no info no info no info no info 100 no info no info no info no info read controlled paid contributors no info mobile phone no info wav pcm 1 48000 16 no info no info yes M35%-F65% no no N/A no no no no +Shaip Polish Speech Corpus shaip-media-corpus-21 paid https://pl.shaip.com/offerings/speech-data-catalog/ yes Proprietary Shaip Shaip data catalog pl-PL 2021 no info no info no info no info Shaip no info 2021 Shaip no info no info no info 269 269 no info 533 no info yes no info no info no info no info no info 100 no info no info no info multi-domain various corpus public speakers various various no info wav pcm 1 16000 16 no info no info yes M66%-F33% no no N/A no no no no +Speecon Polish elra-speecon-pl-05 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0179/ yes ELRA ELRA ELRA multi 2005 697-702-806-588-8 no info ELRA-S0179 https://www.researchgate.net/publication/2494457_SPEECON_-_Speech_Databases_for_Consumer_Devices_Database_Specification_and_Validation/link/54b64e6b0cf28ebe92e7c713/download ELRA 1.0 2007 European Union 67 000 EUR 75 000 EUR train, valid, test 248 248 135 600 no info no no info no info no info yes Scope: - documentation - completeness of the database - file formats - signal quality - transcription quality - lexicon - speaker and environment distribution Processes: - prompt sheet and lexicon validation - first 10-speaker database validation 100 yes no info no info multi-domain read controlled no info office, public space, broadcast, car studio mic no info wav pcm 4 16000 16 yes yes yes yes yes no N/A no no no no +Vox populi fair-voxpopuli-pl-21 free https://github.com/facebookresearch/voxpopuli yes CC-0 FAIR Github multi 2021 no info no info no info https://arxiv.org/abs/2101.00390 Changhan Wang (changhan@fb.com), Morgane Rivière (mriviere@fb.com), Ann Lee (annl@fb.com) 2 2022 European Union free free train, valid, test 21200 111 no info 282 no info no 802000 no info yes yes transcription 0.52% no info no info no info parliament speech public speech corpus public speakers various studio mic no info ogg vorbis 1 16000 16 no N/A no M76%-F24% yes no N/A no no no no +Fleurs google-fleurs-22 free https://huggingface.co/datasets/google/fleurs yes CC-BY Google Hugging Face Data Catalog multi 2022 no info no info no info https://arxiv.org/pdf/2205.12446.pdf Alexis Conneau 2 2022 Google free free train, valid, test 12.1 12.1 2.1 no info 3937 no no info no info no yes recordings 100% no yes tsv wikipedia articles read controlled paid contributors various various no info wav pcm 1 16000 16 no N/A yes M70%-F30% yes no N/A no no no no +Spokes Biz pelcra-spokesbiz-23 free http://docs.pelcra.pl/doku.php?id=spokesbiz yes CC-BY-NC-ND UL PELCRA pl-PL 2023 no info no info no info http://arxiv.org/abs/2312.12364 Piotr Pęzik 1 2023 CLARIN-PL free no info no info 650 650 no info 590 925 yes 5911420 no info yes yes transcription, annotation 100% no info no info no info multi-domain conversational corpus Polish Speech Database various various no info wav pcm 1 16000 16 yes yes yes yes yes yes yes yes no no yes \ No newline at end of file diff --git a/snapshots/catalog/pl-asr-speech-datasets-catalog-20240218.tsv b/snapshots/catalog/pl-asr-speech-datasets-catalog-20240218.tsv new file mode 100644 index 0000000..01fae07 --- /dev/null +++ b/snapshots/catalog/pl-asr-speech-datasets-catalog-20240218.tsv @@ -0,0 +1,54 @@ +Dataset name Dataset ID Access type Access link Available online License Publisher Repository Languages Creation year ISLRN ISBN LR catalog ID Reference publication Contact point Latest version Last update year Sponsor Price - non-commercial usage Price - commercial usage Purpose and split Size audio total [hours] Size audio transcribed [hours] Size [GB] Speakers Audio recordings Audio segmentation Tokens Unique tokens Automatic QA Manual QA Manual QA scope Transcription coverage Transcription protocol Denormalized transcriptions Transcription and annotation format Domain Speech type Audio collection process Speech recordings source Acoustic environment Audio device Device model Audio format Audio codec Audio channels Sampling rate [Hz] Bits per sample Age info Age balance Gender info Gender balance Nativity info Accent info Accent representative Education info Occupation info Health info Time alignement annotation +Appen Global Phone Polish appen-gphone-02 paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL 2002 no info no info Appen ID: POL_ASR001 no info Appen no info no info European Union no info no info train, valid, test 25 25 2 99 10,130 no no info no info no info no info no info 100 no info no info no info general read controlled paid contributors, volunteers home, office studio mic Sennheiser 440-6 wav pcm 1 16000 16 yes 20-39 yes M53%-F47% yes no no no no no +Appen Mobile Speech appen-mobile-unk paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL no info no info no info Appen ID: POL_ASR002_CN no info Appen no info no info Appen no info no info no info 293 293 no info 353 106,674 no no info 168,544 no info yes no info 100 no info no info no info news read controlled paid contributors various mobile phone no info wav pcm 1 16000 16 no info no info no info no info no no no no no no +Appen SpeechDat Phone appen-speechdat-10 paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL 2010 no info no info Appen ID: Polish SpeechDat(E) Database https://www.phonetik.uni-muenchen.de/forschung/BITS/TP1/Cookbook/node1.html Appen no info no info European Union no info no info train, valid, test 78 78 no info 1000 48,000 no no info no info yes yes no info 100 no info no info no info multi-domain read controlled paid contributors quiet landline phone no info wav pcm 1 8000 16 no info no info no info no info no no no no no no +BABEL elra-babel-98 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0307/ yes ELRA ELRA ELRA pl-PL 1998 376-102-726-476-0 no info ELRA-S0307 no info Valerie Mapelli no info 2010 European Union 600 EUR 6000 EUR train, valid, test 16 16 no info 60 no info no no info no info no info no info no info no info no info no info no info numbers no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no no no no no no +Baza nagrań mowy AGH agh-corpus-may-15 no info no info no Proprietary AGH pl-PL 2015 no info no info no info https://link.springer.com/article/10.1007/s10579-015-9302-y Piotr Å»elasko no info no info AGH no info no info train, valid, test 25 25 no info 166 no info no 117 450 13 784 OpenSJP dictionary yes subset of unspecified size 100 no info no info MLF multi-domain read controlled no info no info various no info wav pcm 1 16000 16 no info 20-35 yes M66%-F-34% no no no no no no +Baza nagrań mowy AGH dla systemu SARMATA agh-corpus-sep-15 no info no info no Proprietary AGH pl-PL 2015 no info no info no info https://www.researchgate.net/publication/281774738_SARMATA_20_Automatic_Polish_Language_Speech_Recognition_System Piotr Å»elasko no info no info AGH no info no info train, valid, test 42 42 no info 391 no info no no info no info OpenSJP dictionary yes subset of unspecified size 100 no info no info MLF multi-domain read controlled no info no info various no info wav pcm 1 16000 16 no info 20-35 yes M55%-F45% no no no no no no +Clarin cyfry clarin-pjatk-cyfry-16 free https://clarin-pl.eu/dspace/bitstream/handle/11321/317/cyfry.zip?sequence=1&isAllowed=y yes CC-BY-SA PJATK DSpace CLARIN PL pl-PL 2016 no info no info http://hdl.handle.net/11321/317 no info Daniel Korzinek no info 2021 Polish Ministry of Science and Higher Education (N N104 205039) free free train, valid, test 1 1 0.12 25 488 no no info no info no info no info no info 100 no info no info no info digits read controlled no info quiet no info no info raw pcm 1 16000 16 no info no no no no no no no +Clarin mobile clarin-pjatk-mobile-15 free https://clarin-pl.eu/DSpace/handle/11321/237 yes CC-BY PJATK DSpace CLARIN PL, VLO pl-PL 2015 no info no info no info no info Daniel Korzinek no info 2015 CLARIN-PL free free train, valid, test 13 13 1.5 to check 3,552 no no info no info no info no info no info 0 no info no info no info no info read controlled volunteers quiet landline phone no info wav pcm 1 16000 16 no info no no no no no no no +Clarin studio clarin-pjatk-studio-15 free https://clarin-pl.eu/DSpace/handle/11321/236 yes CC-BY PJATK DSpace CLARIN PL, VLO pl-PL 2015 no info no info no info https://www.clarin.eu/sites/default/files/02%20-%20KORZINEK-Polish.pdf Daniel Korzinek no info 2015 CLARIN-PL free free train, valid, test 56 56 4 to check no info no no info no info no info no info no info 100 no info no info no info no info read controlled volunteers quiet studio mic no info wav pcm 1 16000 16 no info no no no no no no no +Corpora put-corpora-97 no info no no info PUT pl-PL 1997 no info no info no info https://www.isca-speech.org/archive_v0/archive_papers/eurospeech_1997/e97_1735.pdf Stefan Grocholewski no info no info no info no info no info no info 6 6 45 365 no no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no no no no no no +CSLU: 22 Languages Corpus ldc-clsu-pl-05 paid https://catalog.ldc.upenn.edu/LDC2005S26 yes LDC LDC LDC multi 2005 no info 1-58563-356-9 LDC2005S26 no info T. Lander or Linguistic Data Consortium no info no info no info 150 USD 150 USD no info 4 4 no info no info 2,500 no no info no info no info yes Check if prompt instructions were followed. Manual transcription of 30% of data. 30 no info no info no info no info read controlled no info no info landline phone no info RIFF uLaw 1 8000 8 no info no info no info no info no no no no no no +DiaBiz clarin-diabiz-22 paid http://docs.pelcra.pl/doku.php?id=diabiz yes Proprietary UL DSpace CLARIN PL pl-PL 2022 no info no info no info https://aclanthology.org/2022.lrec-1.76/ Piotr Pęzik no info no info European Union, Poland, Clarin-Biz no info no info train, valid, test 410 410 no info 196 3,764 no 447576 no info no info yes transcription 100 no info no info json, xml, txt customer service conversational controlled paid contributors quiet landline phone no info wav pcm 2 8000 16 no info no info yes no info no no no no no yes +Diabiz eval clarin-diabiz-eval-22 paid http://docs.pelcra.pl/doku.php?id=diabiz yes Proprietary UL DSpace CLARIN PL pl-PL 2022 no info no info no info no info Piotr Pęzik no info no info European Union, Poland, Clarin-Biz no info no info test 41 41 no info 146 no info no no info no info no info yes transcription 100 no info no info json, xml, txt customer service conversational controlled paid contributors quiet landline phone no info wav pcm 2 8000 16 no info no info no info no info yes no no no no yes +DiaBiz sample clarin-diabiz-sample-22 no info http://docs.pelcra.pl/doku.php?id=diabiz yes Proprietary UL DSpace CLARIN PL pl-PL 2022 no info no info no info no info Piotr Pęzik no info no info European Union, Poland, Clarin-Biz no info no info none 1 1 no info no info 18 no no info no info no info yes transcription 100 no info no info json, xml, txt customer service conversational controlled paid contributors quiet landline phone no info wav pcm 2 8000 16 no info no info no info no info yes no no no no yes +EASR Corpora of European Portuguese, French, Hungarian and Polish Elderly Speech elra-easr-14 no info no info no no info ELRA ELRA multi 2014 no info no info no info http://www.lrec-conf.org/proceedings/lrec2014/pdf/365_Paper.pdf Artur Kolesiński no info no info European Union no info no info train, valid, test 205 205 no info 781 no info no no info no info no info yes transcription 100 yes no info no info multi-domain read controlled paid contributors quiet headset no info wav pcm 1 16000 16 no info yes, but elderly group only (over 60) yes yes yes 100 no no no no no +ELRA Global Phone Polish elra-gphone-elra-02 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0320/ yes ELRA ELRA ELRA multi 2002 350-930-795-617-4 no info ELRA-S0320 no info Valerie Mapelli no info no info European Union 700 EUR 3700 train, valid, test 25 25 2 99 10,130 no no info no info no info no info no info 100 no info no info no info general read controlled paid contributors, volunteers home, office studio mic Sennheiser 440-6 wav pcm 1 16000 16 yes 20-39 yes M53%-F47% no no no no no no +EU Parliament clarin-pjatk-pinc-21 free https://clarin-pl.eu/DSpace/handle/11321/821 yes CC-BY PJATK DSpace CLARIN PL pl-PL 2021 no info no info http://hdl.handle.net/11321/821 no info Daniel Korzinek no info 2021 Polish Ministry of Science and Higher Education (N N104 205039) free free none 32 32 3.7 no info 1,040 yes no info no info no info no info no info 100 no info no info no info multi-domain public speech corpus public speakers various various no info wav pcm 1 16000 16 no no no no no no no yes +Exmeralda hzsk-exmeralda-pl-07 free https://corpora.uni-hamburg.de/hzsk/de/islandora/object/spoken-corpus:demo-1.0#corpus-metadata no HZSK-PUB HZSK HZSK multi 2007 no info no info http://hdl.handle.net/11022/0000-0000-4F70-A no info HZSK no info 2009 no info free not available none no info no info no info 5 no info no no info no info no info no info no info 100 no info no info no info broadcast conversational corpus paid contributors broadcast lavalier mic no info wav pcm 1 44100 16 no info no info yes M50%-F50% no no no no no no +Gewiss hzsk-gewiss-pl-12 free https://gewiss.uni-leipzig.de/index.php?id=about_gewiss&L=1 no HZSK-PUB HZSK HZSK multi 2012 no info no info no info no info no info no info no info no info free free none 20 20 no info 10 no info no no info no info no info yes no info 100 GAT2 no info exmeralda academic lecture public speech corpus volunteers queit no info no info no info no info no info no info no info no info no info no info no info no no no no no no +Hamburg bilingual hzsk-hamcopolig-11 free https://corpora.uni-hamburg.de/hzsk/de/islandora/object/spoken-corpus:hamcopolig yes HZSK-PUB HZSK CLARIN EU, VLO multi 2011 no info no info http://hdl.handle.net/11022/0000-0000-6969-5 https://benjamins.com/catalog/hsm.14.10cza Agnieszka Czachor 0.2 2012 no info free no info no info no info no info no info no info no info no no info no info no info no info no info no info no info no info no info multi-domain conversational controlled volunteers no info no info no info no info no info no info no info no info yes yes no info no info yes yes no no no no no +Jurisdic uam-jurisdic-08 no info no info no no info AMU pl-PL 2008 no info no info no info http://www.lrec-conf.org/proceedings/lrec2008/pdf/326_paper.pdf Grażyna Demenko no info no info Poland no info no info train, valid, test 855 855 no info 1000 494,933 no no info no info yes yes transcription, annotation 100 yes no info no info multi-domain various various government agents quiet no info Sennheiser ME-3, AKG C-1000S, Sennheiser ew300G2 wav pcm 2 16000 16 no info no info yes no info yes yes no no no yes no +Korpus mowy szeptanej Politechniki Poznańskiej put-whisper-16 no info no info no no info PUT pl-PL 2016 no info no info no info https://yadda.icm.edu.pl/baztech/element/bwmeta1.element.baztech-19695dfb-03d8-401f-bb88-c90abebe2bf5 Piotr Kozierski no info no info Polish Ministry of Higher Education no info no info no info 9 9 no info no info no info no no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Korpus radiowy clarin-radio-21 free https://clarin-pl.eu/DSpace/handle/11321/820 yes CC-BY PJATK DSpace CLARIN PL pl-PL 2021 no info no info no info no info Łukasz Brocki no info 2021 Polish Ministry of Higher Education free free none 7 0 0.75 200 192 no no info no info no info no info no info 0 no info no info public speech corpus public speakers mixed various no info raw pcm 1 16000 16 no no no no no no no no +LUNA pjatk-luna-07 no info http://nlp.ipipan.waw.pl/NLP-SEMINAR/070423.pdf no no info PJATK pl-PL 2007 no info no info no info no info no info no info no info European Union no info no info no info 11 11 no info 500 500 no no info no info no info yes annotation 100 yes no info no info public transport conversational controlled volunteers no info landline phone no info no info no info no info no info no info no info no info no info no info no info no no no no no +M-AILABS speech dataset mailabs-19 free https://data.solak.de/data/Training/stt_tts/pl_PL.tgz yes Proprietary M-AILABS Coqui Free Corpora Catalog multi 2019 no info no info no info no info Imdat Solak no info 2019 M-AILABS free free no info 54 54 4.2 no info no info no no info no info no info no info no info 100 yes no info yes multi-domain read corpus volunteers quiet various various wav pcm 1 16000 16 no yes no info no no no no no no +Mozilla Common Voice mozilla-comm-voice-20 free commonvoice.mozilla.org/ yes CC-0 Mozilla Foundation Common Voice multi 2020 no info no info no info https://arxiv.org/pdf/1912.06670.pdf Mozilla Org 9.0 2022 Mozilla Foundation free free train, valid, test 148 148 4 3062 no info no no info no info no info yes up to 3 binary validations of each recording (2 if first 2 validations are consistent) 100 no info no info no info multi-domain read controlled crowd various various multiple mp3 mpeg-3 1 48000 16 yes yes yes M60%-F14% no no no no no no +Multilingual librispeech fair-mls-20 free https://www.openslr.org/94/ yes CC-BY FAIR Github multi 2020 no info no info SLR94 https://arxiv.org/abs/2012.03411 Facebook (Meta) 2 2020 Facebook free free train, valid, test 137 137 6.2 16 28,860 no 492320 67100 yes yes manually transcribed and annotated dev, test subsets 100 yes no info no info books read corpus volunteers various various no info flac opus 1 16000 16 no yes no info no no no no no no +Pelcra EMI pelcra-emi-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-GB 2018 no info no info PELCRA_EMI http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free valid 18 9 12 44 22 no 96000 no info no info no info no info 100 no info no info JSON, EAF, ELAN, EMU, TextGrid interview conversational controlled volunteers no info no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra EMO pelcra-emo-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info PELCRA_EMO http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free train, valid, test 28 26 2.2 80 40 no 252000 no info no info no info no info 100 no info no info JSON, EAF, ELAN, EMU, TextGrid interview conversational controlled volunteers no info no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra learner English corpus pelcra-plec-11 free http://pelcra.pl/plec/downloads_fs yes no info UL PELCRA en-PL 2011 no info no info no info “Towards the PELCRA Learner English Corpus.” In Corpus Data across Languages and Disciplines, edited by Piotr Pęzik, 28:33–42. Łódź Studies in Language. Peter Lang, 2012. Piotr Pęzik no info 2012 Polish Ministry of Science and Higher Education (N N104 205039) free no info none no info no info no info no info no info no no info no info yes yes annotation 100 no no info xls general read controlled volunteers quiet no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra LUZ pelcra-luz-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info PELCRA_LUZ http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 20 20 13.2 42 21 no 213000 no info no info no info no info 100 no no info JSON, EAF, ELAN, EMU, TextGrid interview conversational controlled volunteers quiet no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra Mowa Miasta Kraków pelcra-mmk-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info MMK http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 2 2 no info 11 4 no 15900 no info no info no info no info no info no no info no info conversations conversational corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra Mowa Miasta Wrocław 1 pelcra-mmw-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info MMW_1 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 8 7 no info 65 14 no 60000 no info no info no info no info no info no no info no info conversations conversational corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra Mowa Miasta Wrocław 2 pelcra-mmw2-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info MMW_2 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 7 7 no info 38 14 no 70000 no info no info no info no info no info no no info no info conversations conversational corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no no no no no no +Pelcra PARL pelcra-parl-15 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info PELCRA_PARL http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 14 12 no info 251 48 no 99000 no info no info no info no info no info no no info no info parliament speech public speech corpus public speakers mixed no info no info no info no info no info no info no info no info no info no info no info no no no no no no +Pelcra Spelling and NUmbers Voice database pelcra-snuv-12 free http://metashare.elda.org/repository/browse/spelling-and-numbers-voice-database/f9e499c663f111e2bff4525400d761477c36ad442d124e6892bb3c8ce1a1ecdf/ yes CC-BY UL PELCRA pl-PL 2012 no info no info no info no info Piotr Pęzik no info 2012 European Union, Poland free free no info 220 220 no info 210 99,517 no 704625 no info yes yes no info 100 no no info no info numbers read controlled paid contributors various headset no info WAV pcm 1 22050 16 no info no info no info no info yes no no no no no +Pelcra YouTube 1 pelcra-yt1-20 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2020 no info no info PELCRA_YT1 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 7 5 no info 106 25 no 49000 no info no info no info no info no info no no info no info multi-domain various corpus public speakers various no info no info no info no info no info no info no info no info no info no info no info no no no no no no +Pelcra YouTube 2 pelcra-yt2-20 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2020 no info no info PELCRA_YT2 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 6 5 no info 45 23 no 49000 no info no info no info no info no info no no info no info multi-domain various corpus public speakers mixed no info no info no info no info no info no info no info no info no info no info no info no no no no no no +PolEval 2019 pjatk-poleval-19 free http://2019.poleval.pl/index.php/tasks/task5 yes no info PJATK pl-PL 2019 no info no info no info http://2019.poleval.pl/files/2019/11.pdf Daniel Korzinek no info 2019 PJATK free free test 1 1 0.8 no info 29 no no info no info no info no info no info 0 UTF-8, lowercase, no punctuation, no numbers, no abbreviations no info no info parliament speech public speech corpus public speakers various no info no info wav pcm 1 16000 16 no no no no no no no no +Polish Sejm Senat speech corpus clarin-sejm_senat-18 free https://huggingface.co/datasets/jimregan/clarinpl_sejmsenat yes CC-BY PJATK DSpace CLARIN PL pl-PL 2018 no info no info no info https://acoustics.ippt.pan.pl/index.php/aa/article/view/327/pdf_32 Daniel Korzinek no info 2018 Polish Ministry of Higher Education free free train, test 97 97 no info 516 6,762 no no info no info no info no info no info no info no info no info no info parliament speech public speech corpus public speakers parliament no info no info wav pcm 1 no info no info no info no info no info no info no no no no no no +Polish Speech Database ldc-polish-speech-db-19 paid https://catalog.ldc.upenn.edu/LDC2019S19 yes LDC LDC LDC pl-PL 2019 803-554-461-385-1 1-58563-903-6 LDC2019S19 no info Tomasz Szwelnik no info no info VoiceLab 3000 USD 3000 USD train, valid, test 280 280 no info 200 263,424 no 815000 no info yes yes transcription and annotation 100 no no info txt multi-domain read controlled paid contributors quiet headset no info flac flac 1 16000 16 yes 15-30 yes M51%-F49% yes no no no no no +PolyAI MINDS-14 polyai-minds14-21 free http://poly-public-data.s3.amazonaws.com/MInDS-14/MInDS-14.zip%22 yes CC-BY PolyAI Hugging Face Data Catalog multi 2021 no info no info no info https://arxiv.org/abs/2104.08524 PolyAI no info 2022 PolyAI free free train,test 1 1 0.5 no info 578 no no info no info no info no info no info no info no info no info no info no info read controlled paid contributors no info various no info wav pcm 1 8000 16 no info no info no no no no no no no +PWR Atlas Zasobów Otwartej Nauki AZON nagrania kontrolowane pwr-azon-read-20 free https://zasobynauki.pl/zasoby/korpus-nagran-probek-mowy-do-celow-budowy-modeli-akustycznych-dla-automatycznego-rozpoznawania-mowy,53293/ yes CC-BY-SA WUST AZON pl-PL 2020 no info no info no info no info Teresa Sas no info 2022 Politechnika Wrocławska free free none 5 5 1.7 29 2,788 no no info no info no info no info no info 100 no info no info no info multi-domain read controlled volunteers quiet no info no info wav pcm 1 44100 16 no yes M72%-F28% no no no no no no +PWR Atlas Zasobów Otwartej Nauki AZON nagrania spontaniczne pwr-azon-spontaneous-20 free https://zasobynauki.pl/zasoby/korpus-nagran-probek-mowy-do-celow-budowy-modeli-akustycznych-dla-automatycznego-rozpoznawania-mowy,62687/ yes CC-BY-SA WUST AZON pl-PL 2020 no info no info no info no info Teresa Sas no info 2022 Politechnika Wrocławska free free none 2 2 0.65 27 456 no no info no info no info no info no info 100 no info no info no info multi-domain public speech corpus public speakers mixed no info no info wav pcm 1 44100 16 no yes M72%-F28% no no no no no no +PWR male set sample pwr-maleset-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 6 6 0.57 no info 4,738 no no info no info no info no info no info 100 no info no info no info no info read controlled no info no info studio mic various wav pcm 1 16000 16 no no no no no no no no +PWR short words sample pwr-shortwords-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 1 1 0.35 no info 939 no no info no info no info no info no info 100 no info no info no info no info read controlled no info no info studio mic various wav pcm 1 44100 16 no no no no no no no no +PWR Very Important Utterances pwr-viu-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 1 1 0.1 no info 2,703 no no info no info no info no info no info 100 no info no info no info no info read controlled no info no info studio mic various wav pcm 1 44100 16 no no no no no no no no +Shaip Polish Mobile Speech Dataset shaip-mobile-speech-21 paid https://pl.shaip.com/offerings/speech-data-catalog/ yes Proprietary Shaip Shaip data catalog pl-PL 2021 no info no info no info no info Shaip no info 2021 Shaip no info no info no info 1482 1482 no info 2049 no info yes no info no info no info no info no info 100 no info no info no info no info read controlled paid contributors no info mobile phone no info wav pcm 1 48000 16 no info no info yes M35%-F65% no no no no no no +Shaip Polish Speech Corpus shaip-media-corpus-21 paid https://pl.shaip.com/offerings/speech-data-catalog/ yes Proprietary Shaip Shaip data catalog pl-PL 2021 no info no info no info no info Shaip no info 2021 Shaip no info no info no info 269 269 no info 533 no info yes no info no info no info no info no info 100 no info no info no info multi-domain various corpus public speakers various various no info wav pcm 1 16000 16 no info no info yes M66%-F33% no no no no no no +Speecon Polish elra-speecon-pl-05 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0179/ yes ELRA ELRA ELRA multi 2005 697-702-806-588-8 no info ELRA-S0179 https://www.researchgate.net/publication/2494457_SPEECON_-_Speech_Databases_for_Consumer_Devices_Database_Specification_and_Validation/link/54b64e6b0cf28ebe92e7c713/download ELRA 1.0 2007 European Union 67 000 EUR 75 000 EUR train, valid, test 248 248 135 600 no info no no info no info no info yes Scope: - documentation - completeness of the database - file formats - signal quality - transcription quality - lexicon - speaker and environment distribution Processes: - prompt sheet and lexicon validation - first 10-speaker database validation 100 yes no info no info multi-domain read controlled no info office, public space, broadcast, car studio mic no info wav pcm 4 16000 16 yes yes yes yes yes no no no no no +Vox populi fair-voxpopuli-pl-21 free https://github.com/facebookresearch/voxpopuli yes CC-0 FAIR Github multi 2021 no info no info no info https://arxiv.org/abs/2101.00390 Changhan Wang (changhan@fb.com), Morgane Rivière (mriviere@fb.com), Ann Lee (annl@fb.com) 2 2022 European Union free free train, valid, test 21200 111 no info 282 no info no 802000 no info yes yes transcription 0.52% no info no info no info parliament speech public speech corpus public speakers various studio mic no info ogg vorbis 1 16000 16 no no M76%-F24% yes no no no no no +Fleurs google-fleurs-22 free https://huggingface.co/datasets/google/fleurs yes CC-BY Google Hugging Face Data Catalog multi 2022 no info no info no info https://arxiv.org/pdf/2205.12446.pdf Alexis Conneau 2 2022 Google free free train, valid, test 12.1 12.1 2.1 no info 3937 no no info no info no yes recordings 100% no yes tsv wikipedia articles read controlled paid contributors various various no info wav pcm 1 16000 16 no yes M70%-F30% yes no no no no no +Spokes Biz pelcra-spokesbiz-23 free http://docs.pelcra.pl/doku.php?id=spokesbiz yes CC-BY-NC-ND UL PELCRA pl-PL 2023 no info no info no info http://arxiv.org/abs/2312.12364 Piotr Pęzik 1 2023 CLARIN-PL free no info no info 650 650 no info 590 925 yes 5911420 no info yes yes transcription, annotation 100% no info no info no info multi-domain conversational corpus Polish Speech Database various various no info wav pcm 1 16000 16 yes yes yes yes yes yes yes yes no no yes diff --git a/snapshots/pl-asr-speech-datasets-catalog-latest.tsv b/snapshots/pl-asr-speech-datasets-catalog-latest.tsv index b60b0d6..01fae07 100644 --- a/snapshots/pl-asr-speech-datasets-catalog-latest.tsv +++ b/snapshots/pl-asr-speech-datasets-catalog-latest.tsv @@ -1,52 +1,54 @@ -Dataset name Codename Access type Access link Available online License Publisher Repository Languages Creation year ISLRN ISBN LR catalogue ID Reference publication Contact point Latest version Last update year Sponsor Price - non commercial usage Price - commercial usage Purpose and split Size audio total [hours] Size audio transcribed [hours] Size [GB] Speakers Audio recordings Audio segmentation Tokens Unique tokens Automatic QA Manual QA Manual QA scope Transcription coverage Transcription protocol Denormalized transcriptions Transcription and annotation format Domain Speech type Audio collection process Speech recordings providers Acoustic environment Audio device Device model Audio format Audio codec Audio channels Sampling rate [Hz] Bits per sample Age info Age balance Gender info Gender balance Nativity info Accent info Accent representative Time alignement annot Named entities annot Part of speech annot Occupation coverage -CORPORA agh-corpora-93 no info NA no no info AGH None pl-PL 1993 no info no info no info no info Stefan Grocholewski no info no info no info no info no info no info 2 2 NA 45 365 no no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -BABEL elra-babel-98 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0307/ yes ELRA ELRA ELRA pl-PL 1998 376-102-726-476-0 no info ELRA-S0307 no info Valerie Mapelli no info 2010 European Union 600 EUR 6000 EUR train, valid, test 16 16 no info 60 no info no no info no info no info no info no info no info no info no info no info numbers no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Appen Global Phone Polish appen-gphone-02 paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL 2002 no info no info Appen ID: POL_ASR001 no info Appen no info no info European Union no info no info train, valid, test 25 25 2 99 10,130 no no info no info no info no info no info 100 no info no info no info common read controlled paid contributors, volunteers home, office studio mic Sennheiser 440-6 wav pcm 1 16000 16 yes 20-39 yes M53%-F47% yes no info no info no no no no -ELRA Global Phone Polish elra-gphone-elra-02 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0320/ yes ELRA ELRA ELRA multi 2002 350-930-795-617-4 no info ELRA-S0320 no info Valerie Mapelli no info no info European Union 700 EUR 3700 train, valid, test 25 25 2 99 10,130 no no info no info no info no info no info 100 no info no info no info common read controlled paid contributors, volunteers home, office studio mic Sennheiser 440-6 wav pcm 1 16000 16 yes 20-39 yes M53%-F47% s no info no info no no no no -Speecon Polish elra-speecon-pl-05 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0179/ yes ELRA ELRA ELRA multi 2005 697-702-806-588-8 no info ELRA-S0179 https://www.researchgate.net/publication/2494457_SPEECON_-_Speech_Databases_for_Consumer_Devices_Database_Specification_and_Validation/link/54b64e6b0cf28ebe92e7c713/download ELRA 1.0 2007 European Union 67 000 EUR 75 000 EUR train, valid, test 248 248 135 600 no info no no info no info no info yes Scope: - documentation - completeness of the database - file formats - signal quality - transcription quality - lexicon - speaker and environment distribution Processes: - prompt sheet and lexicon validation - first 10-speaker database validation 100 yes no info no info multi read controlled no info office, public places, entertainment, car, children studio mic no info wav pcm 4 16000 16 yes yes yes yes yes no info no info no info no info no info no info -CSLU: 22 Languages Corpus ldc-clsu-pl-05 paid https://catalog.ldc.upenn.edu/LDC2005S26 yes LDC LDC LDC multi 2005 no info 1-58563-356-9 LDC2005S26 no info T. Lander or Linguistic Data Consortium no info no info no info 150 USD 150 USD no info 4 4 no info no info 2,500 no no info no info no info yes Check if prompt instructions were followed. Manual transcription of 30% of data. 30 no info no info no info no info read controlled no info no info landline phone no info RIFF uLaw 1 8000 8 no info no info no info no info no info no info no info no info no info no info no info -Exmeralda hzsk-exmeralda-pl-07 free https://corpora.uni-hamburg.de/hzsk/de/islandora/object/spoken-corpus:demo-1.0#corpus-metadata no HZSK-PUB HZSK HZSK multi 2007 no info no info http://hdl.handle.net/11022/0000-0000-4F70-A no info HZSK no info 2009 no info free not available none no info no info no info 5 no info no no info no info no info no info no info 100 no info no info no info tv show conversation corpus paid contributors tv show lavalier mic no info wav pcm 1 44100 16 no info no info yes M50%-F50% no no no no no no no -LUNA pjatk-luna-07 no info http://nlp.ipipan.waw.pl/NLP-SEMINAR/070423.pdf no info no info PJATK None pl-PL 2007 no info no info no info no info no info no info no info European Union no info no info no info 11 11 no info 500 500 no no info no info no info yes annotation 100 yes no info no info public transport dialogs controlled volunteers no info phone no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Jurisdic uam-jurisdic-08 no info no info no no info AMU None pl-PL 2008 no info no info no info http://www.lrec-conf.org/proceedings/lrec2008/pdf/326_paper.pdf Grażyna Demenko no info no info Poland no info no info train, valid, test 855 855 no info 1000 494,933 no no info no info yes yes transcription, annotation 100 yes no info no info multi various (read, spontenous) various government agents quiet no info Sennheiser ME-3, AKG C-1000S, Sennheiser ew300G2 wav pcm 2 16000 16 no info no info yes no info yes yes no info no info NA NA no info -Appen SpeechDat Phone appen-speechdat-10 paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL 2010 no info no info Appen ID: Polish SpeechDat(E) Database https://www.phonetik.uni-muenchen.de/forschung/BITS/TP1/Cookbook/node1.html Appen no info no info European Union no info no info train, valid, test 78 78 no info 1000 48,000 no no info no info yes yes no info 100 no info no info no info multi read controlled paid contributors quiet landline phone no info wav pcm 1 8000 16 no info no info no info no info no info no info no info no info no info no info no info -Hamburg bilingual hzsk-hamcopolig-11 free https://corpora.uni-hamburg.de/hzsk/de/islandora/object/spoken-corpus:hamcopolig yes HZSK-PUB HZSK CLARIN EU, VLO multi 2011 no info no info http://hdl.handle.net/11022/0000-0000-6969-5 https://benjamins.com/catalog/hsm.14.10cza Agnieszka Czachor 0.2 2012 no info free not available no info no info no info no info no info no info no no info no info no info no info no info no info no info no info no info multiple conversational, spontaneous controlled volunteers no info no info no info no info no info no info no info no info yes yes no info no info yes yes no info no no no no -Pelcra learner English corpus pelcra-plec-11 free http://pelcra.pl/plec/downloads_fs yes no info PELCRA PELCRA en-PL 2011 no info no info no info “Towards the PELCRA Learner English Corpus.” In Corpus Data across Languages and Disciplines, edited by Piotr Pęzik, 28:33–42. Łódź Studies in Language. Peter Lang, 2012. Piotr Pęzik no info 2012 Polish Ministry of Science and Higher Education (N N104 205039) free no info none no info no info no info no info no info no no info no info yes yes annotation 100 no no info xls general read controlled volunteers quiet no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Pelcra Spelling and NUmbers Voice database pelcra-snuv-12 free http://metashare.elda.org/repository/browse/spelling-and-numbers-voice-database/f9e499c663f111e2bff4525400d761477c36ad442d124e6892bb3c8ce1a1ecdf/ yes CC-BY PELCRA PELCRA pl-PL 2012 no info no info no info no info Piotr Pęzik no info 2012 European Union, Poland free free no info 220 220 no info 210 no info no 704625 no info yes yes no info 100 no no info no info numbers, spelling read controlled crowd various headset no info WAV pcm 1 22050 16 no info no info no info no info yes no info no info no info no info no info no info -Gewiss hzsk-gewiss-pl-12 free https://gewiss.uni-leipzig.de/index.php?id=about_gewiss&L=1 no info HZSK-PUB UL HZSK multi 2012 no info no info no info no info no info no info no info no info free free none 20 20 no info 10 no info no no info no info no info yes no info 100 GAT2 no info exmeralda academic speech spoken controlled volunteers queit no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -EASR Corpora of European Portuguese, French, Hungarian and Polish Elderly Speech elra-easr-14 no info no info no no info ELRA ELRA multi 2014 no info no info no info http://www.lrec-conf.org/proceedings/lrec2014/pdf/365_Paper.pdf Artur Kolesiński no info no info European Union no info no info train, valid, test 205 205 no info 781 no info no no info no info no info yes transcription 100 yes no info no info multiple read controlled paid contributors quiet headset no info wav pcm 1 16000 16 no info yes, but elderly group only (over 60) yes yes yes 100 yes no info no info no info no info -Clarin studio clarin-pjatk-studio-15 free https://clarin-pl.eu/DSpace/handle/11321/236 yes CC-BY PJATK DSpace CLARIN PL, VLO pl-PL 2015 no info no info no info https://www.clarin.eu/sites/default/files/02%20-%20KORZINEK-Polish.pdf Daniel Korzinek no info 2015 CLARIN-PL free free train, valid, test 56 56 4 to check no info no no info no info no info no info no info 100 no info no info no info no info read recordings volunteers quiet studio mic no info wav pcm 1 16000 16 no info N/A no N/A no no no no no no no -Clarin mobile clarin-pjatk-mobile-15 free https://clarin-pl.eu/DSpace/handle/11321/237 yes CC-BY PJATK DSpace CLARIN PL, VLO pl-PL 2015 no info no info no info no info Daniel Korzinek no info 2015 CLARIN-PL free free train, valid, test 13 13 1.5 to check 3,552 no no info no info no info no info no info 0 no info no info no info no info lecture recordings volunteers quiet landline phone no info wav pcm 1 16000 16 no info N/A no N/A no no no no no no no -Baza nagrań mowy AGH agh-corpus-may-15 no info no info no info Proprietary AGH None pl-PL 2015 no info no info no info https://link.springer.com/article/10.1007/s10579-015-9302-y Piotr Żelasko no info no info AGH no info no info train, valid, test 25 25 no info 166 no info no 117 450 13 784 OpenSJP dictionary yes subset of unspecified size 100 no info no info MLF multi-domain various controlled no info no info various no info wav pcm 1 16000 16 no info 20-35 yes M66%-F-34% no info no info no info no info no info no info no info -Baza nagrań mowy AGH dla systemu SARMATA agh-corpus-sep-15 no info no info no info Proprietary AGH None pl-PL 2015 no info no info no info https://www.researchgate.net/publication/281774738_SARMATA_20_Automatic_Polish_Language_Speech_Recognition_System Piotr Żelasko no info no info AGH no info no info train, valid, test 42 42 no info 391 no info no no info no info OpenSJP dictionary yes subset of unspecified size 100 no info no info MLF multi-domain various controlled no info no info various no info wav pcm 1 16000 16 no info 20-35 yes M55%-F45% no info no info no info no info no info no info no info -Clarin cyfry clarin-pjatk-cyfry-16 free https://clarin-pl.eu/DSpace/handle/11321/317 yes CC-BY-SA PJATK DSpace CLARIN PL pl-PL 2016 no info no info http://hdl.handle.net/11321/317 no info Daniel Korzinek no info 2021 Polish Ministry of Science and Higher Education (N N104 205039) free free train, valid, test 1 1 0.12 25 488 no no info no info no info no info no info 100 no info no info no info digits read controlled no info quiet no info no info raw pcm 1 16000 16 no info N/A no N/A no no no no no no no -Korpus mowy szeptanej Politechniki Poznańskiej put-whisper-16 no info no info no info no info PUT None pl-PL 2016 no info no info no info https://yadda.icm.edu.pl/baztech/element/bwmeta1.element.baztech-19695dfb-03d8-401f-bb88-c90abebe2bf5 Piotr Kozierski no info no info Polish Ministry of Higher Education no info no info no info 9 9 no info no info no info no no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Pelcra EMI pelcra-emi-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY PELCRA PELCRA pl-GB 2018 no info no info PELCRA_EMI http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free valid 18 9 12 44 22 no 96000 no info no info no info no info 100 no info no info JSON, EAF, ELAN, EMU, TextGrid interviews with polish emmigrants in Scotlant spontenous controlled volunteers no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Polish Sejm Senat speech corpus clarin-sejm_senat-18 free https://huggingface.co/datasets/jimregan/clarinpl_sejmsenat yes CC-BY PJATK DSpace CLARIN PL pl-PL 2018 no info no info no info https://acoustics.ippt.pan.pl/index.php/aa/article/view/327/pdf_32 Daniel Korzinek no info 2018 Polish Ministry of Higher Education free free train, test 97 97 no info 516 6,762 no no info no info no info no info no info no info no info no info no info parliamentry speech lecture corpus public speakers parliament no info no info wav pcm 1 no info no info no info no info no info no info no info no info no info no info no info no info no info -Pelcra EMO pelcra-emo-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY PELCRA PELCRA pl-PL 2018 no info no info PELCRA_EMO http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free train, valid, test 28 26 2.2 80 40 no 252000 no info no info no info no info 100 no info no info JSON, EAF, ELAN, EMU, TextGrid interiews focused on reflecting on emotions spontenous controlled volunteers no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Pelcra LUZ pelcra-luz-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY PELCRA PELCRA pl-PL 2018 no info no info PELCRA_LUZ http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 20 20 13.2 42 21 no 213000 no info no info no info no info 100 no no info JSON, EAF, ELAN, EMU, TextGrid open interviews spontenous controlled volunteers quiet no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Pelcra Mowa Miasta Kraków pelcra-mmk-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY PELCRA PELCRA pl-PL 2018 no info no info MMK http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 2 2 no info 11 4 no 15900 no info no info no info no info no info no no info no info conversations spontaneous corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Pelcra Mowa Miasta Wrocław 1 pelcra-mmw-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY PELCRA PELCRA pl-PL 2018 no info no info MMW_1 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 8 7 no info 65 14 no 60000 no info no info no info no info no info no no info no info conversations spontaneous corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Pelcra Mowa Miasta Wrocłąw 2 pelcra-mmw2-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY PELCRA PELCRA pl-PL 2018 no info no info MMW_2 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 7 7 no info 38 14 no 70000 no info no info no info no info no info no no info no info parliamentary speeches lecture corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Pelcra PARL pelcra-parl-15 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY PELCRA PELCRA pl-PL 2018 no info no info PELCRA_PARL http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 14 12 no info 251 48 no 99000 no info no info no info no info no info no no info no info conversations spontaneous corpus public speakers mixed no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Polish Speech Database ldc-polish-speech-db-19 paid https://catalog.ldc.upenn.edu/LDC2019S19 yes LDC LDC LDC pl-PL 2019 803-554-461-385-1 1-58563-903-6 LDC2019S19 no info Tomasz Szwelnik no info no info VoiceLab 3000 USD 3000 USD train, valid, test 280 280 no info 200 263,424 no 815000 no info yes yes transcription and annotation 100 no no info txt multi read controlled paid contributors quiet headset no info flac flac 1 16000 16 yes 15-30 yes M51%-F49% yes no info no info no info no info no info no info -Pelcra YouTube 2 pelcra-yt2-19 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY PELCRA PELCRA pl-PL 2019 no info no info PELCRA_YT2 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 6 5 no info 45 23 no 49000 no info no info no info no info no info no no info no info multi various corpus public speakers mixed no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -PolEval 2019 pjatk-poleval-2019 free http://2019.poleval.pl/index.php/tasks/task5 yes no info PJATK None pl-PL 2019 no info no info no info http://2019.poleval.pl/files/2019/11.pdf Daniel Korzinek no info 2019 PJATK free free test 1 1 0.8 no info 29 no no info no info no info no info no info 0 UTF-8, lowercase, no punctuation, no numbers, no abbreviations no info no info parliamentary speeches lecture corpus public speakers various no info no info wav pcm 1 16000 16 no N/A no N/A no no N/A no no no no -M-AILABS speech dataset mailabs-19 free https://www.caito.de/2019/01/03/the-m-ailabs-speech-dataset/ yes Proprietary M-AILABS Coqui Free Corpora Catalog multi 2019 no info no info no info no info Imdat Solak no info 2019 M-AILABS free free no info 54 54 4.2 no info no info no no info no info no info no info no info 100 yes no info yes multiple read corpus volunteers quiet various various wav pcm 1 16000 16 no N/A yes no info no no N/A no no no no -PWR Atlas Zasobów Otwartej Nauki AZON nagrania kontrolowane pwr-azon-read-21 free https://zasobynauki.pl/zasoby/korpus-nagran-probek-mowy-do-celow-budowy-modeli-akustycznych-dla-automatycznego-rozpoznawania-mowy,62687/ yes CC-BY-SA WUST AZON pl-PL 2020 no info no info no info no info Teresa Sas no info 2022 Politechnika Wrocławska free free none 5 5 1.7 29 2,788 no no info no info no info no info no info 100 no info no info no info multiple read controlled volunteers quiet no info no info wav pcm 1 44100 16 no N/A yes M72%-F28% no no N/A no no no no -PWR Atlas Zasobów Otwartej Nauki AZON nagrania spontaniczne pwr-azon-spontaneous-21 free https://zasobynauki.pl/zasoby/korpus-nagran-probek-mowy-do-celow-budowy-modeli-akustycznych-dla-automatycznego-rozpoznawania-mowy,62687/ yes CC-BY-SA WUST AZON pl-PL 2020 no info no info no info no info Teresa Sas no info 2022 Politechnika Wrocławska free free none 2 2 0.65 27 456 no no info no info no info no info no info 100 no info no info no info multiple spontaneous corpus public speakers mixed no info no info wav pcm 1 44100 16 no N/A yes M72%-F28% no no N/A no no no no -Pelcra YouTube 1 pelcra-yt1-20 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY PELCRA PELCRA pl-PL 2020 no info no info PELCRA_YT1 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 7 5 no info 106 25 no 49000 no info no info no info no info no info no no info no info multi various corpus public speakers various no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info -Mozilla Common Voice mozilla-comm-voice-22 free commonvoice.mozilla.org/ yes CC-0 Mozilla Foundation Common Voice multi 2020 no info no info no info https://arxiv.org/pdf/1912.06670.pdf Mozilla Org 9.0 2022 Mozilla Foundation free free train, valid, test 148 148 4 3062 no info no no info no info no info yes up to 3 binary validations of each recording (2 if first 2 validations are consistent) 100 no info no info no info news, literature read open crowd various various multiple mp3 mpeg-3 1 48000 16 yes yes yes M60%-F14% no no N/A no info no no N/A -Multilingual librispeech fair-mls-20 free https://www.openslr.org/94/ yes CC-BY FAIR Github multi 2020 no info no info SLR94 https://arxiv.org/abs/2012.03411 Facebook (Meta) 2 2020 Facebook free free train, valid, test 137 137 6.2 16 28,860 no 492320 67100 yes yes manually transcribed and annotated dev, test subsets 100 yes no info no info books read corpus volunteers various various no info flac opus 1 16000 16 no N/A yes no info no no N/A no no no N/A -Vox populi fair-voxpopuli-pl-21 free https://github.com/facebookresearch/voxpopuli yes CC-0 FAIR Github multi 2021 no info no info no info https://arxiv.org/abs/2101.00390 Changhan Wang (changhan@fb.com), Morgane Rivière (mriviere@fb.com), Ann Lee (annl@fb.com) 2 2022 European Union free free train, valid, test 21200 111 no info 282 no info no 802000 no info yes yes transcription 0.52% no info no info no info parliament speech lecture corpus public speakers various studio mic no info ogg vorbis 1 16000 16 no N/A no M76%-F24% yes no N/A no info no no no -Korpus radiowy clarin-radio-21 free https://clarin-pl.eu/DSpace/handle/11321/820 yes CC-BY PJATK DSpace CLARIN PL pl-PL 2021 no info no info no info no info Łukasz Brocki no info 2021 Polish Ministry of Higher Education free free none 7 0 0.75 200 192 no no info no info no info no info no info 0 N/A no info N/A no info spontaneus, scripted corpus public speakers mixed various no info raw pcm 1 16000 16 no N/A no N/A no no N/A no no no N/A -EU Parliament clarin-pjatk-pinc-21 free https://clarin-pl.eu/DSpace/handle/11321/821 yes CC-BY PJATK DSpace CLARIN PL pl-PL 2021 no info no info http://hdl.handle.net/11321/821 no info Daniel Korzinek no info 2021 Polish Ministry of Science and Higher Education (N N104 205039) free free none 32 32 3.7 no info 1,040 yes no info no info no info no info no info 100 no info no info no info multiple lecture corpus public speakers various various no info wav pcm 1 16000 16 no N/A no N/A no no no yes no yes no -DiaBiz sample clarin-diabiz-sample-22 free http://docs.pelcra.pl/doku.php?id=diabiz yes ? PELCRA DSpace CLARIN PL pl-PL 2022 no info no info no info no info Piotr Pęzik no info no info European Union, Poland, Clarin-Biz free free none 1 1 no info no info 18 no no info no info no info yes transcription 100 no info no info json, xml, txt customer service dialogs controlled paid contributors quiet landline phone no info wav pcm 1 8000 16 no info no info no info no info yes no info no info yes no no no -DiaBiz clarin-diabiz-22 paid http://docs.pelcra.pl/doku.php?id=diabiz yes Proprietary PELCRA DSpace CLARIN PL pl-PL 2022 no info no info no info DiaBiz – an Annotated Corpus of Polish Call Center Dialogs Piotr Pęzik no info no info European Union, Poland, Clarin-Biz no info no info train, valid, test 410 410 no info 196 3,764 no 447576 no info no info yes transcription 100 no info no info json, xml, txt customer service dialogs controlled paid contributors quiet landline phone no info wav pcm 2 8000 16 no info no info yes no info no no no yes no no no -Diabiz eval clarin-diabiz-eval-22 paid http://docs.pelcra.pl/doku.php?id=diabiz yes Proprietary PELCRA DSpace CLARIN PL pl-PL 2022 no info no info no info no info Piotr Pęzik no info no info European Union, Poland, Clarin-Biz no info no info test 41 41 no info 146 no info no no info no info no info yes transcription 100 no info no info json, xml, txt customer service dialogs controlled paid contributors quiet landline phone no info wav pcm 2 8000 16 no info no info no info no info yes no info no info yes no no no -PWR male set sample pwr-male-sample-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 6 6 0.57 no info 4,738 no no info no info no info no info no info 100 no info no info no info no info read no info no info no info studio mic various wav pcm 1 16000 16 no N/A no N/A no no N/A no no no no -PWR short words sample pwr-short-words-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 1 1 0.35 no info 939 no no info no info no info no info no info 100 no info no info no info no info read no info no info no info studio mic various wav pcm 1 44100 16 no N/A no N/A no no N/A no no no no -PWR Very Important Utterances pwr-vui-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 1 1 0.1 no info 2,703 no no info no info no info no info no info 100 no info no info no info no info read no info no info no info studio mic various wav pcm 1 44100 16 no N/A no N/A no no N/A no no no no -Appen Mobile Speech appen-mobile-unk paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL no info no info no info Appen ID: POL_ASR002_CN no info Appen no info no info Appen no info no info no info 293 293 no info 353 106,674 no no info 168,544 no info yes no info 100 no info no info no info news read controlled paid contributors various mobile phone no info wav pcm 1 16000 16 no info no info no info no info no info no info no info no info no info no info no info -Shaip Polish Speech Corpus shaip-media-corpus-21 paid https://pl.shaip.com/offerings/speech-data-catalog/ yes Proprietary Shaip Shaip data catalog pl-PL 2021 no info no info no info no info Shaip no info 2021 Shaip no info no info no info 269 269 no info 533 no info yes no info no info no info no info no info 100 no info no info no info news, interviews, podcasts, various corpus public speakers various various no info wav pcm 1 16000 16 no info no info yes M66%-F33% no info no info no info no info no info no info no info -Shaip Polish Mobile Speech Dataset shaip-mobile-speech-21 paid https://pl.shaip.com/offerings/speech-data-catalog/ yes Proprietary Shaip Shaip data catalog pl-PL 2021 no info no info no info no info Shaip no info 2021 Shaip no info no info no info 1482 1482 no info 2049 no info yes no info no info no info no info no info 100 no info no info no info no info read controlled paid contributors no info mobile phone no info wav pcm 1 48000 16 no info no info yes M35%-F65% no info no info no info no info no info no info no info -PolyAI MINDS-14 polyai-minds14-21 free http://poly-public-data.s3.amazonaws.com/MInDS-14/MInDS-14.zip%22 yes CC-BY PolyAI Hugging Face Data Catalog multi 2021 no info no info no info https://arxiv.org/abs/2104.08524 PolyAI no info 2022 PolyAI free free train,test no info no info 0.5 no info 578 no no info no info no info no info no info no info no info no info no info no info spoken controlled paid contributors no info various no info wav pcm 1 16000 16 no info no info no N/A no info no info no info no info no info no info no info \ No newline at end of file +Dataset name Dataset ID Access type Access link Available online License Publisher Repository Languages Creation year ISLRN ISBN LR catalog ID Reference publication Contact point Latest version Last update year Sponsor Price - non-commercial usage Price - commercial usage Purpose and split Size audio total [hours] Size audio transcribed [hours] Size [GB] Speakers Audio recordings Audio segmentation Tokens Unique tokens Automatic QA Manual QA Manual QA scope Transcription coverage Transcription protocol Denormalized transcriptions Transcription and annotation format Domain Speech type Audio collection process Speech recordings source Acoustic environment Audio device Device model Audio format Audio codec Audio channels Sampling rate [Hz] Bits per sample Age info Age balance Gender info Gender balance Nativity info Accent info Accent representative Education info Occupation info Health info Time alignement annotation +Appen Global Phone Polish appen-gphone-02 paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL 2002 no info no info Appen ID: POL_ASR001 no info Appen no info no info European Union no info no info train, valid, test 25 25 2 99 10,130 no no info no info no info no info no info 100 no info no info no info general read controlled paid contributors, volunteers home, office studio mic Sennheiser 440-6 wav pcm 1 16000 16 yes 20-39 yes M53%-F47% yes no no no no no +Appen Mobile Speech appen-mobile-unk paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL no info no info no info Appen ID: POL_ASR002_CN no info Appen no info no info Appen no info no info no info 293 293 no info 353 106,674 no no info 168,544 no info yes no info 100 no info no info no info news read controlled paid contributors various mobile phone no info wav pcm 1 16000 16 no info no info no info no info no no no no no no +Appen SpeechDat Phone appen-speechdat-10 paid https://appen.com/products/pre-labeled-datasets/ yes Proprietary Appen Appen Pre-Labelled Datasets pl-PL 2010 no info no info Appen ID: Polish SpeechDat(E) Database https://www.phonetik.uni-muenchen.de/forschung/BITS/TP1/Cookbook/node1.html Appen no info no info European Union no info no info train, valid, test 78 78 no info 1000 48,000 no no info no info yes yes no info 100 no info no info no info multi-domain read controlled paid contributors quiet landline phone no info wav pcm 1 8000 16 no info no info no info no info no no no no no no +BABEL elra-babel-98 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0307/ yes ELRA ELRA ELRA pl-PL 1998 376-102-726-476-0 no info ELRA-S0307 no info Valerie Mapelli no info 2010 European Union 600 EUR 6000 EUR train, valid, test 16 16 no info 60 no info no no info no info no info no info no info no info no info no info no info numbers no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no no no no no no +Baza nagrań mowy AGH agh-corpus-may-15 no info no info no Proprietary AGH pl-PL 2015 no info no info no info https://link.springer.com/article/10.1007/s10579-015-9302-y Piotr Å»elasko no info no info AGH no info no info train, valid, test 25 25 no info 166 no info no 117 450 13 784 OpenSJP dictionary yes subset of unspecified size 100 no info no info MLF multi-domain read controlled no info no info various no info wav pcm 1 16000 16 no info 20-35 yes M66%-F-34% no no no no no no +Baza nagrań mowy AGH dla systemu SARMATA agh-corpus-sep-15 no info no info no Proprietary AGH pl-PL 2015 no info no info no info https://www.researchgate.net/publication/281774738_SARMATA_20_Automatic_Polish_Language_Speech_Recognition_System Piotr Å»elasko no info no info AGH no info no info train, valid, test 42 42 no info 391 no info no no info no info OpenSJP dictionary yes subset of unspecified size 100 no info no info MLF multi-domain read controlled no info no info various no info wav pcm 1 16000 16 no info 20-35 yes M55%-F45% no no no no no no +Clarin cyfry clarin-pjatk-cyfry-16 free https://clarin-pl.eu/dspace/bitstream/handle/11321/317/cyfry.zip?sequence=1&isAllowed=y yes CC-BY-SA PJATK DSpace CLARIN PL pl-PL 2016 no info no info http://hdl.handle.net/11321/317 no info Daniel Korzinek no info 2021 Polish Ministry of Science and Higher Education (N N104 205039) free free train, valid, test 1 1 0.12 25 488 no no info no info no info no info no info 100 no info no info no info digits read controlled no info quiet no info no info raw pcm 1 16000 16 no info no no no no no no no +Clarin mobile clarin-pjatk-mobile-15 free https://clarin-pl.eu/DSpace/handle/11321/237 yes CC-BY PJATK DSpace CLARIN PL, VLO pl-PL 2015 no info no info no info no info Daniel Korzinek no info 2015 CLARIN-PL free free train, valid, test 13 13 1.5 to check 3,552 no no info no info no info no info no info 0 no info no info no info no info read controlled volunteers quiet landline phone no info wav pcm 1 16000 16 no info no no no no no no no +Clarin studio clarin-pjatk-studio-15 free https://clarin-pl.eu/DSpace/handle/11321/236 yes CC-BY PJATK DSpace CLARIN PL, VLO pl-PL 2015 no info no info no info https://www.clarin.eu/sites/default/files/02%20-%20KORZINEK-Polish.pdf Daniel Korzinek no info 2015 CLARIN-PL free free train, valid, test 56 56 4 to check no info no no info no info no info no info no info 100 no info no info no info no info read controlled volunteers quiet studio mic no info wav pcm 1 16000 16 no info no no no no no no no +Corpora put-corpora-97 no info no no info PUT pl-PL 1997 no info no info no info https://www.isca-speech.org/archive_v0/archive_papers/eurospeech_1997/e97_1735.pdf Stefan Grocholewski no info no info no info no info no info no info 6 6 45 365 no no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no no no no no no +CSLU: 22 Languages Corpus ldc-clsu-pl-05 paid https://catalog.ldc.upenn.edu/LDC2005S26 yes LDC LDC LDC multi 2005 no info 1-58563-356-9 LDC2005S26 no info T. Lander or Linguistic Data Consortium no info no info no info 150 USD 150 USD no info 4 4 no info no info 2,500 no no info no info no info yes Check if prompt instructions were followed. Manual transcription of 30% of data. 30 no info no info no info no info read controlled no info no info landline phone no info RIFF uLaw 1 8000 8 no info no info no info no info no no no no no no +DiaBiz clarin-diabiz-22 paid http://docs.pelcra.pl/doku.php?id=diabiz yes Proprietary UL DSpace CLARIN PL pl-PL 2022 no info no info no info https://aclanthology.org/2022.lrec-1.76/ Piotr Pęzik no info no info European Union, Poland, Clarin-Biz no info no info train, valid, test 410 410 no info 196 3,764 no 447576 no info no info yes transcription 100 no info no info json, xml, txt customer service conversational controlled paid contributors quiet landline phone no info wav pcm 2 8000 16 no info no info yes no info no no no no no yes +Diabiz eval clarin-diabiz-eval-22 paid http://docs.pelcra.pl/doku.php?id=diabiz yes Proprietary UL DSpace CLARIN PL pl-PL 2022 no info no info no info no info Piotr Pęzik no info no info European Union, Poland, Clarin-Biz no info no info test 41 41 no info 146 no info no no info no info no info yes transcription 100 no info no info json, xml, txt customer service conversational controlled paid contributors quiet landline phone no info wav pcm 2 8000 16 no info no info no info no info yes no no no no yes +DiaBiz sample clarin-diabiz-sample-22 no info http://docs.pelcra.pl/doku.php?id=diabiz yes Proprietary UL DSpace CLARIN PL pl-PL 2022 no info no info no info no info Piotr Pęzik no info no info European Union, Poland, Clarin-Biz no info no info none 1 1 no info no info 18 no no info no info no info yes transcription 100 no info no info json, xml, txt customer service conversational controlled paid contributors quiet landline phone no info wav pcm 2 8000 16 no info no info no info no info yes no no no no yes +EASR Corpora of European Portuguese, French, Hungarian and Polish Elderly Speech elra-easr-14 no info no info no no info ELRA ELRA multi 2014 no info no info no info http://www.lrec-conf.org/proceedings/lrec2014/pdf/365_Paper.pdf Artur Kolesiński no info no info European Union no info no info train, valid, test 205 205 no info 781 no info no no info no info no info yes transcription 100 yes no info no info multi-domain read controlled paid contributors quiet headset no info wav pcm 1 16000 16 no info yes, but elderly group only (over 60) yes yes yes 100 no no no no no +ELRA Global Phone Polish elra-gphone-elra-02 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0320/ yes ELRA ELRA ELRA multi 2002 350-930-795-617-4 no info ELRA-S0320 no info Valerie Mapelli no info no info European Union 700 EUR 3700 train, valid, test 25 25 2 99 10,130 no no info no info no info no info no info 100 no info no info no info general read controlled paid contributors, volunteers home, office studio mic Sennheiser 440-6 wav pcm 1 16000 16 yes 20-39 yes M53%-F47% no no no no no no +EU Parliament clarin-pjatk-pinc-21 free https://clarin-pl.eu/DSpace/handle/11321/821 yes CC-BY PJATK DSpace CLARIN PL pl-PL 2021 no info no info http://hdl.handle.net/11321/821 no info Daniel Korzinek no info 2021 Polish Ministry of Science and Higher Education (N N104 205039) free free none 32 32 3.7 no info 1,040 yes no info no info no info no info no info 100 no info no info no info multi-domain public speech corpus public speakers various various no info wav pcm 1 16000 16 no no no no no no no yes +Exmeralda hzsk-exmeralda-pl-07 free https://corpora.uni-hamburg.de/hzsk/de/islandora/object/spoken-corpus:demo-1.0#corpus-metadata no HZSK-PUB HZSK HZSK multi 2007 no info no info http://hdl.handle.net/11022/0000-0000-4F70-A no info HZSK no info 2009 no info free not available none no info no info no info 5 no info no no info no info no info no info no info 100 no info no info no info broadcast conversational corpus paid contributors broadcast lavalier mic no info wav pcm 1 44100 16 no info no info yes M50%-F50% no no no no no no +Gewiss hzsk-gewiss-pl-12 free https://gewiss.uni-leipzig.de/index.php?id=about_gewiss&L=1 no HZSK-PUB HZSK HZSK multi 2012 no info no info no info no info no info no info no info no info free free none 20 20 no info 10 no info no no info no info no info yes no info 100 GAT2 no info exmeralda academic lecture public speech corpus volunteers queit no info no info no info no info no info no info no info no info no info no info no info no no no no no no +Hamburg bilingual hzsk-hamcopolig-11 free https://corpora.uni-hamburg.de/hzsk/de/islandora/object/spoken-corpus:hamcopolig yes HZSK-PUB HZSK CLARIN EU, VLO multi 2011 no info no info http://hdl.handle.net/11022/0000-0000-6969-5 https://benjamins.com/catalog/hsm.14.10cza Agnieszka Czachor 0.2 2012 no info free no info no info no info no info no info no info no info no no info no info no info no info no info no info no info no info no info multi-domain conversational controlled volunteers no info no info no info no info no info no info no info no info yes yes no info no info yes yes no no no no no +Jurisdic uam-jurisdic-08 no info no info no no info AMU pl-PL 2008 no info no info no info http://www.lrec-conf.org/proceedings/lrec2008/pdf/326_paper.pdf Grażyna Demenko no info no info Poland no info no info train, valid, test 855 855 no info 1000 494,933 no no info no info yes yes transcription, annotation 100 yes no info no info multi-domain various various government agents quiet no info Sennheiser ME-3, AKG C-1000S, Sennheiser ew300G2 wav pcm 2 16000 16 no info no info yes no info yes yes no no no yes no +Korpus mowy szeptanej Politechniki Poznańskiej put-whisper-16 no info no info no no info PUT pl-PL 2016 no info no info no info https://yadda.icm.edu.pl/baztech/element/bwmeta1.element.baztech-19695dfb-03d8-401f-bb88-c90abebe2bf5 Piotr Kozierski no info no info Polish Ministry of Higher Education no info no info no info 9 9 no info no info no info no no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Korpus radiowy clarin-radio-21 free https://clarin-pl.eu/DSpace/handle/11321/820 yes CC-BY PJATK DSpace CLARIN PL pl-PL 2021 no info no info no info no info Łukasz Brocki no info 2021 Polish Ministry of Higher Education free free none 7 0 0.75 200 192 no no info no info no info no info no info 0 no info no info public speech corpus public speakers mixed various no info raw pcm 1 16000 16 no no no no no no no no +LUNA pjatk-luna-07 no info http://nlp.ipipan.waw.pl/NLP-SEMINAR/070423.pdf no no info PJATK pl-PL 2007 no info no info no info no info no info no info no info European Union no info no info no info 11 11 no info 500 500 no no info no info no info yes annotation 100 yes no info no info public transport conversational controlled volunteers no info landline phone no info no info no info no info no info no info no info no info no info no info no info no no no no no +M-AILABS speech dataset mailabs-19 free https://data.solak.de/data/Training/stt_tts/pl_PL.tgz yes Proprietary M-AILABS Coqui Free Corpora Catalog multi 2019 no info no info no info no info Imdat Solak no info 2019 M-AILABS free free no info 54 54 4.2 no info no info no no info no info no info no info no info 100 yes no info yes multi-domain read corpus volunteers quiet various various wav pcm 1 16000 16 no yes no info no no no no no no +Mozilla Common Voice mozilla-comm-voice-20 free commonvoice.mozilla.org/ yes CC-0 Mozilla Foundation Common Voice multi 2020 no info no info no info https://arxiv.org/pdf/1912.06670.pdf Mozilla Org 9.0 2022 Mozilla Foundation free free train, valid, test 148 148 4 3062 no info no no info no info no info yes up to 3 binary validations of each recording (2 if first 2 validations are consistent) 100 no info no info no info multi-domain read controlled crowd various various multiple mp3 mpeg-3 1 48000 16 yes yes yes M60%-F14% no no no no no no +Multilingual librispeech fair-mls-20 free https://www.openslr.org/94/ yes CC-BY FAIR Github multi 2020 no info no info SLR94 https://arxiv.org/abs/2012.03411 Facebook (Meta) 2 2020 Facebook free free train, valid, test 137 137 6.2 16 28,860 no 492320 67100 yes yes manually transcribed and annotated dev, test subsets 100 yes no info no info books read corpus volunteers various various no info flac opus 1 16000 16 no yes no info no no no no no no +Pelcra EMI pelcra-emi-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-GB 2018 no info no info PELCRA_EMI http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free valid 18 9 12 44 22 no 96000 no info no info no info no info 100 no info no info JSON, EAF, ELAN, EMU, TextGrid interview conversational controlled volunteers no info no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra EMO pelcra-emo-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info PELCRA_EMO http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free train, valid, test 28 26 2.2 80 40 no 252000 no info no info no info no info 100 no info no info JSON, EAF, ELAN, EMU, TextGrid interview conversational controlled volunteers no info no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra learner English corpus pelcra-plec-11 free http://pelcra.pl/plec/downloads_fs yes no info UL PELCRA en-PL 2011 no info no info no info “Towards the PELCRA Learner English Corpus.” In Corpus Data across Languages and Disciplines, edited by Piotr Pęzik, 28:33–42. Łódź Studies in Language. Peter Lang, 2012. Piotr Pęzik no info 2012 Polish Ministry of Science and Higher Education (N N104 205039) free no info none no info no info no info no info no info no no info no info yes yes annotation 100 no no info xls general read controlled volunteers quiet no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra LUZ pelcra-luz-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info PELCRA_LUZ http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 20 20 13.2 42 21 no 213000 no info no info no info no info 100 no no info JSON, EAF, ELAN, EMU, TextGrid interview conversational controlled volunteers quiet no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra Mowa Miasta Kraków pelcra-mmk-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info MMK http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 2 2 no info 11 4 no 15900 no info no info no info no info no info no no info no info conversations conversational corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra Mowa Miasta Wrocław 1 pelcra-mmw-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info MMW_1 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 8 7 no info 65 14 no 60000 no info no info no info no info no info no no info no info conversations conversational corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no info no no no no no +Pelcra Mowa Miasta Wrocław 2 pelcra-mmw2-18 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info MMW_2 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 7 7 no info 38 14 no 70000 no info no info no info no info no info no no info no info conversations conversational corpus no info mixed no info no info no info no info no info no info no info no info no info no info no info no no no no no no +Pelcra PARL pelcra-parl-15 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2018 no info no info PELCRA_PARL http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 14 12 no info 251 48 no 99000 no info no info no info no info no info no no info no info parliament speech public speech corpus public speakers mixed no info no info no info no info no info no info no info no info no info no info no info no no no no no no +Pelcra Spelling and NUmbers Voice database pelcra-snuv-12 free http://metashare.elda.org/repository/browse/spelling-and-numbers-voice-database/f9e499c663f111e2bff4525400d761477c36ad442d124e6892bb3c8ce1a1ecdf/ yes CC-BY UL PELCRA pl-PL 2012 no info no info no info no info Piotr Pęzik no info 2012 European Union, Poland free free no info 220 220 no info 210 99,517 no 704625 no info yes yes no info 100 no no info no info numbers read controlled paid contributors various headset no info WAV pcm 1 22050 16 no info no info no info no info yes no no no no no +Pelcra YouTube 1 pelcra-yt1-20 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2020 no info no info PELCRA_YT1 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 7 5 no info 106 25 no 49000 no info no info no info no info no info no no info no info multi-domain various corpus public speakers various no info no info no info no info no info no info no info no info no info no info no info no no no no no no +Pelcra YouTube 2 pelcra-yt2-20 free http://docs.pelcra.pl/doku.php?id=spoken_offline_corpora yes CC-BY UL PELCRA pl-PL 2020 no info no info PELCRA_YT2 http://www.lrec-conf.org/proceedings/lrec2018/pdf/888.pdf Piotr Pęzik no info 2018 CLARIN-PL free free none 6 5 no info 45 23 no 49000 no info no info no info no info no info no no info no info multi-domain various corpus public speakers mixed no info no info no info no info no info no info no info no info no info no info no info no no no no no no +PolEval 2019 pjatk-poleval-19 free http://2019.poleval.pl/index.php/tasks/task5 yes no info PJATK pl-PL 2019 no info no info no info http://2019.poleval.pl/files/2019/11.pdf Daniel Korzinek no info 2019 PJATK free free test 1 1 0.8 no info 29 no no info no info no info no info no info 0 UTF-8, lowercase, no punctuation, no numbers, no abbreviations no info no info parliament speech public speech corpus public speakers various no info no info wav pcm 1 16000 16 no no no no no no no no +Polish Sejm Senat speech corpus clarin-sejm_senat-18 free https://huggingface.co/datasets/jimregan/clarinpl_sejmsenat yes CC-BY PJATK DSpace CLARIN PL pl-PL 2018 no info no info no info https://acoustics.ippt.pan.pl/index.php/aa/article/view/327/pdf_32 Daniel Korzinek no info 2018 Polish Ministry of Higher Education free free train, test 97 97 no info 516 6,762 no no info no info no info no info no info no info no info no info no info parliament speech public speech corpus public speakers parliament no info no info wav pcm 1 no info no info no info no info no info no info no no no no no no +Polish Speech Database ldc-polish-speech-db-19 paid https://catalog.ldc.upenn.edu/LDC2019S19 yes LDC LDC LDC pl-PL 2019 803-554-461-385-1 1-58563-903-6 LDC2019S19 no info Tomasz Szwelnik no info no info VoiceLab 3000 USD 3000 USD train, valid, test 280 280 no info 200 263,424 no 815000 no info yes yes transcription and annotation 100 no no info txt multi-domain read controlled paid contributors quiet headset no info flac flac 1 16000 16 yes 15-30 yes M51%-F49% yes no no no no no +PolyAI MINDS-14 polyai-minds14-21 free http://poly-public-data.s3.amazonaws.com/MInDS-14/MInDS-14.zip%22 yes CC-BY PolyAI Hugging Face Data Catalog multi 2021 no info no info no info https://arxiv.org/abs/2104.08524 PolyAI no info 2022 PolyAI free free train,test 1 1 0.5 no info 578 no no info no info no info no info no info no info no info no info no info no info read controlled paid contributors no info various no info wav pcm 1 8000 16 no info no info no no no no no no no +PWR Atlas Zasobów Otwartej Nauki AZON nagrania kontrolowane pwr-azon-read-20 free https://zasobynauki.pl/zasoby/korpus-nagran-probek-mowy-do-celow-budowy-modeli-akustycznych-dla-automatycznego-rozpoznawania-mowy,53293/ yes CC-BY-SA WUST AZON pl-PL 2020 no info no info no info no info Teresa Sas no info 2022 Politechnika Wrocławska free free none 5 5 1.7 29 2,788 no no info no info no info no info no info 100 no info no info no info multi-domain read controlled volunteers quiet no info no info wav pcm 1 44100 16 no yes M72%-F28% no no no no no no +PWR Atlas Zasobów Otwartej Nauki AZON nagrania spontaniczne pwr-azon-spontaneous-20 free https://zasobynauki.pl/zasoby/korpus-nagran-probek-mowy-do-celow-budowy-modeli-akustycznych-dla-automatycznego-rozpoznawania-mowy,62687/ yes CC-BY-SA WUST AZON pl-PL 2020 no info no info no info no info Teresa Sas no info 2022 Politechnika Wrocławska free free none 2 2 0.65 27 456 no no info no info no info no info no info 100 no info no info no info multi-domain public speech corpus public speakers mixed no info no info wav pcm 1 44100 16 no yes M72%-F28% no no no no no no +PWR male set sample pwr-maleset-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 6 6 0.57 no info 4,738 no no info no info no info no info no info 100 no info no info no info no info read controlled no info no info studio mic various wav pcm 1 16000 16 no no no no no no no no +PWR short words sample pwr-shortwords-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 1 1 0.35 no info 939 no no info no info no info no info no info 100 no info no info no info no info read controlled no info no info studio mic various wav pcm 1 44100 16 no no no no no no no no +PWR Very Important Utterances pwr-viu-unk free https://www.ii.pwr.edu.pl/~sas/ASR/ yes no info WUST Author's page pl-PL no info no info no info no info no info no info no info no info Politechnika Wrocławska free free none 1 1 0.1 no info 2,703 no no info no info no info no info no info 100 no info no info no info no info read controlled no info no info studio mic various wav pcm 1 44100 16 no no no no no no no no +Shaip Polish Mobile Speech Dataset shaip-mobile-speech-21 paid https://pl.shaip.com/offerings/speech-data-catalog/ yes Proprietary Shaip Shaip data catalog pl-PL 2021 no info no info no info no info Shaip no info 2021 Shaip no info no info no info 1482 1482 no info 2049 no info yes no info no info no info no info no info 100 no info no info no info no info read controlled paid contributors no info mobile phone no info wav pcm 1 48000 16 no info no info yes M35%-F65% no no no no no no +Shaip Polish Speech Corpus shaip-media-corpus-21 paid https://pl.shaip.com/offerings/speech-data-catalog/ yes Proprietary Shaip Shaip data catalog pl-PL 2021 no info no info no info no info Shaip no info 2021 Shaip no info no info no info 269 269 no info 533 no info yes no info no info no info no info no info 100 no info no info no info multi-domain various corpus public speakers various various no info wav pcm 1 16000 16 no info no info yes M66%-F33% no no no no no no +Speecon Polish elra-speecon-pl-05 paid http://catalog.elra.info/en-us/repository/browse/ELRA-S0179/ yes ELRA ELRA ELRA multi 2005 697-702-806-588-8 no info ELRA-S0179 https://www.researchgate.net/publication/2494457_SPEECON_-_Speech_Databases_for_Consumer_Devices_Database_Specification_and_Validation/link/54b64e6b0cf28ebe92e7c713/download ELRA 1.0 2007 European Union 67 000 EUR 75 000 EUR train, valid, test 248 248 135 600 no info no no info no info no info yes Scope: - documentation - completeness of the database - file formats - signal quality - transcription quality - lexicon - speaker and environment distribution Processes: - prompt sheet and lexicon validation - first 10-speaker database validation 100 yes no info no info multi-domain read controlled no info office, public space, broadcast, car studio mic no info wav pcm 4 16000 16 yes yes yes yes yes no no no no no +Vox populi fair-voxpopuli-pl-21 free https://github.com/facebookresearch/voxpopuli yes CC-0 FAIR Github multi 2021 no info no info no info https://arxiv.org/abs/2101.00390 Changhan Wang (changhan@fb.com), Morgane Rivière (mriviere@fb.com), Ann Lee (annl@fb.com) 2 2022 European Union free free train, valid, test 21200 111 no info 282 no info no 802000 no info yes yes transcription 0.52% no info no info no info parliament speech public speech corpus public speakers various studio mic no info ogg vorbis 1 16000 16 no no M76%-F24% yes no no no no no +Fleurs google-fleurs-22 free https://huggingface.co/datasets/google/fleurs yes CC-BY Google Hugging Face Data Catalog multi 2022 no info no info no info https://arxiv.org/pdf/2205.12446.pdf Alexis Conneau 2 2022 Google free free train, valid, test 12.1 12.1 2.1 no info 3937 no no info no info no yes recordings 100% no yes tsv wikipedia articles read controlled paid contributors various various no info wav pcm 1 16000 16 no yes M70%-F30% yes no no no no no +Spokes Biz pelcra-spokesbiz-23 free http://docs.pelcra.pl/doku.php?id=spokesbiz yes CC-BY-NC-ND UL PELCRA pl-PL 2023 no info no info no info http://arxiv.org/abs/2312.12364 Piotr Pęzik 1 2023 CLARIN-PL free no info no info 650 650 no info 590 925 yes 5911420 no info yes yes transcription, annotation 100% no info no info no info multi-domain conversational corpus Polish Speech Database various various no info wav pcm 1 16000 16 yes yes yes yes yes yes yes yes no no yes diff --git a/snapshots/pl-asr-speech-datasets-taxonomy-latest.tsv b/snapshots/pl-asr-speech-datasets-taxonomy-latest.tsv index 2f60e07..f41b6cc 100644 --- a/snapshots/pl-asr-speech-datasets-taxonomy-latest.tsv +++ b/snapshots/pl-asr-speech-datasets-taxonomy-latest.tsv @@ -1,62 +1,62 @@ -Dataset attribute Purpose Allowed values -Dataset name Full name of a speech dataset [a-z A-Z0-9_\-] -Dataset ID Dataset unique identifier for reporting [a-z0-9\-] -Access type Dataset access type from the cost perspective. free, paid, no-info -Access link Web reference for accessing or purchasing a dataset URL format -Available online Validated access status as of March 2023 yes, no -License Dataset license type Apache, CC-0, CC-BY, CC-BY-SA, ELRA, HZSK-PUB, LDC, Proprietary -Publisher Creator or publisher of a dataset [a-z A-Z\-] -Repository Main repository hosting a dataset [a-z A-Z\-] -Languages Language and country code of speakers recorded $lang(ISO-639-1)-$country code(ISO-3166-2), multi -Creation year Year a dataset was created or published \d{4} -ISLRN International Standard Language Resource Number ISRLN -ISBN International Standard Book Number ISBN -LR catalog ID Language data repository ID URL, [a-z A-Z\-\_0-9] -Reference publication Link to relevant publication describing a dataset URL -Contact point Contact point referenced in the documentation [a-z A-Z\-\_0-9\@] -Latest version The latest version of the released dataset [0-9\.] -Last update year Last update date (year) \d{4} -Sponsor Institution which funded the creation of dataset [a-z A-Z\-\_0-9] -Price - non-commercial usage Price for non-commercial usage [free|\d+] -Price - commercial usage Price for commercial usage [free|\d+] -Purpose and split Target usage and available data splits train, valid, test, none -Size audio total [hours] Total amount of audio data in hours [\d+\.] -Size audio transcribed [hours] Total amount of transcribed speech data [\d+\.] -Size [GB] Size of a dataset in gigabytes [\d+\.] -Speakers Number of speakers recordings originate from [\d+] -Audio recordings Number of voice recordings in the corpus \d+ -Audio segmentation Are audio recordings segmented yes, no -Tokens Number of tokens in the corpus [\d+] -Unique tokens Number of unique tokens [\d+] -Automatic QA Type of automatic quality assurance process applied yes, no -Manual QA Type of manual quality assurance process applied yes, no -Manual QA scope Scope of manual QA applied [a-zA-Z \d+] -Transcription coverage Ratio of transcribed recordings % -Transcription protocol Is a transcription protocol specified or described? yes, no, description -Denormalized transcriptions Are there available transcriptions without abbreviations, numerals, punctuation etc. yes, no -Transcription and annotation format Format of transcription files [a-z A-Z0-9\.] -Domain Domain of utterances academic lecture, books, broadcast, conversations, customer service, digits, general, interview, multi-domain, news, numbers, parliament speech, public transport -Speech type Type of speech dialog, isolated words, lecture, monolog, read, spontaneous, various -Audio collection process Audio collection process controlled, corpus, various -Speech recordings source Speech recordings source volunteers, university employees, crowd, public speakers, paid contributors -Acoustic environment Acoustic conditions audio was collected in broadcast, car, home, mixed, quiet space, office, public space, various -Audio device Audio devices used for speech collection condenser mic, headset, mobile phone, landline phone, various -Device model Recording device(s) and model(s) [a-zA-Z\- ] -Audio format Audio storage format flac, mp3, raw, riff, wav -Audio codec Audio encoding format mp3, ogg, opus, vorbis -Audio channels Number of audio recording channels [1-16] -Sampling rate [Hz] Sampling rate of recorded audio \d?\d{4} -Bits per sample Number of bits used for encoding each audio sample 8,16,24,32 -Age info Annotation of speakers age yes, no -Age balance Is speakers age distribution balanced across demographics groups free text -Gender info Annotation of speakers gender yes, no -Gender balance Is speakers gender distribution balanced across demographics groups free text -Nativity info Annotation of speakers nativity yes, no -Accent info Annotation of speakers accent yes, no -Accent representative Is dataset balanced in terms of speakers' accent yes, no, N/A -Education info Information about speaker education level yes,no -Occupation info Information about professional occupation of speakers yes, no -Health info Information about health condition of speakers yes, no -Time alignement annotation Information about time- alignment of speech signal yes, no -Named entities annotation Transcriptions with Named Entities annotations yes, no -Part of speech annotation Transcriptions with POS (Part of speech) annotations yes, no \ No newline at end of file +Dataset attribute Purpose Allowed values +Dataset name Full name of a speech dataset [a-z A-Z0-9_\-] +Dataset ID Dataset unique identifier for reporting [a-z0-9\-] +Access type Dataset access type from the cost perspective. free, paid, no-info +Access link Web reference for accessing or purchasing a dataset URL format +Available online Validated access status as of March 2023 yes, no +License Dataset license type Apache, CC-0, CC-BY, CC-BY-SA, ELRA, HZSK-PUB, LDC, Proprietary +Publisher Creator or publisher of a dataset [a-z A-Z\-] +Repository Main repository hosting a dataset [a-z A-Z\-] +Languages Language and country code of speakers recorded $lang(ISO-639-1)-$country code(ISO-3166-2), multi +Creation year Year a dataset was created or published \d{4} +ISLRN International Standard Language Resource Number ISRLN +ISBN International Standard Book Number ISBN +LR catalog ID Language data repository ID URL, [a-z A-Z\-\_0-9] +Reference publication Link to relevant publication describing a dataset URL +Contact point Contact point referenced in the documentation [a-z A-Z\-\_0-9\@] +Latest version The latest version of the released dataset [0-9\.] +Last update year Last update date (year) \d{4} +Sponsor Institution which funded the creation of dataset [a-z A-Z\-\_0-9] +Price - non-commercial usage Price for non-commercial usage [free|\d+] +Price - commercial usage Price for commercial usage [free|\d+] +Purpose and split Target usage and available data splits train, valid, test, none +Size audio total [hours] Total amount of audio data in hours [\d+\.] +Size audio transcribed [hours] Total amount of transcribed speech data [\d+\.] +Size [GB] Size of a dataset in gigabytes [\d+\.] +Speakers Number of speakers recordings originate from [\d+] +Audio recordings Number of voice recordings in the corpus \d+ +Audio segmentation Are audio recordings segmented yes, no +Tokens Number of tokens in the corpus [\d+] +Unique tokens Number of unique tokens [\d+] +Automatic QA Type of automatic quality assurance process applied yes, no +Manual QA Type of manual quality assurance process applied yes, no +Manual QA scope Scope of manual QA applied [a-zA-Z \d+] +Transcription coverage Ratio of transcribed recordings % +Transcription protocol Is a transcription protocol specified or described? yes, no, description +Denormalized transcriptions Are there available transcriptions without abbreviations, numerals, punctuation etc. yes, no +Transcription and annotation format Format of transcription files [a-z A-Z0-9\.] +Domain Domain of utterances academic lecture, books, broadcast, conversations, customer service, digits, general, interview, multi-domain, news, numbers, parliament speech, public transport +Speech type Type of speech conversational, read, public speech, various +Audio collection process Audio collection process controlled, corpus, various +Speech recordings source Speech recordings source volunteers, university employees, crowd, public speakers, paid contributors +Acoustic environment Acoustic conditions audio was collected in broadcast, car, home, mixed, quiet space, office, public space, various +Audio device Audio devices used for speech collection condenser mic, headset, mobile phone, landline phone, various +Device model Recording device(s) and model(s) [a-zA-Z\- ] +Audio format Audio storage format flac, mp3, raw, riff, wav +Audio codec Audio encoding format mp3, ogg, opus, vorbis +Audio channels Number of audio recording channels [1-16] +Sampling rate [Hz] Sampling rate of recorded audio \d?\d{4} +Bits per sample Number of bits used for encoding each audio sample 8,16,24,32 +Age info Annotation of speakers age yes, no +Age balance Is speakers age distribution balanced across demographics groups free text +Gender info Annotation of speakers gender yes, no +Gender balance Is speakers gender distribution balanced across demographics groups free text +Nativity info Annotation of speakers nativity yes, no +Accent info Annotation of speakers accent yes, no +Accent representative Is dataset balanced in terms of speakers' accent yes, no, N/A +Education info Information about speaker education level yes,no +Occupation info Information about professional occupation of speakers yes, no +Health info Information about health condition of speakers yes, no +Time alignement annotation Information about time- alignment of speech signal yes, no +Named entities annotation Transcriptions with Named Entities annotations yes, no +Part of speech annotation Transcriptions with POS (Part of speech) annotations yes, no diff --git a/snapshots/taxonomy/pl-asr-speech-datasets-taxonomy-20240218.tsv b/snapshots/taxonomy/pl-asr-speech-datasets-taxonomy-20240218.tsv new file mode 100644 index 0000000..f41b6cc --- /dev/null +++ b/snapshots/taxonomy/pl-asr-speech-datasets-taxonomy-20240218.tsv @@ -0,0 +1,62 @@ +Dataset attribute Purpose Allowed values +Dataset name Full name of a speech dataset [a-z A-Z0-9_\-] +Dataset ID Dataset unique identifier for reporting [a-z0-9\-] +Access type Dataset access type from the cost perspective. free, paid, no-info +Access link Web reference for accessing or purchasing a dataset URL format +Available online Validated access status as of March 2023 yes, no +License Dataset license type Apache, CC-0, CC-BY, CC-BY-SA, ELRA, HZSK-PUB, LDC, Proprietary +Publisher Creator or publisher of a dataset [a-z A-Z\-] +Repository Main repository hosting a dataset [a-z A-Z\-] +Languages Language and country code of speakers recorded $lang(ISO-639-1)-$country code(ISO-3166-2), multi +Creation year Year a dataset was created or published \d{4} +ISLRN International Standard Language Resource Number ISRLN +ISBN International Standard Book Number ISBN +LR catalog ID Language data repository ID URL, [a-z A-Z\-\_0-9] +Reference publication Link to relevant publication describing a dataset URL +Contact point Contact point referenced in the documentation [a-z A-Z\-\_0-9\@] +Latest version The latest version of the released dataset [0-9\.] +Last update year Last update date (year) \d{4} +Sponsor Institution which funded the creation of dataset [a-z A-Z\-\_0-9] +Price - non-commercial usage Price for non-commercial usage [free|\d+] +Price - commercial usage Price for commercial usage [free|\d+] +Purpose and split Target usage and available data splits train, valid, test, none +Size audio total [hours] Total amount of audio data in hours [\d+\.] +Size audio transcribed [hours] Total amount of transcribed speech data [\d+\.] +Size [GB] Size of a dataset in gigabytes [\d+\.] +Speakers Number of speakers recordings originate from [\d+] +Audio recordings Number of voice recordings in the corpus \d+ +Audio segmentation Are audio recordings segmented yes, no +Tokens Number of tokens in the corpus [\d+] +Unique tokens Number of unique tokens [\d+] +Automatic QA Type of automatic quality assurance process applied yes, no +Manual QA Type of manual quality assurance process applied yes, no +Manual QA scope Scope of manual QA applied [a-zA-Z \d+] +Transcription coverage Ratio of transcribed recordings % +Transcription protocol Is a transcription protocol specified or described? yes, no, description +Denormalized transcriptions Are there available transcriptions without abbreviations, numerals, punctuation etc. yes, no +Transcription and annotation format Format of transcription files [a-z A-Z0-9\.] +Domain Domain of utterances academic lecture, books, broadcast, conversations, customer service, digits, general, interview, multi-domain, news, numbers, parliament speech, public transport +Speech type Type of speech conversational, read, public speech, various +Audio collection process Audio collection process controlled, corpus, various +Speech recordings source Speech recordings source volunteers, university employees, crowd, public speakers, paid contributors +Acoustic environment Acoustic conditions audio was collected in broadcast, car, home, mixed, quiet space, office, public space, various +Audio device Audio devices used for speech collection condenser mic, headset, mobile phone, landline phone, various +Device model Recording device(s) and model(s) [a-zA-Z\- ] +Audio format Audio storage format flac, mp3, raw, riff, wav +Audio codec Audio encoding format mp3, ogg, opus, vorbis +Audio channels Number of audio recording channels [1-16] +Sampling rate [Hz] Sampling rate of recorded audio \d?\d{4} +Bits per sample Number of bits used for encoding each audio sample 8,16,24,32 +Age info Annotation of speakers age yes, no +Age balance Is speakers age distribution balanced across demographics groups free text +Gender info Annotation of speakers gender yes, no +Gender balance Is speakers gender distribution balanced across demographics groups free text +Nativity info Annotation of speakers nativity yes, no +Accent info Annotation of speakers accent yes, no +Accent representative Is dataset balanced in terms of speakers' accent yes, no, N/A +Education info Information about speaker education level yes,no +Occupation info Information about professional occupation of speakers yes, no +Health info Information about health condition of speakers yes, no +Time alignement annotation Information about time- alignment of speech signal yes, no +Named entities annotation Transcriptions with Named Entities annotations yes, no +Part of speech annotation Transcriptions with POS (Part of speech) annotations yes, no