Added script for updating snapshots

goodmike31 · Feb 18, 2024 · 74af704 · 74af704
1 parent 041c6fc
commit 74af704
Show file tree

Hide file tree

Showing 7 changed files with 348 additions and 114 deletions.
diff --git a/requirements b/requirements
@@ -0,0 +1,2 @@
+pandas
+requests
diff --git a/scripts/update-snapshots.py b/scripts/update-snapshots.py
@@ -0,0 +1,60 @@
+import requests
+import pandas as pd
+import os
+
+def download_tsv_from_google_sheet(sheet_url):
+    # Modify the Google Sheet URL to export it as TSV
+    tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
+
+    # Send a GET request to download the TSV file
+    response = requests.get(tsv_url)
+
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Read the TSV content into a pandas DataFrame
+        from io import StringIO
+        tsv_content = StringIO(response.text)
+        df = pd.read_csv(tsv_content, sep='\t')
+        return df
+    else:
+        print("Failed to download the TSV file.")
+        return None
+
+# Example usage (URL would need to be replaced with your actual Google Sheet URL)
+catalog_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=0"
+df_catalog = download_tsv_from_google_sheet(catalog_url)
+print(df_catalog.sample(1))
+
+# Get the current date in the format YYYYMMDD
+today = pd.Timestamp.now().strftime("%Y%m%d")
+
+filename = f"pl-asr-speech-datasets-catalog-{today}.tsv"
+filepath = os.path.join("./snapshots/catalog", filename)
+
+df_catalog.to_csv(filepath, sep='\t', index=False)
+print(f"Saved the DataFrame to {filepath}")
+
+# save also as the "latest" file
+latest_filepath = os.path.join("./snapshots", "pl-asr-speech-datasets-catalog-latest.tsv")
+df_catalog.to_csv(latest_filepath, sep='\t', index=False)
+print(f"Saved the DataFrame to {latest_filepath}")
+
+
+taxonomy_url="https://docs.google.com/spreadsheets/d/181EDfwZNtHgHFOMaKNtgKssrYDX4tXTJ9POMzBsCRlI/edit#gid=2015613057"
+df_taxonomy = download_tsv_from_google_sheet(taxonomy_url)
+print(df_taxonomy.sample(1))
+
+# Get the current date in the format YYYYMMDD
+today = pd.Timestamp.now().strftime("%Y%m%d")
+
+filename = f"pl-asr-speech-datasets-taxonomy-{today}.tsv"
+filepath = os.path.join("./snapshots/taxonomy", filename)
+
+df_taxonomy.to_csv(filepath, sep='\t', index=False)
+print(f"Saved the DataFrame to {filepath}")
+
+# save also as the "latest" file
+latest_filepath = os.path.join("./snapshots", "pl-asr-speech-datasets-taxonomy-latest.tsv")
+df_taxonomy.to_csv(latest_filepath, sep='\t', index=False)
+print(f"Saved the DataFrame to {latest_filepath}")
+
diff --git a/snapshots/catalog/pl-asr-speech-datasets-catalog-20231221.tsv b/snapshots/catalog/pl-asr-speech-datasets-catalog-20231221.tsv
diff --git a/snapshots/catalog/pl-asr-speech-datasets-catalog-20240218.tsv b/snapshots/catalog/pl-asr-speech-datasets-catalog-20240218.tsv
diff --git a/snapshots/pl-asr-speech-datasets-catalog-latest.tsv b/snapshots/pl-asr-speech-datasets-catalog-latest.tsv
diff --git a/snapshots/pl-asr-speech-datasets-taxonomy-latest.tsv b/snapshots/pl-asr-speech-datasets-taxonomy-latest.tsv
@@ -1,62 +1,62 @@
-Dataset attribute	Purpose	Allowed values
-Dataset name	Full name of a speech dataset	[a-z A-Z0-9_\-]
-Dataset ID	Dataset unique identifier for reporting	[a-z0-9\-]
-Access type	Dataset access type from the cost perspective.	free, paid, no-info
-Access link	Web reference for accessing or purchasing a dataset	URL format
-Available online	Validated access status as of March 2023	yes, no
-License	Dataset license type	Apache,  CC-0, CC-BY, CC-BY-SA, ELRA, HZSK-PUB, LDC, Proprietary
-Publisher	Creator or publisher of a dataset	[a-z A-Z\-]
-Repository	Main repository hosting a dataset	[a-z A-Z\-]
-Languages	Language and country code of speakers recorded	$lang(ISO-639-1)-$country code(ISO-3166-2), multi
-Creation year	Year a dataset was created or published	\d{4}
-ISLRN	International Standard Language Resource Number	ISRLN
-ISBN	International Standard Book Number	ISBN
-LR catalog ID	Language data repository ID	URL,  [a-z A-Z\-\_0-9]
-Reference publication	Link to relevant publication describing a dataset 	URL
-Contact point	Contact point referenced in the documentation	[a-z A-Z\-\_0-9\@]
-Latest version	The latest version of the released dataset	[0-9\.]
-Last update year	Last update date (year)	\d{4}
-Sponsor	Institution which funded the creation of dataset	[a-z A-Z\-\_0-9]
-Price - non-commercial usage	Price for non-commercial usage	[free|\d+]
-Price - commercial usage	Price for commercial usage	[free|\d+]
-Purpose and split	Target usage and available data splits	train, valid, test, none
-Size audio total [hours]	Total amount of audio data in hours	[\d+\.]
-Size audio transcribed [hours]	Total amount of transcribed speech data	[\d+\.]
-Size [GB]	Size of a dataset in gigabytes	[\d+\.]
-Speakers	Number of speakers recordings originate from	[\d+]
-Audio recordings	Number of voice recordings in the corpus	\d+
-Audio segmentation	Are audio recordings segmented	yes, no
-Tokens	Number of tokens in the corpus	[\d+]
-Unique tokens	Number of unique tokens	[\d+]
-Automatic QA	Type of automatic quality assurance process applied	yes, no
-Manual QA	Type of manual quality assurance process applied	yes, no
-Manual QA scope	Scope of manual QA applied	[a-zA-Z \d+]
-Transcription coverage	Ratio of transcribed recordings	%
-Transcription protocol	Is a transcription protocol specified or described?	yes, no, description
-Denormalized transcriptions	Are there available transcriptions without abbreviations, numerals, punctuation etc.	yes, no
-Transcription and annotation format	Format of transcription files	[a-z A-Z0-9\.]
-Domain	Domain of utterances	academic lecture, books, broadcast, conversations, customer service, digits, general, interview, multi-domain, news, numbers, parliament speech, public transport
-Speech type	Type of speech	dialog, isolated words, lecture, monolog, read, spontaneous, various
-Audio collection process	Audio collection process	controlled, corpus, various
-Speech recordings source	Speech recordings source	volunteers, university employees, crowd, public speakers, paid contributors
-Acoustic environment	Acoustic conditions audio was collected in	broadcast, car, home, mixed, quiet space, office, public space, various
-Audio device	Audio devices used for speech collection	condenser mic, headset, mobile phone, landline phone, various
-Device model	Recording device(s) and model(s)	[a-zA-Z\- ]
-Audio format	Audio storage format	flac, mp3, raw, riff, wav
-Audio codec	Audio encoding format	mp3, ogg, opus, vorbis 
-Audio channels	Number of audio recording channels	[1-16]
-Sampling rate [Hz]	Sampling rate of recorded audio	\d?\d{4}
-Bits per sample	Number of bits used for encoding each audio sample	8,16,24,32
-Age info	Annotation of speakers age	yes, no
-Age balance	Is speakers age distribution balanced across demographics groups	free text
-Gender info	Annotation of speakers gender	yes, no
-Gender balance	Is speakers gender distribution balanced across demographics groups	free text
-Nativity info	Annotation of speakers nativity	yes, no
-Accent info	Annotation of speakers accent	yes, no
-Accent representative	Is dataset balanced in terms of speakers' accent	yes, no, N/A
-Education info	Information about speaker education level	yes,no
-Occupation info	Information about professional occupation of speakers	yes, no
-Health info	Information about health condition of speakers	yes, no
-Time alignement annotation	Information about time- alignment of speech signal	yes, no
-Named entities annotation	Transcriptions with Named Entities annotations	yes, no
-Part of speech annotation	Transcriptions with POS (Part of speech) annotations	yes, no
+Dataset attribute	Purpose	Allowed values
+Dataset name	Full name of a speech dataset	[a-z A-Z0-9_\-]
+Dataset ID	Dataset unique identifier for reporting	[a-z0-9\-]
+Access type	Dataset access type from the cost perspective.	free, paid, no-info
+Access link	Web reference for accessing or purchasing a dataset	URL format
+Available online	Validated access status as of March 2023	yes, no
+License	Dataset license type	Apache,  CC-0, CC-BY, CC-BY-SA, ELRA, HZSK-PUB, LDC, Proprietary
+Publisher	Creator or publisher of a dataset	[a-z A-Z\-]
+Repository	Main repository hosting a dataset	[a-z A-Z\-]
+Languages	Language and country code of speakers recorded	$lang(ISO-639-1)-$country code(ISO-3166-2), multi
+Creation year	Year a dataset was created or published	\d{4}
+ISLRN	International Standard Language Resource Number	ISRLN
+ISBN	International Standard Book Number	ISBN
+LR catalog ID	Language data repository ID	URL,  [a-z A-Z\-\_0-9]
+Reference publication	Link to relevant publication describing a dataset 	URL
+Contact point	Contact point referenced in the documentation	[a-z A-Z\-\_0-9\@]
+Latest version	The latest version of the released dataset	[0-9\.]
+Last update year	Last update date (year)	\d{4}
+Sponsor	Institution which funded the creation of dataset	[a-z A-Z\-\_0-9]
+Price - non-commercial usage	Price for non-commercial usage	[free|\d+]
+Price - commercial usage	Price for commercial usage	[free|\d+]
+Purpose and split	Target usage and available data splits	train, valid, test, none
+Size audio total [hours]	Total amount of audio data in hours	[\d+\.]
+Size audio transcribed [hours]	Total amount of transcribed speech data	[\d+\.]
+Size [GB]	Size of a dataset in gigabytes	[\d+\.]
+Speakers	Number of speakers recordings originate from	[\d+]
+Audio recordings	Number of voice recordings in the corpus	\d+
+Audio segmentation	Are audio recordings segmented	yes, no
+Tokens	Number of tokens in the corpus	[\d+]
+Unique tokens	Number of unique tokens	[\d+]
+Automatic QA	Type of automatic quality assurance process applied	yes, no
+Manual QA	Type of manual quality assurance process applied	yes, no
+Manual QA scope	Scope of manual QA applied	[a-zA-Z \d+]
+Transcription coverage	Ratio of transcribed recordings	%
+Transcription protocol	Is a transcription protocol specified or described?	yes, no, description
+Denormalized transcriptions	Are there available transcriptions without abbreviations, numerals, punctuation etc.	yes, no
+Transcription and annotation format	Format of transcription files	[a-z A-Z0-9\.]
+Domain	Domain of utterances	academic lecture, books, broadcast, conversations, customer service, digits, general, interview, multi-domain, news, numbers, parliament speech, public transport
+Speech type	Type of speech	conversational, read,  public speech, various
+Audio collection process	Audio collection process	controlled, corpus, various
+Speech recordings source	Speech recordings source	volunteers, university employees, crowd, public speakers, paid contributors
+Acoustic environment	Acoustic conditions audio was collected in	broadcast, car, home, mixed, quiet space, office, public space, various
+Audio device	Audio devices used for speech collection	condenser mic, headset, mobile phone, landline phone, various
+Device model	Recording device(s) and model(s)	[a-zA-Z\- ]
+Audio format	Audio storage format	flac, mp3, raw, riff, wav
+Audio codec	Audio encoding format	mp3, ogg, opus, vorbis 
+Audio channels	Number of audio recording channels	[1-16]
+Sampling rate [Hz]	Sampling rate of recorded audio	\d?\d{4}
+Bits per sample	Number of bits used for encoding each audio sample	8,16,24,32
+Age info	Annotation of speakers age	yes, no
+Age balance	Is speakers age distribution balanced across demographics groups	free text
+Gender info	Annotation of speakers gender	yes, no
+Gender balance	Is speakers gender distribution balanced across demographics groups	free text
+Nativity info	Annotation of speakers nativity	yes, no
+Accent info	Annotation of speakers accent	yes, no
+Accent representative	Is dataset balanced in terms of speakers' accent	yes, no, N/A
+Education info	Information about speaker education level	yes,no
+Occupation info	Information about professional occupation of speakers	yes, no
+Health info	Information about health condition of speakers	yes, no
+Time alignement annotation	Information about time- alignment of speech signal	yes, no
+Named entities annotation	Transcriptions with Named Entities annotations	yes, no
+Part of speech annotation	Transcriptions with POS (Part of speech) annotations	yes, no
diff --git a/snapshots/taxonomy/pl-asr-speech-datasets-taxonomy-20240218.tsv b/snapshots/taxonomy/pl-asr-speech-datasets-taxonomy-20240218.tsv
@@ -0,0 +1,62 @@
+Dataset attribute	Purpose	Allowed values
+Dataset name	Full name of a speech dataset	[a-z A-Z0-9_\-]
+Dataset ID	Dataset unique identifier for reporting	[a-z0-9\-]
+Access type	Dataset access type from the cost perspective.	free, paid, no-info
+Access link	Web reference for accessing or purchasing a dataset	URL format
+Available online	Validated access status as of March 2023	yes, no
+License	Dataset license type	Apache,  CC-0, CC-BY, CC-BY-SA, ELRA, HZSK-PUB, LDC, Proprietary
+Publisher	Creator or publisher of a dataset	[a-z A-Z\-]
+Repository	Main repository hosting a dataset	[a-z A-Z\-]
+Languages	Language and country code of speakers recorded	$lang(ISO-639-1)-$country code(ISO-3166-2), multi
+Creation year	Year a dataset was created or published	\d{4}
+ISLRN	International Standard Language Resource Number	ISRLN
+ISBN	International Standard Book Number	ISBN
+LR catalog ID	Language data repository ID	URL,  [a-z A-Z\-\_0-9]
+Reference publication	Link to relevant publication describing a dataset 	URL
+Contact point	Contact point referenced in the documentation	[a-z A-Z\-\_0-9\@]
+Latest version	The latest version of the released dataset	[0-9\.]
+Last update year	Last update date (year)	\d{4}
+Sponsor	Institution which funded the creation of dataset	[a-z A-Z\-\_0-9]
+Price - non-commercial usage	Price for non-commercial usage	[free|\d+]
+Price - commercial usage	Price for commercial usage	[free|\d+]
+Purpose and split	Target usage and available data splits	train, valid, test, none
+Size audio total [hours]	Total amount of audio data in hours	[\d+\.]
+Size audio transcribed [hours]	Total amount of transcribed speech data	[\d+\.]
+Size [GB]	Size of a dataset in gigabytes	[\d+\.]
+Speakers	Number of speakers recordings originate from	[\d+]
+Audio recordings	Number of voice recordings in the corpus	\d+
+Audio segmentation	Are audio recordings segmented	yes, no
+Tokens	Number of tokens in the corpus	[\d+]
+Unique tokens	Number of unique tokens	[\d+]
+Automatic QA	Type of automatic quality assurance process applied	yes, no
+Manual QA	Type of manual quality assurance process applied	yes, no
+Manual QA scope	Scope of manual QA applied	[a-zA-Z \d+]
+Transcription coverage	Ratio of transcribed recordings	%
+Transcription protocol	Is a transcription protocol specified or described?	yes, no, description
+Denormalized transcriptions	Are there available transcriptions without abbreviations, numerals, punctuation etc.	yes, no
+Transcription and annotation format	Format of transcription files	[a-z A-Z0-9\.]
+Domain	Domain of utterances	academic lecture, books, broadcast, conversations, customer service, digits, general, interview, multi-domain, news, numbers, parliament speech, public transport
+Speech type	Type of speech	conversational, read,  public speech, various
+Audio collection process	Audio collection process	controlled, corpus, various
+Speech recordings source	Speech recordings source	volunteers, university employees, crowd, public speakers, paid contributors
+Acoustic environment	Acoustic conditions audio was collected in	broadcast, car, home, mixed, quiet space, office, public space, various
+Audio device	Audio devices used for speech collection	condenser mic, headset, mobile phone, landline phone, various
+Device model	Recording device(s) and model(s)	[a-zA-Z\- ]
+Audio format	Audio storage format	flac, mp3, raw, riff, wav
+Audio codec	Audio encoding format	mp3, ogg, opus, vorbis 
+Audio channels	Number of audio recording channels	[1-16]
+Sampling rate [Hz]	Sampling rate of recorded audio	\d?\d{4}
+Bits per sample	Number of bits used for encoding each audio sample	8,16,24,32
+Age info	Annotation of speakers age	yes, no
+Age balance	Is speakers age distribution balanced across demographics groups	free text
+Gender info	Annotation of speakers gender	yes, no
+Gender balance	Is speakers gender distribution balanced across demographics groups	free text
+Nativity info	Annotation of speakers nativity	yes, no
+Accent info	Annotation of speakers accent	yes, no
+Accent representative	Is dataset balanced in terms of speakers' accent	yes, no, N/A
+Education info	Information about speaker education level	yes,no
+Occupation info	Information about professional occupation of speakers	yes, no
+Health info	Information about health condition of speakers	yes, no
+Time alignement annotation	Information about time- alignment of speech signal	yes, no
+Named entities annotation	Transcriptions with Named Entities annotations	yes, no
+Part of speech annotation	Transcriptions with POS (Part of speech) annotations	yes, no