From 5ef046c9575bbcad5dd3a82a41aa8d6b460ddbbe Mon Sep 17 00:00:00 2001 From: Karsten Date: Sun, 10 Jan 2021 04:19:26 +0100 Subject: [PATCH 1/2] ignoring reported.tsv from common voice --- audiomate/corpus/io/common_voice.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/audiomate/corpus/io/common_voice.py b/audiomate/corpus/io/common_voice.py index a830fa6..9ca21ed 100644 --- a/audiomate/corpus/io/common_voice.py +++ b/audiomate/corpus/io/common_voice.py @@ -97,7 +97,8 @@ def get_subset_ids(path): # We don't want to include the invalidated files # since there maybe corrupt files - if basename != 'invalidated': + # reported utterances are also causing issue, e.g. empty entries + if basename != 'invalidated' and basename != 'reported': subset_ids.append(basename) return subset_ids From fcebae595127fd0ca8f6bba61841d7d6b55bdfa8 Mon Sep 17 00:00:00 2001 From: Karsten Date: Sun, 10 Jan 2021 05:30:42 +0100 Subject: [PATCH 2/2] simplified basename check --- audiomate/corpus/io/common_voice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audiomate/corpus/io/common_voice.py b/audiomate/corpus/io/common_voice.py index 9ca21ed..aef984a 100644 --- a/audiomate/corpus/io/common_voice.py +++ b/audiomate/corpus/io/common_voice.py @@ -98,7 +98,7 @@ def get_subset_ids(path): # We don't want to include the invalidated files # since there maybe corrupt files # reported utterances are also causing issue, e.g. empty entries - if basename != 'invalidated' and basename != 'reported': + if basename not in ('invalidated', 'reported'): subset_ids.append(basename) return subset_ids