diff --git a/datasets/doc/source/recommended-fl-datasets.rst b/datasets/doc/source/recommended-fl-datasets.rst new file mode 100644 index 000000000000..6cd2d37b27cc --- /dev/null +++ b/datasets/doc/source/recommended-fl-datasets.rst @@ -0,0 +1,163 @@ +Recommended FL Datasets +======================= + +This page lists the recommended datasets for federated learning research, which can be used with Flower Datasets ``flwr-datasets``. + +.. note:: + + All datasets from HuggingFace Hub can be used with our library. This page presents just a set of datasets we collected that you might find useful. + +For more information about any dataset, visit its page by clicking the dataset name. + +Image Datasets +-------------- + +.. list-table:: Image Datasets + :widths: 40 40 20 + :header-rows: 1 + + * - Name + - Size + - Image Shape + * - `ylecun/mnist `_ + - train 60k; + test 10k + - 28x28 + * - `uoft-cs/cifar10 `_ + - train 50k; + test 10k + - 32x32x3 + * - `uoft-cs/cifar100 `_ + - train 50k; + test 10k + - 32x32x3 + * - `zalando-datasets/fashion_mnist `_ + - train 60k; + test 10k + - 28x28 + * - `flwrlabs/femnist `_ + - train 814k + - 28x28 + * - `zh-plus/tiny-imagenet `_ + - train 100k; + valid 10k + - 64x64x3 + * - `flwrlabs/usps `_ + - train 7.3k; + test 2k + - 16x16 + * - `flwrlabs/pacs `_ + - train 10k + - 227x227 + * - `flwrlabs/cinic10 `_ + - train 90k; + valid 90k; + test 90k + - 32x32x3 + * - `flwrlabs/caltech101 `_ + - train 8.7k + - varies + * - `flwrlabs/office-home `_ + - train 15.6k + - varies + * - `flwrlabs/fed-isic2019 `_ + - train 18.6k; + test 4.7k + - varies + * - `ufldl-stanford/svhn `_ + - train 73.3k; + test 26k; + extra 531k + - 32x32x3 + * - `sasha/dog-food `_ + - train 2.1k; + test 0.9k + - varies + * - `Mike0307/MNIST-M `_ + - train 59k; + test 9k + - 32x32 + +Audio Datasets +-------------- + +.. list-table:: Audio Datasets + :widths: 35 30 15 + :header-rows: 1 + + * - Name + - Size + - Subset + * - `google/speech_commands `_ + - train 64.7k + - v0.01 + * - `google/speech_commands `_ + - train 105.8k + - v0.02 + * - `flwrlabs/ambient-acoustic-context `_ + - train 70.3k + - + * - `fixie-ai/common_voice_17_0 `_ + - varies + - 14 versions + * - `fixie-ai/librispeech_asr `_ + - varies + - clean/other + +Tabular Datasets +---------------- + +.. list-table:: Tabular Datasets + :widths: 35 30 + :header-rows: 1 + + * - Name + - Size + * - `scikit-learn/adult-census-income `_ + - train 32.6k + * - `jlh/uci-mushrooms `_ + - train 8.1k + * - `scikit-learn/iris `_ + - train 150 + +Text Datasets +------------- + +.. list-table:: Text Datasets + :widths: 40 30 30 + :header-rows: 1 + + * - Name + - Size + - Category + * - `sentiment140 `_ + - train 1.6M; + test 0.5k + - Sentiment + * - `google-research-datasets/mbpp `_ + - full 974; sanitized 427 + - General + * - `openai/openai_humaneval `_ + - test 164 + - General + * - `lukaemon/mmlu `_ + - varies + - General + * - `takala/financial_phrasebank `_ + - train 4.8k + - Financial + * - `pauri32/fiqa-2018 `_ + - train 0.9k; validation 0.1k; test 0.2k + - Financial + * - `zeroshot/twitter-financial-news-sentiment `_ + - train 9.5k; validation 2.4k + - Financial + * - `bigbio/pubmed_qa `_ + - train 2M; validation 11k + - Medical + * - `openlifescienceai/medmcqa `_ + - train 183k; validation 4.3k; test 6.2k + - Medical + * - `bigbio/med_qa `_ + - train 10.1k; test 1.3k; validation 1.3k + - Medical