From 18c838eac2496708dcc45d6865ec2c0721bb7f24 Mon Sep 17 00:00:00 2001
From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com>
Date: Thu, 23 Nov 2023 11:08:46 +0100
Subject: [PATCH 1/3] Automate FDS reference doc generation (#2562)

---
 datasets/dev/build-flwr-datasets-docs.sh      | 30 +++++++++
 .../source/_templates/autosummary/class.rst   | 33 ++++++++++
 .../source/_templates/autosummary/module.rst  | 66 +++++++++++++++++++
 datasets/doc/source/conf.py                   | 34 ++++++++++
 datasets/doc/source/index.rst                 | 10 ++-
 datasets/doc/source/ref-api-flwr-datasets.rst | 27 --------
 dev/build-docs.sh                             |  3 +-
 7 files changed, 171 insertions(+), 32 deletions(-)
 create mode 100755 datasets/dev/build-flwr-datasets-docs.sh
 create mode 100644 datasets/doc/source/_templates/autosummary/class.rst
 create mode 100644 datasets/doc/source/_templates/autosummary/module.rst
 delete mode 100644 datasets/doc/source/ref-api-flwr-datasets.rst

diff --git a/datasets/dev/build-flwr-datasets-docs.sh b/datasets/dev/build-flwr-datasets-docs.sh
new file mode 100755
index 000000000000..dc3cd979d5c8
--- /dev/null
+++ b/datasets/dev/build-flwr-datasets-docs.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Generating the docs, rename and move the files such that the meet the convention used in Flower.
+# Note that it involves two runs of sphinx-build that are necessary.
+# The first run generates the .rst files (and the html files that are discarded)
+# The second time it is run after the files are renamed and moved to the correct place. It generates the final htmls.
+
+set -e
+
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )"  >/dev/null 2>&1 && pwd )"/../doc
+
+# Remove the old docs from source/ref-api
+REF_API_DIR="source/ref-api"
+if [[ -d "$REF_API_DIR" ]]; then
+
+  echo "Removing ${REF_API_DIR}"
+  rm -r ${REF_API_DIR}
+fi
+
+# Remove the old html files
+if [[ -d build ]]; then
+  echo "Removing ./build"
+  rm -r build
+fi
+
+# Docs generation: Generate new rst files
+# It starts at the __init__ in the main directory and recursively generated the documentation for the
+# specified classes/modules/packages specified in __all__.
+# Note if a package cannot be reach via the recursive traversal, even if it has __all__, it won't be documented.
+echo "Generating the docs based on only the functionality given in the __all__."
+sphinx-build -M html source build
diff --git a/datasets/doc/source/_templates/autosummary/class.rst b/datasets/doc/source/_templates/autosummary/class.rst
new file mode 100644
index 000000000000..b4b35789bc6f
--- /dev/null
+++ b/datasets/doc/source/_templates/autosummary/class.rst
@@ -0,0 +1,33 @@
+{{ name | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+   :members:
+   :show-inheritance:
+   :inherited-members:
+
+   {% block methods %}
+
+   {% if methods %}
+   .. rubric:: {{ _('Methods') }}
+
+   .. autosummary::
+   {% for item in methods %}
+      {% if item != "__init__" %}
+         ~{{ name }}.{{ item }}
+      {% endif %}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: {{ _('Attributes') }}
+
+   .. autosummary::
+   {% for item in attributes %}
+      ~{{ name }}.{{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
diff --git a/datasets/doc/source/_templates/autosummary/module.rst b/datasets/doc/source/_templates/autosummary/module.rst
new file mode 100644
index 000000000000..571db198d27c
--- /dev/null
+++ b/datasets/doc/source/_templates/autosummary/module.rst
@@ -0,0 +1,66 @@
+{{ name | escape | underline}}
+
+.. automodule:: {{ fullname }}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: Module Attributes
+
+   .. autosummary::
+      :toctree:
+   {% for item in attributes %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block functions %}
+   {% if functions %}
+   .. rubric:: {{ _('Functions') }}
+
+   .. autosummary::
+      :toctree:
+   {% for item in functions %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block classes %}
+   {% if classes %}
+   .. rubric:: {{ _('Classes') }}
+
+   .. autosummary::
+      :toctree:
+      :template: autosummary/class.rst
+   {% for item in classes %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block exceptions %}
+   {% if exceptions %}
+   .. rubric:: {{ _('Exceptions') }}
+
+   .. autosummary::
+      :toctree:
+   {% for item in exceptions %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+{% block modules %}
+{% if modules %}
+.. rubric:: Modules
+
+.. autosummary::
+   :toctree:
+   :template: autosummary/module.rst
+   :recursive:
+{% for item in modules %}
+   {{ item }}
+{%- endfor %}
+{% endif %}
+{% endblock %}
diff --git a/datasets/doc/source/conf.py b/datasets/doc/source/conf.py
index 4fccaf0ef084..32baa6dd1471 100644
--- a/datasets/doc/source/conf.py
+++ b/datasets/doc/source/conf.py
@@ -61,8 +61,42 @@
     "nbsphinx",
 ]
 
+# Generate .rst files
 autosummary_generate = True
 
+# Document ONLY the objects from __all__ (present in __init__ files).
+# It will be done recursively starting from flwr_dataset.__init__
+# It's controlled in the index.rst file.
+autosummary_ignore_module_all = False
+
+# Each class and function docs start with the path to it
+# Make the flwr_datasets.federated_dataset.FederatedDataset appear as FederatedDataset
+# The full name is still at the top of the page
+add_module_names = False
+
+def find_test_modules(package_path):
+    """Go through the python files and exclude every *_test.py file."""
+    full_path_modules = []
+    for root, dirs, files in os.walk(package_path):
+        for file in files:
+            if file.endswith('_test.py'):
+                # Construct the module path relative to the package directory
+                full_path = os.path.join(root, file)
+                relative_path = os.path.relpath(full_path, package_path)
+                # Convert file path to dotted module path
+                module_path = os.path.splitext(relative_path)[0].replace(os.sep, '.')
+                full_path_modules.append(module_path)
+    modules = []
+    for full_path_module in full_path_modules:
+        parts = full_path_module.split('.')
+        for i in range(len(parts)):
+            modules.append('.'.join(parts[i:]))
+    return modules
+
+# Stop from documenting the *_test.py files.
+# That's the only way to do that in autosummary (make the modules as mock_imports).
+autodoc_mock_imports = find_test_modules(os.path.abspath("../../"))
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 
diff --git a/datasets/doc/source/index.rst b/datasets/doc/source/index.rst
index 7b19624b341a..ae7e7259f504 100644
--- a/datasets/doc/source/index.rst
+++ b/datasets/doc/source/index.rst
@@ -38,11 +38,15 @@ References
 
 Information-oriented API reference and other reference material.
 
-.. toctree::
-   :maxdepth: 2
+.. autosummary::
+   :toctree: ref-api
+   :template: autosummary/module.rst
    :caption: API reference
+   :recursive:
+
+      flwr_datasets
+
 
-   ref-api-flwr-datasets
 
 Main features
 -------------
diff --git a/datasets/doc/source/ref-api-flwr-datasets.rst b/datasets/doc/source/ref-api-flwr-datasets.rst
deleted file mode 100644
index 2e6a9e731add..000000000000
--- a/datasets/doc/source/ref-api-flwr-datasets.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-flwr\_datasets (Python API reference)
-======================
-
-Federated Dataset
------------------
-.. autoclass:: flwr_datasets.federated_dataset.FederatedDataset
-   :members:
-
-
-partitioner
------------
-
-.. automodule:: flwr_datasets.partitioner
-
-
-Partitioner
------------
-
-.. autoclass:: flwr_datasets.partitioner.Partitioner
-   :members:
-
-
-IID Partitioner
----------------
-
-.. autoclass:: flwr_datasets.partitioner.IidPartitioner
-   :members:
diff --git a/dev/build-docs.sh b/dev/build-docs.sh
index 0c913c6fc1d8..45a4dfca0adf 100755
--- a/dev/build-docs.sh
+++ b/dev/build-docs.sh
@@ -13,8 +13,7 @@ cd examples/doc
 make docs
 
 cd $ROOT
-cd datasets/doc
-make docs
+./datasets/dev/build-flwr-datasets-docs.sh
 
 cd $ROOT
 cd doc

From 961367049b849a3635685eb566bac5461531cd98 Mon Sep 17 00:00:00 2001
From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com>
Date: Thu, 23 Nov 2023 11:37:07 +0100
Subject: [PATCH 2/3] Update the transforms section for DataLoader (#2628)

---
 datasets/doc/source/tutorial-quickstart.rst | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/datasets/doc/source/tutorial-quickstart.rst b/datasets/doc/source/tutorial-quickstart.rst
index 8a70ee8854be..b4578fae0de9 100644
--- a/datasets/doc/source/tutorial-quickstart.rst
+++ b/datasets/doc/source/tutorial-quickstart.rst
@@ -51,20 +51,22 @@ For more detailed instructions, go to :doc:`how-to-use-with-pytorch`.
 
 PyTorch DataLoader
 ^^^^^^^^^^^^^^^^^^
-Transform the Dataset directly into the DataLoader::
+Transform the Dataset into the DataLoader, use the PyTorch transforms (`Compose` and all the others are also
+possible)::
 
   from torch.utils.data import DataLoader
   from torchvision.transforms import ToTensor
 
   transforms = ToTensor()
-  partition_torch = partition.map(
-        lambda img: {"img": transforms(img)}, input_columns="img"
-    ).with_format("torch")
+  def apply_transforms(batch):
+    batch["img"] = [transforms(img) for img in batch["img"]]
+    return batch
+  partition_torch = partition.with_transform(apply_transforms)
   dataloader = DataLoader(partition_torch, batch_size=64)
 
 NumPy
 ^^^^^
-NumPy can be used as input to the TensorFlow model and is very straightforward::
+NumPy can be used as input to the TensorFlow and scikit-learn models and it is very straightforward::
 
    partition_np = partition.with_format("numpy")
    X_train, y_train = partition_np["img"], partition_np["label"]

From 3e07f97dd37af92bb9c0b7714720a27cb0393eca Mon Sep 17 00:00:00 2001
From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com>
Date: Thu, 23 Nov 2023 12:38:52 +0100
Subject: [PATCH 3/3] Update the dataset creation docs (#2629)

Co-authored-by: Javier <jafermarq@users.noreply.github.com>
---
 datasets/doc/source/how-to-use-with-pytorch.rst | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/datasets/doc/source/how-to-use-with-pytorch.rst b/datasets/doc/source/how-to-use-with-pytorch.rst
index 497266dd1e69..85e7833b0869 100644
--- a/datasets/doc/source/how-to-use-with-pytorch.rst
+++ b/datasets/doc/source/how-to-use-with-pytorch.rst
@@ -10,7 +10,7 @@ Standard setup - download the dataset, choose the partitioning::
   partition = fds.load_partition(0, "train")
   centralized_dataset = fds.load_full("test")
 
-Determine the names of our features (you can alternatively do that directly on the Hugging Face website). The name can
+Determine the names of the features (you can alternatively do that directly on the Hugging Face website). The name can
 vary e.g. "img" or "image", "label" or "labels"::
 
   partition.features
@@ -38,7 +38,7 @@ That is why we iterate over all the samples from this batch and apply our transf
     return batch
 
   partition_torch = partition.with_transform(apply_transforms)
-  # At this point, you can check if you didn't make any mistakes by calling partition_torch[0]
+  # Now, you can check if you didn't make any mistakes by calling partition_torch[0]
   dataloader = DataLoader(partition_torch, batch_size=64)
 
 
@@ -70,8 +70,10 @@ If you want to divide the dataset, you can use (at any point before passing the
 Or you can simply calculate the indices yourself::
 
   partition_len = len(partition)
-  partition_train = partition[:int(0.8 * partition_len)]
-  partition_test = partition[int(0.8 * partition_len):]
+  # Split `partition` 80:20
+  num_train_examples = int(0.8 * partition_len)
+  partition_train = partition.select(range(num_train_examples)) ) # use first 80% 
+  partition_test = partition.select(range(num_train_examples, partition_len)) ) # use last 20%
 
 And during the training loop, you need to apply one change. With a typical dataloader, you get a list returned for each iteration::