From 18c838eac2496708dcc45d6865ec2c0721bb7f24 Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Thu, 23 Nov 2023 11:08:46 +0100 Subject: [PATCH 1/3] Automate FDS reference doc generation (#2562) --- datasets/dev/build-flwr-datasets-docs.sh | 30 +++++++++ .../source/_templates/autosummary/class.rst | 33 ++++++++++ .../source/_templates/autosummary/module.rst | 66 +++++++++++++++++++ datasets/doc/source/conf.py | 34 ++++++++++ datasets/doc/source/index.rst | 10 ++- datasets/doc/source/ref-api-flwr-datasets.rst | 27 -------- dev/build-docs.sh | 3 +- 7 files changed, 171 insertions(+), 32 deletions(-) create mode 100755 datasets/dev/build-flwr-datasets-docs.sh create mode 100644 datasets/doc/source/_templates/autosummary/class.rst create mode 100644 datasets/doc/source/_templates/autosummary/module.rst delete mode 100644 datasets/doc/source/ref-api-flwr-datasets.rst diff --git a/datasets/dev/build-flwr-datasets-docs.sh b/datasets/dev/build-flwr-datasets-docs.sh new file mode 100755 index 000000000000..dc3cd979d5c8 --- /dev/null +++ b/datasets/dev/build-flwr-datasets-docs.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Generating the docs, rename and move the files such that the meet the convention used in Flower. +# Note that it involves two runs of sphinx-build that are necessary. +# The first run generates the .rst files (and the html files that are discarded) +# The second time it is run after the files are renamed and moved to the correct place. It generates the final htmls. + +set -e + +cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"/../doc + +# Remove the old docs from source/ref-api +REF_API_DIR="source/ref-api" +if [[ -d "$REF_API_DIR" ]]; then + + echo "Removing ${REF_API_DIR}" + rm -r ${REF_API_DIR} +fi + +# Remove the old html files +if [[ -d build ]]; then + echo "Removing ./build" + rm -r build +fi + +# Docs generation: Generate new rst files +# It starts at the __init__ in the main directory and recursively generated the documentation for the +# specified classes/modules/packages specified in __all__. +# Note if a package cannot be reach via the recursive traversal, even if it has __all__, it won't be documented. +echo "Generating the docs based on only the functionality given in the __all__." +sphinx-build -M html source build diff --git a/datasets/doc/source/_templates/autosummary/class.rst b/datasets/doc/source/_templates/autosummary/class.rst new file mode 100644 index 000000000000..b4b35789bc6f --- /dev/null +++ b/datasets/doc/source/_templates/autosummary/class.rst @@ -0,0 +1,33 @@ +{{ name | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :members: + :show-inheritance: + :inherited-members: + + {% block methods %} + + {% if methods %} + .. rubric:: {{ _('Methods') }} + + .. autosummary:: + {% for item in methods %} + {% if item != "__init__" %} + ~{{ name }}.{{ item }} + {% endif %} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block attributes %} + {% if attributes %} + .. rubric:: {{ _('Attributes') }} + + .. autosummary:: + {% for item in attributes %} + ~{{ name }}.{{ item }} + {%- endfor %} + {% endif %} + {% endblock %} diff --git a/datasets/doc/source/_templates/autosummary/module.rst b/datasets/doc/source/_templates/autosummary/module.rst new file mode 100644 index 000000000000..571db198d27c --- /dev/null +++ b/datasets/doc/source/_templates/autosummary/module.rst @@ -0,0 +1,66 @@ +{{ name | escape | underline}} + +.. automodule:: {{ fullname }} + + {% block attributes %} + {% if attributes %} + .. rubric:: Module Attributes + + .. autosummary:: + :toctree: + {% for item in attributes %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block functions %} + {% if functions %} + .. rubric:: {{ _('Functions') }} + + .. autosummary:: + :toctree: + {% for item in functions %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block classes %} + {% if classes %} + .. rubric:: {{ _('Classes') }} + + .. autosummary:: + :toctree: + :template: autosummary/class.rst + {% for item in classes %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block exceptions %} + {% if exceptions %} + .. rubric:: {{ _('Exceptions') }} + + .. autosummary:: + :toctree: + {% for item in exceptions %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + +{% block modules %} +{% if modules %} +.. rubric:: Modules + +.. autosummary:: + :toctree: + :template: autosummary/module.rst + :recursive: +{% for item in modules %} + {{ item }} +{%- endfor %} +{% endif %} +{% endblock %} diff --git a/datasets/doc/source/conf.py b/datasets/doc/source/conf.py index 4fccaf0ef084..32baa6dd1471 100644 --- a/datasets/doc/source/conf.py +++ b/datasets/doc/source/conf.py @@ -61,8 +61,42 @@ "nbsphinx", ] +# Generate .rst files autosummary_generate = True +# Document ONLY the objects from __all__ (present in __init__ files). +# It will be done recursively starting from flwr_dataset.__init__ +# It's controlled in the index.rst file. +autosummary_ignore_module_all = False + +# Each class and function docs start with the path to it +# Make the flwr_datasets.federated_dataset.FederatedDataset appear as FederatedDataset +# The full name is still at the top of the page +add_module_names = False + +def find_test_modules(package_path): + """Go through the python files and exclude every *_test.py file.""" + full_path_modules = [] + for root, dirs, files in os.walk(package_path): + for file in files: + if file.endswith('_test.py'): + # Construct the module path relative to the package directory + full_path = os.path.join(root, file) + relative_path = os.path.relpath(full_path, package_path) + # Convert file path to dotted module path + module_path = os.path.splitext(relative_path)[0].replace(os.sep, '.') + full_path_modules.append(module_path) + modules = [] + for full_path_module in full_path_modules: + parts = full_path_module.split('.') + for i in range(len(parts)): + modules.append('.'.join(parts[i:])) + return modules + +# Stop from documenting the *_test.py files. +# That's the only way to do that in autosummary (make the modules as mock_imports). +autodoc_mock_imports = find_test_modules(os.path.abspath("../../")) + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] diff --git a/datasets/doc/source/index.rst b/datasets/doc/source/index.rst index 7b19624b341a..ae7e7259f504 100644 --- a/datasets/doc/source/index.rst +++ b/datasets/doc/source/index.rst @@ -38,11 +38,15 @@ References Information-oriented API reference and other reference material. -.. toctree:: - :maxdepth: 2 +.. autosummary:: + :toctree: ref-api + :template: autosummary/module.rst :caption: API reference + :recursive: + + flwr_datasets + - ref-api-flwr-datasets Main features ------------- diff --git a/datasets/doc/source/ref-api-flwr-datasets.rst b/datasets/doc/source/ref-api-flwr-datasets.rst deleted file mode 100644 index 2e6a9e731add..000000000000 --- a/datasets/doc/source/ref-api-flwr-datasets.rst +++ /dev/null @@ -1,27 +0,0 @@ -flwr\_datasets (Python API reference) -====================== - -Federated Dataset ------------------ -.. autoclass:: flwr_datasets.federated_dataset.FederatedDataset - :members: - - -partitioner ------------ - -.. automodule:: flwr_datasets.partitioner - - -Partitioner ------------ - -.. autoclass:: flwr_datasets.partitioner.Partitioner - :members: - - -IID Partitioner ---------------- - -.. autoclass:: flwr_datasets.partitioner.IidPartitioner - :members: diff --git a/dev/build-docs.sh b/dev/build-docs.sh index 0c913c6fc1d8..45a4dfca0adf 100755 --- a/dev/build-docs.sh +++ b/dev/build-docs.sh @@ -13,8 +13,7 @@ cd examples/doc make docs cd $ROOT -cd datasets/doc -make docs +./datasets/dev/build-flwr-datasets-docs.sh cd $ROOT cd doc From 961367049b849a3635685eb566bac5461531cd98 Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Thu, 23 Nov 2023 11:37:07 +0100 Subject: [PATCH 2/3] Update the transforms section for DataLoader (#2628) --- datasets/doc/source/tutorial-quickstart.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/datasets/doc/source/tutorial-quickstart.rst b/datasets/doc/source/tutorial-quickstart.rst index 8a70ee8854be..b4578fae0de9 100644 --- a/datasets/doc/source/tutorial-quickstart.rst +++ b/datasets/doc/source/tutorial-quickstart.rst @@ -51,20 +51,22 @@ For more detailed instructions, go to :doc:`how-to-use-with-pytorch`. PyTorch DataLoader ^^^^^^^^^^^^^^^^^^ -Transform the Dataset directly into the DataLoader:: +Transform the Dataset into the DataLoader, use the PyTorch transforms (`Compose` and all the others are also +possible):: from torch.utils.data import DataLoader from torchvision.transforms import ToTensor transforms = ToTensor() - partition_torch = partition.map( - lambda img: {"img": transforms(img)}, input_columns="img" - ).with_format("torch") + def apply_transforms(batch): + batch["img"] = [transforms(img) for img in batch["img"]] + return batch + partition_torch = partition.with_transform(apply_transforms) dataloader = DataLoader(partition_torch, batch_size=64) NumPy ^^^^^ -NumPy can be used as input to the TensorFlow model and is very straightforward:: +NumPy can be used as input to the TensorFlow and scikit-learn models and it is very straightforward:: partition_np = partition.with_format("numpy") X_train, y_train = partition_np["img"], partition_np["label"] From 3e07f97dd37af92bb9c0b7714720a27cb0393eca Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Thu, 23 Nov 2023 12:38:52 +0100 Subject: [PATCH 3/3] Update the dataset creation docs (#2629) Co-authored-by: Javier --- datasets/doc/source/how-to-use-with-pytorch.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/datasets/doc/source/how-to-use-with-pytorch.rst b/datasets/doc/source/how-to-use-with-pytorch.rst index 497266dd1e69..85e7833b0869 100644 --- a/datasets/doc/source/how-to-use-with-pytorch.rst +++ b/datasets/doc/source/how-to-use-with-pytorch.rst @@ -10,7 +10,7 @@ Standard setup - download the dataset, choose the partitioning:: partition = fds.load_partition(0, "train") centralized_dataset = fds.load_full("test") -Determine the names of our features (you can alternatively do that directly on the Hugging Face website). The name can +Determine the names of the features (you can alternatively do that directly on the Hugging Face website). The name can vary e.g. "img" or "image", "label" or "labels":: partition.features @@ -38,7 +38,7 @@ That is why we iterate over all the samples from this batch and apply our transf return batch partition_torch = partition.with_transform(apply_transforms) - # At this point, you can check if you didn't make any mistakes by calling partition_torch[0] + # Now, you can check if you didn't make any mistakes by calling partition_torch[0] dataloader = DataLoader(partition_torch, batch_size=64) @@ -70,8 +70,10 @@ If you want to divide the dataset, you can use (at any point before passing the Or you can simply calculate the indices yourself:: partition_len = len(partition) - partition_train = partition[:int(0.8 * partition_len)] - partition_test = partition[int(0.8 * partition_len):] + # Split `partition` 80:20 + num_train_examples = int(0.8 * partition_len) + partition_train = partition.select(range(num_train_examples)) ) # use first 80% + partition_test = partition.select(range(num_train_examples, partition_len)) ) # use last 20% And during the training loop, you need to apply one change. With a typical dataloader, you get a list returned for each iteration::