diff --git a/docs/images/dataset_and_dataframe.svg b/docs/images/dataset_and_dataframe.svg new file mode 100644 index 000000000..9286208c0 --- /dev/null +++ b/docs/images/dataset_and_dataframe.svg @@ -0,0 +1,1196 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + 1 + + 3 + + 5 + + 2 + + 4 + + 6 + Merlin Dataset Object + + + + + col1 + DataFrame 1 + + col 2 + + col3 + + + + + col1 + DataFrame 3 + + col 2 + + col3 + + + + + col1 + DataFrame 5 + + col 2 + + col3 + + + + + + col1 + DataFrame 2 + + col 2 + + col3 + + + col1 + DataFrame 4 + + col 2 + + col3 + + + col1 + DataFrame 6 + + col 2 + + col3 + + + + + + + + + + GPU 0 + GPU 1 + + diff --git a/docs/images/graph_schema.svg b/docs/images/graph_schema.svg new file mode 100644 index 000000000..c42c5b78a --- /dev/null +++ b/docs/images/graph_schema.svg @@ -0,0 +1,410 @@ + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Graph + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/graph_simple.svg b/docs/images/graph_simple.svg new file mode 100644 index 000000000..464975f7e --- /dev/null +++ b/docs/images/graph_simple.svg @@ -0,0 +1,369 @@ + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Graph + + diff --git a/docs/images/parquet_and_dataset.svg b/docs/images/parquet_and_dataset.svg new file mode 100644 index 000000000..1b1a149e7 --- /dev/null +++ b/docs/images/parquet_and_dataset.svg @@ -0,0 +1,555 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + 1 + + 2 + + 3 + + 4 + + 5 + + 6 + + + 1 + + 3 + + 5 + + 2 + + 4 + + 6 + Directory of Parquet Files on Disk + Merlin Dataset Object + + + + diff --git a/docs/requirements-doc.txt b/docs/requirements-doc.txt index e5229f453..bf03f3dcb 100644 --- a/docs/requirements-doc.txt +++ b/docs/requirements-doc.txt @@ -18,5 +18,4 @@ mergedeep<1.4 docker<5.1 PyGithub<1.56 semver>=2,<3 -pytest<7.3 -coverage<6.6 + diff --git a/docs/source/about-dag.md b/docs/source/about-dag.md new file mode 100644 index 000000000..2826fc59d --- /dev/null +++ b/docs/source/about-dag.md @@ -0,0 +1,107 @@ +# About the Merlin Graph + +```{contents} +--- +depth: 2 +local: true +backlinks: none +--- +``` + +## Purpose of the Merlin Graph + +Merlin uses a directed acyclic graph (DAG) to represent operations on data such as normalizing or clipping values and to represent operations in a recommender system such as creating an ensemble or filtering candidate items during inference. + +Understanding the Merlin DAG is helpful if you want to develop your own Operator or building a recommender system with Merlin. + +## Graph Terminology + +node +: A node in the DAG is a group of columns and at least one _operator_. + The columns are specified with a _column selector_. + A node has an _input schema_ and an _output schema_. + Resolution of the schemas is delayed until you run `fit` or `transform` on a dataset. + +column selector +: A column selector specifies the columns to select from a dataset using column names or _tags_. + +operator +: An operator performs a transformation on data and return a new _node_. + The data is identified by the _column selector_. + Some simple operators like `+` and `-` add or remove columns. + More complex operations are applied by shifting the operators onto the column selector with the `>>` notation. + +schema +: A Merlin schema is metadata that describes the columns in a dataset. + Each column has its own schema that identifies the column name and can specify _tags_ and properties. + +tag +: A Merlin tag categorizes information about a column. + Adding a tag to a column enables you to select columns for operations by tag rather than name. + + For example, you can add the `CONTINUOUS` or `CATEGORICAL` tags to columns. + Feature engineering Operators, modeling, and inference operations can use that information to operate accordingly on the dataset. + +## Introduction to Operators, Columns, Nodes, and Schema + +The NVTabular library uses Operators for feature engineering. +One example of an NVTabular Operator is `Normalize`. +The Operator normalizes continuous variables between `0` and `1`. + +The Merlin Systems library uses Operators for building ensembles and performing inference. +The library includes Operators such as `FilterCandidates` and `PredictTensorflow`. +You use these Operators to put your models into production and serve recommendations. + +Merlin enables you to chain together Operators with the `>>` syntax to create feature-processing workflows. +The `>>` syntax means "take the output columns from the left-hand side and feed them as the input columns to the right-hand side." + +You can specify an explicit list of columns names for an Operator. +The following code block shows the syntax for explicit column names: + +```python +result = ["col1", "col2",] >> SomeOperator(...) +``` + +Or, you can use the `>>` syntax between Operators to run one Operator on all the output columns from the preceding Operator: + +```python +result = AnOperator(...) >> OtherOperator(...) +``` + +Chaining Operators together builds a graph. +The following figure shows how each node in the graph has an Operator. + +![A directed graph with two nodes. The first node is a Selection Operator and selects columns "col1" and "col2." The second node receives the two columns as its input. The second node has a fictional SomeOperator Operator.](../images/graph_simple.svg) + +```{tip} +After you build an NVTabular workflow or Merlin Systems transform workflow, you can visualize the graph and create an image like the preceding example by running the `graph` method. +``` + +Each node in a graph has an input schema and an output schema that describe the input columns to the Operator and the output columns produced by the Operator. +The following figure represents an Operator, `SomeOperator`, that adds `colB` to a dataset. + +![Part of a directed graph that shows the input schema to a fictional SomeOperator Operator as "colA". The fictional Operator adds adds "colB" and the result is an output schema with "colA" and "colB."](../images/graph_schema.svg) + +In practice, when Merlin first builds the graph, the workflow does not initially know which columns are input or output. +This is for two reasons: + +1. Merlin enables you to build graphs that process categories of columns. + The categories are specified by _tags_ instead of an explicit list of column names. + + For example, you can select the continuous columns from your dataset with code like the following example: + + ```python + [Tags.CONTINUOUS] >> Operator(...) + ``` + +1. You can chain Operators together into a graph, such as an NVTabular workflow, before you specify a dataset. + The graph, Operators, and schema do not know which columns will be selected by tag until the software accesses the dataset and determines the column names. + +## Reference Documentation + +- {py:class}`nvtabular.ops.Normalize` +- {py:class}`nvtabular.workflow.workflow.Workflow` +- {py:class}`merlin.systems.dag.ops.workflow.TransformWorkflow` +- {py:class}`merlin.systems.dag.Ensemble` +- {py:class}`merlin.systems.dag.ops.session_filter.FilterCandidates` +- {py:class}`merlin.systems.dag.tensorflow.PredictTensorFlow` \ No newline at end of file diff --git a/docs/source/about-dataset.md b/docs/source/about-dataset.md new file mode 100644 index 000000000..ab53475af --- /dev/null +++ b/docs/source/about-dataset.md @@ -0,0 +1,58 @@ +# About the Merlin Dataset + +```{contents} +--- +depth: 2 +local: true +backlinks: none +--- +``` + +## On-disk Representation + +The Apache Parquet file format is the most-frequently used file format for Merlin datasets. + +Parquet is a columnar storage format. +The format arranges the values for each column in a long list. +This format is in contrast with a row-oriented format---such as a comma-separated values format---that arranges all the data for one row together. + +As an analogy, columnar storage is like a dictionary of columns instead of row-oriented storage that is like a list of rows. + +In most cases, a Parquet dataset includes multiple files in one or more directories. + +![The Merlin dataset class can read a directory of Parquet files for data access.](../images/parquet_and_dataset.svg) + +The Merlin dataset class, `merlin.io.Dataset`, treats a collection of many Parquet files as a single dataset. +By treating the collection as a single dataset, Merlin simplifies distributing computation over multiple GPUs or multiple machines. + +The dataset class is not a copy of the data or a modification of the Parquet files. +An instance of the class is similar to a collection of pointers to the Parquet files. + +When you create an instance of the dataset class, Merlin attempts to infer a schema by reading one record of the data. +Merlin attempts to determine the column names and data types. + +## Processing Data: Dataset and DataFrame + +When you perform a computation on a Merlin dataset, the dataset reads from the files on disk and converts them into a set of DataFrames. +The DataFrames, like Parquet files, use a columnar storage format. +The API for a DataFrame is similar to a Python dictionary---you can reference a column with syntax like `dataframe['col1']`. + +![A Merlin dataset reads data from disk and becomes several DataFrames.](../images/dataset_and_dataframe.svg) + +Merlin processes each DataFrame individually and aggregates the results across the DataFrames as needed. +There are two kinds of computations that you can perform on a dataset: `fit` and `transform`. + +The `fit` computations perform a full pass over the dataset to compute statistics, find unique values, perform grouping, or another operation that requires information from multiple DataFrames. + +The `transform` computations process each DataFrame individually. +These computations use the information gathered from `fit` to alter the DataFrame. +For example the `Normalize` and `Clip` Operators compute new values for columns and the `Rename` Operator adds and removes columns. + +More information about the `fit` and `transform` methods is provided in [](./about-operators.md). + +## Reference Documentation + +- {py:class}`merlin.io.Dataset` +- {py:class}`nvtabular.ops.Normalize` +- {py:class}`nvtabular.ops.Clip` +- {py:class}`nvtabular.ops.Rename` \ No newline at end of file diff --git a/docs/source/about-model-blocks.md b/docs/source/about-model-blocks.md new file mode 100644 index 000000000..f8850fce7 --- /dev/null +++ b/docs/source/about-model-blocks.md @@ -0,0 +1,3 @@ +# About Merlin Model Blocks + +FIXME \ No newline at end of file diff --git a/docs/source/about-operators.md b/docs/source/about-operators.md new file mode 100644 index 000000000..fc2a8b86a --- /dev/null +++ b/docs/source/about-operators.md @@ -0,0 +1,89 @@ +# About Merlin Operators + +```{contents} +--- +depth: 2 +local: true +backlinks: none +--- +``` + +## Understanding Operators + +Merlin uses Operators to perform computation on datasets such as normalizing continuous variables, bucketing continuous variables, clipping variables between minimum and maximum values, and so on. + +An Operator implements two key methods: + +Fit +: The `fit` method performs any pre-computation steps that are required before modifying the data. + + For example, the `Normalize` Operator normalizes the values of a continuous variable between `0` and `1`. + The `fit` method determines the minimum and maximum values. + + The method is optional. + For example, the `Bucketize` and `Clip` Operators do not implement the method because you specify the bucket boundaries or the minimum and maximum values for clipping. + These Operators do not need to access the data to perform any pre-computation steps. + +Transform +: The `transform` method operates on the dataset such as normalizing values, bucketing, or clipping. + This method modifies the data. + +Another difference between the two methods is that the `fit` method accepts a Merlin dataset object and the `transform` method accepts a DataFrame object. +The difference is an implementation detail---the `fit` method must access all the data and the `transform` method processes each part of the dataset one at a time. + +```{code-block} python +--- +emphasize-lines: 5, 12 +--- +# Typical signature of a fit method. +def fit( + self, + selector: ColumnSelector, + dataset: Dataset +) -> Any + +# Typical signature of a transform method. +def transform( + self, + selector: ColumnSelector, + df: DataFrame +) -> DataFrame +``` + +## Operators and Columns: Column Selector + +In most cases, you want an Operator to process a subset of the columns in your input dataset. +Both the `fit` and `transform` methods have a `selector` argument that specifies the columns to operate on. +Merlin uses a `ColumnSelector` class to represent the columns. + +The simplest column selector is a list of strings that specify some column names. +In the following sample code, `["col1", "col2"]` become an instance of a `ColumnSelector` class. + +```python +result = ["col1", "col2"] >> SomeOperator(...) +``` + +Column selectors also offer a more powerful and flexible way to specify columns. +You can specify the input columns to an Operator with tags. +In the following sample code, the Operator processes all the continuous variables in a dataset. + +```python +result = [Tags.CONTINUOUS] >> SomeOperator(...) +``` + +Using tags to create a column selector offers the following advantages: + +- Enables you to apply several Operators to the same kind of columns, such as categorical or continuous variables. +- Reduces code maintenance by enabling your code to automatically operate on newly added columns in a dataset. +- Simplifies code by avoiding lists of strings for column names. + +## How to Build an Operator + +Blah. + +## Reference Documentation + +- {py:class}`merlin.dag.BaseOperator` +- {py:class}`merlin.dag.ColumnSelector` +- {py:class}`merlin.schema.Tags` +- {py:class}`merlin.io.DataSet` \ No newline at end of file diff --git a/docs/source/about-schema.md b/docs/source/about-schema.md new file mode 100644 index 000000000..aac112ae9 --- /dev/null +++ b/docs/source/about-schema.md @@ -0,0 +1,3 @@ +# About the Merlin Schema + +FIXME \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 779dc587d..32821dde3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -71,7 +71,7 @@ # html_theme = "sphinx_rtd_theme" html_theme_options = { - "navigation_depth": 3, + "titles_only": True, "analytics_id": "G-NVJ1Y1YJHK", } html_copy_source = False @@ -118,6 +118,14 @@ autosummary_generate = True +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "merlin-core": ("https://nvidia-merlin.github.io/core/main", None), + "merlin-systems": ("https://nvidia-merlin.github.io/systems/main", None), + "merlin-models": ("https://nvidia-merlin.github.io/models/main", None), + "NVTabular": ("https://nvidia-merlin.github.io/NVTabular/main", None), +} + copydirs_additional_dirs = ["../../examples/", "../../README.md"] copydirs_file_rename = { diff --git a/docs/source/technical-concepts.md b/docs/source/technical-concepts.md new file mode 100644 index 000000000..290d1da7f --- /dev/null +++ b/docs/source/technical-concepts.md @@ -0,0 +1,4 @@ +# Merlin Technical Concepts + +The following pages provide a deeper technical understanding of Merlin concepts. +These concepts can help you to develop your own operator to implement a more sophisticated recommender system. \ No newline at end of file diff --git a/docs/source/toc.yaml b/docs/source/toc.yaml index aa384075a..3687a58d8 100644 --- a/docs/source/toc.yaml +++ b/docs/source/toc.yaml @@ -46,5 +46,13 @@ subtrees: title: Deploy the HugeCTR Model with Triton - file: examples/scaling-criteo/04-Triton-Inference-with-Merlin-Models-TensorFlow.ipynb title: Deploy the TensorFlow Model with Triton + - title: Merlin Technical Concepts + file: technical-concepts.md + entries: + - file: about-dag.md + - file: about-dataset.md + - file: about-schema.md + - file: about-operators.md + - file: about-model-blocks.md - file: containers.rst - file: support_matrix/index.rst \ No newline at end of file diff --git a/requirements/docs.txt b/requirements/docs.txt new file mode 100644 index 000000000..e5229f453 --- /dev/null +++ b/requirements/docs.txt @@ -0,0 +1,22 @@ +# docs +ipython==8.2.0 +Sphinx==3.5.4 +jinja2<3.1 +markupsafe==2.0.1 +natsort==8.1.0 +sphinx_rtd_theme +sphinx_markdown_tables +sphinx-multiversion@git+https://github.com/mikemckiernan/sphinx-multiversion.git@v0.3.0 +sphinxcontrib-copydirs@git+https://github.com/mikemckiernan/sphinxcontrib-copydirs.git@v0.3.3 +sphinx-external-toc<0.4 +myst-nb +linkify-it-py +Markdown==3.3.7 + +# smx +mergedeep<1.4 +docker<5.1 +PyGithub<1.56 +semver>=2,<3 +pytest<7.3 +coverage<6.6 diff --git a/tox.ini b/tox.ini index 80406dc4f..5a76a130c 100644 --- a/tox.ini +++ b/tox.ini @@ -36,14 +36,14 @@ commands = ; Generates documentation with sphinx. There are other steps in the Github Actions workflow ; to publish the documentation on release. changedir = {toxinidir} -deps = -rrequirements/docs.txt +deps = -r requirements/docs.txt commands = - python -m sphinx.cmd.build -P -b html docs/source docs/build/html + python -m sphinx.cmd.build -P -b {posargs:html} docs/source docs/build/{posargs:html} [testenv:docs-multi] ; Run the multi-version build that is shown on GitHub Pages. changedir = {toxinidir} -deps = -rrequirements/docs.txt +deps = -r requirements/docs.txt commands = sphinx-multiversion --dump-metadata docs/source docs/build/html | jq "keys" sphinx-multiversion docs/source docs/build/html