From 5470be37ee0ae1c463b6101a3efc81779087467b Mon Sep 17 00:00:00 2001 From: Mike McKiernan Date: Mon, 14 Nov 2022 11:24:31 -0500 Subject: [PATCH] Add information about the Merlin DAG Define the important terms of the DAG. --- docs/source/about-dag.md | 69 ++++++++++++++++++++++++++++++++++++++++ docs/source/conf.py | 8 +++++ docs/source/toc.yaml | 2 ++ 3 files changed, 79 insertions(+) create mode 100644 docs/source/about-dag.md diff --git a/docs/source/about-dag.md b/docs/source/about-dag.md new file mode 100644 index 000000000..9978f1d1c --- /dev/null +++ b/docs/source/about-dag.md @@ -0,0 +1,69 @@ +# About the Merlin Directed Acyclic Graph + +Merlin uses a directed acyclic graph (DAG) to represent operations on data such as filtering or bucketing and to represent operations in a recommender system such as creating an ensemble or filtering candidate items during inference. + +Understanding the Merlin DAG is helpful if you want to develop your own operator (Op) or building a recommender system with Merlin. + +## DAG Terminology + +node +: A node in the DAG is a group of columns and at least one _operator_. + The columns are specified with a _column selector_. + A node has an _input schema_ and an _output schema_. + Resolution of the schemas is delayed until you run `fit` or `transform` on a dataset. + +column selector +: A column selector specifies the columns to select from a dataset using column names or _tags_. + +operator +: An operator performs a transformation on data and return a new _node_. + The data is identified by the _column selector_. + Some simple operators like `+` and `-` add or remove columns. + More complex operations are applied by shifting the operators onto the column selector with the `>>` notation. + +schema +: A Merlin schema is metadata that describes the columns in a dataset. + Each column has its own schema that identifies the column name and can specify _tags_ and properties. + +tag +: A Merlin tag categorizes information about a column. + Adding a tag to a column enables you to select columns for operations by tag rather than name. + + For example, you can add the `USER` and `ITEM` tags to columns. + Modeling and inference operations can use that information to act accordingly on the dataset. + +## Syntax and Sample Code + +The following code block shows the typical syntax for building a workflow that operates on DAG components. + +```{rubric} Syntax +``` + +```python +result = [column_selector, ...] >> op1 >> op2 >> ...; +``` + +Starting with the `column_selector`, the brackets group one or more column selectors that identify columns in the input data. + +The `op1` and `op2` represent operators. +When an operator performs its operation on the input data, the operator returns a node. + +The `result` object is the graph. +It contains the sequence of operations to perform. + +```{rubric} Sample Code +``` + +```python +item_features = ( + ["item_category", "item_shop", "item_brand"] >> Categorify(dtype="int32") >> TagAsItemFeatures() +) +``` + +In the sample code, the column selector is created by specifying the item-related column names. + +The {py:class}`~nvtabular.ops.Categorify` operator transforms the categorical features into unique integer values, adds the {py:attr}`~merlin.schema.Tags.CATEGORICAL` tag, and returns a node. + +The {py:class}`~nvtabular.ops.TagAsItemFeatures` operator applies the {py:attr}`~merlin.schema.Tags.ITEM` tag and returns a node. + +When the `item_features` variable is included in a transformation and applied to input data, it will traverse the nodes in order and apply the data transformation and tagging. diff --git a/docs/source/conf.py b/docs/source/conf.py index 779dc587d..a51bd04eb 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -118,6 +118,14 @@ autosummary_generate = True +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "merlin-core": ("https://nvidia-merlin.github.io/core/main", None), + "merlin-systems": ("https://nvidia-merlin.github.io/systems/main", None), + "merlin-models": ("https://nvidia-merlin.github.io/models/main", None), + "NVTabular": ("https://nvidia-merlin.github.io/NVTabular/main", None), +} + copydirs_additional_dirs = ["../../examples/", "../../README.md"] copydirs_file_rename = { diff --git a/docs/source/toc.yaml b/docs/source/toc.yaml index aa384075a..3226c5806 100644 --- a/docs/source/toc.yaml +++ b/docs/source/toc.yaml @@ -46,5 +46,7 @@ subtrees: title: Deploy the HugeCTR Model with Triton - file: examples/scaling-criteo/04-Triton-Inference-with-Merlin-Models-TensorFlow.ipynb title: Deploy the TensorFlow Model with Triton + - file: about-dag.md + title: Merlin DAG - file: containers.rst - file: support_matrix/index.rst \ No newline at end of file