Add docstrings (#1106)

* Add docstrings to Encoder * Add docstrings to ItemRetrievalScorer * Add docstrings to Model * Fix docstring for TwoTowerModel * Add docstring to YoutubeDNNRetrievalModelV2 * Add docstrings to L2Norm * Add docstrings to ContinuousFeatures * Add docstrings to AverageEmbeddingsByWeightFeature * Add docstrings to ReplaceMaskedEmbeddings * Add docstrings to SequenceEmbeddingFeatures * Add docstrings to EmbeddingTable * lint * lint * lint
NVIDIA-Merlin · May 23, 2023 · db4483c · db4483c
1 parent 7ee8e84
commit db4483c
Show file tree

Hide file tree

Showing 9 changed files with 600 additions and 55 deletions.
diff --git a/merlin/models/tf/blocks/retrieval/base.py b/merlin/models/tf/blocks/retrieval/base.py
@@ -135,14 +135,15 @@ class ItemRetrievalScorer(Block):
     """Block for ItemRetrieval, which expects query/user and item embeddings as input and
     uses dot product to score the positive item (inputs["item"]) and also sampled negative
     items (during training).
+
     Parameters
     ----------
-    samplers : List[ItemSampler], optional
+    samplers: List[ItemSampler], optional
         List of item samplers that provide negative samples when `training=True`
-    sampling_downscore_false_negatives : bool, optional
+    sampling_downscore_false_negatives: bool, optional
         Identify false negatives (sampled item ids equal to the positive item and downscore them
         to the `sampling_downscore_false_negatives_value`), by default True
-    sampling_downscore_false_negatives_value : int, optional
+    sampling_downscore_false_negatives_value: int, optional
         Value to be used to downscore false negatives when
         `sampling_downscore_false_negatives=True`, by default `np.finfo(np.float32).min / 100.0`
     item_id_feature_name: str
@@ -174,6 +175,7 @@ def __init__(
         store_negative_ids: bool = False,
         **kwargs,
     ):
+        """Initializes the `ItemRetrievalScorer` class."""
         super().__init__(**kwargs)
 
         self.downscore_false_negatives = sampling_downscore_false_negatives
@@ -193,6 +195,13 @@ def __init__(
         self.set_required_features()
 
     def build(self, input_shapes):
+        """Builds the block.
+
+        Parameters
+        ----------
+        input_shapes: tuple or dict
+            Shape of the input tensor.
+        """
         if isinstance(input_shapes, dict):
             query_shape = input_shapes[self.query_name]
             self.context.add_weight(
@@ -206,6 +215,13 @@ def build(self, input_shapes):
         super().build(input_shapes)
 
     def _check_input_from_two_tower(self, inputs):
+        """Checks if the inputs from the two towers (query and item) are correctly provided.
+
+        Parameters
+        ----------
+        inputs: dict
+            Dictionary of inputs.
+        """
         if set(inputs.keys()) != set([self.query_name, self.item_name]):
             raise ValueError(
                 f"Wrong input-names, expected: {[self.query_name, self.item_name]} "
@@ -223,13 +239,15 @@ def call(
             the positive item (inputs["item"]).
             For the sampled-softmax mode, logits are computed by multiplying the query vector
             and the item embeddings matrix (self.context.get_embedding(self.item_domain))
+
         Parameters
         ----------
         inputs : Union[tf.Tensor, TabularData]
             Dict with the query and item embeddings (e.g. `{"query": <emb>}, "item": <emb>}`),
             where embeddings are 2D tensors (batch size, embedding size)
         training : bool, optional
             Flag that indicates whether in training mode, by default True
+
         Returns
         -------
         tf.Tensor
@@ -273,13 +291,15 @@ def call_outputs(
     ) -> "PredictionOutput":
         """Based on the user/query embedding (inputs[self.query_name]), uses dot product to score
             the positive item and also sampled negative items (during training).
+
         Parameters
         ----------
         inputs : TabularData
             Dict with the query and item embeddings (e.g. `{"query": <emb>}, "item": <emb>}`),
             where embeddings are 2D tensors (batch size, embedding size)
         training : bool, optional
             Flag that indicates whether in training mode, by default True
+
         Returns
         -------
         [tf.Tensor,tf.Tensor]
@@ -431,6 +451,7 @@ def _prepare_query_item_vectors_for_sampled_softmax(
         return predictions
 
     def set_required_features(self):
+        """Sets the required features for the samplers."""
         required_features = set()
         if self.downscore_false_negatives:
             required_features.add(self.item_id_feature_name)
@@ -442,6 +463,13 @@ def set_required_features(self):
         self._required_features = list(required_features)
 
     def get_config(self):
+        """Returns the configuration of the model as a dictionary.
+
+        Returns
+        -------
+        dict
+            The configuration of the model.
+        """
         config = super().get_config()
         config = maybe_serialize_keras_objects(self, config, ["samplers"])
         config["sampling_downscore_false_negatives"] = self.downscore_false_negatives
@@ -458,6 +486,17 @@ def get_config(self):
 
     @classmethod
     def from_config(cls, config):
+        """Creates a new instance of the class from its config.
+
+        Parameters
+        ----------
+        config: dict
+            A dictionary, typically the output of get_config.
+
+        Returns
+        -------
+        A new instance of the `ItemRetrievalScorer` class.
+        """
         config = maybe_deserialize_keras_objects(config, ["samplers"])
 
         return super().from_config(config)
diff --git a/merlin/models/tf/core/encoder.py b/merlin/models/tf/core/encoder.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -88,6 +88,22 @@ def encode(
         batch_size: int,
         **kwargs,
     ) -> merlin.io.Dataset:
+        """Encodes the given dataset and index.
+
+        Parameters
+        ----------
+        dataset: merlin.io.Dataset
+            The dataset to encode.
+        index: Union[str, ColumnSchema, Schema, Tags]
+            The index to use for encoding.
+        batch_size: int
+            The batch size for encoding.
+
+        Returns
+        -------
+        merlin.io.Dataset
+            The encoded dataset.
+        """
         if isinstance(index, Schema):
             output_schema = index
         elif isinstance(index, ColumnSchema):
@@ -117,12 +133,14 @@ def batch_predict(
         **kwargs,
     ) -> merlin.io.Dataset:
         """Batched prediction using Dask.
+
         Parameters
         ----------
         dataset: merlin.io.Dataset
             Dataset to predict on.
         batch_size: int
             Batch size to use for prediction.
+
         Returns
         -------
         merlin.io.Dataset
@@ -166,6 +184,23 @@ def batch_predict(
         return merlin.io.Dataset(predictions)
 
     def call(self, inputs, *, targets=None, training=False, testing=False, **kwargs):
+        """Calls the model on new inputs and returns the outputs as tensors.
+
+        Parameters
+        ----------
+        inputs : tensor-like or dict/tuple of tensors.
+            Tensors or dict/tuple of tensors representing the input batch.
+        targets : tensor-like, optional
+            Tensors representing the target data.
+        training : bool, optional
+            Whether the model is in training mode.
+        testing : bool, optional
+            Whether the model is in testing mode.
+
+        Returns
+        -------
+        A tensor or dict of tensors corresponding to the result of calling the layer.
+        """
         inputs = self._prepare_features(inputs, targets=targets)
         if isinstance(inputs, tuple):
             inputs, targets = inputs
@@ -180,6 +215,17 @@ def call(self, inputs, *, targets=None, training=False, testing=False, **kwargs)
         )
 
     def __call__(self, inputs, **kwargs):
+        """Overrides the default __call__ method to remove "features" from inputs.
+
+        Parameters
+        ----------
+        inputs : tensor-like or dict/tuple of tensors.
+            Tensors or dict/tuple of tensors representing the input batch.
+
+        Returns
+        -------
+        A tensor or dict of tensors corresponding to the result of calling the layer.
+        """
         # We remove features here since we don't expect them at inference time
         # Inside the `call` method, we will add them back by assuming inputs=features
         if "features" in kwargs:
@@ -188,6 +234,13 @@ def __call__(self, inputs, **kwargs):
         return super().__call__(inputs, **kwargs)
 
     def build(self, input_shape):
+        """Creates the variables of the layer.
+
+        Parameters
+        ----------
+        input_shape: Tuple[int]
+            The shape of the input data.
+        """
         self._prepare_features.build(input_shape)
         input_shape = self._prepare_features.compute_output_shape(input_shape)
 
@@ -196,18 +249,40 @@ def build(self, input_shape):
             self._build_input_shape = input_shape
 
     def compute_output_shape(self, input_shape):
+        """Computes the output shape of the layer.
+
+        Parameters
+        ----------
+        input_shape: Tuple[int]
+            The shape of the input data.
+
+        Returns
+        -------
+        Tuple[int]
+            The output shape of the layer.
+        """
         input_shape = self._prepare_features.compute_output_shape(input_shape)
         return combinators.compute_output_shape_sequentially(list(self.to_call), input_shape)
 
     def train_step(self, data):
-        """Train step"""
+        """Performs a training step.
+
+        Train step method is not implemented and Raises an error as the
+        Encoder block is not meant to be trained by itself and can only be
+        trained as part of a model.
+        """
         raise NotImplementedError(
             "This block is not meant to be trained by itself. ",
             "It can only be trained as part of a model.",
         )
 
     def fit(self, *args, **kwargs):
-        """Fit model"""
+        """Fits the model.
+
+        Fit method is not implemented and Raises an error as the Encoder block
+        is not meant to be trained by itself and can only be trained as part
+        of a model.
+        """
         raise NotImplementedError(
             "This block is not meant to be trained by itself. ",
             "It can only be trained as part of a model.",
@@ -245,6 +320,7 @@ def save(
 
     @property
     def to_call(self):
+        """Provides the list of blocks to be called during the execution of the model."""
         if self.pre:
             yield self.pre
 
@@ -256,22 +332,40 @@ def to_call(self):
 
     @property
     def has_schema(self) -> bool:
+        """Returns True as this class does contain a schema."""
         return True
 
     @property
     def schema(self) -> Schema:
+        """Returns the schema of the model."""
         return self._schema
 
     @property
     def first(self):
+        """Returns the first block of the model."""
         return self.blocks[0]
 
     @property
     def last(self):
+        """Returns the last block of the model."""
         return self.blocks[-1]
 
     @classmethod
     def from_config(cls, config, custom_objects=None):
+        """Creates a new instance of the class by deserializing.
+
+        Parameters
+        ----------
+        config: dict
+            A dictionary, typically the output of get_config.
+        custom_objects: dict, optional
+            A dictionary mapping the names of layers to the corresponding
+            functions and classes.
+
+        Returns
+        -------
+        A new instance of Encoder.
+        """
         pre = config.pop("pre", None)
         post = config.pop("post", None)
         layers = [
@@ -291,6 +385,13 @@ def from_config(cls, config, custom_objects=None):
         return output
 
     def get_config(self):
+        """Returns the configuration of the model as a dictionary.
+
+        Returns
+        -------
+        dict
+            The configuration of the model.
+        """
         config = tf_utils.maybe_serialize_keras_objects(self, {}, ["pre", "post"])
         for i, layer in enumerate(self.blocks):
             config[i] = tf.keras.utils.serialize_keras_object(layer)

diff --git a/merlin/models/tf/inputs/base.py b/merlin/models/tf/inputs/base.py
@@ -44,7 +44,6 @@ def InputBlock(
     post: Optional[BlockType] = None,
     aggregation: Optional[TabularAggregationType] = None,
     seq: bool = False,
-    max_seq_length: Optional[int] = None,
     add_continuous_branch: bool = True,
     continuous_tags: Optional[Union[TagsType, Tuple[Tags]]] = (Tags.CONTINUOUS,),
     continuous_projection: Optional[Block] = None,
@@ -75,6 +74,9 @@ def InputBlock(
         Next to this, it's also possible to construct it manually.
     branches: Dict[str, Block], optional
         Dictionary of branches to use inside the InputBlock.
+    pre: Optional[BlockType]
+        Transformations to apply on the inputs before the module is
+        called (before 'forward'). Default is None.
     post: Optional[BlockType]
         Transformations to apply on the inputs after the module is
         called (so **after** `forward`).