[fix] Pickling issues with xformer models (#290) (#309)

blefaudeux · web-flow · commit 8a2ef26709a2 · 2022-05-25T12:19:10.000-04:00
* Tentatively fixing pickling issues, lazy init
diff --git a/tests/test_block_factory.py b/tests/test_block_factory.py
@@ -27,13 +27,13 @@
 VOCAB_SIZE = 64
 
 
-@pytest.mark.parametrize("attn_dropout", [0.0, 0.1])
-@pytest.mark.parametrize("residual_dropout", [0.0, 0.1])
+@pytest.mark.parametrize("attn_dropout", [0.1])
+@pytest.mark.parametrize("residual_dropout", [0.1])
 @pytest.mark.parametrize("heads", [1, 2])
 @pytest.mark.parametrize("activation", [a.value for a in Activation])
 @pytest.mark.parametrize("attention_name", ATTENTION_REGISTRY.keys())
 @pytest.mark.parametrize("feedforward_name", FEEDFORWARD_REGISTRY.keys())
-@pytest.mark.parametrize("layer_norm_style", ["pre", "post"])
+@pytest.mark.parametrize("layer_norm_style", ["pre", "post", "deepnorm"])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("reversible", [True, False])
 @pytest.mark.skipif(
@@ -127,15 +127,15 @@ def test_xformer_encoder_block(
     _ = block(inputs, input_mask=input_mask)
 
 
-@pytest.mark.parametrize("attn_dropout", [0.0, 0.1])
-@pytest.mark.parametrize("residual_dropout", [0.0, 0.1])
+@pytest.mark.parametrize("attn_dropout", [0.1])
+@pytest.mark.parametrize("residual_dropout", [0.1])
 @pytest.mark.parametrize("causal", [True, False])
 @pytest.mark.parametrize("heads", [1, 2])
 @pytest.mark.parametrize("activation", [a.value for a in Activation])
 @pytest.mark.parametrize("rotary_embeddings", [False, True])
 @pytest.mark.parametrize("attention_name", ATTENTION_REGISTRY.keys())
 @pytest.mark.parametrize("feedforward_name", FEEDFORWARD_REGISTRY.keys())
-@pytest.mark.parametrize("layer_norm_style", ["pre", "post"])
+@pytest.mark.parametrize("layer_norm_style", ["pre", "post", "deepnorm"])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.skipif(
     not torch.cuda.is_available(), reason="This test requires a CUDA device"
diff --git a/tests/test_pickling.py b/tests/test_pickling.py
@@ -7,14 +7,17 @@
 # https://github.com/facebookresearch/xformers/issues/203
 
 import pickle
+from copy import deepcopy
 
+import pytest
 from torch import nn
 
+from xformers import _is_triton_available
 from xformers.factory import xFormer, xFormerConfig
 
 test_config = [
     {
-        "reversible": False,  # Turn on to test the effect of using reversible layers
+        "reversible": False,
         "block_type": "encoder",
         "num_layers": 2,
         "dim_model": 768,
@@ -30,7 +33,7 @@
             },
         },
         "feedforward_config": {
-            "name": "MLP",  # FIXME: Test with FusedMLP also
+            "name": "FusedMLP",
             "dropout": 0.1,
             "activation": "gelu",
             "hidden_layer_multiplier": 4,
@@ -40,11 +43,20 @@
 
 
 class ViT(nn.Module):
-    def __init__(self):
+    def __init__(self, mlp):
         super().__init__()
-        self.xformer = xFormer.from_config(xFormerConfig(test_config))
+        test_config[0]["feedforward_config"]["name"] = mlp
+        xformer_config = xFormerConfig(test_config)
+        self.xformer = xFormer.from_config(xformer_config)
 
 
-def test_pickling():
-    test = ViT()
-    pickle.dumps(test)
+MLPs = ["MLP"]
+if _is_triton_available:
+    MLPs.append("FusedMLP")
+
+
+@pytest.mark.parametrize("mlp", MLPs)
+def test_pickling(mlp):
+    test = ViT(mlp)
+    _ = pickle.dumps(test)
+    _ = deepcopy(test)
diff --git a/xformers/triton/dropout.py b/xformers/triton/dropout.py
@@ -7,7 +7,7 @@
 # CREDITS: This comes almost as-is from the Triton dropout tutorial
 # https://raw.githubusercontent.com/openai/triton/master/python/tutorials/04-low-memory-dropout.py
 
-from typing import Optional
+from typing import Any, Optional
 
 import torch
 import triton
@@ -196,6 +196,11 @@ def dropout(
 
 
 class FusedDropoutBias(torch.nn.Module):
+    """
+    A layer which fuses the computation of Dropout(Activation(x))
+    in a single GPU kernel
+    """
+
     def __init__(
         self,
         p: float,
@@ -216,15 +221,24 @@ def __init__(
             if bias_shape is not None
             else None
         )
-        self.activation = get_triton_activation_kernel(activation)
-        self.pytorch_activation = build_activation(self.activation_type)
-        self.activation_grad = get_triton_activation_bwd_kernel(activation)
+
+        self.activation: Optional[Any] = None
+        self.activation_grad: Optional[Any] = None
+        self.activation_pytorch: Optional[Any] = None
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Convenience, catch a possible type or device mismatch
         if self.bias is not None:
             self.bias = self.bias.to(dtype=x.dtype, device=x.device)  # type: ignore
 
+        # Lazy init (helps with pickling)
+        if self.activation is None:
+            self.activation = get_triton_activation_kernel(self.activation_type)
+            self.pytorch_activation = build_activation(self.activation_type)
+            self.activation_grad = get_triton_activation_bwd_kernel(
+                self.activation_type
+            )
+
         # Train/inference
         p = self.p if self.training else 0.0