diff --git a/.travis.yml b/.travis.yml
index e4728659..7378c56b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,7 +33,7 @@ install:
   - sudo apt-get install libblas-dev liblapack-dev libatlas-base-dev gfortran
   # python libs
   - sudo pip3 install --upgrade pip
-  - sudo pip3 install numpy scipy h5py "tensorflow==2.0.0"
+  - sudo pip3 install numpy scipy h5py "tensorflow==2.1.0"
   - echo "Version numbers of TensorFlow and Keras:"
   - python3 -c "import tensorflow as tf; import tensorflow.keras; print(tf.__version__); print(tensorflow.keras.__version__)"
   # FunctionalPlus
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81a83e28..d4f0560e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ list(APPEND CMAKE_MODULE_PATH "${FDEEP_TOP_DIR}/cmake")
 
 include(cmake/hunter.cmake) # default off
 
-project(frugally-deep VERSION 0.12.1)
+project(frugally-deep VERSION 0.13.0)
 
 message(STATUS "===( ${PROJECT_NAME} ${PROJECT_VERSION} )===")
 
diff --git a/INSTALL.md b/INSTALL.md
index 860af1bc..ed4c9262 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -63,7 +63,7 @@ Just add a *conanfile.txt* with frugally-deep as a requirement and chose the gen
 
 ```
 [requires]
-frugally-deep/v0.12.1-p0@dobiasd/stable
+frugally-deep/v0.13.0-p0@dobiasd/stable
 
 [generators]
 cmake
diff --git a/README.md b/README.md
index 09aa8f16..7eb2d613 100644
--- a/README.md
+++ b/README.md
@@ -173,7 +173,7 @@ Requirements and Installation
 
 - A **C++14**-compatible compiler: Compilers from these versions on are fine: GCC 4.9, Clang 3.7 (libc++ 3.7) and Visual C++ 2015.
 - Python 3.5 or higher.
-- TensorFlow 2.0.0
+- TensorFlow 2.1.0
 
 Guides for different ways to install frugally-deep can be found in [`INSTALL.md`](INSTALL.md).
 
diff --git a/include/fdeep/layers/bidirectional_layer.hpp b/include/fdeep/layers/bidirectional_layer.hpp
index 31d46a57..4290fdff 100644
--- a/include/fdeep/layers/bidirectional_layer.hpp
+++ b/include/fdeep/layers/bidirectional_layer.hpp
@@ -56,17 +56,23 @@ class bidirectional_layer : public layer
         forward_state_h_(stateful ? tensor(tensor_shape(n_units), static_cast<float_type>(0)) : fplus::nothing<tensor>()),
         forward_state_c_(stateful && wrapped_layer_type_has_state_c(wrapped_layer_type) ? tensor(tensor_shape(n_units), static_cast<float_type>(0)) : fplus::nothing<tensor>()),
         backward_state_h_(stateful ? tensor(tensor_shape(n_units), static_cast<float_type>(0)) : fplus::nothing<tensor>()),
-        backward_state_c_(stateful && wrapped_layer_type_has_state_c(wrapped_layer_type) ? tensor(tensor_shape(n_units), static_cast<float_type>(0)) : fplus::nothing<tensor>())
+        backward_state_c_(stateful && wrapped_layer_type_has_state_c(wrapped_layer_type) ? tensor(tensor_shape(n_units), static_cast<float_type>(0)) : fplus::nothing<tensor>()),
+        use_avail_input_state_for_stateful_(true)
+
         {
     }
 
     void reset_states() override
      {
+         // TF 2.1 Bug: reset_states() does nothing in TF 2.1.
+         // the implementation below is how TF 2.1 should behave.
+         // to match TF 2.1, just comment out the code below.
         if (is_stateful()) {
             forward_state_h_ = tensor(tensor_shape(n_units_), static_cast<float_type>(0));
             forward_state_c_ = tensor(tensor_shape(n_units_), static_cast<float_type>(0));
             backward_state_h_ = tensor(tensor_shape(n_units_), static_cast<float_type>(0));
             backward_state_c_ = tensor(tensor_shape(n_units_), static_cast<float_type>(0));
+            use_avail_input_state_for_stateful_ = true;
         }
      }
 
@@ -110,29 +116,26 @@ class bidirectional_layer : public layer
             assertion(inputs.size() == 1 || inputs.size() == 5,
                 "Invalid number of input tensors.");
 
-            tensor forward_state_h = inputs.size() == 5
-            ? inputs[1]
-            : is_stateful()
-                ? forward_state_h_.unsafe_get_just()
-                : tensor(tensor_shape(n_units_), static_cast<float_type>(0));
-
-            tensor forward_state_c = inputs.size() == 5
-            ? inputs[2]
-            : is_stateful()
-                ? forward_state_c_.unsafe_get_just()
-                : tensor(tensor_shape(n_units_), static_cast<float_type>(0));
-
-            tensor backward_state_h = inputs.size() == 5
-            ? inputs[3]
-            : is_stateful()
-                ? backward_state_h_.unsafe_get_just()
-                : tensor(tensor_shape(n_units_), static_cast<float_type>(0));
-
-            tensor backward_state_c = inputs.size() == 5
-            ? inputs[4]
-            : is_stateful()
-                ? backward_state_c_.unsafe_get_just()
-                : tensor(tensor_shape(n_units_), static_cast<float_type>(0));
+            bool initial_state_provided = inputs.size() == 5;
+            bool use_last_state_for_initial_state = is_stateful() && !use_avail_input_state_for_stateful_;
+            bool use_input_initial_state = initial_state_provided && !use_last_state_for_initial_state;
+            // bool use_zero_initial_state = !use_input_initial_state && !use_last_state_for_initial_state;
+
+            tensor forward_state_h = use_input_initial_state ? inputs[1] :
+                use_last_state_for_initial_state ? forward_state_h_.unsafe_get_just() :
+                tensor(tensor_shape(n_units_), static_cast<float_type>(0)); // use_zero_initial_state
+
+            tensor forward_state_c = use_input_initial_state ? inputs[2] :
+                use_last_state_for_initial_state ? forward_state_c_.unsafe_get_just() :
+                tensor(tensor_shape(n_units_), static_cast<float_type>(0)); // use_zero_initial_state
+
+            tensor backward_state_h = use_input_initial_state ? inputs[3] :
+                use_last_state_for_initial_state ? backward_state_h_.unsafe_get_just() :
+                tensor(tensor_shape(n_units_), static_cast<float_type>(0)); // use_zero_initial_state
+
+            tensor backward_state_c = use_input_initial_state ? inputs[4] :
+                use_last_state_for_initial_state ? backward_state_c_.unsafe_get_just() :
+                tensor(tensor_shape(n_units_), static_cast<float_type>(0)); // use_zero_initial_state
 
             result_forward = lstm_impl(input, forward_state_h, forward_state_c,
                                        n_units_, use_bias_, return_sequences_, stateful_,
@@ -147,6 +150,7 @@ class bidirectional_layer : public layer
                 forward_state_c_ = forward_state_c;
                 backward_state_h_ = backward_state_h;
                 backward_state_c_ = backward_state_c;
+                use_avail_input_state_for_stateful_ = false;
              }
         }
         else if (wrapped_layer_type_ == "GRU" || wrapped_layer_type_ == "CuDNNGRU")
@@ -154,17 +158,18 @@ class bidirectional_layer : public layer
             assertion(inputs.size() == 1 || inputs.size() == 3,
                 "Invalid number of input tensors.");
 
-            tensor forward_state_h = inputs.size() == 3
-            ? inputs[1]
-            : is_stateful()
-                ? forward_state_h_.unsafe_get_just()
-                : tensor(tensor_shape(n_units_), static_cast<float_type>(0));
+            bool initial_state_provided = inputs.size() == 3;
+            bool use_last_state_for_initial_state = is_stateful() && !use_avail_input_state_for_stateful_;
+            bool use_input_initial_state = initial_state_provided && !use_last_state_for_initial_state;
+            // bool use_zero_initial_state = !use_input_initial_state && !use_last_state_for_initial_state;
+
+            tensor forward_state_h = use_input_initial_state ? inputs[1] :
+                use_last_state_for_initial_state ? forward_state_h_.unsafe_get_just() :
+                tensor(tensor_shape(n_units_), static_cast<float_type>(0)); // use_zero_initial_state
 
-            tensor backward_state_h = inputs.size() == 3
-            ? inputs[2]
-            : is_stateful()
-                ? backward_state_h_.unsafe_get_just()
-                : tensor(tensor_shape(n_units_), static_cast<float_type>(0));
+            tensor backward_state_h = use_input_initial_state ? inputs[2] :
+                use_last_state_for_initial_state ? backward_state_h_.unsafe_get_just() :
+                tensor(tensor_shape(n_units_), static_cast<float_type>(0)); // use_zero_initial_state
 
             result_forward = gru_impl(input, forward_state_h, n_units_, use_bias_, reset_after_, return_sequences_, false,
                                       forward_weights_, forward_recurrent_weights_,
@@ -175,6 +180,7 @@ class bidirectional_layer : public layer
             if (is_stateful()) {
                 forward_state_h_ = forward_state_h;
                 backward_state_h_ = backward_state_h;
+                use_avail_input_state_for_stateful_ = false;
              }
         }
         else
@@ -223,6 +229,7 @@ class bidirectional_layer : public layer
     mutable fplus::maybe<tensor> forward_state_c_;
     mutable fplus::maybe<tensor> backward_state_h_;
     mutable fplus::maybe<tensor> backward_state_c_;
+    mutable bool use_avail_input_state_for_stateful_;
 };
 
 } // namespace internal
diff --git a/include/fdeep/layers/gru_layer.hpp b/include/fdeep/layers/gru_layer.hpp
index e561f772..3e5125db 100644
--- a/include/fdeep/layers/gru_layer.hpp
+++ b/include/fdeep/layers/gru_layer.hpp
@@ -44,7 +44,8 @@ class gru_layer : public layer
           weights_(weights),
           recurrent_weights_(recurrent_weights),
           bias_(bias),
-          state_h_(stateful ? tensor(tensor_shape(n_units), static_cast<float_type>(0)) : fplus::nothing<tensor>())
+          state_h_(stateful ? tensor(tensor_shape(n_units), static_cast<float_type>(0)) : fplus::nothing<tensor>()),
+          use_avail_input_state_for_stateful_(true)
 
     {
     }
@@ -53,6 +54,7 @@ class gru_layer : public layer
     {
         if (is_stateful()) {
             state_h_ = tensor(tensor_shape(n_units_), static_cast<float_type>(0));
+            use_avail_input_state_for_stateful_ = true;
         }
     }
 
@@ -77,17 +79,26 @@ class gru_layer : public layer
         assertion(inputs.size() == 1 || inputs.size() == 2,
                 "Invalid number of input tensors.");
 
-        tensor state_h = inputs.size() == 2
-            ? inputs[1]
-            : is_stateful()
-                ? state_h_.unsafe_get_just()
-                : tensor(tensor_shape(n_units_), static_cast<float_type>(0));
-
+        // RNN behaivor since TF 2.1:
+        // If an *initial state input is provided*, this is used always for non-stateful models
+        // but only on reset for stateful models (including the very first call)
+        // If *no input state is provided*, then initial state is 0 for non-stateful
+        // and, for stateful, it carries the state from previous call, unless state-reset, in which case it set to 0
+        bool initial_state_provided = inputs.size() == 2;
+        bool use_last_state_for_initial_state = is_stateful() && !use_avail_input_state_for_stateful_;
+        bool use_input_initial_state = initial_state_provided && !use_last_state_for_initial_state;
+        // bool use_zero_initial_state = !use_input_initial_state && !use_last_state_for_initial_state;
+
+        tensor state_h = use_input_initial_state ? inputs[1] :
+                        use_last_state_for_initial_state ? state_h_.unsafe_get_just() :
+                        tensor(tensor_shape(n_units_), static_cast<float_type>(0)); // use_zero_initial_state 
+      
         const auto result = gru_impl(input, state_h, n_units_, use_bias_,
             reset_after_, return_sequences_, return_state_, weights_, recurrent_weights_,
             bias_, activation_, recurrent_activation_);
         if (is_stateful()) {
             state_h_ = state_h;
+            use_avail_input_state_for_stateful_ = false;
         }
         return result;
     }
@@ -104,6 +115,7 @@ class gru_layer : public layer
     const float_vec recurrent_weights_;
     const float_vec bias_;
     mutable fplus::maybe<tensor> state_h_;
+    mutable bool use_avail_input_state_for_stateful_;
 };
 
 } // namespace internal
diff --git a/include/fdeep/layers/lstm_layer.hpp b/include/fdeep/layers/lstm_layer.hpp
index 9dade051..acd464d9 100644
--- a/include/fdeep/layers/lstm_layer.hpp
+++ b/include/fdeep/layers/lstm_layer.hpp
@@ -43,7 +43,9 @@ class lstm_layer : public layer
           recurrent_weights_(recurrent_weights),
           bias_(bias),
           state_h_(stateful ? tensor(tensor_shape(n_units), static_cast<float_type>(0)) : fplus::nothing<tensor>()),
-          state_c_(stateful ? tensor(tensor_shape(n_units), static_cast<float_type>(0)) : fplus::nothing<tensor>())
+          state_c_(stateful ? tensor(tensor_shape(n_units), static_cast<float_type>(0)) : fplus::nothing<tensor>()),
+          use_avail_input_state_for_stateful_(true)
+
     {
     }
 
@@ -52,6 +54,7 @@ class lstm_layer : public layer
         if (is_stateful()) {
             state_h_ = tensor(tensor_shape(n_units_), static_cast<float_type>(0));
             state_c_ = tensor(tensor_shape(n_units_), static_cast<float_type>(0));
+            use_avail_input_state_for_stateful_ = true;
         }
     }
 
@@ -74,17 +77,23 @@ class lstm_layer : public layer
         assertion(inputs.size() == 1 || inputs.size() == 3,
                 "Invalid number of input tensors.");
 
-        tensor state_h = inputs.size() == 3
-            ? inputs[1]
-            : is_stateful()
-                ? state_h_.unsafe_get_just()
-                : tensor(tensor_shape(n_units_), static_cast<float_type>(0));
+        // RNN behaivor since TF 2.1:
+        // If an *initial state input is provided*, this is used always for non-stateful models
+        // but only on reset for stateful models (including the very first call)
+        // If *no input state is provided*, then initial state is 0 for non-stateful
+        // and, for stateful, it carries the state from previous call, unless state-reset, in which case it set to 0
+        bool initial_state_provided = inputs.size() == 3;
+        bool use_last_state_for_initial_state = is_stateful() && !use_avail_input_state_for_stateful_;
+        bool use_input_initial_state = initial_state_provided && !use_last_state_for_initial_state;
+        // bool use_zero_initial_state = !use_input_initial_state && !use_last_state_for_initial_state;
+
+        tensor state_h = use_input_initial_state ? inputs[1] :
+                        use_last_state_for_initial_state ? state_h_.unsafe_get_just() :
+                        tensor(tensor_shape(n_units_), static_cast<float_type>(0)); // use_zero_initial_state
 
-        tensor state_c = inputs.size() == 3
-            ? inputs[2]
-            : is_stateful()
-                ? state_c_.unsafe_get_just()
-                : tensor(tensor_shape(n_units_), static_cast<float_type>(0));
+        tensor state_c = use_input_initial_state ? inputs[2] :
+                        use_last_state_for_initial_state ? state_c_.unsafe_get_just() :
+                        tensor(tensor_shape(n_units_), static_cast<float_type>(0)); // use_zero_initial_state
 
         const auto result = lstm_impl(input, state_h, state_c,
             n_units_, use_bias_, return_sequences_, return_state_, weights_,
@@ -92,6 +101,7 @@ class lstm_layer : public layer
         if (is_stateful()) {
             state_h_ = state_h;
             state_c_ = state_c;
+            use_avail_input_state_for_stateful_ = false;
         }
         return result;
     }
@@ -108,6 +118,7 @@ class lstm_layer : public layer
     const float_vec bias_;
     mutable fplus::maybe<tensor> state_h_;
     mutable fplus::maybe<tensor> state_c_;
+    mutable bool use_avail_input_state_for_stateful_;
 };
 
 } // namespace internal
diff --git a/test/stateful_test/stateful_recurrent_tests.cpp b/test/stateful_test/stateful_recurrent_tests.cpp
index 57ad200e..cc3a0f6b 100644
--- a/test/stateful_test/stateful_recurrent_tests.cpp
+++ b/test/stateful_test/stateful_recurrent_tests.cpp
@@ -1,5 +1,5 @@
 #include "fdeep/fdeep.hpp"
-#include <fstream> // looks like we need this too (edit by π)
+#include <fstream>
 
 using namespace fdeep;
 
@@ -16,13 +16,23 @@ int main()
 
     const std::vector<float> x_inf_0 = {2.1, -1.2, 3.14, 1.2};
     const std::vector<float> x_inf_1 = {1, 3, -2, 10};
-    const std::vector<float> state_0 = {1.1, -2.1};
-    const std::vector<float> state_1 = {2.7, 3.1};
-    const std::vector<float> state_2 = {-2.5, 3.0};
-    const std::vector<float> state_3 = {-2.0, -10.0};
+    const std::vector<float> state_0 = {40.1, -25.1};
+    const std::vector<float> state_1 = {34.7, 56.1};
+    const std::vector<float> state_2 = {-62.5, 12.0};
+    const std::vector<float> state_3 = {-33.0, -100.0};
+
+
+
+    // const std::vector<float> state_0 = {1.1, -2.1};
+    // const std::vector<float> state_1 = {2.7, 3.1};
+    // const std::vector<float> state_2 = {-2.5, 3.0};
+    // const std::vector<float> state_3 = {-2.0, -10.0};
     std::vector<float> all_results = {};
     std::vector<float> one_result = {};
 
+// [40.1, -25.1, 34.7, 56.1, -62.5, 12.0, -33.0, -100.0]
+// [1.1, -2.1, 2.7, 3.1, -2.5, 3.0, -2.0, -10.0]
+
     const shared_float_vec xt0(fplus::make_shared_ref<float_vec>(x_inf_0));
     const shared_float_vec xt1(fplus::make_shared_ref<float_vec>(x_inf_1));
     const shared_float_vec st0(fplus::make_shared_ref<float_vec>(state_0));
@@ -158,16 +168,16 @@ int main()
     vec_append(all_results, *result[0].as_vector());
 
     // ************************* BIDIRECTIONAL TESTS ************************* //
-    #define TF_BIDI_BUG_FIXED false
+    #define TF_BIDI_STATE_RESET_WORKS false
     // *********** TEST 9: "bidi-GRU_nonstateful_no_init_state.json" ***********
     model = load_model("./models/bidi-GRU_nonstateful_no_init_state.json");
     /// state_reset = true
     result = model.predict({test_in_0});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     result = model.predict({test_in_1});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     /// state_reset = false
     result = model.predict({test_in_0});
     vec_append(all_results, *result[0].as_vector());
@@ -179,10 +189,10 @@ int main()
     /// state_reset = true
     result = model.predict({test_in_0, test_state_0, test_state_1});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     result = model.predict({test_in_1, test_state_0, test_state_1});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     /// state_reset = false
     result = model.predict({test_in_0, test_state_0, test_state_1});
     vec_append(all_results, *result[0].as_vector());
@@ -194,10 +204,10 @@ int main()
     /// state_reset = true
     result = model.predict_stateful({test_in_0});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED)  model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS)  model.reset_states();
     result = model.predict_stateful({test_in_1});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     /// state_reset = false
     result = model.predict_stateful({test_in_0});
     vec_append(all_results, *result[0].as_vector());
@@ -209,10 +219,10 @@ int main()
     /// state_reset = true
     result = model.predict_stateful({test_in_0, test_state_0, test_state_1});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED)  model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS)  model.reset_states();
     result = model.predict_stateful({test_in_1, test_state_0, test_state_1});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     /// state_reset = false
     result = model.predict_stateful({test_in_0, test_state_0, test_state_1});
     vec_append(all_results, *result[0].as_vector());
@@ -224,10 +234,10 @@ int main()
     /// state_reset = true
     result = model.predict({test_in_0});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     result = model.predict({test_in_1});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     /// state_reset = false
     result = model.predict({test_in_0});
     vec_append(all_results, *result[0].as_vector());
@@ -239,10 +249,10 @@ int main()
     /// state_reset = true
     result = model.predict({test_in_0, test_state_0, test_state_1, test_state_2, test_state_3});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     result = model.predict({test_in_1, test_state_0, test_state_1, test_state_2, test_state_3});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     /// state_reset = false
     result = model.predict({test_in_0, test_state_0, test_state_1, test_state_2, test_state_3});
     vec_append(all_results, *result[0].as_vector());
@@ -254,10 +264,10 @@ int main()
     /// state_reset = true
     result = model.predict_stateful({test_in_0});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     result = model.predict_stateful({test_in_1});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     /// state_reset = false
     result = model.predict_stateful({test_in_0});
     vec_append(all_results, *result[0].as_vector());
@@ -265,21 +275,21 @@ int main()
     vec_append(all_results, *result[0].as_vector());
 
     // *********** TEST 16: "bidi-LSTM_stateful_init_state.json" ***********
-    model = load_model("./models/bidi-LSTM_nonstateful_init_state.json");
+    model = load_model("./models/bidi-LSTM_stateful_init_state.json");
     /// state_reset = true
     result = model.predict_stateful({test_in_0, test_state_0, test_state_1, test_state_2, test_state_3});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     result = model.predict_stateful({test_in_1, test_state_0, test_state_1, test_state_2, test_state_3});
     vec_append(all_results, *result[0].as_vector());
-    if(TF_BIDI_BUG_FIXED) model.reset_states();
+    if(TF_BIDI_STATE_RESET_WORKS) model.reset_states();
     /// state_reset = false
     result = model.predict_stateful({test_in_0, test_state_0, test_state_1, test_state_2, test_state_3});
     vec_append(all_results, *result[0].as_vector());
     result = model.predict_stateful({test_in_1, test_state_0, test_state_1, test_state_2, test_state_3});
     vec_append(all_results, *result[0].as_vector());
 
-    #undef TF_BIDI_BUG_FIXED
+    #undef TF_BIDI_STATE_RESET_WORKS
 
     if(verbose){
         std::cout << "\n\nOUTPUT ***" << std::endl;
diff --git a/test/stateful_test/stateful_recurrent_tests.py b/test/stateful_test/stateful_recurrent_tests.py
index 5d5defb0..d5940016 100644
--- a/test/stateful_test/stateful_recurrent_tests.py
+++ b/test/stateful_test/stateful_recurrent_tests.py
@@ -1,3 +1,8 @@
+# to hide any GPUs.
+# import os
+# os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
+# os.environ['CUDA_VISIBLE_DEVICES']=''
+
 import errno
 import os
 import sys
@@ -194,7 +199,8 @@ def main():
     x_inf = np.asarray([2.1, -1.2, 3.14, 1.2, 1, 3, -2, 10], dtype=np.float32)  # simple
     x_inf = x_inf.reshape((2, train_seq_length, 1))
 
-    initial_states = np.asarray([1.1, -2.1, 2.7, 3.1, -2.5, 3.0, -2.0, -10.0], dtype=np.float32)
+    initial_states = np.asarray([40.1, -25.1, 34.7, 56.1, -62.5, 12.0, -33.0, -100.0], dtype=np.float32)
+    # initial_states = np.asarray([1.1, -2.1, 2.7, 3.1, -2.5, 3.0, -2.0, -10.0], dtype=np.float32)
     initial_states = initial_states.reshape((4, 1, 2))
 
     model_file_names = []
@@ -253,6 +259,7 @@ def main():
             print('********* FAILED !!!!!!!!!!!!\n\n')
             print('Keras: ', all_results[i], '\n')
             print('Frugally-deep: ', frugally_deep_results[i], '\n')
+            print('Difference: ', all_results[i] - frugally_deep_results[i], '\n')
             all_tests_passed = False
 
     if not all_tests_passed:
@@ -260,6 +267,5 @@ def main():
         sys.exit(errno.EIO)
     print('\n\nPassed all stateful tests')
 
-
 if __name__ == "__main__":
     main()
diff --git a/test/stateful_test/tf_behaivor_scripts/temp_bidi_no_state_in.py b/test/stateful_test/tf_behaivor_scripts/temp_bidi_no_state_in.py
new file mode 100644
index 00000000..3ec648d5
--- /dev/null
+++ b/test/stateful_test/tf_behaivor_scripts/temp_bidi_no_state_in.py
@@ -0,0 +1,48 @@
+import os
+os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
+os.environ['CUDA_VISIBLE_DEVICES']=''
+
+import numpy as np
+from tensorflow.keras.layers import Input, Dense, SimpleRNN, GRU, LSTM, Bidirectional
+from tensorflow.keras.models import Model
+
+REC = LSTM
+
+sequence_length = 3
+feature_dim = 1
+features_in = Input(batch_shape=(1, sequence_length, feature_dim)) 
+
+rnn_out = Bidirectional( REC(1, activation=None, use_bias=False, return_sequences=True, return_state=False, stateful=False))(features_in)
+stateless_model = Model(inputs=[features_in], outputs=[rnn_out])
+
+stateful_rnn_out = Bidirectional( REC(1, activation=None, use_bias=False, return_sequences=True, return_state=False, stateful=True))(features_in)
+stateful_model = Model(inputs=features_in, outputs=stateful_rnn_out)
+
+stateful_model.set_weights( stateless_model.get_weights() )
+
+x_in = np.random.normal(0,10,sequence_length)
+x_in = x_in.reshape( (1, sequence_length, feature_dim) )
+
+def print_bidi_out(non_stateful_out, stateful_out):
+	fb = ['FWD::', 'BWD::']
+
+	for i in range(2):
+		print(fb[i])
+		print(f'non_stateful: {non_stateful_out.T[i]}')
+		print(f'stateful: {stateful_out.T[i]}')
+		print(f'delta: {stateful_out.T[i]-non_stateful_out.T[i]}')
+
+
+non_stateful_out = stateless_model.predict(x_in).reshape((sequence_length,2))
+stateful_out = stateful_model.predict(x_in).reshape((sequence_length,2))
+print_bidi_out(non_stateful_out, stateful_out)
+
+non_stateful_out = stateless_model.predict(x_in).reshape((sequence_length,2))
+stateful_out = stateful_model.predict(x_in).reshape((sequence_length,2))
+print_bidi_out(non_stateful_out, stateful_out)
+
+print('\n** RESETING STATES in STATEFUL MODEL **\n')
+stateful_model.reset_states()
+non_stateful_out = stateless_model.predict(x_in).reshape((sequence_length,2))
+stateful_out = stateful_model.predict(x_in).reshape((sequence_length,2))
+print_bidi_out(non_stateful_out, stateful_out)
diff --git a/test/stateful_test/tf_behaivor_scripts/temp_bidi_state_in.cpp b/test/stateful_test/tf_behaivor_scripts/temp_bidi_state_in.cpp
new file mode 100644
index 00000000..3e266a9f
--- /dev/null
+++ b/test/stateful_test/tf_behaivor_scripts/temp_bidi_state_in.cpp
@@ -0,0 +1,72 @@
+#include "fdeep/fdeep.hpp"
+#include <fstream>
+
+using namespace fdeep;
+
+int main()
+{
+
+    // x_in = np.random.normal(0,10,sequence_length)
+    // x_in = np.asarray([1,0,0])
+    // x_in = x_in.reshape( (1, sequence_length, feature_dim) )
+
+    // fwd_initial_h = np.asarray(2.75).reshape(1,1)
+    // fwd_initial_c = np.asarray(1.3).reshape(1,1)
+    // bwd_initial_h = np.asarray(-2.0).reshape(1,1)
+    // bwd_initial_c = np.asarray(-1.2).reshape(1,1)
+
+    const std::vector<float> x_inf_0 = {1.0, 0.0, 0.0};
+    const std::vector<float> state_0 = {2.75};
+    const std::vector<float> state_1 = {1.3};
+    const std::vector<float> state_2 = {-2.0};
+    const std::vector<float> state_3 = {-1.2};
+
+    const shared_float_vec xt0(fplus::make_shared_ref<float_vec>(x_inf_0));
+    const shared_float_vec st0(fplus::make_shared_ref<float_vec>(state_0));
+    const shared_float_vec st1(fplus::make_shared_ref<float_vec>(state_1));
+    const shared_float_vec st2(fplus::make_shared_ref<float_vec>(state_2));
+    const shared_float_vec st3(fplus::make_shared_ref<float_vec>(state_3));
+
+    const tensor test_in_0(tensor_shape(3, 1), xt0);
+    const tensor test_state_0(tensor_shape(static_cast<std::size_t>(1)), st0);
+    const tensor test_state_1(tensor_shape(static_cast<std::size_t>(1)), st1);
+    const tensor test_state_2(tensor_shape(static_cast<std::size_t>(1)), st2);
+    const tensor test_state_3(tensor_shape(static_cast<std::size_t>(1)), st3);
+
+
+    std::cout << "loading models" << std::endl;
+    auto stateful_model = load_model("temp_stateful.json");
+    auto stateless_model = load_model("temp_stateless.json");
+
+    // input for GRU: {test_in_0, test_state_0, test_state_2};
+    // input for LSTM: {test_in_0, test_state_0, test_state_1, test_state_2, test_state_3}
+
+    // A
+    std::cout << "starting A" << std::endl;
+    auto non_stateful_out = stateless_model.predict({test_in_0, test_state_0, test_state_1, test_state_2, test_state_3});
+    auto stateful_out = stateful_model.predict_stateful({test_in_0, test_state_0, test_state_1, test_state_2, test_state_3});
+    std::cout << "Non-Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(non_stateful_out) << std::endl;
+    std::cout << "Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(stateful_out) << std::endl;
+
+    // B
+    std::cout << "starting B" << std::endl;
+    non_stateful_out = stateless_model.predict({test_in_0, test_state_0, test_state_1, test_state_2, test_state_3});
+    stateful_out = stateful_model.predict_stateful({test_in_0, test_state_0, test_state_1, test_state_2, test_state_3});
+    std::cout << "Non-Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(non_stateful_out) << std::endl;
+    std::cout << "Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(stateful_out) << std::endl;
+
+    // C
+    std::cout << "starting C" << std::endl;
+    // stateful_model.reset_states();
+    non_stateful_out = stateless_model.predict({test_in_0, test_state_0, test_state_1, test_state_2, test_state_3});
+    stateful_out = stateful_model.predict_stateful({test_in_0, test_state_0, test_state_1, test_state_2, test_state_3});
+    std::cout << "Non-Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(non_stateful_out) << std::endl;
+    std::cout << "Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(stateful_out) << std::endl;
+}
+
diff --git a/test/stateful_test/tf_behaivor_scripts/temp_bidi_state_in.py b/test/stateful_test/tf_behaivor_scripts/temp_bidi_state_in.py
new file mode 100644
index 00000000..2d51b5f0
--- /dev/null
+++ b/test/stateful_test/tf_behaivor_scripts/temp_bidi_state_in.py
@@ -0,0 +1,90 @@
+import os
+os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
+os.environ['CUDA_VISIBLE_DEVICES']=''
+
+import numpy as np
+from tensorflow.keras.layers import Input, Dense, SimpleRNN, GRU, LSTM, Bidirectional
+from tensorflow.keras.models import Model
+
+REC = LSTM
+
+sequence_length = 3
+feature_dim = 1
+features_in = Input(batch_shape=(1, sequence_length, feature_dim)) 
+state_h_fwd_in = Input(batch_shape=(1, 1))
+state_h_bwd_in = Input(batch_shape=(1, 1))
+state_c_fwd_in = Input(batch_shape=(1, 1))
+state_c_bwd_in = Input(batch_shape=(1, 1))
+
+four_state_shape = [state_h_fwd_in, state_c_fwd_in, state_h_bwd_in, state_c_bwd_in]
+two_state_shape = [state_h_fwd_in, state_h_bwd_in]
+
+if REC == LSTM:
+    rnn_out = Bidirectional( REC(1, activation='linear', use_bias=False, return_sequences=True, return_state=False, stateful=False))(features_in, initial_state=four_state_shape)
+    stateful_rnn_out = Bidirectional( REC(1, activation='linear', use_bias=False, return_sequences=True, return_state=False, stateful=True))(features_in, initial_state=four_state_shape)
+    rnn_inputs = [features_in, state_h_fwd_in, state_c_fwd_in, state_h_bwd_in, state_c_bwd_in]
+else:
+    if REC == SimpleRNN:
+        rnn_out = Bidirectional( REC(1, activation='linear', use_bias=False, return_sequences=True, return_state=False, stateful=False))(features_in, initial_state=two_state_shape)
+        stateful_rnn_out = Bidirectional( REC(1, activation='linear', use_bias=False, return_sequences=True, return_state=False, stateful=True))(features_in, initial_state=two_state_shape)
+    else:
+        rnn_out = Bidirectional( REC(1, activation='linear', use_bias=False, return_sequences=True, return_state=False, stateful=False))(features_in, initial_state=two_state_shape)
+        stateful_rnn_out = Bidirectional( REC(1, activation='linear', use_bias=False, return_sequences=True, return_state=False, stateful=True))(features_in, initial_state=two_state_shape)
+    rnn_inputs = [features_in, state_h_fwd_in, state_h_bwd_in]
+
+stateless_model = Model(inputs=rnn_inputs, outputs=rnn_out)
+stateful_model = Model(inputs=rnn_inputs, outputs=stateful_rnn_out)
+
+
+# toy_weights = [np.asarray([[ 1.0]], dtype=np.float32), np.asarray([[0.5 ]], dtype=np.float32), np.asarray([[ -1.0 ]], dtype=np.float32), np.asarray([[ -0.5 ]], dtype=np.float32)]
+# stateless_model.set_weights(toy_weights)
+# stateful_model.set_weights(toy_weights)
+
+stateful_model.set_weights( stateless_model.get_weights() )
+
+stateful_model.save('temp_stateful.h5')
+stateless_model.save('temp_stateless.h5')
+
+x_in = np.random.normal(0,10,sequence_length)
+x_in = np.asarray([1,0,0])
+x_in = x_in.reshape( (1, sequence_length, feature_dim) )
+
+fwd_initial_h = np.asarray(2.75).reshape(1,1)
+fwd_initial_c = np.asarray(1.3).reshape(1,1)
+bwd_initial_h = np.asarray(-2.0).reshape(1,1)
+bwd_initial_c = np.asarray(-1.2).reshape(1,1)
+
+# fwd_initial_h = np.asarray(np.random.normal(0,10)).reshape(1,1)
+# fwd_initial_h = np.asarray(np.random.normal(0,10)).reshape(1,1)
+# bwd_initial_h = np.asarray(np.random.normal(0,10)).reshape(1,1)
+# fwd_initial_c = np.asarray(np.random.normal(0,10)).reshape(1,1)
+# bwd_initial_c = np.asarray(np.random.normal(0,10)).reshape(1,1)
+
+if REC == LSTM:
+    rnn_input = [x_in, fwd_initial_h, fwd_initial_c, bwd_initial_h, bwd_initial_c]
+else:
+    rnn_input = [x_in, fwd_initial_h, bwd_initial_h] 
+    
+
+def print_bidi_out(non_stateful_out, stateful_out):
+	fb = ['FWD::', 'BWD::']
+
+	for i in range(2):
+		print(fb[i])
+		print(f'non_stateful: {non_stateful_out.T[i]}')
+		print(f'stateful: {stateful_out.T[i]}')
+		print(f'delta: {stateful_out.T[i]-non_stateful_out.T[i]}')
+
+non_stateful_out = stateless_model.predict(rnn_input).reshape((sequence_length,2))
+stateful_out = stateful_model.predict(rnn_input).reshape((sequence_length,2))
+print_bidi_out(non_stateful_out, stateful_out)
+
+non_stateful_out = stateless_model.predict(rnn_input).reshape((sequence_length,2))
+stateful_out = stateful_model.predict(rnn_input).reshape((sequence_length,2))
+print_bidi_out(non_stateful_out, stateful_out)
+
+print('\n** RESETING STATES in STATEFUL MODEL **\n')
+stateful_model.reset_states()
+non_stateful_out = stateless_model.predict(rnn_input).reshape((sequence_length,2))
+stateful_out = stateful_model.predict(rnn_input).reshape((sequence_length,2))
+print_bidi_out(non_stateful_out, stateful_out)
diff --git a/test/stateful_test/tf_behaivor_scripts/temp_rnn_test.cpp b/test/stateful_test/tf_behaivor_scripts/temp_rnn_test.cpp
new file mode 100644
index 00000000..5e627b83
--- /dev/null
+++ b/test/stateful_test/tf_behaivor_scripts/temp_rnn_test.cpp
@@ -0,0 +1,57 @@
+#include "fdeep/fdeep.hpp"
+#include <fstream>
+
+using namespace fdeep;
+
+int main()
+{
+    const std::vector<float> x_inf_0 = {1.0, 0.0, 0.0};
+    const std::vector<float> state_0 = {10.0};
+
+    const shared_float_vec xt0(fplus::make_shared_ref<float_vec>(x_inf_0));
+    const shared_float_vec st0(fplus::make_shared_ref<float_vec>(state_0));
+
+    std::cout << "convert to tensors" << std::endl;
+    const tensor test_in_0(tensor_shape(3, 1), xt0);
+    std::cout << "convert to tensors" << std::endl;
+    const tensor test_state_0(tensor_shape(static_cast<std::size_t>(1)), st0);
+
+    std::cout << "loading models" << std::endl;
+    auto stateful_model = load_model("temp_stateful.json");
+    auto stateless_model = load_model("temp_stateless.json");
+
+    // A
+    std::cout << "starting A" << std::endl;
+    auto non_stateful_out = stateless_model.predict({test_in_0, test_state_0});
+    auto stateful_out = stateful_model.predict_stateful({test_in_0, test_state_0});
+    std::cout << "Non-Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(non_stateful_out) << std::endl;
+    std::cout << "Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(stateful_out) << std::endl;
+
+    // B
+    non_stateful_out = stateless_model.predict({test_in_0, test_state_0});
+    stateful_out = stateful_model.predict_stateful({test_in_0, test_state_0});
+    std::cout << "Non-Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(non_stateful_out) << std::endl;
+    std::cout << "Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(stateful_out) << std::endl;
+
+    // C
+    std::cout << "** RESETING STATES in STATEFUL MODEL **" << std::endl;
+    stateful_model.reset_states();
+    non_stateful_out = stateless_model.predict({test_in_0, test_state_0});
+    stateful_out = stateful_model.predict_stateful({test_in_0, test_state_0});
+    std::cout << "Non-Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(non_stateful_out) << std::endl;
+    std::cout << "Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(stateful_out) << std::endl;
+
+    //D
+    non_stateful_out = stateless_model.predict({test_in_0, test_state_0});
+    stateful_out = stateful_model.predict_stateful({test_in_0, test_state_0});
+    std::cout << "Non-Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(non_stateful_out) << std::endl;
+    std::cout << "Stateful" << std::endl;
+    std::cout << fdeep::show_tensors(stateful_out) << std::endl;
+}
\ No newline at end of file
diff --git a/test/stateful_test/tf_behaivor_scripts/temp_rnn_test.py b/test/stateful_test/tf_behaivor_scripts/temp_rnn_test.py
new file mode 100644
index 00000000..acf9dedf
--- /dev/null
+++ b/test/stateful_test/tf_behaivor_scripts/temp_rnn_test.py
@@ -0,0 +1,68 @@
+import os
+os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
+os.environ['CUDA_VISIBLE_DEVICES']=''
+
+import numpy as np
+from tensorflow.keras.layers import Input, Dense, SimpleRNN, GRU, LSTM, Bidirectional
+from tensorflow.keras.models import Model
+
+USE_TOY_WEIGHTS = True
+REC_LAYER = GRU
+sequence_length = 3
+feature_dim = 1
+features_in = Input(batch_shape=(1, sequence_length, feature_dim)) 
+state_h_in = Input(batch_shape=(1, 1))
+
+rnn_out = REC_LAYER(1, activation=None,  use_bias=False, return_sequences=True, return_state=False, stateful=False)(features_in, initial_state=state_h_in)
+stateless_model = Model(inputs=[features_in, state_h_in], outputs=rnn_out)
+
+stateful_rnn_out = REC_LAYER(1, activation=None,  use_bias=False, return_sequences=True, return_state=False, stateful=True)(features_in, initial_state=state_h_in)
+stateful_model = Model(inputs=[features_in, state_h_in], outputs=stateful_rnn_out)
+
+if USE_TOY_WEIGHTS:
+	if REC_LAYER == SimpleRNN:
+		toy_weights = [ np.asarray([[1.0]], dtype=np.float32), np.asarray([[-0.5]], dtype=np.float32)]
+
+	elif REC_LAYER == GRU:
+		# for a GRU, the first are the non-recurrent kernels W, and the second are the recurrent kernels U (V)
+		toy_weights = [np.asarray([[ 1.0, -2.0,  3.0 ]], dtype=np.float32), np.asarray([[ -0.5 , 2.0, -1.1 ]], dtype=np.float32)]
+
+	stateless_model.set_weights(toy_weights)
+	stateful_model.set_weights(toy_weights)
+
+# w = stateless_model.get_weights()
+# print(w)
+
+stateless_model.save('temp_stateless.h5', include_optimizer=False)
+stateful_model.save('temp_stateful.h5', include_optimizer=False)
+
+x_in = np.zeros(sequence_length)
+x_in[0] = 1
+x_in = x_in.reshape( (1, sequence_length, feature_dim) )
+initial_state = np.asarray( [10])
+initial_state = initial_state.reshape((1,1))
+
+def print_rnn_out(non_stateful_out, stateful_out):
+	fb = ['FWD::', 'BWD::']
+
+	print(f'non_stateful: {non_stateful_out}')
+	print(f'stateful: {stateful_out}')
+	print(f'delta: {stateful_out-non_stateful_out}')
+
+non_stateful_out = stateless_model.predict([x_in, initial_state]).reshape((sequence_length))
+stateful_out = stateful_model.predict([x_in, initial_state]).reshape((sequence_length))
+print_rnn_out(non_stateful_out, stateful_out)
+
+non_stateful_out = stateless_model.predict([x_in, initial_state]).reshape((sequence_length))
+stateful_out = stateful_model.predict([x_in, initial_state]).reshape((sequence_length))
+print_rnn_out(non_stateful_out, stateful_out)
+
+print('\n** RESETING STATES in STATEFUL MODEL **\n')
+stateful_model.reset_states()
+non_stateful_out = stateless_model.predict([x_in, initial_state]).reshape((sequence_length))
+stateful_out = stateful_model.predict([x_in, initial_state]).reshape((sequence_length))
+print_rnn_out(non_stateful_out, stateful_out)
+
+non_stateful_out = stateless_model.predict([x_in, initial_state]).reshape((sequence_length))
+stateful_out = stateful_model.predict([x_in, initial_state]).reshape((sequence_length))
+print_rnn_out(non_stateful_out, stateful_out)