From a6700c3c0e3f0a593b82d7d185b211f95511b038 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Mon, 9 Dec 2024 18:24:44 +0100
Subject: [PATCH] changes for Flux v0.15 (#550)

---
 GNNGraphs/Project.toml                        |    4 +-
 GNNGraphs/src/GNNGraphs.jl                    |    2 -
 GNNGraphs/src/datastore.jl                    |    2 -
 GNNGraphs/src/gnngraph.jl                     |    2 -
 .../src/gnnheterograph/gnnheterograph.jl      |    2 -
 GNNGraphs/src/temporalsnapshotsgnngraph.jl    |    2 -
 GNNGraphs/test/runtests.jl                    |    2 +-
 GNNLux/Project.toml                           |    6 +-
 GNNLux/docs/make.jl                           |    8 +-
 GNNLux/src/layers/conv.jl                     |    2 +-
 GNNLux/test/runtests.jl                       |    8 +
 GNNlib/Project.toml                           |    6 +-
 GNNlib/docs/make.jl                           |    6 +-
 GNNlib/src/layers/conv.jl                     |    2 +-
 GNNlib/src/layers/pool.jl                     |    5 +-
 GNNlib/test/runtests.jl                       |    7 +
 GraphNeuralNetworks/Project.toml              |    8 +-
 GraphNeuralNetworks/docs/make.jl              |    8 +-
 GraphNeuralNetworks/src/layers/pool.jl        |   21 +-
 .../src/layers/temporalconv.jl                | 1118 ++++++++---------
 GraphNeuralNetworks/test/Project.toml         |    2 +
 GraphNeuralNetworks/test/layers/pool.jl       |    2 +-
 .../test/layers/temporalconv.jl               |  167 +--
 GraphNeuralNetworks/test/runtests.jl          |    8 +
 GraphNeuralNetworks/test/test_module.jl       |    2 +-
 25 files changed, 689 insertions(+), 713 deletions(-)

diff --git a/GNNGraphs/Project.toml b/GNNGraphs/Project.toml
index ed8548d79..3b5f6c6d4 100644
--- a/GNNGraphs/Project.toml
+++ b/GNNGraphs/Project.toml
@@ -1,7 +1,7 @@
 name = "GNNGraphs"
 uuid = "aed8fd31-079b-4b5a-b342-a13352159b8c"
 authors = ["Carlo Lucibello and contributors"]
-version = "1.3.1"
+version = "1.4.0-DEV"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -31,7 +31,7 @@ GNNGraphsSimpleWeightedGraphsExt = "SimpleWeightedGraphs"
 Adapt = "4"
 CUDA = "5"
 ChainRulesCore = "1"
-Functors = "0.4.1, 0.5"
+Functors = "0.5"
 Graphs = "1.4"
 KrylovKit = "0.8"
 LinearAlgebra = "1"
diff --git a/GNNGraphs/src/GNNGraphs.jl b/GNNGraphs/src/GNNGraphs.jl
index e6a79407f..736af601a 100644
--- a/GNNGraphs/src/GNNGraphs.jl
+++ b/GNNGraphs/src/GNNGraphs.jl
@@ -1,7 +1,6 @@
 module GNNGraphs
 
 using SparseArrays
-using Functors: @functor
 import Graphs
 using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree, 
               has_self_loops, is_directed, induced_subgraph, has_edge
@@ -13,7 +12,6 @@ using ChainRulesCore
 using LinearAlgebra, Random, Statistics
 import MLUtils
 using MLUtils: getobs, numobs, ones_like, zeros_like, chunk, batch, rand_like
-import Functors
 using MLDataDevices: get_device, cpu_device, CPUDevice
 
 include("chainrules.jl") # hacks for differentiability
diff --git a/GNNGraphs/src/datastore.jl b/GNNGraphs/src/datastore.jl
index 7bc1bd29b..b38ef0acc 100644
--- a/GNNGraphs/src/datastore.jl
+++ b/GNNGraphs/src/datastore.jl
@@ -70,8 +70,6 @@ struct DataStore
     end
 end
 
-@functor DataStore
-
 DataStore(data) = DataStore(-1, data)
 DataStore(n::Int, data::NamedTuple) = DataStore(n, Dict{Symbol, Any}(pairs(data)))
 DataStore(n::Int, data) = DataStore(n, Dict{Symbol, Any}(data))
diff --git a/GNNGraphs/src/gnngraph.jl b/GNNGraphs/src/gnngraph.jl
index c19483332..9934229f7 100644
--- a/GNNGraphs/src/gnngraph.jl
+++ b/GNNGraphs/src/gnngraph.jl
@@ -116,8 +116,6 @@ struct GNNGraph{T <: Union{COO_T, ADJMAT_T}} <: AbstractGNNGraph{T}
     gdata::DataStore
 end
 
-@functor GNNGraph
-
 function GNNGraph(data::D;
                   num_nodes = nothing,
                   graph_indicator = nothing,
diff --git a/GNNGraphs/src/gnnheterograph/gnnheterograph.jl b/GNNGraphs/src/gnnheterograph/gnnheterograph.jl
index 7e55e76e7..9d65cbe67 100644
--- a/GNNGraphs/src/gnnheterograph/gnnheterograph.jl
+++ b/GNNGraphs/src/gnnheterograph/gnnheterograph.jl
@@ -95,8 +95,6 @@ struct GNNHeteroGraph{T <: Union{COO_T, ADJMAT_T}} <: AbstractGNNGraph{T}
     etypes::Vector{EType}
 end
 
-@functor GNNHeteroGraph
-
 GNNHeteroGraph(data; kws...) = GNNHeteroGraph(Dict(data); kws...)
 GNNHeteroGraph(data::Pair...; kws...) = GNNHeteroGraph(Dict(data...); kws...)
 
diff --git a/GNNGraphs/src/temporalsnapshotsgnngraph.jl b/GNNGraphs/src/temporalsnapshotsgnngraph.jl
index 362689650..53983e4c2 100644
--- a/GNNGraphs/src/temporalsnapshotsgnngraph.jl
+++ b/GNNGraphs/src/temporalsnapshotsgnngraph.jl
@@ -240,5 +240,3 @@ function print_feature_t(io::IO, feature)
         print(io, "no")
     end
 end
-
-@functor TemporalSnapshotsGNNGraph
diff --git a/GNNGraphs/test/runtests.jl b/GNNGraphs/test/runtests.jl
index 147d3fccc..1dbb84bc2 100644
--- a/GNNGraphs/test/runtests.jl
+++ b/GNNGraphs/test/runtests.jl
@@ -1,7 +1,7 @@
 using CUDA, cuDNN
 using GNNGraphs
 using GNNGraphs: getn, getdata
-using Functors
+using Functors: Functors
 using LinearAlgebra, Statistics, Random
 using NNlib
 import MLUtils
diff --git a/GNNLux/Project.toml b/GNNLux/Project.toml
index c3d2689d0..eac8b5468 100644
--- a/GNNLux/Project.toml
+++ b/GNNLux/Project.toml
@@ -1,7 +1,7 @@
 name = "GNNLux"
 uuid = "e8545f4d-a905-48ac-a8c4-ca114b98986d"
 authors = ["Carlo Lucibello and contributors"]
-version = "0.1.1"
+version = "0.2.0-DEV"
 
 [deps]
 ConcreteStructs = "2569d6c7-a4a2-43d3-a901-331e8e4be471"
@@ -17,8 +17,8 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
 ConcreteStructs = "0.2.3"
-GNNGraphs = "1.3"
-GNNlib = "0.2.3"
+GNNGraphs = "1.4"
+GNNlib = "1"
 Lux = "1"
 LuxCore = "1"
 NNlib = "0.9.21"
diff --git a/GNNLux/docs/make.jl b/GNNLux/docs/make.jl
index d1f0bbadb..feae0f3c5 100644
--- a/GNNLux/docs/make.jl
+++ b/GNNLux/docs/make.jl
@@ -1,8 +1,10 @@
 using Pkg
 Pkg.activate(@__DIR__)
-Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNGraphs"))
-Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNlib"))
-Pkg.develop(path=joinpath(@__DIR__, ".."))
+Pkg.develop([
+    PackageSpec(path=joinpath(@__DIR__, "..", "..", "GNNGraphs")), 
+    PackageSpec(path=joinpath(@__DIR__, "..", "..", "GNNlib")), 
+    PackageSpec(path=joinpath(@__DIR__, "..")),
+])
 Pkg.instantiate()
 
 using Documenter
diff --git a/GNNLux/src/layers/conv.jl b/GNNLux/src/layers/conv.jl
index 2587b130d..f92dd1ec6 100644
--- a/GNNLux/src/layers/conv.jl
+++ b/GNNLux/src/layers/conv.jl
@@ -1261,7 +1261,7 @@ LuxCore.parameterlength(l::GatedGraphConv) = parameterlength(l.gru) + l.dims^2*l
 
 function (l::GatedGraphConv)(g, x, ps, st)
     gru = StatefulLuxLayer{true}(l.gru, ps.gru, _getstate(st, :gru))
-    fgru = (h, x) -> gru((x, (h,)))  # make the forward compatible with Flux.GRUCell style
+    fgru = (x, h) -> gru((x, (h,)))[1]  # make the forward compatible with Flux.GRUCell style
     m = (; gru=fgru, ps.weight, l.num_layers, l.aggr, l.dims)
     return GNNlib.gated_graph_conv(m, g, x), st
 end
diff --git a/GNNLux/test/runtests.jl b/GNNLux/test/runtests.jl
index 91f74105f..0483eff61 100644
--- a/GNNLux/test/runtests.jl
+++ b/GNNLux/test/runtests.jl
@@ -1,3 +1,11 @@
+## The test environment is instantiated as follows:
+# using Pkg
+# Pkg.activate(@__DIR__)
+# Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNGraphs"))
+# Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNlib"))
+# Pkg.develop(path=joinpath(@__DIR__, ".."))
+# Pkg.instantiate()
+
 using TestItemRunner
 
 ## See https://www.julia-vscode.org/docs/stable/userguide/testitems/
diff --git a/GNNlib/Project.toml b/GNNlib/Project.toml
index 985fb52bd..5bb541eab 100644
--- a/GNNlib/Project.toml
+++ b/GNNlib/Project.toml
@@ -1,7 +1,7 @@
 name = "GNNlib"
 uuid = "a6a84749-d869-43f8-aacc-be26a1996e48"
 authors = ["Carlo Lucibello and contributors"]
-version = "0.2.5"
+version = "1.0.0-DEV"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -28,10 +28,10 @@ GNNlibCUDAExt = "CUDA"
 
 [compat]
 AMDGPU = "1"
-CUDA = "4, 5"
+CUDA = "5"
 ChainRulesCore = "1.24"
 DataStructures = "0.18"
-GNNGraphs = "1.0"
+GNNGraphs = "1.4"
 GPUArraysCore = "0.1"
 LinearAlgebra = "1"
 MLUtils = "0.4"
diff --git a/GNNlib/docs/make.jl b/GNNlib/docs/make.jl
index 0141ad809..bcf6aa352 100644
--- a/GNNlib/docs/make.jl
+++ b/GNNlib/docs/make.jl
@@ -1,7 +1,9 @@
 using Pkg
 Pkg.activate(@__DIR__)
-Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNGraphs"))
-Pkg.develop(path=joinpath(@__DIR__, ".."))
+Pkg.develop([
+    PackageSpec(path=joinpath(@__DIR__, "..", "..", "GNNGraphs")), 
+    PackageSpec(path=joinpath(@__DIR__, "..")),
+])
 Pkg.instantiate()
 
 using Documenter
diff --git a/GNNlib/src/layers/conv.jl b/GNNlib/src/layers/conv.jl
index 8b378cbdc..fe7c27d9c 100644
--- a/GNNlib/src/layers/conv.jl
+++ b/GNNlib/src/layers/conv.jl
@@ -228,7 +228,7 @@ function gated_graph_conv(l, g::GNNGraph, x::AbstractMatrix)
         m = view(l.weight, :, :, i) * h
         m = propagate(copy_xj, g, l.aggr; xj = m)
         # in gru forward, hidden state is first argument, input is second
-        h, _ = l.gru(h, m)
+        h = l.gru(m, h)
     end
     return h
 end
diff --git a/GNNlib/src/layers/pool.jl b/GNNlib/src/layers/pool.jl
index 4a6735a06..991e18465 100644
--- a/GNNlib/src/layers/pool.jl
+++ b/GNNlib/src/layers/pool.jl
@@ -29,8 +29,11 @@ topk_index(y::Adjoint, k::Int) = topk_index(y', k)
 function set2set_pool(l, g::GNNGraph, x::AbstractMatrix)
     n_in = size(x, 1)    
     qstar = zeros_like(x, (2*n_in, g.num_graphs))
+    h = zeros_like(l.lstm.Wh, size(l.lstm.Wh, 2))
+    c = zeros_like(l.lstm.Wh, size(l.lstm.Wh, 2))
     for t in 1:l.num_iters
-        q = l.lstm(qstar)                            # [n_in, n_graphs]
+        h, c = l.lstm(qstar, (h, c))                     # [n_in, n_graphs]
+        q = h
         qn = broadcast_nodes(g, q)                    # [n_in, n_nodes]
         α = softmax_nodes(g, sum(qn .* x, dims = 1))  # [1, n_nodes]
         r = reduce_nodes(+, g, x .* α)               # [n_in, n_graphs]
diff --git a/GNNlib/test/runtests.jl b/GNNlib/test/runtests.jl
index d420f5a6c..25f2ca5aa 100644
--- a/GNNlib/test/runtests.jl
+++ b/GNNlib/test/runtests.jl
@@ -1,3 +1,10 @@
+## The test environment is instantiated as follows:
+# using Pkg
+# Pkg.activate(@__DIR__)
+# Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNGraphs"))
+# Pkg.develop(path=joinpath(@__DIR__, ".."))
+# Pkg.instantiate()
+
 using TestItemRunner
 
 ## See https://www.julia-vscode.org/docs/stable/userguide/testitems/
diff --git a/GraphNeuralNetworks/Project.toml b/GraphNeuralNetworks/Project.toml
index 1467b9b43..b659338a8 100644
--- a/GraphNeuralNetworks/Project.toml
+++ b/GraphNeuralNetworks/Project.toml
@@ -1,7 +1,7 @@
 name = "GraphNeuralNetworks"
 uuid = "cffab07f-9bc2-4db1-8861-388f63bf7694"
 authors = ["Carlo Lucibello and contributors"]
-version = "0.6.23"
+version = "1.0.0-DEV"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -18,9 +18,9 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
 ChainRulesCore = "1"
-Flux = "0.14"
-GNNGraphs = "1.0"
-GNNlib = "0.2"
+Flux = "0.15"
+GNNGraphs = "1.4"
+GNNlib = "1"
 LinearAlgebra = "1"
 MLUtils = "0.4"
 MacroTools = "0.5"
diff --git a/GraphNeuralNetworks/docs/make.jl b/GraphNeuralNetworks/docs/make.jl
index a5ff601ff..fd41f333d 100644
--- a/GraphNeuralNetworks/docs/make.jl
+++ b/GraphNeuralNetworks/docs/make.jl
@@ -1,8 +1,10 @@
 using Pkg
 Pkg.activate(@__DIR__)
-Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNGraphs"))
-Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNlib"))
-Pkg.develop(path=joinpath(@__DIR__, ".."))
+Pkg.develop([
+    PackageSpec(path=joinpath(@__DIR__, "..", "..", "GNNGraphs")), 
+    PackageSpec(path=joinpath(@__DIR__, "..", "..", "GNNlib")), 
+    PackageSpec(path=joinpath(@__DIR__, "..")),
+])
 Pkg.instantiate()
 
 using Documenter
diff --git a/GraphNeuralNetworks/src/layers/pool.jl b/GraphNeuralNetworks/src/layers/pool.jl
index 59164e199..493ef6715 100644
--- a/GraphNeuralNetworks/src/layers/pool.jl
+++ b/GraphNeuralNetworks/src/layers/pool.jl
@@ -149,24 +149,19 @@ end
 Flux.@layer Set2Set
 
 function Set2Set(n_in::Int, n_iters::Int, n_layers::Int = 1)
-    @assert n_layers >= 1
+    @assert n_layers == 1 "multiple layers not implemented yet" #TODO
     n_out = 2 * n_in
-
-    if n_layers == 1
-        lstm = LSTM(n_out => n_in)
-    else
-        layers = [LSTM(n_out => n_in)]
-        for _ in 2:n_layers
-            push!(layers, LSTM(n_in => n_in))
-        end
-        lstm = Chain(layers...)
-    end
-
+    lstm = LSTMCell(n_out => n_in)
     return Set2Set(lstm, n_iters)
 end
 
+function initialstates(cell::LSTMCell)
+    h = zeros_like(cell.Wh, size(cell.Wh, 2))
+    c = zeros_like(cell.Wh, size(cell.Wh, 2))
+    return h, c
+end
+
 function (l::Set2Set)(g, x)
-    Flux.reset!(l.lstm)
     return GNNlib.set2set_pool(l, g, x)
 end
 
diff --git a/GraphNeuralNetworks/src/layers/temporalconv.jl b/GraphNeuralNetworks/src/layers/temporalconv.jl
index 2f6292f28..67e85356d 100644
--- a/GraphNeuralNetworks/src/layers/temporalconv.jl
+++ b/GraphNeuralNetworks/src/layers/temporalconv.jl
@@ -1,21 +1,8 @@
-# Adapting Flux.Recur to work with GNNGraphs
-function (m::Flux.Recur)(g::GNNGraph, x)
-    m.state, y = m.cell(m.state, g, x)
-    return y
-end
-    
-function (m::Flux.Recur)(g::GNNGraph, x::AbstractArray{T, 3}) where T
-    h = [m(g, x_t) for x_t in Flux.eachlastdim(x)]
-    sze = size(h[1])
-    reshape(reduce(hcat, h), sze[1], sze[2], length(h))
-end
-
-struct TGCNCell <: GNNLayer
-    conv::GCNConv
-    gru::Flux.GRUv3Cell
-    state0
-    in::Int
-    out::Int
+struct TGCNCell{C,G} <: GNNLayer
+    conv::C
+    gru::G
+    din::Int
+    dout::Int
 end
 
 Flux.@layer TGCNCell
@@ -23,29 +10,29 @@ Flux.@layer TGCNCell
 function TGCNCell(ch::Pair{Int, Int};
                   bias::Bool = true,
                   init = Flux.glorot_uniform,
-                  init_state = Flux.zeros32,
-                  add_self_loops = false,
-                  use_edge_weight = true)
-    in, out = ch
-    conv = GCNConv(in => out, sigmoid; init, bias, add_self_loops,
-                   use_edge_weight)
-    gru = Flux.GRUv3Cell(out, out)
-    state0 = init_state(out,1)
-    return TGCNCell(conv, gru, state0, in,out)
+                  add_self_loops = false)
+    din, dout = ch
+    conv = GCNConv(din => dout, sigmoid; init, bias, add_self_loops)
+    gru = GRUCell(dout => dout)
+    return TGCNCell(conv, gru, din, dout)
 end
 
-function (tgcn::TGCNCell)(h, g::GNNGraph, x::AbstractArray)
-    x̃ = tgcn.conv(g, x)
-    h, x̃ = tgcn.gru(h, x̃)
-    return h, x̃
+initialstates(cell::GRUCell) = zeros_like(cell.Wh, size(cell.Wh, 2))
+initialstates(cell::TGCNCell) = initialstates(cell.gru)
+(cell::TGCNCell)(g::GNNGraph, x::AbstractVecOrMat) = cell(g, x, initialstates(cell))
+
+function (cell::TGCNCell)(g::GNNGraph, x::AbstractVecOrMat, h::AbstractVecOrMat)
+    x = cell.conv(g, x)
+    h = cell.gru(x, h)
+    return h
 end
 
-function Base.show(io::IO, tgcn::TGCNCell)
-    print(io, "TGCNCell($(tgcn.in) => $(tgcn.out))")
+function Base.show(io::IO, cell::TGCNCell)
+    print(io, "TGCNCell($(cell.din) => $(cell.dout))")
 end
 
 """
-    TGCN(in => out; [bias, init, init_state, add_self_loops, use_edge_weight])
+    TGCN(din => dout; [bias, init, add_self_loops])
 
 Temporal Graph Convolutional Network (T-GCN) recurrent layer from the paper [T-GCN: A Temporal Graph Convolutional Network for Traffic Prediction](https://arxiv.org/pdf/1811.05320.pdf).
 
@@ -53,578 +40,539 @@ Performs a layer of GCNConv to model spatial dependencies, followed by a Gated R
 
 # Arguments
 
-- `in`: Number of input features.
-- `out`: Number of output features.
-- `bias`: Add learnable bias. Default `true`.
-- `init`: Weights' initializer. Default `glorot_uniform`.
-- `init_state`: Initial state of the hidden stat of the GRU layer. Default `zeros32`.
-- `add_self_loops`: Add self loops to the graph before performing the convolution. Default `false`.
-- `use_edge_weight`: If `true`, consider the edge weights in the input graph (if available).
-                     If `add_self_loops=true` the new weights will be set to 1. 
-                     This option is ignored if the `edge_weight` is explicitly provided in the forward pass.
-                     Default `false`.
-# Examples
-
-```jldoctest
-julia> tgcn = TGCN(2 => 6)
-Recur(
-  TGCNCell(
-    GCNConv(2 => 6, σ),                 # 18 parameters
-    GRUv3Cell(6 => 6),                  # 240 parameters
-    Float32[0.0; 0.0; … ; 0.0; 0.0;;],  # 6 parameters  (all zero)
-    2,
-    6,
-  ),
-)         # Total: 8 trainable arrays, 264 parameters,
-          # plus 1 non-trainable, 6 parameters, summarysize 1.492 KiB.
-
-julia> g, x = rand_graph(5, 10), rand(Float32, 2, 5);
-
-julia> y = tgcn(g, x);
-
-julia> size(y)
-(6, 5)
-
-julia> Flux.reset!(tgcn);
-
-julia> tgcn(rand_graph(5, 10), rand(Float32, 2, 5, 20)) |> size # batch size of 20
-(6, 5, 20)
-```
-
-!!! warning "Batch size changes"
-    Failing to call `reset!` when the input batch size changes can lead to unexpected behavior.
-"""
-TGCN(ch; kwargs...) = Flux.Recur(TGCNCell(ch; kwargs...))
-
-Flux.Recur(tgcn::TGCNCell) = Flux.Recur(tgcn, tgcn.state0)
-
-# make TGCN compatible with GNNChain
-(l::Flux.Recur{TGCNCell})(g::GNNGraph) = GNNGraph(g, ndata = l(g, node_features(g)))
-_applylayer(l::Flux.Recur{TGCNCell}, g::GNNGraph, x) = l(g, x)
-_applylayer(l::Flux.Recur{TGCNCell}, g::GNNGraph) = l(g)
-
-
-"""
-    A3TGCN(in => out; [bias, init, init_state, add_self_loops, use_edge_weight])
-
-Attention Temporal Graph Convolutional Network (A3T-GCN) model from the paper [A3T-GCN: Attention Temporal Graph
-Convolutional Network for Traffic Forecasting](https://arxiv.org/pdf/2006.11583.pdf).
-
-Performs a TGCN layer, followed by a soft attention layer.
-
-# Arguments
-
-- `in`: Number of input features.
-- `out`: Number of output features.
+- `din`: Number of input features.
+- `dout`: Number of output features.
 - `bias`: Add learnable bias. Default `true`.
-- `init`: Weights' initializer. Default `glorot_uniform`.
-- `init_state`: Initial state of the hidden stat of the GRU layer. Default `zeros32`.
+- `init`: Convolution's weights initializer. Default `glorot_uniform`.
 - `add_self_loops`: Add self loops to the graph before performing the convolution. Default `false`.
-- `use_edge_weight`: If `true`, consider the edge weights in the input graph (if available).
-                     If `add_self_loops=true` the new weights will be set to 1. 
-                     This option is ignored if the `edge_weight` is explicitly provided in the forward pass.
-                     Default `false`.
-# Examples
-
-```jldoctest
-julia> a3tgcn = A3TGCN(2 => 6)
-A3TGCN(2 => 6)
-
-julia> g, x = rand_graph(5, 10), rand(Float32, 2, 5);
-
-julia> y = a3tgcn(g,x);
-
-julia> size(y)
-(6, 5)
-
-julia> Flux.reset!(a3tgcn);
-
-julia> y = a3tgcn(rand_graph(5, 10), rand(Float32, 2, 5, 20));
-
-julia> size(y)
-(6, 5)
-```
-
-!!! warning "Batch size changes"
-    Failing to call `reset!` when the input batch size changes can lead to unexpected behavior.
-"""
-struct A3TGCN <: GNNLayer
-    tgcn::Flux.Recur{TGCNCell}
-    dense1::Dense
-    dense2::Dense
-    in::Int
-    out::Int
-end
-
-Flux.@layer A3TGCN
-
-function A3TGCN(ch::Pair{Int, Int},
-                  bias::Bool = true,
-                  init = Flux.glorot_uniform,
-                  init_state = Flux.zeros32,
-                  add_self_loops = false,
-                  use_edge_weight = true)
-    in, out = ch
-    tgcn = TGCN(in => out; bias, init, init_state, add_self_loops, use_edge_weight)
-    dense1 = Dense(out, out)
-    dense2 = Dense(out, out)
-    return A3TGCN(tgcn, dense1, dense2, in, out)
-end
-
-function (a3tgcn::A3TGCN)(g::GNNGraph, x::AbstractArray)
-    h = a3tgcn.tgcn(g, x)
-    e = a3tgcn.dense1(h)
-    e = a3tgcn.dense2(e)
-    a = softmax(e, dims = 3)
-    c = sum(a .* h , dims = 3)
-    if length(size(c)) == 3
-        c = dropdims(c, dims = 3)
-    end
-    return c
-end
-
-function Base.show(io::IO, a3tgcn::A3TGCN)
-    print(io, "A3TGCN($(a3tgcn.in) => $(a3tgcn.out))")
-end
-
-struct GConvGRUCell <: GNNLayer
-    conv_x_r::ChebConv
-    conv_h_r::ChebConv
-    conv_x_z::ChebConv
-    conv_h_z::ChebConv
-    conv_x_h::ChebConv
-    conv_h_h::ChebConv
-    k::Int
-    state0
-    in::Int
-    out::Int
-end
-
-Flux.@layer GConvGRUCell
-
-function GConvGRUCell(ch::Pair{Int, Int}, k::Int, n::Int;
-                   bias::Bool = true,
-                   init = Flux.glorot_uniform,
-                   init_state = Flux.zeros32)
-    in, out = ch
-    # reset gate
-    conv_x_r = ChebConv(in => out, k; bias, init)
-    conv_h_r = ChebConv(out => out, k; bias, init)
-    # update gate
-    conv_x_z = ChebConv(in => out, k; bias, init)
-    conv_h_z = ChebConv(out => out, k; bias, init)
-    # new gate
-    conv_x_h = ChebConv(in => out, k; bias, init)
-    conv_h_h = ChebConv(out => out, k; bias, init)
-    state0 = init_state(out, n)
-    return GConvGRUCell(conv_x_r, conv_h_r, conv_x_z, conv_h_z, conv_x_h, conv_h_h, k, state0, in, out)
-end
-
-function (ggru::GConvGRUCell)(h, g::GNNGraph, x)
-    r = ggru.conv_x_r(g, x) .+ ggru.conv_h_r(g, h)
-    r = Flux.sigmoid_fast(r)
-    z = ggru.conv_x_z(g, x) .+ ggru.conv_h_z(g, h)
-    z = Flux.sigmoid_fast(z)
-    h̃ = ggru.conv_x_h(g, x) .+ ggru.conv_h_h(g, r .* h)
-    h̃ = Flux.tanh_fast(h̃)
-    h = (1 .- z) .* h̃ .+ z .* h 
-    return h, h
-end
-
-function Base.show(io::IO, ggru::GConvGRUCell)
-    print(io, "GConvGRUCell($(ggru.in) => $(ggru.out))")
-end
-
-"""
-    GConvGRU(in => out, k, n; [bias, init, init_state])
-
-Graph Convolutional Gated Recurrent Unit (GConvGRU) recurrent layer from the paper [Structured Sequence Modeling with Graph Convolutional Recurrent Networks](https://arxiv.org/pdf/1612.07659).
-
-Performs a layer of ChebConv to model spatial dependencies, followed by a Gated Recurrent Unit (GRU) cell to model temporal dependencies.
-
-# Arguments
-
-- `in`: Number of input features.
-- `out`: Number of output features.
-- `k`: Chebyshev polynomial order.
-- `n`: Number of nodes in the graph.
-- `bias`: Add learnable bias. Default `true`.
-- `init`: Weights' initializer. Default `glorot_uniform`.
-- `init_state`: Initial state of the hidden stat of the GRU layer. Default `zeros32`.
-
-# Examples
-
-```jldoctest
-julia> g1, x1 = rand_graph(5, 10), rand(Float32, 2, 5);
-
-julia> ggru = GConvGRU(2 => 5, 2, g1.num_nodes);
-
-julia> y = ggru(g1, x1);
-
-julia> size(y)
-(5, 5)
-
-julia> g2, x2 = rand_graph(5, 10), rand(Float32, 2, 5, 30);
-
-julia> z = ggru(g2, x2);
-
-julia> size(z)
-(5, 5, 30)
-```
-""" 
-GConvGRU(ch, k, n; kwargs...) = Flux.Recur(GConvGRUCell(ch, k, n; kwargs...))
-Flux.Recur(ggru::GConvGRUCell) = Flux.Recur(ggru, ggru.state0)
-
-(l::Flux.Recur{GConvGRUCell})(g::GNNGraph) = GNNGraph(g, ndata = l(g, node_features(g)))
-_applylayer(l::Flux.Recur{GConvGRUCell}, g::GNNGraph, x) = l(g, x)
-_applylayer(l::Flux.Recur{GConvGRUCell}, g::GNNGraph) = l(g)
-
-struct GConvLSTMCell <: GNNLayer
-    conv_x_i::ChebConv
-    conv_h_i::ChebConv
-    w_i
-    b_i
-    conv_x_f::ChebConv
-    conv_h_f::ChebConv
-    w_f
-    b_f
-    conv_x_c::ChebConv
-    conv_h_c::ChebConv
-    w_c
-    b_c
-    conv_x_o::ChebConv
-    conv_h_o::ChebConv
-    w_o
-    b_o
-    k::Int
-    state0
-    in::Int
-    out::Int
-end
-
-Flux.@layer GConvLSTMCell
-
-function GConvLSTMCell(ch::Pair{Int, Int}, k::Int, n::Int;
-                        bias::Bool = true,
-                        init = Flux.glorot_uniform,
-                        init_state = Flux.zeros32)
-    in, out = ch
-    # input gate
-    conv_x_i = ChebConv(in => out, k; bias, init)
-    conv_h_i = ChebConv(out => out, k; bias, init)
-    w_i = init(out, 1)
-    b_i = bias ? Flux.create_bias(w_i, true, out) : false
-    # forget gate
-    conv_x_f = ChebConv(in => out, k; bias, init)
-    conv_h_f = ChebConv(out => out, k; bias, init)
-    w_f = init(out, 1)
-    b_f = bias ? Flux.create_bias(w_f, true, out) : false
-    # cell state
-    conv_x_c = ChebConv(in => out, k; bias, init)
-    conv_h_c = ChebConv(out => out, k; bias, init)
-    w_c = init(out, 1)
-    b_c = bias ? Flux.create_bias(w_c, true, out) : false
-    # output gate
-    conv_x_o = ChebConv(in => out, k; bias, init)
-    conv_h_o = ChebConv(out => out, k; bias, init)
-    w_o = init(out, 1)
-    b_o = bias ? Flux.create_bias(w_o, true, out) : false
-    state0 = (init_state(out, n), init_state(out, n))
-    return GConvLSTMCell(conv_x_i, conv_h_i, w_i, b_i,
-                         conv_x_f, conv_h_f, w_f, b_f,
-                         conv_x_c, conv_h_c, w_c, b_c,
-                         conv_x_o, conv_h_o, w_o, b_o,
-                         k, state0, in, out)
-end
-
-function (gclstm::GConvLSTMCell)((h, c), g::GNNGraph, x)
-    # input gate
-    i = gclstm.conv_x_i(g, x) .+ gclstm.conv_h_i(g, h) .+ gclstm.w_i .* c .+ gclstm.b_i 
-    i = Flux.sigmoid_fast(i)
-    # forget gate
-    f = gclstm.conv_x_f(g, x) .+ gclstm.conv_h_f(g, h) .+ gclstm.w_f .* c .+ gclstm.b_f
-    f = Flux.sigmoid_fast(f)
-    # cell state
-    c = f .* c .+ i .* Flux.tanh_fast(gclstm.conv_x_c(g, x) .+ gclstm.conv_h_c(g, h) .+ gclstm.w_c .* c .+ gclstm.b_c)
-    # output gate
-    o = gclstm.conv_x_o(g, x) .+ gclstm.conv_h_o(g, h) .+ gclstm.w_o .* c .+ gclstm.b_o
-    o = Flux.sigmoid_fast(o)
-    h =  o .* Flux.tanh_fast(c)
-    return (h,c), h
-end
-
-function Base.show(io::IO, gclstm::GConvLSTMCell)
-    print(io, "GConvLSTMCell($(gclstm.in) => $(gclstm.out))")
-end
 
-"""
-    GConvLSTM(in => out, k, n; [bias, init, init_state])
-
-Graph Convolutional Long Short-Term Memory (GConvLSTM) recurrent layer from the paper [Structured Sequence Modeling with Graph Convolutional Recurrent Networks](https://arxiv.org/pdf/1612.07659). 
+# Forward 
 
-Performs a layer of ChebConv to model spatial dependencies, followed by a Long Short-Term Memory (LSTM) cell to model temporal dependencies.
+    tgcn(g::GNNGraph, x, [h])
 
-# Arguments
-
-- `in`: Number of input features.
-- `out`: Number of output features.
-- `k`: Chebyshev polynomial order.
-- `n`: Number of nodes in the graph.
-- `bias`: Add learnable bias. Default `true`.
-- `init`: Weights' initializer. Default `glorot_uniform`.
-- `init_state`: Initial state of the hidden stat of the LSTM layer. Default `zeros32`.
+- `g`: The input graph.
+- `x`: The input to the TGCN. It should be a matrix size `din x timesteps` or an array of size `din x timesteps x num_nodes`.
+- `h`: The initial hidden state of the GRU cell. If given, it is a vector of size `out` or a matrix of size `dout x num_nodes`.
+       If not provided, it is assumed to be a vector of zeros.
 
 # Examples
 
 ```jldoctest
-julia> g1, x1 = rand_graph(5, 10), rand(Float32, 2, 5);
-
-julia> gclstm = GConvLSTM(2 => 5, 2, g1.num_nodes);
+julia> din, dout = 2, 3;
 
-julia> y = gclstm(g1, x1);
+julia> tgcn = TGCN(din => dout)
+TGCN(
+  TGCNCell(
+    GCNConv(2 => 3, σ),                 # 9 parameters
+    GRUCell(3 => 3),                    # 63 parameters
+  ),
+)                   # Total: 5 arrays, 72 parameters, 560 bytes.
 
-julia> size(y)
-(5, 5)
+julia> num_nodes = 5; num_edges = 10; timesteps = 4;
 
-julia> g2, x2 = rand_graph(5, 10), rand(Float32, 2, 5, 30);
+julia> g = rand_graph(num_nodes, num_edges);
 
-julia> z = gclstm(g2, x2);
+julia> x = rand(Float32, din, timesteps, num_nodes);
 
-julia> size(z)
-(5, 5, 30)
+julia> tgcn(g, x) |> size
+(3, 4, 5)
 ```
 """
-GConvLSTM(ch, k, n; kwargs...) = Flux.Recur(GConvLSTMCell(ch, k, n; kwargs...))
-Flux.Recur(tgcn::GConvLSTMCell) = Flux.Recur(tgcn, tgcn.state0)
-
-(l::Flux.Recur{GConvLSTMCell})(g::GNNGraph) = GNNGraph(g, ndata = l(g, node_features(g)))
-_applylayer(l::Flux.Recur{GConvLSTMCell}, g::GNNGraph, x) = l(g, x)
-_applylayer(l::Flux.Recur{GConvLSTMCell}, g::GNNGraph) = l(g)
-
-struct DCGRUCell
-    in::Int
-    out::Int
-    state0
-    k::Int
-    dconv_u::DConv
-    dconv_r::DConv
-    dconv_c::DConv
+struct TGCN{C<:TGCNCell} <: GNNLayer
+    cell::C
 end
 
-Flux.@layer DCGRUCell
-
-function DCGRUCell(ch::Pair{Int,Int}, k::Int, n::Int; bias = true, init = glorot_uniform, init_state = Flux.zeros32)
-    in, out = ch
-    dconv_u = DConv((in + out) => out, k; bias=bias, init=init)
-    dconv_r = DConv((in + out) => out, k; bias=bias, init=init)
-    dconv_c = DConv((in + out) => out, k; bias=bias, init=init)
-    state0 = init_state(out, n)
-    return DCGRUCell(in, out, state0, k, dconv_u, dconv_r, dconv_c)
-end
-
-function (dcgru::DCGRUCell)(h, g::GNNGraph, x)
-    h̃ = vcat(x, h)
-    z = dcgru.dconv_u(g, h̃)
-    z = NNlib.sigmoid_fast.(z)
-    r = dcgru.dconv_r(g, h̃)
-    r = NNlib.sigmoid_fast.(r)
-    ĥ = vcat(x, h .* r)
-    c = dcgru.dconv_c(g, ĥ)
-    c = tanh.(c)
-    h = z.* h + (1 .- z) .* c
-    return h, h
-end
-
-function Base.show(io::IO, dcgru::DCGRUCell)
-    print(io, "DCGRUCell($(dcgru.in) => $(dcgru.out), $(dcgru.k))")
-end
-
-"""
-    DCGRU(in => out, k, n; [bias, init, init_state])
-
-Diffusion Convolutional Recurrent Neural Network (DCGRU) layer from the paper [Diffusion Convolutional Recurrent Neural
-Network: Data-driven Traffic Forecasting](https://arxiv.org/pdf/1707.01926).
-
-Performs a Diffusion Convolutional layer to model spatial dependencies, followed by a Gated Recurrent Unit (GRU) cell to model temporal dependencies.
+Flux.@layer TGCN
 
-# Arguments
-
-- `in`: Number of input features.
-- `out`: Number of output features.
-- `k`: Diffusion step.
-- `n`: Number of nodes in the graph.
-- `bias`: Add learnable bias. Default `true`.
-- `init`: Weights' initializer. Default `glorot_uniform`.
-- `init_state`: Initial state of the hidden stat of the LSTM layer. Default `zeros32`.
-
-# Examples
-
-```jldoctest
-julia> g1, x1 = rand_graph(5, 10), rand(Float32, 2, 5);
-
-julia> dcgru = DCGRU(2 => 5, 2, g1.num_nodes);
-
-julia> y = dcgru(g1, x1);
-
-julia> size(y)
-(5, 5)
-
-julia> g2, x2 = rand_graph(5, 10), rand(Float32, 2, 5, 30);
+TGCN(ch::Pair{Int, Int}; kws...) = TGCN(TGCNCell(ch; kws...))
 
-julia> z = dcgru(g2, x2);
+initialstates(tgcn::TGCN) = initialstates(tgcn.cell)
 
-julia> size(z)
-(5, 5, 30)
-```
-"""
-DCGRU(ch, k, n; kwargs...) = Flux.Recur(DCGRUCell(ch, k, n; kwargs...))
-Flux.Recur(dcgru::DCGRUCell) = Flux.Recur(dcgru, dcgru.state0)
-
-(l::Flux.Recur{DCGRUCell})(g::GNNGraph) = GNNGraph(g, ndata = l(g, node_features(g)))
-_applylayer(l::Flux.Recur{DCGRUCell}, g::GNNGraph, x) = l(g, x)
-_applylayer(l::Flux.Recur{DCGRUCell}, g::GNNGraph) = l(g)
-
-"""
-    EvolveGCNO(ch; bias = true, init = glorot_uniform, init_state = Flux.zeros32)
-
-Evolving Graph Convolutional Network (EvolveGCNO) layer from the paper [EvolveGCN: Evolving Graph Convolutional Networks for Dynamic Graphs](https://arxiv.org/pdf/1902.10191).
-
-Perfoms a Graph Convolutional layer with parameters derived from a Long Short-Term Memory (LSTM) layer across the snapshots of the temporal graph.
-
-
-# Arguments
+(tgcn::TGCN)(g::GNNGraph, x) = tgcn(g, x, initialstates(tgcn))
 
-- `in`: Number of input features.
-- `out`: Number of output features.
-- `bias`: Add learnable bias. Default `true`.
-- `init`: Weights' initializer. Default `glorot_uniform`.
-- `init_state`: Initial state of the hidden stat of the LSTM layer. Default `zeros32`.
-
-# Examples
-
-```jldoctest
-julia> tg = TemporalSnapshotsGNNGraph([rand_graph(10,20; ndata = rand(4,10)), rand_graph(10,14; ndata = rand(4,10)), rand_graph(10,22; ndata = rand(4,10))])
-TemporalSnapshotsGNNGraph:
-  num_nodes: [10, 10, 10]
-  num_edges: [20, 14, 22]
-  num_snapshots: 3
-
-julia> ev = EvolveGCNO(4 => 5)
-EvolveGCNO(4 => 5)
-
-julia> size(ev(tg, tg.ndata.x))
-(3,)
-
-julia> size(ev(tg, tg.ndata.x)[1])
-(5, 10)
-```
-"""
-struct EvolveGCNO
-    conv
-    W_init
-    init_state
-    in::Int
-    out::Int
-    Wf
-    Uf
-    Bf
-    Wi
-    Ui
-    Bi
-    Wo
-    Uo
-    Bo
-    Wc
-    Uc
-    Bc
-end
- 
-Flux.@functor EvolveGCNO
-
-function EvolveGCNO(ch; bias = true, init = glorot_uniform, init_state = Flux.zeros32)
-    in, out = ch
-    W = init(out, in)
-    conv = GCNConv(ch; bias = bias, init = init)
-    Wf = init(out, in)
-    Uf = init(out, in)
-    Bf = bias ? init(out, in) : nothing
-    Wi = init(out, in)
-    Ui = init(out, in)
-    Bi = bias ? init(out, in) : nothing
-    Wo = init(out, in)
-    Uo = init(out, in)
-    Bo = bias ? init(out, in) : nothing
-    Wc = init(out, in)
-    Uc = init(out, in)
-    Bc = bias ? init(out, in) : nothing
-    return EvolveGCNO(conv, W, init_state, in, out, Wf, Uf, Bf, Wi, Ui, Bi, Wo, Uo, Bo, Wc, Uc, Bc)
-end
-
-function (egcno::EvolveGCNO)(tg::TemporalSnapshotsGNNGraph, x)
-    H = egcno.init_state(egcno.out, egcno.in)
-    C = egcno.init_state(egcno.out, egcno.in)
-    W = egcno.W_init
-    X = map(1:tg.num_snapshots) do i
-        F = Flux.sigmoid_fast.(egcno.Wf .* W + egcno.Uf .* H + egcno.Bf)
-        I = Flux.sigmoid_fast.(egcno.Wi .* W + egcno.Ui .* H + egcno.Bi)
-        O = Flux.sigmoid_fast.(egcno.Wo .* W + egcno.Uo .* H + egcno.Bo)
-        C̃ = Flux.tanh_fast.(egcno.Wc .* W + egcno.Uc .* H + egcno.Bc)
-        C = F .* C + I .* C̃
-        H = O .* tanh_fast.(C)
-        W = H
-        egcno.conv(tg.snapshots[i], x[i]; conv_weight = H)
+function (tgcn::TGCN)(g::GNNGraph, x::AbstractArray, h)
+    @assert ndims(x) == 2 || ndims(x) == 3
+    # [x] = [din, timesteps] or [din, timesteps, num_nodes]
+    # y = AbstractArray[] # issue https://github.com/JuliaLang/julia/issues/56771
+    y = []
+    for xt in eachslice(x, dims = 2)
+        h = tgcn.cell(g, xt, h)
+        y = vcat(y, [h])
     end
-    return X
-end
+    return stack(y, dims = 2) # [dout, timesteps, num_nodes]
+end
+
+Base.show(io::IO, tgcn::TGCN) = print(io, "TGCN($(tgcn.cell.din) => $(tgcn.cell.dout))")
+
+######## TO BE PORTED TO FLUX v0.15 from here ############################
+
+# """
+#     A3TGCN(din => dout; [bias, init, add_self_loops])
+
+# Attention Temporal Graph Convolutional Network (A3T-GCN) model from the paper [A3T-GCN: Attention Temporal Graph
+# Convolutional Network for Traffic Forecasting](https://arxiv.org/pdf/2006.11583.pdf).
+
+# Performs a TGCN layer, followed by a soft attention layer.
+
+# # Arguments
+
+# - `din`: Number of input features.
+# - `dout`: Number of output features.
+# - `bias`: Add learnable bias. Default `true`.
+# - `init`: Convolution's weights initializer. Default `glorot_uniform`.
+# - `add_self_loops`: Add self loops to the graph before performing the convolution. Default `false`.
+
+# # Examples
+
+# ```jldoctest
+# julia> din, dout = 2, 3;
+
+# julia> model = A3TGCN(din => dout)
+# TGCN(
+#   TGCNCell(
+#     GCNConv(2 => 3, σ),                 # 9 parameters
+#     GRUCell(3 => 3),                    # 63 parameters
+#   ),
+# )                   # Total: 5 arrays, 72 parameters, 560 bytes.
+
+# julia> num_nodes = 5; num_edges = 10; timesteps = 4;
+
+# julia> g = rand_graph(num_nodes, num_edges);
+
+# julia> x = rand(Float32, din, timesteps, num_nodes);
+
+# julia> model(g, x) |> size
+# (3, 4, 5)
+# ```
+
+# !!! warning "Batch size changes"
+#     Failing to call `reset!` when the input batch size changes can lead to unexpected behavior.
+# """
+# struct A3TGCN <: GNNLayer
+#     tgcn::TGCN
+#     dense1::Dense
+#     dense2::Dense
+#     din::Int
+#     dout::Int
+# end
+
+# Flux.@layer A3TGCN
+
+# function A3TGCN(ch::Pair{Int, Int},
+#                   bias::Bool = true,
+#                   init = Flux.glorot_uniform,
+#                   add_self_loops = false)
+#     din, dout = ch
+#     tgcn = TGCN(din => dout; bias, init, init_state, add_self_loops)
+#     dense1 = Dense(dout => dout)
+#     dense2 = Dense(dout => dout)
+#     return A3TGCN(tgcn, dense1, dense2, din, dout)
+# end
+
+# function (a3tgcn::A3TGCN)(g::GNNGraph, x::AbstractArray, h)
+#     h = a3tgcn.tgcn(g, x, h)
+#     e = a3tgcn.dense1(h) # WHY NOT RELU?
+#     e = a3tgcn.dense2(e)
+#     a = softmax(e, dims = 2)
+#     c = sum(a .* h , dims = 2)
+#     if length(size(c)) == 3
+#         c = dropdims(c, dims = 2)
+#     end
+#     return c
+# end
+
+# function Base.show(io::IO, a3tgcn::A3TGCN)
+#     print(io, "A3TGCN($(a3tgcn.din) => $(a3tgcn.dout))")
+# end
+
+# struct GConvGRUCell <: GNNLayer
+#     conv_x_r::ChebConv
+#     conv_h_r::ChebConv
+#     conv_x_z::ChebConv
+#     conv_h_z::ChebConv
+#     conv_x_h::ChebConv
+#     conv_h_h::ChebConv
+#     k::Int
+#     state0
+#     in::Int
+#     out::Int
+# end
+
+# Flux.@layer GConvGRUCell
+
+# function GConvGRUCell(ch::Pair{Int, Int}, k::Int, n::Int;
+#                    bias::Bool = true,
+#                    init = Flux.glorot_uniform,
+#                    init_state = Flux.zeros32)
+#     in, out = ch
+#     # reset gate
+#     conv_x_r = ChebConv(in => out, k; bias, init)
+#     conv_h_r = ChebConv(out => out, k; bias, init)
+#     # update gate
+#     conv_x_z = ChebConv(in => out, k; bias, init)
+#     conv_h_z = ChebConv(out => out, k; bias, init)
+#     # new gate
+#     conv_x_h = ChebConv(in => out, k; bias, init)
+#     conv_h_h = ChebConv(out => out, k; bias, init)
+#     state0 = init_state(out, n)
+#     return GConvGRUCell(conv_x_r, conv_h_r, conv_x_z, conv_h_z, conv_x_h, conv_h_h, k, state0, in, out)
+# end
+
+# function (ggru::GConvGRUCell)(h, g::GNNGraph, x)
+#     r = ggru.conv_x_r(g, x) .+ ggru.conv_h_r(g, h)
+#     r = Flux.sigmoid_fast(r)
+#     z = ggru.conv_x_z(g, x) .+ ggru.conv_h_z(g, h)
+#     z = Flux.sigmoid_fast(z)
+#     h̃ = ggru.conv_x_h(g, x) .+ ggru.conv_h_h(g, r .* h)
+#     h̃ = Flux.tanh_fast(h̃)
+#     h = (1 .- z) .* h̃ .+ z .* h 
+#     return h, h
+# end
+
+# function Base.show(io::IO, ggru::GConvGRUCell)
+#     print(io, "GConvGRUCell($(ggru.in) => $(ggru.out))")
+# end
+
+# """
+#     GConvGRU(in => out, k, n; [bias, init, init_state])
+
+# Graph Convolutional Gated Recurrent Unit (GConvGRU) recurrent layer from the paper [Structured Sequence Modeling with Graph Convolutional Recurrent Networks](https://arxiv.org/pdf/1612.07659).
+
+# Performs a layer of ChebConv to model spatial dependencies, followed by a Gated Recurrent Unit (GRU) cell to model temporal dependencies.
+
+# # Arguments
+
+# - `in`: Number of input features.
+# - `out`: Number of output features.
+# - `k`: Chebyshev polynomial order.
+# - `n`: Number of nodes in the graph.
+# - `bias`: Add learnable bias. Default `true`.
+# - `init`: Weights' initializer. Default `glorot_uniform`.
+# - `init_state`: Initial state of the hidden stat of the GRU layer. Default `zeros32`.
+
+# # Examples
+
+# ```jldoctest
+# julia> g1, x1 = rand_graph(5, 10), rand(Float32, 2, 5);
+
+# julia> ggru = GConvGRU(2 => 5, 2, g1.num_nodes);
+
+# julia> y = ggru(g1, x1);
+
+# julia> size(y)
+# (5, 5)
+
+# julia> g2, x2 = rand_graph(5, 10), rand(Float32, 2, 5, 30);
+
+# julia> z = ggru(g2, x2);
+
+# julia> size(z)
+# (5, 5, 30)
+# ```
+# """ 
+# # GConvGRU(ch, k, n; kwargs...) = Flux.Recur(GConvGRUCell(ch, k, n; kwargs...))
+# # Flux.Recur(ggru::GConvGRUCell) = Flux.Recur(ggru, ggru.state0)
+
+# # (l::Flux.Recur{GConvGRUCell})(g::GNNGraph) = GNNGraph(g, ndata = l(g, node_features(g)))
+# # _applylayer(l::Flux.Recur{GConvGRUCell}, g::GNNGraph, x) = l(g, x)
+# # _applylayer(l::Flux.Recur{GConvGRUCell}, g::GNNGraph) = l(g)
+
+# struct GConvLSTMCell <: GNNLayer
+#     conv_x_i::ChebConv
+#     conv_h_i::ChebConv
+#     w_i
+#     b_i
+#     conv_x_f::ChebConv
+#     conv_h_f::ChebConv
+#     w_f
+#     b_f
+#     conv_x_c::ChebConv
+#     conv_h_c::ChebConv
+#     w_c
+#     b_c
+#     conv_x_o::ChebConv
+#     conv_h_o::ChebConv
+#     w_o
+#     b_o
+#     k::Int
+#     state0
+#     in::Int
+#     out::Int
+# end
+
+# Flux.@layer GConvLSTMCell
+
+# function GConvLSTMCell(ch::Pair{Int, Int}, k::Int, n::Int;
+#                         bias::Bool = true,
+#                         init = Flux.glorot_uniform,
+#                         init_state = Flux.zeros32)
+#     in, out = ch
+#     # input gate
+#     conv_x_i = ChebConv(in => out, k; bias, init)
+#     conv_h_i = ChebConv(out => out, k; bias, init)
+#     w_i = init(out, 1)
+#     b_i = bias ? Flux.create_bias(w_i, true, out) : false
+#     # forget gate
+#     conv_x_f = ChebConv(in => out, k; bias, init)
+#     conv_h_f = ChebConv(out => out, k; bias, init)
+#     w_f = init(out, 1)
+#     b_f = bias ? Flux.create_bias(w_f, true, out) : false
+#     # cell state
+#     conv_x_c = ChebConv(in => out, k; bias, init)
+#     conv_h_c = ChebConv(out => out, k; bias, init)
+#     w_c = init(out, 1)
+#     b_c = bias ? Flux.create_bias(w_c, true, out) : false
+#     # output gate
+#     conv_x_o = ChebConv(in => out, k; bias, init)
+#     conv_h_o = ChebConv(out => out, k; bias, init)
+#     w_o = init(out, 1)
+#     b_o = bias ? Flux.create_bias(w_o, true, out) : false
+#     state0 = (init_state(out, n), init_state(out, n))
+#     return GConvLSTMCell(conv_x_i, conv_h_i, w_i, b_i,
+#                          conv_x_f, conv_h_f, w_f, b_f,
+#                          conv_x_c, conv_h_c, w_c, b_c,
+#                          conv_x_o, conv_h_o, w_o, b_o,
+#                          k, state0, in, out)
+# end
+
+# function (gclstm::GConvLSTMCell)((h, c), g::GNNGraph, x)
+#     # input gate
+#     i = gclstm.conv_x_i(g, x) .+ gclstm.conv_h_i(g, h) .+ gclstm.w_i .* c .+ gclstm.b_i 
+#     i = Flux.sigmoid_fast(i)
+#     # forget gate
+#     f = gclstm.conv_x_f(g, x) .+ gclstm.conv_h_f(g, h) .+ gclstm.w_f .* c .+ gclstm.b_f
+#     f = Flux.sigmoid_fast(f)
+#     # cell state
+#     c = f .* c .+ i .* Flux.tanh_fast(gclstm.conv_x_c(g, x) .+ gclstm.conv_h_c(g, h) .+ gclstm.w_c .* c .+ gclstm.b_c)
+#     # output gate
+#     o = gclstm.conv_x_o(g, x) .+ gclstm.conv_h_o(g, h) .+ gclstm.w_o .* c .+ gclstm.b_o
+#     o = Flux.sigmoid_fast(o)
+#     h =  o .* Flux.tanh_fast(c)
+#     return (h,c), h
+# end
+
+# function Base.show(io::IO, gclstm::GConvLSTMCell)
+#     print(io, "GConvLSTMCell($(gclstm.in) => $(gclstm.out))")
+# end
+
+# """
+#     GConvLSTM(in => out, k, n; [bias, init, init_state])
+
+# Graph Convolutional Long Short-Term Memory (GConvLSTM) recurrent layer from the paper [Structured Sequence Modeling with Graph Convolutional Recurrent Networks](https://arxiv.org/pdf/1612.07659). 
+
+# Performs a layer of ChebConv to model spatial dependencies, followed by a Long Short-Term Memory (LSTM) cell to model temporal dependencies.
+
+# # Arguments
+
+# - `in`: Number of input features.
+# - `out`: Number of output features.
+# - `k`: Chebyshev polynomial order.
+# - `n`: Number of nodes in the graph.
+# - `bias`: Add learnable bias. Default `true`.
+# - `init`: Weights' initializer. Default `glorot_uniform`.
+# - `init_state`: Initial state of the hidden stat of the LSTM layer. Default `zeros32`.
+
+# # Examples
+
+# ```jldoctest
+# julia> g1, x1 = rand_graph(5, 10), rand(Float32, 2, 5);
+
+# julia> gclstm = GConvLSTM(2 => 5, 2, g1.num_nodes);
+
+# julia> y = gclstm(g1, x1);
+
+# julia> size(y)
+# (5, 5)
+
+# julia> g2, x2 = rand_graph(5, 10), rand(Float32, 2, 5, 30);
+
+# julia> z = gclstm(g2, x2);
+
+# julia> size(z)
+# (5, 5, 30)
+# ```
+# """
+# # GConvLSTM(ch, k, n; kwargs...) = Flux.Recur(GConvLSTMCell(ch, k, n; kwargs...))
+# # Flux.Recur(tgcn::GConvLSTMCell) = Flux.Recur(tgcn, tgcn.state0)
+
+# # (l::Flux.Recur{GConvLSTMCell})(g::GNNGraph) = GNNGraph(g, ndata = l(g, node_features(g)))
+# # _applylayer(l::Flux.Recur{GConvLSTMCell}, g::GNNGraph, x) = l(g, x)
+# # _applylayer(l::Flux.Recur{GConvLSTMCell}, g::GNNGraph) = l(g)
+
+# struct DCGRUCell
+#     in::Int
+#     out::Int
+#     state0
+#     k::Int
+#     dconv_u::DConv
+#     dconv_r::DConv
+#     dconv_c::DConv
+# end
+
+# Flux.@layer DCGRUCell
+
+# function DCGRUCell(ch::Pair{Int,Int}, k::Int, n::Int; bias = true, init = glorot_uniform, init_state = Flux.zeros32)
+#     in, out = ch
+#     dconv_u = DConv((in + out) => out, k; bias=bias, init=init)
+#     dconv_r = DConv((in + out) => out, k; bias=bias, init=init)
+#     dconv_c = DConv((in + out) => out, k; bias=bias, init=init)
+#     state0 = init_state(out, n)
+#     return DCGRUCell(in, out, state0, k, dconv_u, dconv_r, dconv_c)
+# end
+
+# function (dcgru::DCGRUCell)(h, g::GNNGraph, x)
+#     h̃ = vcat(x, h)
+#     z = dcgru.dconv_u(g, h̃)
+#     z = NNlib.sigmoid_fast.(z)
+#     r = dcgru.dconv_r(g, h̃)
+#     r = NNlib.sigmoid_fast.(r)
+#     ĥ = vcat(x, h .* r)
+#     c = dcgru.dconv_c(g, ĥ)
+#     c = tanh.(c)
+#     h = z.* h + (1 .- z) .* c
+#     return h, h
+# end
+
+# function Base.show(io::IO, dcgru::DCGRUCell)
+#     print(io, "DCGRUCell($(dcgru.in) => $(dcgru.out), $(dcgru.k))")
+# end
+
+# """
+#     DCGRU(in => out, k, n; [bias, init, init_state])
+
+# Diffusion Convolutional Recurrent Neural Network (DCGRU) layer from the paper [Diffusion Convolutional Recurrent Neural
+# Network: Data-driven Traffic Forecasting](https://arxiv.org/pdf/1707.01926).
+
+# Performs a Diffusion Convolutional layer to model spatial dependencies, followed by a Gated Recurrent Unit (GRU) cell to model temporal dependencies.
+
+# # Arguments
+
+# - `in`: Number of input features.
+# - `out`: Number of output features.
+# - `k`: Diffusion step.
+# - `n`: Number of nodes in the graph.
+# - `bias`: Add learnable bias. Default `true`.
+# - `init`: Weights' initializer. Default `glorot_uniform`.
+# - `init_state`: Initial state of the hidden stat of the LSTM layer. Default `zeros32`.
+
+# # Examples
+
+# ```jldoctest
+# julia> g1, x1 = rand_graph(5, 10), rand(Float32, 2, 5);
+
+# julia> dcgru = DCGRU(2 => 5, 2, g1.num_nodes);
+
+# julia> y = dcgru(g1, x1);
+
+# julia> size(y)
+# (5, 5)
+
+# julia> g2, x2 = rand_graph(5, 10), rand(Float32, 2, 5, 30);
+
+# julia> z = dcgru(g2, x2);
+
+# julia> size(z)
+# (5, 5, 30)
+# ```
+# """
+# # DCGRU(ch, k, n; kwargs...) = Flux.Recur(DCGRUCell(ch, k, n; kwargs...))
+# # Flux.Recur(dcgru::DCGRUCell) = Flux.Recur(dcgru, dcgru.state0)
+
+# # (l::Flux.Recur{DCGRUCell})(g::GNNGraph) = GNNGraph(g, ndata = l(g, node_features(g)))
+# # _applylayer(l::Flux.Recur{DCGRUCell}, g::GNNGraph, x) = l(g, x)
+# # _applylayer(l::Flux.Recur{DCGRUCell}, g::GNNGraph) = l(g)
+
+# """
+#     EvolveGCNO(ch; bias = true, init = glorot_uniform, init_state = Flux.zeros32)
+
+# Evolving Graph Convolutional Network (EvolveGCNO) layer from the paper [EvolveGCN: Evolving Graph Convolutional Networks for Dynamic Graphs](https://arxiv.org/pdf/1902.10191).
+
+# Perfoms a Graph Convolutional layer with parameters derived from a Long Short-Term Memory (LSTM) layer across the snapshots of the temporal graph.
+
+
+# # Arguments
+
+# - `in`: Number of input features.
+# - `out`: Number of output features.
+# - `bias`: Add learnable bias. Default `true`.
+# - `init`: Weights' initializer. Default `glorot_uniform`.
+# - `init_state`: Initial state of the hidden stat of the LSTM layer. Default `zeros32`.
+
+# # Examples
+
+# ```jldoctest
+# julia> tg = TemporalSnapshotsGNNGraph([rand_graph(10,20; ndata = rand(4,10)), rand_graph(10,14; ndata = rand(4,10)), rand_graph(10,22; ndata = rand(4,10))])
+# TemporalSnapshotsGNNGraph:
+#   num_nodes: [10, 10, 10]
+#   num_edges: [20, 14, 22]
+#   num_snapshots: 3
+
+# julia> ev = EvolveGCNO(4 => 5)
+# EvolveGCNO(4 => 5)
+
+# julia> size(ev(tg, tg.ndata.x))
+# (3,)
+
+# julia> size(ev(tg, tg.ndata.x)[1])
+# (5, 10)
+# ```
+# """
+# struct EvolveGCNO
+#     conv
+#     W_init
+#     init_state
+#     in::Int
+#     out::Int
+#     Wf
+#     Uf
+#     Bf
+#     Wi
+#     Ui
+#     Bi
+#     Wo
+#     Uo
+#     Bo
+#     Wc
+#     Uc
+#     Bc
+# end
  
-function Base.show(io::IO, egcno::EvolveGCNO)
-    print(io, "EvolveGCNO($(egcno.in) => $(egcno.out))")
-end
-
-function (l::GINConv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
-
-function (l::ChebConv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
-
-function (l::GATConv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
-
-function (l::GATv2Conv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
-
-function (l::GatedGraphConv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
-
-function (l::CGConv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
-
-function (l::SGConv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
-
-function (l::TransformerConv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
-
-function (l::GCNConv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
-
-function (l::ResGatedGraphConv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
-
-function (l::SAGEConv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
-
-function (l::GraphConv)(tg::TemporalSnapshotsGNNGraph, x::AbstractVector)
-    return l.(tg.snapshots, x)
-end
+# function EvolveGCNO(ch; bias = true, init = glorot_uniform, init_state = Flux.zeros32)
+#     in, out = ch
+#     W = init(out, in)
+#     conv = GCNConv(ch; bias = bias, init = init)
+#     Wf = init(out, in)
+#     Uf = init(out, in)
+#     Bf = bias ? init(out, in) : nothing
+#     Wi = init(out, in)
+#     Ui = init(out, in)
+#     Bi = bias ? init(out, in) : nothing
+#     Wo = init(out, in)
+#     Uo = init(out, in)
+#     Bo = bias ? init(out, in) : nothing
+#     Wc = init(out, in)
+#     Uc = init(out, in)
+#     Bc = bias ? init(out, in) : nothing
+#     return EvolveGCNO(conv, W, init_state, in, out, Wf, Uf, Bf, Wi, Ui, Bi, Wo, Uo, Bo, Wc, Uc, Bc)
+# end
+
+# function (egcno::EvolveGCNO)(tg::TemporalSnapshotsGNNGraph, x)
+#     H = egcno.init_state(egcno.out, egcno.in)
+#     C = egcno.init_state(egcno.out, egcno.in)
+#     W = egcno.W_init
+#     X = map(1:tg.num_snapshots) do i
+#         F = Flux.sigmoid_fast.(egcno.Wf .* W + egcno.Uf .* H + egcno.Bf)
+#         I = Flux.sigmoid_fast.(egcno.Wi .* W + egcno.Ui .* H + egcno.Bi)
+#         O = Flux.sigmoid_fast.(egcno.Wo .* W + egcno.Uo .* H + egcno.Bo)
+#         C̃ = Flux.tanh_fast.(egcno.Wc .* W + egcno.Uc .* H + egcno.Bc)
+#         C = F .* C + I .* C̃
+#         H = O .* tanh_fast.(C)
+#         W = H
+#         egcno.conv(tg.snapshots[i], x[i]; conv_weight = H)
+#     end
+#     return X
+# end
+ 
+# function Base.show(io::IO, egcno::EvolveGCNO)
+#     print(io, "EvolveGCNO($(egcno.in) => $(egcno.out))")
+# end
diff --git a/GraphNeuralNetworks/test/Project.toml b/GraphNeuralNetworks/test/Project.toml
index d6f77b391..ebdb52172 100644
--- a/GraphNeuralNetworks/test/Project.toml
+++ b/GraphNeuralNetworks/test/Project.toml
@@ -3,6 +3,8 @@ ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+GNNGraphs = "aed8fd31-079b-4b5a-b342-a13352159b8c"
+GNNlib = "a6a84749-d869-43f8-aacc-be26a1996e48"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 GraphNeuralNetworks = "cffab07f-9bc2-4db1-8861-388f63bf7694"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
diff --git a/GraphNeuralNetworks/test/layers/pool.jl b/GraphNeuralNetworks/test/layers/pool.jl
index 382a728ea..fa1475b20 100644
--- a/GraphNeuralNetworks/test/layers/pool.jl
+++ b/GraphNeuralNetworks/test/layers/pool.jl
@@ -76,7 +76,7 @@ end
             
         n_in = 3
         n_iters = 2
-        n_layers = 1
+        n_layers = 1 #TODO test with more layers
         g = batch([rand_graph(10, 40, graph_type = GRAPH_T) for _ in 1:5])
         g = GNNGraph(g, ndata = rand(Float32, n_in, g.num_nodes))
         l = Set2Set(n_in, n_iters, n_layers)
diff --git a/GraphNeuralNetworks/test/layers/temporalconv.jl b/GraphNeuralNetworks/test/layers/temporalconv.jl
index ab7d28a3a..277783d4f 100644
--- a/GraphNeuralNetworks/test/layers/temporalconv.jl
+++ b/GraphNeuralNetworks/test/layers/temporalconv.jl
@@ -1,106 +1,115 @@
 @testmodule TemporalConvTestModule begin
     using GraphNeuralNetworks
-    export in_channel, out_channel, N, S, T, g1, tg
+    export in_channel, out_channel, N, timesteps, g, tg, RTOL_LOW, RTOL_HIGH, ATOL_LOW
+
+    RTOL_LOW = 1e-2
+    RTOL_HIGH = 1e-5
+    ATOL_LOW = 1e-3
+
     in_channel = 3
     out_channel = 5
     N = 4
-    S = 5
-    T = Float32
+    timesteps = 5
+
+    g = GNNGraph(rand_graph(N, 8),
+                  ndata = rand(Float32, in_channel, N),
+                  graph_type = :coo)
 
-    g1 = GNNGraph(rand_graph(N,8),
-                    ndata = rand(T, in_channel, N),
-                    graph_type = :sparse)
+    tg = TemporalSnapshotsGNNGraph([g for _ in 1:timesteps])
 
-    tg = TemporalSnapshotsGNNGraph([g1 for _ in 1:S])
 end
 
 @testitem "TGCNCell" setup=[TemporalConvTestModule, TestModule] begin
     using .TemporalConvTestModule, .TestModule
-    tgcn = GraphNeuralNetworks.TGCNCell(in_channel => out_channel)
-    h, x̃ = tgcn(tgcn.state0, g1, g1.ndata.x)
-    @test size(h) == (out_channel, N)
-    @test size(x̃) == (out_channel, N)
-    @test h == x̃
+    cell = GraphNeuralNetworks.TGCNCell(in_channel => out_channel)
+    h = cell(g, g.x)
+    @test size(h) == (out_channel, g.num_nodes)
+    test_gradients(cell, g, g.x, rtol = RTOL_HIGH)
 end
 
 @testitem "TGCN" setup=[TemporalConvTestModule, TestModule] begin
     using .TemporalConvTestModule, .TestModule
     tgcn = TGCN(in_channel => out_channel)
-    @test size(Flux.gradient(x -> sum(tgcn(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
-    model = GNNChain(TGCN(in_channel => out_channel), Dense(out_channel, 1))
-    @test size(model(g1, g1.ndata.x)) == (1, N)
-    @test model(g1) isa GNNGraph            
-end
+    x = rand(Float32, in_channel, timesteps, g.num_nodes)
+    h = tgcn(g, x)
+    @test size(h) == (out_channel, timesteps, g.num_nodes)
+    test_gradients(tgcn, g, x, rtol = RTOL_HIGH)
+    test_gradients(tgcn, g, x, h[:,1,:], rtol = RTOL_HIGH)
 
-@testitem "A3TGCN" setup=[TemporalConvTestModule, TestModule] begin
-    using .TemporalConvTestModule, .TestModule
-    a3tgcn = A3TGCN(in_channel => out_channel)
-    @test size(Flux.gradient(x -> sum(a3tgcn(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
-    model = GNNChain(A3TGCN(in_channel => out_channel), Dense(out_channel, 1))
-    @test size(model(g1, g1.ndata.x)) == (1, N)
-    @test model(g1) isa GNNGraph            
+    # model = GNNChain(TGCN(in_channel => out_channel), Dense(out_channel, 1))
+    # @test size(model(g1, g1.ndata.x)) == (1, N)
+    # @test model(g1) isa GNNGraph            
 end
 
-@testitem "GConvLSTMCell" setup=[TemporalConvTestModule, TestModule] begin
-    using .TemporalConvTestModule, .TestModule
-    gconvlstm = GraphNeuralNetworks.GConvLSTMCell(in_channel => out_channel, 2, g1.num_nodes)
-    (h, c), h = gconvlstm(gconvlstm.state0, g1, g1.ndata.x)
-    @test size(h) == (out_channel, N)
-    @test size(c) == (out_channel, N)
-end
+# @testitem "A3TGCN" setup=[TemporalConvTestModule, TestModule] begin
+#     using .TemporalConvTestModule, .TestModule
+#     a3tgcn = A3TGCN(in_channel => out_channel)
+#     @test size(Flux.gradient(x -> sum(a3tgcn(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
+#     model = GNNChain(A3TGCN(in_channel => out_channel), Dense(out_channel, 1))
+#     @test size(model(g1, g1.ndata.x)) == (1, N)
+#     @test model(g1) isa GNNGraph            
+# end
 
-@testitem "GConvLSTM" setup=[TemporalConvTestModule, TestModule] begin
-    using .TemporalConvTestModule, .TestModule
-    gconvlstm = GConvLSTM(in_channel => out_channel, 2, g1.num_nodes)
-    @test size(Flux.gradient(x -> sum(gconvlstm(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
-    model = GNNChain(GConvLSTM(in_channel => out_channel, 2, g1.num_nodes), Dense(out_channel, 1))
-end
+# @testitem "GConvLSTMCell" setup=[TemporalConvTestModule, TestModule] begin
+#     using .TemporalConvTestModule, .TestModule
+#     gconvlstm = GraphNeuralNetworks.GConvLSTMCell(in_channel => out_channel, 2, g1.num_nodes)
+#     (h, c), h = gconvlstm(gconvlstm.state0, g1, g1.ndata.x)
+#     @test size(h) == (out_channel, N)
+#     @test size(c) == (out_channel, N)
+# end
 
-@testitem "GConvGRUCell" setup=[TemporalConvTestModule, TestModule] begin
-    using .TemporalConvTestModule, .TestModule
-    gconvlstm = GraphNeuralNetworks.GConvGRUCell(in_channel => out_channel, 2, g1.num_nodes)
-    h, h = gconvlstm(gconvlstm.state0, g1, g1.ndata.x)
-    @test size(h) == (out_channel, N)
-end
+# @testitem "GConvLSTM" setup=[TemporalConvTestModule, TestModule] begin
+#     using .TemporalConvTestModule, .TestModule
+#     gconvlstm = GConvLSTM(in_channel => out_channel, 2, g1.num_nodes)
+#     @test size(Flux.gradient(x -> sum(gconvlstm(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
+#     model = GNNChain(GConvLSTM(in_channel => out_channel, 2, g1.num_nodes), Dense(out_channel, 1))
+# end
 
-@testitem "GConvGRU" setup=[TemporalConvTestModule, TestModule] begin
-    using .TemporalConvTestModule, .TestModule
-    gconvlstm = GConvGRU(in_channel => out_channel, 2, g1.num_nodes)
-    @test size(Flux.gradient(x -> sum(gconvlstm(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
-    model = GNNChain(GConvGRU(in_channel => out_channel, 2, g1.num_nodes), Dense(out_channel, 1))
-    @test size(model(g1, g1.ndata.x)) == (1, N)
-    @test model(g1) isa GNNGraph            
-end
+# @testitem "GConvGRUCell" setup=[TemporalConvTestModule, TestModule] begin
+#     using .TemporalConvTestModule, .TestModule
+#     gconvlstm = GraphNeuralNetworks.GConvGRUCell(in_channel => out_channel, 2, g1.num_nodes)
+#     h, h = gconvlstm(gconvlstm.state0, g1, g1.ndata.x)
+#     @test size(h) == (out_channel, N)
+# end
 
-@testitem "DCGRU" setup=[TemporalConvTestModule, TestModule] begin
-    using .TemporalConvTestModule, .TestModule
-    dcgru = DCGRU(in_channel => out_channel, 2, g1.num_nodes)
-    @test size(Flux.gradient(x -> sum(dcgru(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
-    model = GNNChain(DCGRU(in_channel => out_channel, 2, g1.num_nodes), Dense(out_channel, 1))
-    @test size(model(g1, g1.ndata.x)) == (1, N)
-    @test model(g1) isa GNNGraph            
-end
+# @testitem "GConvGRU" setup=[TemporalConvTestModule, TestModule] begin
+#     using .TemporalConvTestModule, .TestModule
+#     gconvlstm = GConvGRU(in_channel => out_channel, 2, g1.num_nodes)
+#     @test size(Flux.gradient(x -> sum(gconvlstm(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
+#     model = GNNChain(GConvGRU(in_channel => out_channel, 2, g1.num_nodes), Dense(out_channel, 1))
+#     @test size(model(g1, g1.ndata.x)) == (1, N)
+#     @test model(g1) isa GNNGraph            
+# end
 
-@testitem "EvolveGCNO" setup=[TemporalConvTestModule, TestModule] begin
-    using .TemporalConvTestModule, .TestModule
-    evolvegcno = EvolveGCNO(in_channel => out_channel)
-    @test length(Flux.gradient(x -> sum(sum(evolvegcno(tg, x))), tg.ndata.x)[1]) == S
-    @test size(evolvegcno(tg, tg.ndata.x)[1]) ==  (out_channel, N)
-end
+# @testitem "DCGRU" setup=[TemporalConvTestModule, TestModule] begin
+#     using .TemporalConvTestModule, .TestModule
+#     dcgru = DCGRU(in_channel => out_channel, 2, g1.num_nodes)
+#     @test size(Flux.gradient(x -> sum(dcgru(g1, x)), g1.ndata.x)[1]) == (in_channel, N)
+#     model = GNNChain(DCGRU(in_channel => out_channel, 2, g1.num_nodes), Dense(out_channel, 1))
+#     @test size(model(g1, g1.ndata.x)) == (1, N)
+#     @test model(g1) isa GNNGraph            
+# end
 
-@testitem "GINConv" setup=[TemporalConvTestModule, TestModule] begin
-    using .TemporalConvTestModule, .TestModule
-    ginconv = GINConv(Dense(in_channel => out_channel),0.3)
-    @test length(ginconv(tg, tg.ndata.x)) == S
-    @test size(ginconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(ginconv(tg, x))), tg.ndata.x)[1]) == S    
-end
+# @testitem "EvolveGCNO" setup=[TemporalConvTestModule, TestModule] begin
+#     using .TemporalConvTestModule, .TestModule
+#     evolvegcno = EvolveGCNO(in_channel => out_channel)
+#     @test length(Flux.gradient(x -> sum(sum(evolvegcno(tg, x))), tg.ndata.x)[1]) == S
+#     @test size(evolvegcno(tg, tg.ndata.x)[1]) ==  (out_channel, N)
+# end
 
-@testitem "GraphConv" setup=[TemporalConvTestModule, TestModule] begin
-    using .TemporalConvTestModule, .TestModule
-    graphconv = GraphConv(in_channel => out_channel, tanh)
-    @test length(graphconv(tg, tg.ndata.x)) == S
-    @test size(graphconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
-    @test length(Flux.gradient(x ->sum(sum(graphconv(tg, x))), tg.ndata.x)[1]) == S    
-end
+# @testitem "GINConv" setup=[TemporalConvTestModule, TestModule] begin
+#     using .TemporalConvTestModule, .TestModule
+#     ginconv = GINConv(Dense(in_channel => out_channel),0.3)
+#     @test length(ginconv(tg, tg.ndata.x)) == S
+#     @test size(ginconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
+#     @test length(Flux.gradient(x ->sum(sum(ginconv(tg, x))), tg.ndata.x)[1]) == S    
+# end
+
+# @testitem "GraphConv" setup=[TemporalConvTestModule, TestModule] begin
+#     using .TemporalConvTestModule, .TestModule
+#     graphconv = GraphConv(in_channel => out_channel, tanh)
+#     @test length(graphconv(tg, tg.ndata.x)) == S
+#     @test size(graphconv(tg, tg.ndata.x)[1]) == (out_channel, N) 
+#     @test length(Flux.gradient(x ->sum(sum(graphconv(tg, x))), tg.ndata.x)[1]) == S    
+# end
 
diff --git a/GraphNeuralNetworks/test/runtests.jl b/GraphNeuralNetworks/test/runtests.jl
index 649eaeaf2..b6faea378 100644
--- a/GraphNeuralNetworks/test/runtests.jl
+++ b/GraphNeuralNetworks/test/runtests.jl
@@ -1,3 +1,11 @@
+## The test environment is instantiated as follows:
+# using Pkg
+# Pkg.activate(@__DIR__)
+# Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNGraphs"))
+# Pkg.develop(path=joinpath(@__DIR__, "..", "..", "GNNlib"))
+# Pkg.develop(path=joinpath(@__DIR__, ".."))
+# Pkg.instantiate()
+
 using TestItemRunner
 
 ## See https://www.julia-vscode.org/docs/stable/userguide/testitems/
diff --git a/GraphNeuralNetworks/test/test_module.jl b/GraphNeuralNetworks/test/test_module.jl
index c3ed96f05..f21e0a298 100644
--- a/GraphNeuralNetworks/test/test_module.jl
+++ b/GraphNeuralNetworks/test/test_module.jl
@@ -31,7 +31,7 @@ using Flux
 using Functors: fmapstructure_with_path
 using Graphs
 using ChainRulesTestUtils, FiniteDifferences
-using Zygote
+using Zygote: Zygote
 using SparseArrays