diff --git a/README.md b/README.md
index 872a854..2252ce6 100644
--- a/README.md
+++ b/README.md
@@ -53,6 +53,13 @@ via
 and 
 via 
 [pyHSICLasso](https://pypi.org/project/pyHSICLasso/).
+The performance has been measured 
+on the computation 
+of Gram matrices required 
+by HSIC Lasso 
+for the selection 
+from a dataset of 300 features 
+with as many samples as reported on the x-axis. 
 
 ![gramtimes](gramtimes.png)
 
diff --git a/gramtimes.png b/gramtimes.png
index 29afd1d..d19d4af 100644
Binary files a/gramtimes.png and b/gramtimes.png differ
diff --git a/hisel/lar/lar.py b/hisel/lar/lar.py
index f74f59b..4dee74d 100644
--- a/hisel/lar/lar.py
+++ b/hisel/lar/lar.py
@@ -98,4 +98,9 @@ def solve(
 
     lassopath = path[:k, :].toarray()
 
+    del xx
+    del gamma
+    del g
+    del beta
+
     return active, lassopath
diff --git a/hisel/select.py b/hisel/select.py
index f54b68b..4a05178 100644
--- a/hisel/select.py
+++ b/hisel/select.py
@@ -427,6 +427,7 @@ def _run(
     ny, dy = y.shape
     assert nx == ny
     num_batches: int = nx // batch_size
+    assert num_batches >= 1
     gram_dim: int = num_batches * batch_size**2
     lx = 1.
     ly = np.sqrt(dy)
diff --git a/profiling/gram_comparison.py b/profiling/gram_comparison.py
index ebf6046..7ef48d4 100644
--- a/profiling/gram_comparison.py
+++ b/profiling/gram_comparison.py
@@ -193,7 +193,7 @@ def test_num_samples():
     num_features = 300
     batch_size = 800
     num_samples = 1600 * np.arange(2, 8, dtype=int)
-    num_runs = 8
+    num_runs = 5
     data = []
     Result = make_dataclass("Result",
                             [
@@ -211,14 +211,17 @@ def test_num_samples():
             'experiment.run_hisel()',
             globals=locals(),
             number=num_runs)
+        hisel_cpu_time /= num_runs
         hisel_gpu_time = timeit.timeit(
             'experiment.run_cudahisel()',
             globals=locals(),
             number=num_runs)
+        hisel_gpu_time /= num_runs
         pyhsiclasso_time = timeit.timeit(
             'experiment.run_pyhsiclasso()',
             globals=locals(),
             number=num_runs)
+        pyhsiclasso_time /= num_runs
         del experiment
         result = Result(
             n,
diff --git a/profiling/lar_comparison.py b/profiling/lar_comparison.py
new file mode 100644
index 0000000..a022bb2
--- /dev/null
+++ b/profiling/lar_comparison.py
@@ -0,0 +1,74 @@
+import pstats
+from pstats import SortKey
+import cProfile
+import numpy as np
+import cupy as cp
+from hisel import lar
+from pyHSICLasso import nlars
+
+
+class Experiment:
+    def __init__(self, n, d, a):
+        x = np.random.uniform(size=(n, d))
+        beta = np.random.permutation(np.vstack(
+            [
+                np.random.uniform(low=0., high=1., size=(a, 1)),
+                np.zeros((d-a, 1), dtype=np.float32)
+            ]
+        ))
+        y = x @ beta + np.random.uniform(low=-1e-2, high=1e-2, size=(n, 1))
+
+        self.a = a
+        self.x = x
+        self.x_ = cp.array(x)
+        self.y = y
+        self.y_ = cp.array(y)
+        self.beta = beta
+
+    def run_hisel(self):
+        feats, _ = lar.solve(self.x, self.y, self.a)
+
+    def run_hisel_from_cupy_arrays(self):
+        x = cp.asnumpy(self.x_)
+        y = cp.asnumpy(self.y_)
+        feats, _ = lar.solve(x, y, self.a)
+
+    def run_pyhsiclasso(self):
+        _ = nlars.nlars(self.x, self.x.T @ self.y, self.a, 3)
+
+
+def main():
+    n = 100000
+    d = 500
+    a = 100
+    experiment = Experiment(n, d, a)
+    cProfile.runctx(
+        'experiment.run_hisel()',
+        locals=locals(),
+        globals=globals(),
+        filename='hisel_lar',
+    )
+    hisel_lar = pstats.Stats('hisel_lar')
+    hisel_lar.sort_stats(SortKey.CUMULATIVE).print_stats(20)
+
+    cProfile.runctx(
+        'experiment.run_hisel_from_cupy_arrays()',
+        locals=locals(),
+        globals=globals(),
+        filename='hisel_lar_from_cupy_arrays',
+    )
+    hisel_lar_from_cupy_arrays = pstats.Stats('hisel_lar_from_cupy_arrays')
+    hisel_lar_from_cupy_arrays.sort_stats(SortKey.CUMULATIVE).print_stats(20)
+
+    cProfile.runctx(
+        'experiment.run_pyhsiclasso()',
+        locals=locals(),
+        globals=globals(),
+        filename='pyhsiclasso_lar',
+    )
+    pyhsiclasso_lar = pstats.Stats('pyhsiclasso_lar')
+    pyhsiclasso_lar.sort_stats(SortKey.CUMULATIVE).print_stats(20)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/profiling/select_comparison.py b/profiling/select_comparison.py
index 9f84953..1884ea6 100644
--- a/profiling/select_comparison.py
+++ b/profiling/select_comparison.py
@@ -21,7 +21,10 @@ def pyhsiclasso(x, y, xfeattype,  yfeattype,
     else:
         lasso.regression(n_features, B=batch_size,
                          discrete_x=discrete_x, M=number_of_epochs)
-    return lasso.A
+
+    selection = np.array(lasso.A, copy=True)
+    del lasso
+    return selection
 
 
 class Experiment:
@@ -34,10 +37,9 @@ def __init__(
         apply_transform: bool = False,
         batch_size: int = 500,
         number_of_epochs: int = 3,
-        device: Device = Device.CPU,
     ):
         print('\n\n\n##############################################################')
-        print('Test selection of features in a linear transformation setting')
+        print('# Test selection of features in a linear transformation setting')
         print('##############################################################')
         print(f'Feature type of x: {xfeattype}')
         print(f'Feature type of y: {yfeattype}')
@@ -45,11 +47,11 @@ def __init__(
         print(f'Noisy target: {add_noise}')
         print(f'Number of epochs: {number_of_epochs}')
         print(f'Batch size: {batch_size}')
-        print(f'device: {device}')
 
-        d: int = np.random.randint(low=50, high=100)
-        n: int = np.random.randint(low=15000, high=20000)
-        n_features: int = d // 6
+        d: int = np.random.randint(low=300, high=400)
+        n: int = batch_size * \
+            (np.random.randint(low=10000, high=12000) // batch_size)
+        n_features: int = d // np.random.randint(low=15, high=20)
         features = list(np.random.choice(d, replace=False, size=n_features))
         x: np.ndarray
         y: np.ndarray
@@ -73,7 +75,7 @@ def __init__(
             scaler = .01 if yfeattype == FeatureType.DISCR else .1
             u += scaler * np.std(u) * np.random.uniform(size=u.shape)
         if yfeattype == FeatureType.CONT:
-            y = u
+            y = np.sum(u, axis=1, keepdims=True)
         elif yfeattype == FeatureType.DISCR:
             y = np.zeros(shape=(n, 1), dtype=int)
             for i in range(1, n_features):
@@ -81,7 +83,6 @@ def __init__(
         else:
             raise ValueError(yfeattype)
 
-        self.device = device
         self.number_of_epochs = number_of_epochs
         self.batch_size = batch_size
         self.d = d
@@ -110,7 +111,13 @@ def run_pyhsiclasso(self):
 
         return pyhsiclasso_selection
 
-    def run_hisel(self):
+    def run_hisel_on_cpu(self):
+        self._run_hisel(Device.CPU)
+
+    def run_hisel_on_gpu(self):
+        self._run_hisel(Device.GPU)
+
+    def _run_hisel(self, device: Device = Device.CPU):
         selector = Selector(
             self.x, self.y,
             xfeattype=self.xfeattype,
@@ -121,7 +128,7 @@ def run_hisel(self):
             batch_size=len(self.x),
             minibatch_size=self.batch_size,
             number_of_epochs=self.number_of_epochs,
-            device=self.device,
+            device=device,
             return_index=True,
         )
         print(
@@ -134,16 +141,18 @@ def run_hisel(self):
             print(
                 f'WARNING: hisel did not perform an exact selection:\n{msg}')
 
+        del selector
         return selection
 
 
 def test_regression_with_noise():
     xfeattype = FeatureType.CONT
     yfeattype = FeatureType.CONT
-    batch_size = 1000
+    batch_size = 750
     number_of_epochs = 1
     return Experiment(xfeattype, yfeattype,
                       add_noise=True,
+                      apply_transform=False,
                       batch_size=batch_size,
                       number_of_epochs=number_of_epochs)
 
@@ -151,34 +160,43 @@ def test_regression_with_noise():
 def test_regression_with_noise_with_transform():
     xfeattype = FeatureType.CONT
     yfeattype = FeatureType.CONT
-    batch_size = 1000
+    batch_size = 750
     number_of_epochs = 1
     return Experiment(xfeattype, yfeattype,
                       add_noise=True,
+                      apply_transform=True,
                       batch_size=batch_size,
                       number_of_epochs=number_of_epochs)
 
 
-regression_experiment = test_regression_with_noise()
-
-
 def main():
+    regression_experiment = test_regression_with_noise()
+
     pyhsiclasso_time = timeit.timeit(
         'regression_experiment.run_pyhsiclasso()',
-        number=3,
-        globals=globals(),
+        number=1,
+        globals=locals(),
     )
     print('\n#################################################################')
     print(f'# pyhsiclasso_time: {round(pyhsiclasso_time, 6)}')
     print('#################################################################\n\n\n')
 
-    hisel_time = timeit.timeit(
-        'regression_experiment.run_hisel()',
-        number=3,
-        globals=globals(),
+    hisel_cpu_time = timeit.timeit(
+        'regression_experiment.run_hisel_on_cpu()',
+        number=1,
+        globals=locals(),
+    )
+    print('\n#################################################################')
+    print(f'# hisel_cpu_time: {round(hisel_cpu_time, 6)}')
+    print('#################################################################\n\n')
+
+    hisel_gpu_time = timeit.timeit(
+        'regression_experiment.run_hisel_on_gpu()',
+        number=1,
+        globals=locals(),
     )
     print('\n#################################################################')
-    print(f'# hisel_time: {round(hisel_time, 6)}')
+    print(f'# hisel_gpu_time: {round(hisel_gpu_time, 6)}')
     print('#################################################################\n\n')
 
 
diff --git a/profiling/select_profile.py b/profiling/select_profile.py
index 168081c..0804d58 100644
--- a/profiling/select_profile.py
+++ b/profiling/select_profile.py
@@ -2,16 +2,148 @@
 from pstats import SortKey
 import cProfile
 from tests.select_test import SelectorTest
+import numpy as np
+from scipy.stats import special_ortho_group
+
+from hisel.select import HSICSelector as Selector, FeatureType
+from hisel.kernels import Device
+
+
+class SelectProfiler:
+    def __init__(
+        self,
+        xfeattype: FeatureType,
+        yfeattype: FeatureType,
+        add_noise: bool = False,
+        apply_transform: bool = False,
+        apply_non_linear_transform: bool = False,
+    ):
+        print('\n\n\n##############################################################################')
+        print('Test selection of features in a (non-)linear  transformation setting')
+        print(
+            '##############################################################################')
+        print(f'Feature type of x: {xfeattype}')
+        print(f'Feature type of y: {yfeattype}')
+        print(f'Apply linear transform: {apply_transform}')
+        print(f'Apply non-linear transform: {apply_non_linear_transform}')
+        print(f'Noisy target: {add_noise}')
+        d: int = 200
+        minibatch_size: int = 500
+        n: int = 4000
+        batch_size: int = n
+        n_features: int = 30
+        features = list(np.random.choice(d, replace=False, size=n_features))
+        x: np.ndarray
+        y: np.ndarray
+        if xfeattype == FeatureType.DISCR:
+            ms = np.random.randint(low=2, high=2*n_features, size=(d,))
+            xs = [np.random.randint(m, size=(n, 1)) for m in ms]
+            x = np.concatenate(xs, axis=1)
+            split = None
+        elif xfeattype == FeatureType.BOTH:
+            split: int = np.random.randint(low=3, high=d-1)
+            xcat = np.random.randint(10, size=(n, split))
+            xcont = np.random.uniform(size=(n, d-split))
+            x = np.concatenate((xcat, xcont), axis=1)
+        else:
+            x = np.random.uniform(size=(n, d))
+            split = None
+        z: np.array = x[:, features]
+        if (apply_transform or yfeattype == FeatureType.DISCR):
+            tt = np.expand_dims(
+                special_ortho_group.rvs(n_features),
+                axis=0
+            )
+            zz = np.expand_dims(z, axis=2)
+            u = (tt @ zz)[:, :, 0]
+        else:
+            u = z
+        if add_noise:
+            scaler = .01 if yfeattype == FeatureType.DISCR else .1
+            u += scaler * np.std(u) * np.random.uniform(size=u.shape)
+        if yfeattype == FeatureType.CONT:
+            if apply_non_linear_transform:
+                u = np.sum(u, axis=1, keepdims=True)
+                u /= np.max(np.abs(u), axis=None)
+                y = np.sin(4 * np.pi * u)
+            else:
+                y = u
+        elif yfeattype == FeatureType.DISCR:
+            y = np.zeros(shape=(n, 1), dtype=int)
+            for i in range(1, n_features):
+                y += np.asarray(u[:, [i-1]] > u[:, [i]], dtype=int)
+        else:
+            raise ValueError(yfeattype)
+        print(f'Expected features:\n{sorted(features)}\n')
+
+        self.x = x
+        self.y = y
+        self.xfeattype = xfeattype
+        self.yfeattype = yfeattype
+        self.split = split
+        self.features = features
+        self.n_features = n_features
+        self.batch_size = batch_size
+        self.minibatch_size = minibatch_size
+
+    def run_on_cpu(self):
+        self._run(Device.CPU)
+
+    def run_on_gpu(self):
+        self._run(Device.GPU)
+
+    def _run(self, device: Device = Device.CPU):
+
+        selector = Selector(
+            self.x, self.y,
+            xfeattype=self.xfeattype,
+            yfeattype=self.yfeattype,
+            catcont_split=self.split,
+        )
+        num_to_select = self.n_features
+        selection = selector.select(
+            num_to_select,
+            batch_size=self.batch_size,
+            minibatch_size=self.minibatch_size,
+            number_of_epochs=3,
+            device=device,
+            return_index=True)
+        print(f'Expected features:\n{sorted(self.features)}')
+        print(
+            f'hisel selected features:\n{sorted(selection)}')
+        del selector
 
 
 def main():
-    cProfile.run('SelectorTest().test_regression_no_noise_with_transform()',
-                 'select_profile')
-    p = pstats.Stats('select_profile')
+    profiler = SelectProfiler(
+        xfeattype=FeatureType.CONT,
+        yfeattype=FeatureType.CONT,
+        add_noise=True,
+        apply_transform=False,
+    )
+    cProfile.runctx('profiler.run_on_cpu()', globals=globals(),
+                    locals=locals(), filename='cpu_select_profile')
+    p = pstats.Stats('cpu_select_profile')
+    p.sort_stats(SortKey.CUMULATIVE).print_stats(
+        30)
     p.sort_stats(SortKey.CUMULATIVE).print_stats(
-        'hisel/select.py:', 50)
+        'hisel/select.py:', 20)
     p.sort_stats(SortKey.CUMULATIVE).print_stats(
-        'hisel/kernels.py:', 50)
+        'hisel/kernels.py:', 20)
+    p.sort_stats(SortKey.CUMULATIVE).print_stats(
+        'lar.py:', 20)
+
+    cProfile.runctx('profiler.run_on_gpu()',  globals=globals(),
+                    locals=locals(), filename='gpu_select_profile')
+    p_gpu = pstats.Stats('gpu_select_profile')
+    p_gpu.sort_stats(SortKey.CUMULATIVE).print_stats(
+        30)
+    p_gpu.sort_stats(SortKey.CUMULATIVE).print_stats(
+        'hisel/select.py:', 20)
+    p_gpu.sort_stats(SortKey.CUMULATIVE).print_stats(
+        'hisel/cudakernels.py:', 20)
+    p_gpu.sort_stats(SortKey.CUMULATIVE).print_stats(
+        'lar.py:', 20)
 
 
 if __name__ == '__main__':
diff --git a/tests/lar_performance.py b/tests/lar_performance.py
deleted file mode 100644
index 79cc379..0000000
--- a/tests/lar_performance.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import datetime
-import numpy as np
-from hisel import lar
-use_pyhsiclasso = True
-try:
-    from pyHSICLasso import nlars
-except (ModuleNotFoundError, ImportError):
-    use_pyhsiclasso = False
-
-
-def speed_test(
-        n: int,
-        d: int,
-        a: int,
-        number_of_experiments: int = 1,
-):
-    x = np.random.uniform(size=(n, d))
-    beta = np.random.permutation(np.vstack(
-        [
-            np.random.uniform(low=0., high=1., size=(a, 1)),
-            np.zeros((d-a, 1), dtype=np.float32)
-        ]
-    ))
-    y = x @ beta + np.random.uniform(low=-1e-2, high=1e-2, size=(n, 1))
-    hisel_runtimes = []
-    for _ in range(number_of_experiments):
-        t0 = datetime.datetime.now()
-        feats, _ = lar.solve(x, y, a)
-        t1 = datetime.datetime.now()
-        hisel_runtime = t1 - t0
-        hisel_runtimes.append(
-            hisel_runtime.seconds + 1e-6 * hisel_runtime.microseconds
-        )
-    hisel_mean = np.mean(hisel_runtimes)
-    hisel_std = np.std(hisel_runtimes)
-    print(
-        f'hisel run time: {hisel_mean:.4f} +/- {hisel_std:.4f}  seconds\n')
-
-    if use_pyhsiclasso:
-        pyhsic_runtimes = []
-        for _ in range(number_of_experiments):
-            t0 = datetime.datetime.now()
-            feats = nlars.nlars(x, x.T @ y, a, 3)
-            t1 = datetime.datetime.now()
-            pyhsic_runtime = t1 - t0
-            pyhsic_runtimes.append(
-                pyhsic_runtime.seconds + 1e-6 * pyhsic_runtime.microseconds
-            )
-        pyhsic_mean = np.mean(pyhsic_runtimes)
-        pyhsic_std = np.std(pyhsic_runtimes)
-        print(
-            f'pyhsic run time: {pyhsic_mean:.4f} +/- {pyhsic_std:.4f}  seconds\n')
-
-
-if __name__ == '__main__':
-    n = 100000
-    d = 500
-    a = 100
-    number_of_experiments = 5
-    # With these parameters, on my laptop, I get:
-    # hisel run time: 14.1813 +/- 0.0975  seconds
-    # pyhsic run time: 15.9536 +/- 0.2564  seconds
-    speed_test(n, d, a, number_of_experiments)
diff --git a/tests/select_test.py b/tests/select_test.py
index 626ed8c..7c06c45 100644
--- a/tests/select_test.py
+++ b/tests/select_test.py
@@ -19,15 +19,19 @@
 SKLEARN_RECON = True
 
 
-def pyhsiclasso(x, y, xfeattype,  yfeattype, n_features: int, batch_size=500):
+def pyhsiclasso(x, y, xfeattype,  yfeattype, n_features: int, minibatch_size=500):
     lasso = pyHSICLasso.HSICLasso()
     lasso.X_in = x.T
     lasso.Y_in = y.T
     discrete_x = False  # xfeattype == FeatureType.DISCR
     if yfeattype == FeatureType.DISCR:
-        lasso.classification(n_features, B=batch_size, discrete_x=discrete_x)
+        lasso.classification(n_features,
+                             B=minibatch_size,
+                             discrete_x=discrete_x)
     else:
-        lasso.regression(n_features, B=batch_size, discrete_x=discrete_x)
+        lasso.regression(n_features,
+                         B=minibatch_size,
+                         discrete_x=discrete_x)
     return lasso.A
 
 
@@ -90,13 +94,11 @@ def test_regression_with_noise_with_transform(self):
         self._test_selection(xfeattype, yfeattype,
                              add_noise=True, apply_transform=True)
 
-    # @unittest.skipIf(QUICK_TEST, 'Skipping for faster test')
     def test_mixed_feature_regression_no_noise(self):
         xfeattype = FeatureType.BOTH
         yfeattype = FeatureType.CONT
         self._test_selection(xfeattype, yfeattype, add_noise=False)
 
-    # @unittest.skipIf(QUICK_TEST, 'Skipping for faster test')
     def test_mixed_feature_regression_with_transform(self):
         xfeattype = FeatureType.BOTH
         yfeattype = FeatureType.CONT
@@ -183,7 +185,10 @@ def _test_selection(
         print(f'Noisy target: {add_noise}')
         print(f'device: {device}')
         d: int = np.random.randint(low=15, high=25)
-        n: int = np.random.randint(low=5000, high=10000)
+        minibatch_size: int = np.random.randint(low=500, high=1000)
+        n: int = minibatch_size * \
+            (np.random.randint(low=5000, high=10000) // minibatch_size)
+        batch_size: int = n
         n_features: int = d // 3
         features = list(np.random.choice(d, replace=False, size=n_features))
         x: np.ndarray
@@ -231,7 +236,7 @@ def _test_selection(
         if USE_PYHSICLASSO and (xfeattype == FeatureType.CONT):
             print('Using pyHSICLasso for reconciliation purposes')
             pyhsiclasso_selection = pyhsiclasso(
-                x, y, xfeattype, yfeattype, n_features, 500)
+                x, y, xfeattype, yfeattype, n_features, minibatch_size)
             print(
                 f'pyHSICLasso selected features:\n{sorted(pyhsiclasso_selection)}')
             self.assertEqual(
@@ -258,7 +263,7 @@ def _test_selection(
         )
         num_to_select = n_features
         selected_features = selector.select(
-            num_to_select, batch_size=len(x), minibatch_size=800,  number_of_epochs=3, device=device)
+            num_to_select, batch_size=batch_size, minibatch_size=minibatch_size,  number_of_epochs=3, device=device)
         selection = [int(feat.split('f')[-1])
                      for feat in selected_features]
         print(f'Expected features:\n{sorted(features)}')