From faad60ec05b5224d029203b4ce805a31d2f53a51 Mon Sep 17 00:00:00 2001
From: Egor Krivov <e.a.krivov@gmail.com>
Date: Wed, 27 Sep 2023 20:10:02 +0200
Subject: [PATCH 01/23] update

---
 _benchplot/benchplot-dict.R | 32 ++++++++++++++++++++++++++++++++
 _control/solutions.csv      |  2 ++
 _launcher/solution.R        |  2 +-
 run.conf                    |  2 +-
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R
index 14cb6964..01c95bcd 100644
--- a/_benchplot/benchplot-dict.R
+++ b/_benchplot/benchplot-dict.R
@@ -34,6 +34,7 @@ solution.dict = {list(
   "data.table" = list(name=c(short="data.table", long="data.table"), color=c(strong="blue", light="#7777FF")),
   "dplyr" = list(name=c(short="dplyr", long="dplyr"), color=c(strong="red", light="#FF7777")),
   "pandas" = list(name=c(short="pandas", long="pandas"), color=c(strong="green4", light="#77FF77")),
+  "modin" = list(name=c(short="pandas", long="pandas"), color=c(strong="green4", light="#77FF77")),
   "pydatatable" = list(name=c(short="pydatatable", long="(py)datatable"), color=c(strong="darkorange", light="orange")),
   "spark" = list(name=c(short="spark", long="spark"), color=c(strong="#8000FFFF", light="#CC66FF")),
   "dask" = list(name=c(short="dask", long="dask"), color=c(strong="slategrey", light="lightgrey")),
@@ -102,6 +103,19 @@ groupby.syntax.dict = {list(
     "regression v1 v2 by id2 id4" = "DF[['id2','id4','v1','v2']].groupby(['id2','id4'], as_index=False, sort=False, observed=True, dropna=False).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}))",
     "sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3':'sum', 'v1':'size'})"
   )},
+  # TODO: update later
+  "modin" = {c(
+    "sum v1 by id1" = "DF.groupby('id1', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum'})",
+    "sum v1 by id1:id2" = "DF.groupby(['id1','id2'], as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum'})",
+    "sum v1 mean v3 by id3" = "DF.groupby('id3', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum', 'v3':'mean'})",
+    "mean v1:v3 by id4" = "DF.groupby('id4', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})",
+    "sum v1:v3 by id6" = "DF.groupby('id6', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})",
+    "median v3 sd v3 by id4 id5" = "DF.groupby(['id4','id5'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3': ['median','std']})",
+    "max v1 - min v2 by id3" = "DF.groupby('id3', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'max', 'v2':'min'}).assign(range_v1_v2=lambda x: x['v1']-x['v2'])[['id3','range_v1_v2']]",
+    "largest two v3 by id6" = "DF[~DF['v3'].isna()][['id6','v3']].sort_values('v3', ascending=False).groupby('id6', as_index=False, sort=False, observed=True, dropna=False).head(2)",
+    "regression v1 v2 by id2 id4" = "DF[['id2','id4','v1','v2']].groupby(['id2','id4'], as_index=False, sort=False, observed=True, dropna=False).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}))",
+    "sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3':'sum', 'v1':'size'})"
+  )},
   "pydatatable" = {c(
     "sum v1 by id1" = "DT[:, {'v1': sum(f.v1)}, by(f.id1)]",
     "sum v1 by id1:id2" = "DT[:, {'v1': sum(f.v1)}, by(f.id1, f.id2)]",
@@ -239,6 +253,7 @@ groupby.query.exceptions = {list(
   "data.table" =  list(),
   "dplyr" =       list(),
   "pandas" =      list(),
+  "modin" =      list(),
   "pydatatable" = list(),
   "spark" =       list("not yet implemented: SPARK-26589" = "median v3 sd v3 by id4 id5"),
   "dask" =        list("not yet implemented: dask#4362" = "median v3 sd v3 by id4 id5"),
@@ -265,6 +280,10 @@ groupby.data.exceptions = {list(
   "pandas" = {list(
     "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") # read_csv #9
   )},
+  # TODO: fix later
+  "modin" = {list(
+    "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") # read_csv #9
+  )},
   "pydatatable" = {list(
     "csv reader NAs bug: datatable#2808" = c("G1_1e9_1e2_5_0")
   )},
@@ -362,6 +381,14 @@ join.syntax.dict = {list(
     "medium inner on factor" = "DF.merge(medium, on='id5')",
     "big inner on int" = "DF.merge(big, on='id3')"
   )},
+  # TODO: update later
+  "modin" = {c(
+    "small inner on int" = "DF.merge(small, on='id1')",
+    "medium inner on int" = "DF.merge(medium, on='id2')",
+    "medium outer on int" = "DF.merge(medium, how='left', on='id2')",
+    "medium inner on factor" = "DF.merge(medium, on='id5')",
+    "big inner on int" = "DF.merge(big, on='id3')"
+  )},
   "pydatatable" = {c(
     "small inner on int" = "y.key = 'id1'; DT[:, :, join(y)][isfinite(f.v2), :]",
     "medium inner on int" = "y.key = 'id2'; DT[:, :, join(y)][isfinite(f.v2), :]",
@@ -423,6 +450,7 @@ join.query.exceptions = {list(
   "data.table" =  list(),
   "dplyr" =       list(),
   "pandas" =      list(),
+  "modin" =      list(),
   "pydatatable" = list(),
   "spark" =       list(),
   "dask" =        list(),
@@ -445,6 +473,10 @@ join.data.exceptions = {list(
   "pandas" = {list(
     "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")                  # read_csv
   )},
+  # TODO: update later
+  "modin" = {list(
+    "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")                  # read_csv
+  )},
   "pydatatable" = {list(
     "csv reader NAs bug: datatable#2808" = "J1_1e9_NA_5_0",
     "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_0_1")                                  # q5 out of memory due to a deep copy
diff --git a/_control/solutions.csv b/_control/solutions.csv
index ac996de0..1df22a18 100644
--- a/_control/solutions.csv
+++ b/_control/solutions.csv
@@ -8,6 +8,8 @@ dplyr,groupby2014
 pandas,groupby
 pandas,join
 pandas,groupby2014
+modin,groupby
+modin,join
 pydatatable,groupby
 pydatatable,join
 spark,groupby
diff --git a/_launcher/solution.R b/_launcher/solution.R
index c419a2c5..4b400d57 100755
--- a/_launcher/solution.R
+++ b/_launcher/solution.R
@@ -111,7 +111,7 @@ file.ext = function(x) {
   ans = switch(
     x,
     "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
-    "pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
+    "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py",
     "clickhouse"="sql",
     "juliadf"="jl", "juliads"="jl"
   )
diff --git a/run.conf b/run.conf
index 5347df80..06e26757 100644
--- a/run.conf
+++ b/run.conf
@@ -1,7 +1,7 @@
 # task, used in init-setup-iteration.R
 export RUN_TASKS="groupby"
 # solution, used in init-setup-iteration.R
-export RUN_SOLUTIONS="data.table juliads dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb datafusion"
+export RUN_SOLUTIONS="data.table juliads dplyr pandas modin pydatatable spark dask clickhouse polars arrow duckdb datafusion"
 
  # juliadf clickhouse"
 

From 1943eed6412dd3f9d194417a54e1a2fac0933be7 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Mon, 2 Oct 2023 13:14:29 +0000
Subject: [PATCH 02/23] Working version

---
 modin/groupby-modin.py | 164 +++++++++++++++++++++++++++++------------
 1 file changed, 117 insertions(+), 47 deletions(-)

diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py
index d4e45a79..dd568214 100755
--- a/modin/groupby-modin.py
+++ b/modin/groupby-modin.py
@@ -3,20 +3,78 @@
 print("# groupby-modin.py", flush=True)
 
 import os
+
+os.environ["MODIN_ENGINE"] = "native"
+os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
+os.environ["MODIN_EXPERIMENTAL"] = "True"
+# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000"
+os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000"
+print("Pandas backend: Modin on HDK")
+
 import gc
 import timeit
 import modin as modin
 import modin.pandas as pd
-import ray
+
+import pyhdk
+pyhdk.init()
+
+def init_modin_on_hdk(pd):
+    from modin.experimental.sql import query
+
+    # Calcite initialization
+    data = {"a": [1, 2, 3]}
+    df = pd.DataFrame(data)
+    query("SELECT * FROM df", df=df)
+
+
+init_modin_on_hdk(pd)
+gb_params = dict(as_index=False, sort=False, observed=True)
+
+
+def trigger_import(df: pd.DataFrame):
+    """
+    Trigger import execution for DataFrame obtained by HDK engine.
+    Parameters
+    ----------
+    df : DataFrame
+        DataFrame for trigger import.
+    """
+    modin_frame = df._query_compiler._modin_frame
+    if hasattr(modin_frame, "force_import"):
+        modin_frame.force_import()
+        return
+
+    # The code below has been kept for backwards compatibility and will be removed in the future.
+
+    from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import (
+        DbWorker,
+    )
+
+    df.shape  # to trigger real execution
+
+    p = modin_frame._partitions[0][0]
+    if (
+        p.frame_id is None
+        and modin_frame._has_arrow_table()
+        and not isinstance(table := p.get(), pd.DataFrame)
+    ):
+        p.frame_id = DbWorker().import_arrow_table(table)  # to trigger real execution
+
+
+def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
+    if trigger_hdk_import:
+        trigger_import(df)
+    else:
+        df._query_compiler._modin_frame._execute()
+    return df
 
 exec(open("./_helpers/helpers.py").read())
 
 ver = modin.__version__
 
-ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})
-warnings.filterwarnings('ignore')
 
-os.environ["MODIN_ENGINE"] = "ray"
+# warnings.filterwarnings('ignore')
 
 git = ""
 task = "groupby"
@@ -29,8 +87,11 @@
 src_grp = os.path.join("data", data_name+".csv")
 print("loading dataset %s" % data_name, flush=True)
 
-x = pd.read_csv(src_grp, dtype={'id1':'category', 'id2':'category', 'id3':'category'})
+x = pd.read_csv(src_grp, dtype={'id1':'category', 'id2':'category', 'id3':'category',
+    **{n: "int32" for n in ["id4", "id5", "id6", "v1", "v2"]},
+    "v3": "float64",})
 print(len(x.index), flush=True)
+execute(x, trigger_hdk_import=True)
 
 task_init = timeit.default_timer()
 print("grouping...", flush=True)
@@ -38,8 +99,8 @@
 question = "sum v1 by id1" # q1
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1'], observed=True).agg({'v1':'sum'})
-ans.reset_index(inplace=True) # #68
+ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'})
+# ans.reset_index(inplace=True) # #68
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -50,8 +111,8 @@
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1'], observed=True).agg({'v1':'sum'})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -66,8 +127,8 @@
 question = "sum v1 by id1:id2" # q2
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1','id2'], observed=True).agg({'v1':'sum'})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -78,8 +139,8 @@
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1','id2'], observed=True).agg({'v1':'sum'})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -94,8 +155,8 @@
 question = "sum v1 mean v3 by id3" # q3
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id3'], observed=True).agg({'v1':'sum', 'v3':'mean'})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -106,8 +167,8 @@
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id3'], observed=True).agg({'v1':'sum', 'v3':'mean'})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -122,8 +183,8 @@
 question = "mean v1:v3 by id4" # q4
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id4'], observed=True).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -134,8 +195,8 @@
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id4'], observed=True).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -150,8 +211,8 @@
 question = "sum v1:v3 by id6" # q5
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id6'], observed=True).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -162,8 +223,8 @@
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id6'], observed=True).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -178,36 +239,37 @@
 question = "median v3 sd v3 by id4 id5" # q6
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id4','id5'], observed=True).agg({'v3': ['median','std']})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v3']['median'].sum(), ans['v3']['std'].sum()]
+chk = [ans[('v3', 'median')].sum(), ans[('v3', 'std')].sum()]
 chkt = timeit.default_timer() - t_start
 write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id4','id5'], observed=True).agg({'v3': ['median','std']})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v3']['median'].sum(), ans['v3']['std'].sum()]
+chk = [ans[('v3', 'median')].sum(), ans[('v3', 'std')].sum()]
 chkt = timeit.default_timer() - t_start
 write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
+# TODO: change impl
 question = "max v1 - min v2 by id3" # q7
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id3'], observed=True).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']]
-ans.reset_index(inplace=True)
+ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']]
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -218,8 +280,8 @@
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id3'], observed=True).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']]
-ans.reset_index(inplace=True)
+ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']]
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -231,11 +293,12 @@
 print(ans.tail(3), flush=True)
 del ans
 
+# TODO: change impl
 question = "largest two v3 by id6" # q8
 gc.collect()
 t_start = timeit.default_timer()
-ans = x[['id6','v3']].sort_values('v3', ascending=False).groupby(['id6'], observed=True).head(2)
-ans.reset_index(drop=True, inplace=True)
+ans = x.groupby('id6')['v3'].nlargest(2).reset_index()[['id6', 'v3']]
+# ans.reset_index(drop=True, inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -246,8 +309,8 @@
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x[['id6','v3']].sort_values('v3', ascending=False).groupby(['id6'], observed=True).head(2)
-ans.reset_index(drop=True, inplace=True)
+ans = x.groupby('id6')['v3'].nlargest(2).reset_index()[['id6', 'v3']]
+# ans.reset_index(drop=True, inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -259,12 +322,19 @@
 print(ans.tail(3), flush=True)
 del ans
 
+# TODO: change impl
 question = "regression v1 v2 by id2 id4" # q9
 #ans = x[['id2','id4','v1','v2']].groupby(['id2','id4']).corr().iloc[0::2][['v2']]**2 # slower, 76s vs 47s on 1e8 1e2
 gc.collect()
 t_start = timeit.default_timer()
-ans = x[['id2','id4','v1','v2']].groupby(['id2','id4'], observed=True).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}))
-ans.reset_index(inplace=True)
+from modin.experimental.sql import query
+sql = """
+SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2
+FROM df
+GROUP BY id2, id4;
+"""
+ans = query(sql, df=x)
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -275,8 +345,8 @@
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x[['id2','id4','v1','v2']].groupby(['id2','id4'], observed=True).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}))
-ans.reset_index(inplace=True)
+ans = query(sql, df=x)
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -291,8 +361,8 @@
 question = "sum v3 count by id1:id6" # q10
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1','id2','id3','id4','id5','id6'], observed=True).agg({'v3':'sum', 'v1':'count'})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -303,8 +373,8 @@
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1','id2','id3','id4','id5','id6'], observed=True).agg({'v3':'sum', 'v1':'count'})
-ans.reset_index(inplace=True)
+ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'})
+# ans.reset_index(inplace=True)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()

From fad2249116ea673882a2bffa38958ec2df57ae63 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Mon, 2 Oct 2023 13:15:00 +0000
Subject: [PATCH 03/23] Removed FRAGMENT SIZE hardcode

---
 modin/groupby-modin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py
index dd568214..5a6f3db6 100755
--- a/modin/groupby-modin.py
+++ b/modin/groupby-modin.py
@@ -8,7 +8,7 @@
 os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
 os.environ["MODIN_EXPERIMENTAL"] = "True"
 # os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000"
-os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000"
+# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000"
 print("Pandas backend: Modin on HDK")
 
 import gc

From 826c93245990b7ed710c40aa5799f84c510a6f52 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Mon, 2 Oct 2023 14:08:46 +0000
Subject: [PATCH 04/23] join update

---
 modin/join-modin.py | 232 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 206 insertions(+), 26 deletions(-)

diff --git a/modin/join-modin.py b/modin/join-modin.py
index b7c69650..38a8eb3b 100755
--- a/modin/join-modin.py
+++ b/modin/join-modin.py
@@ -5,65 +5,245 @@
 import os
 import gc
 import timeit
+
+os.environ["MODIN_ENGINE"] = "native"
+os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
+os.environ["MODIN_EXPERIMENTAL"] = "True"
+# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000"
+# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000"
+print("Pandas backend: Modin on HDK")
+
+
+import modin
 import modin.pandas as pd
 
-exec(open("./helpers.py").read())
 
-src_x = os.environ['SRC_X_LOCAL']
-src_y = os.environ['SRC_Y_LOCAL']
+import pyhdk
+pyhdk.init()
+
+def init_modin_on_hdk(pd):
+    from modin.experimental.sql import query
+
+    # Calcite initialization
+    data = {"a": [1, 2, 3]}
+    df = pd.DataFrame(data)
+    query("SELECT * FROM df", df=df)
+
+
+init_modin_on_hdk(pd)
+
+
+def trigger_import(df: pd.DataFrame):
+    """
+    Trigger import execution for DataFrame obtained by HDK engine.
+    Parameters
+    ----------
+    df : DataFrame
+        DataFrame for trigger import.
+    """
+    modin_frame = df._query_compiler._modin_frame
+    if hasattr(modin_frame, "force_import"):
+        modin_frame.force_import()
+        return
+
+    # The code below has been kept for backwards compatibility and will be removed in the future.
 
-ver = "" #pd.__version__
+    from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import (
+        DbWorker,
+    )
+
+    df.shape  # to trigger real execution
+
+    p = modin_frame._partitions[0][0]
+    if (
+        p.frame_id is None
+        and modin_frame._has_arrow_table()
+        and not isinstance(table := p.get(), pd.DataFrame)
+    ):
+        p.frame_id = DbWorker().import_arrow_table(table)  # to trigger real execution
+
+
+def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
+    if trigger_hdk_import:
+        trigger_import(df)
+    else:
+        df._query_compiler._modin_frame._execute()
+    return df
+
+exec(open("./_helpers/helpers.py").read())
+
+ver = modin.__version__
 git = ""
 task = "join"
-question = "inner join"
-l = [os.path.basename(src_x), os.path.basename(src_y)]
-data_name = '-'.join(l)
 solution = "modin"
-fun = "merge"
+fun = ".merge"
 cache = "TRUE"
+on_disk = "FALSE"
+
+
+data_name = os.environ['SRC_DATANAME']
+src_jn_x = os.path.join("data", data_name+".csv")
+y_data_name = join_to_tbls(data_name)
+src_jn_y = [os.path.join("data", y_data_name[0]+".csv"), os.path.join("data", y_data_name[1]+".csv"), os.path.join("data", y_data_name[2]+".csv")]
+if len(src_jn_y) != 3:
+  raise Exception("Something went wrong in preparing files used for join")
+
 
-print("loading datasets...")
+print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2], flush=True)
 
-x = pd.read_csv(os.path.basename(src_x))
-y = pd.read_csv(os.path.basename(src_y))
+x = pd.read_csv(src_jn_x, dtype={
+                **{n: "int32" for n in ["id1", "id2", "id3"]},
+                **{n: "category" for n in ["id4", "id5", "id6"]},
+                "v1": "float64",
+            })
 
-print("joining...")
+small = pd.read_csv(src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"})
+medium = pd.read_csv(src_jn_y[1], dtype={
+                **{n: "int32" for n in ["id1", "id2"]},
+                **{n: "category" for n in ["id4", "id5"]},
+                "v2": "float64",
+            })
+big = pd.read_csv(src_jn_y[2], dtype={
+                **{n: "int32" for n in ["id1", "id2", "id3"]},
+                **{n: "category" for n in ["id4", "id5", "id6"]},
+                "v2": "float64",
+            },)
 
-# NotImplementedError: To contribute to Pandas on Ray, please visit github.com/modin-project/modin
+[execute(df, trigger_hdk_import=True) for df in [x, small, medium, big]]
+
+task_init = timeit.default_timer()
+print("joining...", flush=True)
+
+question = "small inner on int" # q1
+gc.collect()
+t_start = timeit.default_timer()
+ans = x.merge(small, on='id1')
+print(ans.shape, flush=True)
+t = timeit.default_timer() - t_start
+m = memory_usage()
+t_start = timeit.default_timer()
+chk = [ans['v1'].sum(), ans['v2'].sum()]
+chkt = timeit.default_timer() - t_start
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(y, how='inner', on='KEY')
-print(ans.shape)
+ans = x.merge(small, on='id1')
+print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['X2'].sum(), ans['Y2'].sum()]
+chk = [ans['v1'].sum(), ans['v2'].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+print(ans.head(3), flush=True)
+print(ans.tail(3), flush=True)
 del ans
 
+question = "medium inner on int" # q2
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(y, how='inner', on='KEY')
-print(ans.shape)
+ans = x.merge(medium, on='id2')
+print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['X2'].sum(), ans['Y2'].sum()]
+chk = [ans['v1'].sum(), ans['v2'].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+t_start = timeit.default_timer()
+ans = x.merge(medium, on='id2')
+print(ans.shape, flush=True)
+t = timeit.default_timer() - t_start
+m = memory_usage()
+t_start = timeit.default_timer()
+chk = [ans['v1'].sum(), ans['v2'].sum()]
+chkt = timeit.default_timer() - t_start
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+print(ans.head(3), flush=True)
+print(ans.tail(3), flush=True)
 del ans
 
+question = "medium outer on int" # q3
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(y, how='inner', on='KEY')
-print(ans.shape)
+ans = x.merge(medium, how='left', on='id2')
+print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['X2'].sum(), ans['Y2'].sum()]
+chk = [ans['v1'].sum(), ans['v2'].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
+gc.collect()
+t_start = timeit.default_timer()
+ans = x.merge(medium, how='left', on='id2')
+print(ans.shape, flush=True)
+t = timeit.default_timer() - t_start
+m = memory_usage()
+t_start = timeit.default_timer()
+chk = [ans['v1'].sum(), ans['v2'].sum()]
+chkt = timeit.default_timer() - t_start
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+print(ans.head(3), flush=True)
+print(ans.tail(3), flush=True)
+del ans
+
+question = "medium inner on factor" # q4
+gc.collect()
+t_start = timeit.default_timer()
+ans = x.merge(medium, on='id5')
+print(ans.shape, flush=True)
+t = timeit.default_timer() - t_start
+m = memory_usage()
+t_start = timeit.default_timer()
+chk = [ans['v1'].sum(), ans['v2'].sum()]
+chkt = timeit.default_timer() - t_start
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+t_start = timeit.default_timer()
+ans = x.merge(medium, on='id5')
+print(ans.shape, flush=True)
+t = timeit.default_timer() - t_start
+m = memory_usage()
+t_start = timeit.default_timer()
+chk = [ans['v1'].sum(), ans['v2'].sum()]
+chkt = timeit.default_timer() - t_start
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+print(ans.head(3), flush=True)
+print(ans.tail(3), flush=True)
+del ans
+
+question = "big inner on int" # q5
+gc.collect()
+t_start = timeit.default_timer()
+ans = x.merge(big, on='id3')
+print(ans.shape, flush=True)
+t = timeit.default_timer() - t_start
+m = memory_usage()
+t_start = timeit.default_timer()
+chk = [ans['v1'].sum(), ans['v2'].sum()]
+chkt = timeit.default_timer() - t_start
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+del ans
+gc.collect()
+t_start = timeit.default_timer()
+ans = x.merge(big, on='id3')
+print(ans.shape, flush=True)
+t = timeit.default_timer() - t_start
+m = memory_usage()
+t_start = timeit.default_timer()
+chk = [ans['v1'].sum(), ans['v2'].sum()]
+chkt = timeit.default_timer() - t_start
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+print(ans.head(3), flush=True)
+print(ans.tail(3), flush=True)
+del ans
+
+print("joining finished, took %0.fs" % (timeit.default_timer()-task_init), flush=True)
 
-exit(0)
+exit(0)
\ No newline at end of file

From 6ed39e2f5259f6f689c715f7d44e8b23cdf22dfa Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Tue, 10 Oct 2023 13:31:41 +0000
Subject: [PATCH 05/23] current solution

---
 _launcher/launcher.R   |   2 +-
 _launcher/solution.R   |   2 +-
 modin/groupby-modin.py | 116 +++++++++++------------------------------
 modin/join-modin.py    |  72 +++++--------------------
 modin/modin-helpers.py |  52 ++++++++++++++++++
 5 files changed, 96 insertions(+), 148 deletions(-)
 create mode 100644 modin/modin-helpers.py

diff --git a/_launcher/launcher.R b/_launcher/launcher.R
index 2f4b07d2..a152075e 100644
--- a/_launcher/launcher.R
+++ b/_launcher/launcher.R
@@ -15,7 +15,7 @@ file.ext = function(x) {
   ans = switch(
     x,
     "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
-    "pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
+    "pandas"=, "spark"=, "pydatatable"=, "modin"="py", "hdk"="py", "dask"=, "datafusion"=, "polars"="py",
     "clickhouse"="sql",
     "juliadf"="jl", "juliads"="jl",
   )
diff --git a/_launcher/solution.R b/_launcher/solution.R
index 4b400d57..bb469d0c 100755
--- a/_launcher/solution.R
+++ b/_launcher/solution.R
@@ -111,7 +111,7 @@ file.ext = function(x) {
   ans = switch(
     x,
     "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
-    "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py",
+    "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "hdk"="py", "dask"=, "datafusion"=, "polars"="py",
     "clickhouse"="sql",
     "juliadf"="jl", "juliads"="jl"
   )
diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py
index 5a6f3db6..752043d5 100755
--- a/modin/groupby-modin.py
+++ b/modin/groupby-modin.py
@@ -2,83 +2,22 @@
 
 print("# groupby-modin.py", flush=True)
 
-import os
-
-os.environ["MODIN_ENGINE"] = "native"
-os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
-os.environ["MODIN_EXPERIMENTAL"] = "True"
-# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000"
-# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000"
-print("Pandas backend: Modin on HDK")
-
 import gc
 import timeit
-import modin as modin
-import modin.pandas as pd
-
-import pyhdk
-pyhdk.init()
-
-def init_modin_on_hdk(pd):
-    from modin.experimental.sql import query
 
-    # Calcite initialization
-    data = {"a": [1, 2, 3]}
-    df = pd.DataFrame(data)
-    query("SELECT * FROM df", df=df)
+exec(open("./modin/modin-helpers.py").read())
 
+import modin as modin
+import modin.pandas as pd
 
 init_modin_on_hdk(pd)
-gb_params = dict(as_index=False, sort=False, observed=True)
-
-
-def trigger_import(df: pd.DataFrame):
-    """
-    Trigger import execution for DataFrame obtained by HDK engine.
-    Parameters
-    ----------
-    df : DataFrame
-        DataFrame for trigger import.
-    """
-    modin_frame = df._query_compiler._modin_frame
-    if hasattr(modin_frame, "force_import"):
-        modin_frame.force_import()
-        return
-
-    # The code below has been kept for backwards compatibility and will be removed in the future.
-
-    from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import (
-        DbWorker,
-    )
-
-    df.shape  # to trigger real execution
-
-    p = modin_frame._partitions[0][0]
-    if (
-        p.frame_id is None
-        and modin_frame._has_arrow_table()
-        and not isinstance(table := p.get(), pd.DataFrame)
-    ):
-        p.frame_id = DbWorker().import_arrow_table(table)  # to trigger real execution
-
-
-def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
-    if trigger_hdk_import:
-        trigger_import(df)
-    else:
-        df._query_compiler._modin_frame._execute()
-    return df
 
 exec(open("./_helpers/helpers.py").read())
 
 ver = modin.__version__
-
-
-# warnings.filterwarnings('ignore')
-
 git = ""
 task = "groupby"
-solution = "modin"
+solution = solution_txt
 fun = ".groupby"
 cache = "TRUE"
 on_disk = "FALSE"
@@ -91,8 +30,11 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
     **{n: "int32" for n in ["id4", "id5", "id6", "v1", "v2"]},
     "v3": "float64",})
 print(len(x.index), flush=True)
+# To trigger non-lazy loading
 execute(x, trigger_hdk_import=True)
 
+gb_params = dict(as_index=False, sort=False, observed=True)
+
 task_init = timeit.default_timer()
 print("grouping...", flush=True)
 
@@ -100,7 +42,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'})
-# ans.reset_index(inplace=True) # #68
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -112,7 +54,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -128,7 +70,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -140,7 +82,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -156,7 +98,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -168,7 +110,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -184,7 +126,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -196,7 +138,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -212,7 +154,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -224,7 +166,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -240,7 +182,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -252,7 +194,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -269,7 +211,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']]
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -281,7 +223,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']]
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -297,8 +239,8 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 question = "largest two v3 by id6" # q8
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby('id6')['v3'].nlargest(2).reset_index()[['id6', 'v3']]
-# ans.reset_index(drop=True, inplace=True)
+ans = x.groupby('id6', sort=False, observed=True)['v3'].nlargest(2).reset_index()[['id6', 'v3']]
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -309,8 +251,8 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby('id6')['v3'].nlargest(2).reset_index()[['id6', 'v3']]
-# ans.reset_index(drop=True, inplace=True)
+ans = x.groupby('id6', sort=False, observed=True)['v3'].nlargest(2).reset_index()[['id6', 'v3']]
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -334,7 +276,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 GROUP BY id2, id4;
 """
 ans = query(sql, df=x)
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -346,7 +288,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = query(sql, df=x)
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -362,7 +304,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -374,7 +316,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'})
-# ans.reset_index(inplace=True)
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
diff --git a/modin/join-modin.py b/modin/join-modin.py
index 38a8eb3b..f9b10d31 100755
--- a/modin/join-modin.py
+++ b/modin/join-modin.py
@@ -6,70 +6,13 @@
 import gc
 import timeit
 
-os.environ["MODIN_ENGINE"] = "native"
-os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
-os.environ["MODIN_EXPERIMENTAL"] = "True"
-# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000"
-# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000"
-print("Pandas backend: Modin on HDK")
+exec(open("./modin/modin-helpers.py").read())
 
-
-import modin
+import modin as modin
 import modin.pandas as pd
 
-
-import pyhdk
-pyhdk.init()
-
-def init_modin_on_hdk(pd):
-    from modin.experimental.sql import query
-
-    # Calcite initialization
-    data = {"a": [1, 2, 3]}
-    df = pd.DataFrame(data)
-    query("SELECT * FROM df", df=df)
-
-
 init_modin_on_hdk(pd)
 
-
-def trigger_import(df: pd.DataFrame):
-    """
-    Trigger import execution for DataFrame obtained by HDK engine.
-    Parameters
-    ----------
-    df : DataFrame
-        DataFrame for trigger import.
-    """
-    modin_frame = df._query_compiler._modin_frame
-    if hasattr(modin_frame, "force_import"):
-        modin_frame.force_import()
-        return
-
-    # The code below has been kept for backwards compatibility and will be removed in the future.
-
-    from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import (
-        DbWorker,
-    )
-
-    df.shape  # to trigger real execution
-
-    p = modin_frame._partitions[0][0]
-    if (
-        p.frame_id is None
-        and modin_frame._has_arrow_table()
-        and not isinstance(table := p.get(), pd.DataFrame)
-    ):
-        p.frame_id = DbWorker().import_arrow_table(table)  # to trigger real execution
-
-
-def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
-    if trigger_hdk_import:
-        trigger_import(df)
-    else:
-        df._query_compiler._modin_frame._execute()
-    return df
-
 exec(open("./_helpers/helpers.py").read())
 
 ver = modin.__version__
@@ -109,6 +52,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
                 "v2": "float64",
             },)
 
+# To trigger non-lazy loading
 [execute(df, trigger_hdk_import=True) for df in [x, small, medium, big]]
 
 task_init = timeit.default_timer()
@@ -118,6 +62,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.merge(small, on='id1')
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -129,6 +74,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.merge(small, on='id1')
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -144,6 +90,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.merge(medium, on='id2')
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -155,6 +102,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.merge(medium, on='id2')
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -170,6 +118,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.merge(medium, how='left', on='id2')
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -181,6 +130,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.merge(medium, how='left', on='id2')
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -196,6 +146,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.merge(medium, on='id5')
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -207,6 +158,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.merge(medium, on='id5')
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -222,6 +174,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.merge(big, on='id3')
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
@@ -233,6 +186,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False):
 gc.collect()
 t_start = timeit.default_timer()
 ans = x.merge(big, on='id3')
+execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
diff --git a/modin/modin-helpers.py b/modin/modin-helpers.py
new file mode 100644
index 00000000..5154e66e
--- /dev/null
+++ b/modin/modin-helpers.py
@@ -0,0 +1,52 @@
+import os
+
+# Run configuration
+os.environ["MODIN_CPUS"] = "40"
+
+do_execute=True
+# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000"
+solution_txt = (
+    f"modin_{os.environ.get('MODIN_HDK_FRAGMENT_SIZE', 'nfs')}_" +
+    ("exec" if do_execute else "noexec")
+)
+
+# Set up HDK backend
+os.environ["MODIN_ENGINE"] = "native"
+os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
+os.environ["MODIN_EXPERIMENTAL"] = "True"
+
+import pyhdk
+pyhdk.init()
+# pyhdk.init(enable_non_lazy_data_import=True)
+
+
+def init_modin_on_hdk(pd):
+    from modin.experimental.sql import query
+
+    # Calcite initialization
+    data = {"a": [1, 2, 3]}
+    df = pd.DataFrame(data)
+    query("SELECT * FROM df", df=df)
+
+
+def execute(df, *, trigger_hdk_import: bool = False):
+    if trigger_hdk_import:
+        trigger_import(df)
+    else:
+        if do_execute:
+            df._query_compiler._modin_frame._execute()
+    return df
+
+
+def trigger_import(df):
+    """
+    Trigger import execution for DataFrame obtained by HDK engine.
+    Parameters
+    ----------
+    df : DataFrame
+        DataFrame for trigger import.
+    """
+    modin_frame = df._query_compiler._modin_frame
+    if hasattr(modin_frame, "force_import"):
+        modin_frame.force_import()
+        return

From c36ee558f80e05331a41282e5bf8d4a2c5874c8c Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 12 Oct 2023 12:02:50 +0000
Subject: [PATCH 06/23] setup script

---
 modin/setup-modin.sh | 18 +++++++++++-------
 modin/upg-modin.sh   |  4 ++--
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/modin/setup-modin.sh b/modin/setup-modin.sh
index 4ef46d87..30ef1d75 100755
--- a/modin/setup-modin.sh
+++ b/modin/setup-modin.sh
@@ -1,16 +1,20 @@
 #!/bin/bash
 set -e
 
-virtualenv modin/py-modin --python=python3
-source modin/py-modin/bin/activate
+curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    sh install_miniconda.sh -u -b -p ./modin/miniconda && \
+    rm -f install_miniconda.sh
+
+source ./modin/miniconda/bin/activate
+conda install -y conda-libmamba-solver
 
 # install binaries
-python3 -m pip install --upgrade modin[all]
+conda create -y --prefix ./modin/py-modin -c conda-forge python=3.10 --experimental-solver=libmamba
+conda install -y -p ./modin/py-modin -c conda-forge modin-hdk --experimental-solver=libmamba
+
+conda activate modin/py-modin
 
 # check
-python3
-import modin
-modin.__version__
-quit()
+conda run -p modin/py-modin python3 -c "import modin; print(modin.__version__)"
 
 deactivate
diff --git a/modin/upg-modin.sh b/modin/upg-modin.sh
index 80ca5591..f0ed4093 100755
--- a/modin/upg-modin.sh
+++ b/modin/upg-modin.sh
@@ -3,6 +3,6 @@ set -e
 
 echo 'upgrading modin...'
 
-source ./modin/py-modin/bin/activate
+source ./modin/miniconda/bin/activate
 
-python -m pip install --upgrade modin[all] > /dev/null
+conda update modin-hdk -p ./modin/py-modin -y

From da91c4813cc2f052c7515746a9f2c5c19a43f9bb Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 12 Oct 2023 13:47:39 +0000
Subject: [PATCH 07/23] fixed

---
 .gitignore           | 2 ++
 _launcher/launcher.R | 3 ++-
 _launcher/solution.R | 5 +++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 18572a17..294f3b82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,8 @@ metastore_db/*
 *.md5
 .Rproj.user
 .Rhistory
+py-modin
+miniconda
 db-benchmark.Rproj
 */REVISION
 */VERSION
diff --git a/_launcher/launcher.R b/_launcher/launcher.R
index a152075e..d3d2b53e 100644
--- a/_launcher/launcher.R
+++ b/_launcher/launcher.R
@@ -15,9 +15,10 @@ file.ext = function(x) {
   ans = switch(
     x,
     "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
-    "pandas"=, "spark"=, "pydatatable"=, "modin"="py", "hdk"="py", "dask"=, "datafusion"=, "polars"="py",
+    "pandas"=, "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py",
     "clickhouse"="sql",
     "juliadf"="jl", "juliads"="jl",
+    "hdk"="py"
   )
   if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x))
   ans
diff --git a/_launcher/solution.R b/_launcher/solution.R
index bb469d0c..af0e161b 100755
--- a/_launcher/solution.R
+++ b/_launcher/solution.R
@@ -111,9 +111,10 @@ file.ext = function(x) {
   ans = switch(
     x,
     "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
-    "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "hdk"="py", "dask"=, "datafusion"=, "polars"="py",
+    "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py",
     "clickhouse"="sql",
-    "juliadf"="jl", "juliads"="jl"
+    "juliadf"="jl", "juliads"="jl",
+    "hdk"="py"
   )
   if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x))
   ans

From 7405a9585edf6f40274a76f6dd494ae64b1a85b3 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 12 Oct 2023 14:06:34 +0000
Subject: [PATCH 08/23] Updates

---
 _benchplot/benchplot-dict.R | 14 +++++---------
 modin/groupby-modin.py      |  9 ++-------
 modin/setup-modin.sh        | 17 +++++++----------
 modin/upg-modin.sh          |  5 ++---
 4 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R
index 5242436b..33f6643a 100644
--- a/_benchplot/benchplot-dict.R
+++ b/_benchplot/benchplot-dict.R
@@ -34,7 +34,7 @@ solution.dict = {list(
   "data.table" = list(name=c(short="data.table", long="data.table"), color=c(strong="blue", light="#7777FF")),
   "dplyr" = list(name=c(short="dplyr", long="dplyr"), color=c(strong="red", light="#FF7777")),
   "pandas" = list(name=c(short="pandas", long="pandas"), color=c(strong="green4", light="#77FF77")),
-  "modin" = list(name=c(short="pandas", long="pandas"), color=c(strong="green4", light="#77FF77")),
+  "modin" = list(name=c(short="modin", long="modin"), color=c(strong="blue4", light="#7799ff")),
   "pydatatable" = list(name=c(short="pydatatable", long="(py)datatable"), color=c(strong="darkorange", light="orange")),
   "spark" = list(name=c(short="spark", long="spark"), color=c(strong="#8000FFFF", light="#CC66FF")),
   "dask" = list(name=c(short="dask", long="dask"), color=c(strong="slategrey", light="lightgrey")),
@@ -103,7 +103,6 @@ groupby.syntax.dict = {list(
     "regression v1 v2 by id2 id4" = "DF[['id2','id4','v1','v2']].groupby(['id2','id4'], as_index=False, sort=False, observed=True, dropna=False).apply(lambda x: x['v1'].corr(x['v2'])**2).rename(columns={None: 'r2'})",
     "sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3':'sum', 'v1':'size'})"
   )},
-  # TODO: update later
   "modin" = {c(
     "sum v1 by id1" = "DF.groupby('id1', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum'})",
     "sum v1 by id1:id2" = "DF.groupby(['id1','id2'], as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum'})",
@@ -112,8 +111,8 @@ groupby.syntax.dict = {list(
     "sum v1:v3 by id6" = "DF.groupby('id6', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})",
     "median v3 sd v3 by id4 id5" = "DF.groupby(['id4','id5'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3': ['median','std']})",
     "max v1 - min v2 by id3" = "DF.groupby('id3', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'max', 'v2':'min'}).assign(range_v1_v2=lambda x: x['v1']-x['v2'])[['id3','range_v1_v2']]",
-    "largest two v3 by id6" = "DF[~DF['v3'].isna()][['id6','v3']].sort_values('v3', ascending=False).groupby('id6', as_index=False, sort=False, observed=True, dropna=False).head(2)",
-    "regression v1 v2 by id2 id4" = "DF[['id2','id4','v1','v2']].groupby(['id2','id4'], as_index=False, sort=False, observed=True, dropna=False).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}))",
+    "largest two v3 by id6" = "DF.groupby('id6', sort=False, observed=True)['v3'].nlargest(2).reset_index()[['id6', 'v3']]",
+    "regression v1 v2 by id2 id4" = "query('SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;', df=x)",
     "sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3':'sum', 'v1':'size'})"
   )},
   "pydatatable" = {c(
@@ -280,9 +279,8 @@ groupby.data.exceptions = {list(
   "pandas" = {list(
     "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") # read_csv #9
   )},
-  # TODO: fix later
   "modin" = {list(
-    "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") # read_csv #9
+    "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1")
   )},
   "pydatatable" = {list(
     "csv reader NAs bug: datatable#2808" = c("G1_1e9_1e2_5_0")
@@ -381,7 +379,6 @@ join.syntax.dict = {list(
     "medium inner on factor" = "DF.merge(medium, on='id5')",
     "big inner on int" = "DF.merge(big, on='id3')"
   )},
-  # TODO: update later
   "modin" = {c(
     "small inner on int" = "DF.merge(small, on='id1')",
     "medium inner on int" = "DF.merge(medium, on='id2')",
@@ -473,9 +470,8 @@ join.data.exceptions = {list(
   "pandas" = {list(
     "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")                  # read_csv
   )},
-  # TODO: update later
   "modin" = {list(
-    "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")                  # read_csv
+    "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
   )},
   "pydatatable" = {list(
     "csv reader NAs bug: datatable#2808" = "J1_1e9_NA_5_0",
diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py
index 752043d5..846c5e26 100755
--- a/modin/groupby-modin.py
+++ b/modin/groupby-modin.py
@@ -270,12 +270,7 @@
 gc.collect()
 t_start = timeit.default_timer()
 from modin.experimental.sql import query
-sql = """
-SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2
-FROM df
-GROUP BY id2, id4;
-"""
-ans = query(sql, df=x)
+ans = query('SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;', df=x)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
@@ -287,7 +282,7 @@
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = query(sql, df=x)
+ans = query('SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;', df=x)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
diff --git a/modin/setup-modin.sh b/modin/setup-modin.sh
index 30ef1d75..1eb9d56b 100755
--- a/modin/setup-modin.sh
+++ b/modin/setup-modin.sh
@@ -1,20 +1,17 @@
 #!/bin/bash
 set -e
 
-curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-    sh install_miniconda.sh -u -b -p ./modin/miniconda && \
+curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh && \
+    sh install_miniconda.sh -u -b -p ./modin/py-modin && \
     rm -f install_miniconda.sh
 
-source ./modin/miniconda/bin/activate
-conda install -y conda-libmamba-solver
+source ./modin/py-modin/bin/activate
+conda install -y -c conda-forge conda-libmamba-solver
 
 # install binaries
-conda create -y --prefix ./modin/py-modin -c conda-forge python=3.10 --experimental-solver=libmamba
-conda install -y -p ./modin/py-modin -c conda-forge modin-hdk --experimental-solver=libmamba
-
-conda activate modin/py-modin
+conda install -y -c conda-forge modin-hdk --solver=libmamba
 
 # check
-conda run -p modin/py-modin python3 -c "import modin; print(modin.__version__)"
+python3 -c "import modin; print(modin.__version__)"
 
-deactivate
+conda deactivate
diff --git a/modin/upg-modin.sh b/modin/upg-modin.sh
index f0ed4093..7c6ae50d 100755
--- a/modin/upg-modin.sh
+++ b/modin/upg-modin.sh
@@ -3,6 +3,5 @@ set -e
 
 echo 'upgrading modin...'
 
-source ./modin/miniconda/bin/activate
-
-conda update modin-hdk -p ./modin/py-modin -y
+source ./modin/py-modin/bin/activate
+conda update modin-hdk -y -c conda-forge --solver=libmamba

From 01bdc14706adcf77c500f437ab22bc738dbb4b88 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Fri, 13 Oct 2023 13:54:09 +0000
Subject: [PATCH 09/23] moved HDK

---
 _launcher/solution.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/_launcher/solution.R b/_launcher/solution.R
index af0e161b..3b233691 100755
--- a/_launcher/solution.R
+++ b/_launcher/solution.R
@@ -113,8 +113,8 @@ file.ext = function(x) {
     "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
     "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py",
     "clickhouse"="sql",
-    "juliadf"="jl", "juliads"="jl",
-    "hdk"="py"
+    "hdk"="py",
+    "juliadf"="jl", "juliads"="jl"
   )
   if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x))
   ans

From 379c099fd92431675be89506cc0bcaea5a598c97 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Fri, 13 Oct 2023 13:57:44 +0000
Subject: [PATCH 10/23] Fixed formatting

---
 modin/groupby-modin.py | 14 +++++++++-----
 modin/join-modin.py    | 24 ++++++++++++------------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py
index 846c5e26..3846c7be 100755
--- a/modin/groupby-modin.py
+++ b/modin/groupby-modin.py
@@ -17,7 +17,7 @@
 ver = modin.__version__
 git = ""
 task = "groupby"
-solution = solution_txt
+solution = "modin"
 fun = ".groupby"
 cache = "TRUE"
 on_disk = "FALSE"
@@ -26,10 +26,14 @@
 src_grp = os.path.join("data", data_name+".csv")
 print("loading dataset %s" % data_name, flush=True)
 
-x = pd.read_csv(src_grp, dtype={'id1':'category', 'id2':'category', 'id3':'category',
-    **{n: "int32" for n in ["id4", "id5", "id6", "v1", "v2"]},
-    "v3": "float64",})
-print(len(x.index), flush=True)
+x = pd.read_csv(
+    src_grp,
+    dtype={
+        'id1':'category', 'id2':'category', 'id3':'category',
+        **{n: "int32" for n in ["id4", "id5", "id6", "v1", "v2"]},
+        "v3": "float64",
+    }
+)
 # To trigger non-lazy loading
 execute(x, trigger_hdk_import=True)
 
diff --git a/modin/join-modin.py b/modin/join-modin.py
index f9b10d31..7abbf252 100755
--- a/modin/join-modin.py
+++ b/modin/join-modin.py
@@ -35,22 +35,22 @@
 print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2], flush=True)
 
 x = pd.read_csv(src_jn_x, dtype={
-                **{n: "int32" for n in ["id1", "id2", "id3"]},
-                **{n: "category" for n in ["id4", "id5", "id6"]},
-                "v1": "float64",
-            })
+  **{n: "int32" for n in ["id1", "id2", "id3"]},
+  **{n: "category" for n in ["id4", "id5", "id6"]},
+  "v1": "float64",
+})
 
 small = pd.read_csv(src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"})
 medium = pd.read_csv(src_jn_y[1], dtype={
-                **{n: "int32" for n in ["id1", "id2"]},
-                **{n: "category" for n in ["id4", "id5"]},
-                "v2": "float64",
-            })
+  **{n: "int32" for n in ["id1", "id2"]},
+  **{n: "category" for n in ["id4", "id5"]},
+  "v2": "float64",
+})
 big = pd.read_csv(src_jn_y[2], dtype={
-                **{n: "int32" for n in ["id1", "id2", "id3"]},
-                **{n: "category" for n in ["id4", "id5", "id6"]},
-                "v2": "float64",
-            },)
+  **{n: "int32" for n in ["id1", "id2", "id3"]},
+  **{n: "category" for n in ["id4", "id5", "id6"]},
+  "v2": "float64",
+})
 
 # To trigger non-lazy loading
 [execute(df, trigger_hdk_import=True) for df in [x, small, medium, big]]

From 52d22baf5376f996b13cb523a432ade2e3d88636 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Mon, 16 Oct 2023 12:02:02 +0000
Subject: [PATCH 11/23] Updated groupby

---
 _launcher/launcher.R   | 1 -
 modin/groupby-modin.py | 7 ++-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/_launcher/launcher.R b/_launcher/launcher.R
index d3d2b53e..fd58be9d 100644
--- a/_launcher/launcher.R
+++ b/_launcher/launcher.R
@@ -18,7 +18,6 @@ file.ext = function(x) {
     "pandas"=, "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py",
     "clickhouse"="sql",
     "juliadf"="jl", "juliads"="jl",
-    "hdk"="py"
   )
   if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x))
   ans
diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py
index 3846c7be..9b71d417 100755
--- a/modin/groupby-modin.py
+++ b/modin/groupby-modin.py
@@ -210,11 +210,10 @@
 print(ans.tail(3), flush=True)
 del ans
 
-# TODO: change impl
 question = "max v1 - min v2 by id3" # q7
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']]
+ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['id3', 'range_v1_v2']]
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
@@ -226,7 +225,7 @@
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']]
+ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['id3', 'range_v1_v2']]
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
@@ -239,7 +238,6 @@
 print(ans.tail(3), flush=True)
 del ans
 
-# TODO: change impl
 question = "largest two v3 by id6" # q8
 gc.collect()
 t_start = timeit.default_timer()
@@ -268,7 +266,6 @@
 print(ans.tail(3), flush=True)
 del ans
 
-# TODO: change impl
 question = "regression v1 v2 by id2 id4" # q9
 #ans = x[['id2','id4','v1','v2']].groupby(['id2','id4']).corr().iloc[0::2][['v2']]**2 # slower, 76s vs 47s on 1e8 1e2
 gc.collect()

From a1bbb3bc43812972172db44546a045ef969154da Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Fri, 27 Oct 2023 12:29:45 +0200
Subject: [PATCH 12/23] better name

---
 modin/join-modin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modin/join-modin.py b/modin/join-modin.py
index 7abbf252..e433999c 100755
--- a/modin/join-modin.py
+++ b/modin/join-modin.py
@@ -19,7 +19,7 @@
 git = ""
 task = "join"
 solution = "modin"
-fun = ".merge"
+fun = "merge"
 cache = "TRUE"
 on_disk = "FALSE"
 

From d51206de1c6b0ff267d856c4a00b28e9e6b51178 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Tue, 7 Nov 2023 09:47:53 +0100
Subject: [PATCH 13/23] Update to latest modin

---
 _utils/repro.sh        |  21 +-
 modin/groupby-modin.py | 542 +++++++++++++++++++++++++++++++++++------
 modin/join-modin.py    | 322 +++++++++++++++++++-----
 modin/modin-helpers.py |  38 +--
 modin/setup-modin.sh   |   4 +-
 5 files changed, 750 insertions(+), 177 deletions(-)
 mode change 100644 => 100755 _utils/repro.sh

diff --git a/_utils/repro.sh b/_utils/repro.sh
old mode 100644
new mode 100755
index a8df441f..20ddefd4
--- a/_utils/repro.sh
+++ b/_utils/repro.sh
@@ -31,8 +31,21 @@ cd pydatatable
 virtualenv py-pydatatable --python=/usr/bin/python3.10
 cd ../pandas
 virtualenv py-pandas --python=/usr/bin/python3.10
+#################
+# Install modin #
+#################
 cd ../modin
-virtualenv py-modin --python=/usr/bin/python3.10
+curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh && \
+    sh install_miniconda.sh -u -b -p ./py-modin && \
+    rm -f install_miniconda.sh
+
+source ./py-modin/bin/activate
+conda install -y conda-libmamba-solver
+
+# install binaries
+conda install -y -c conda-forge modin-hdk --solver=libmamba
+
+conda deactivate
 cd ..
 
 
@@ -45,8 +58,8 @@ python3 -m pip install --upgrade pandas
 deactivate
 
 source ./modin/py-modin/bin/activate
-python3 -m pip install --upgrade modin
-deactivate
+conda update modin-hdk -y -c conda-forge --solver=libmambapython3 -m pip install --upgrade modin
+conda deactivate
 
 source ./pydatatable/py-pydatatable/bin/activate
 python3 -m pip install --upgrade git+https://github.com/h2oai/datatable
@@ -72,7 +85,7 @@ mv G1_1e7_1e2_0_0.csv data/
 echo "Changing run.conf and _control/data.csv to run only groupby at 0.5GB"
 cp run.conf run.conf.original
 sed -i 's/groupby join groupby2014/groupby/g' run.conf
-sed -i 's/data.table dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf 
+sed -i 's/data.table dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf
 sed -i 's/DO_PUBLISH=true/DO_PUBLISH=false/g' run.conf
 
 # set sizes
diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py
index 9b71d417..53510316 100755
--- a/modin/groupby-modin.py
+++ b/modin/groupby-modin.py
@@ -9,6 +9,7 @@
 
 import modin as modin
 import modin.pandas as pd
+from modin.utils import execute
 
 init_modin_on_hdk(pd)
 
@@ -22,17 +23,19 @@
 cache = "TRUE"
 on_disk = "FALSE"
 
-data_name = os.environ['SRC_DATANAME']
-src_grp = os.path.join("data", data_name+".csv")
+data_name = os.environ["SRC_DATANAME"]
+src_grp = os.path.join("data", data_name + ".csv")
 print("loading dataset %s" % data_name, flush=True)
 
 x = pd.read_csv(
     src_grp,
     dtype={
-        'id1':'category', 'id2':'category', 'id3':'category',
+        "id1": "category",
+        "id2": "category",
+        "id3": "category",
         **{n: "int32" for n in ["id4", "id5", "id6", "v1", "v2"]},
         "v3": "float64",
-    }
+    },
 )
 # To trigger non-lazy loading
 execute(x, trigger_hdk_import=True)
@@ -42,288 +45,675 @@
 task_init = timeit.default_timer()
 print("grouping...", flush=True)
 
-question = "sum v1 by id1" # q1
+question = "sum v1 by id1"  # q1
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'})
+ans = x.groupby(["id1"], **gb_params).agg({"v1": "sum"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum()]
+chk = [ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'})
+ans = x.groupby(["id1"], **gb_params).agg({"v1": "sum"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum()]
+chk = [ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "sum v1 by id1:id2" # q2
+question = "sum v1 by id1:id2"  # q2
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'})
+ans = x.groupby(["id1", "id2"], **gb_params).agg({"v1": "sum"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum()]
+chk = [ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'})
+ans = x.groupby(["id1", "id2"], **gb_params).agg({"v1": "sum"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum()]
+chk = [ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "sum v1 mean v3 by id3" # q3
+question = "sum v1 mean v3 by id3"  # q3
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'})
+ans = x.groupby(["id3"], **gb_params).agg({"v1": "sum", "v3": "mean"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v3'].sum()]
+chk = [ans["v1"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'})
+ans = x.groupby(["id3"], **gb_params).agg({"v1": "sum", "v3": "mean"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v3'].sum()]
+chk = [ans["v1"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "mean v1:v3 by id4" # q4
+question = "mean v1:v3 by id4"  # q4
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})
+ans = x.groupby(["id4"], **gb_params).agg({"v1": "mean", "v2": "mean", "v3": "mean"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum(), ans['v3'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})
+ans = x.groupby(["id4"], **gb_params).agg({"v1": "mean", "v2": "mean", "v3": "mean"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum(), ans['v3'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "sum v1:v3 by id6" # q5
+question = "sum v1:v3 by id6"  # q5
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})
+ans = x.groupby(["id6"], **gb_params).agg({"v1": "sum", "v2": "sum", "v3": "sum"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum(), ans['v3'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})
+ans = x.groupby(["id6"], **gb_params).agg({"v1": "sum", "v2": "sum", "v3": "sum"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum(), ans['v3'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "median v3 sd v3 by id4 id5" # q6
+question = "median v3 sd v3 by id4 id5"  # q6
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']})
+ans = x.groupby(["id4", "id5"], **gb_params).agg({"v3": ["median", "std"]})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans[('v3', 'median')].sum(), ans[('v3', 'std')].sum()]
+chk = [ans[("v3", "median")].sum(), ans[("v3", "std")].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']})
+ans = x.groupby(["id4", "id5"], **gb_params).agg({"v3": ["median", "std"]})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans[('v3', 'median')].sum(), ans[('v3', 'std')].sum()]
+chk = [ans[("v3", "median")].sum(), ans[("v3", "std")].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "max v1 - min v2 by id3" # q7
+question = "max v1 - min v2 by id3"  # q7
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['id3', 'range_v1_v2']]
+ans = (
+    x.groupby(["id3"], **gb_params)
+    .agg({"v1": "max", "v2": "min"})
+    .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]]
+)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['range_v1_v2'].sum()]
+chk = [ans["range_v1_v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['id3', 'range_v1_v2']]
+ans = (
+    x.groupby(["id3"], **gb_params)
+    .agg({"v1": "max", "v2": "min"})
+    .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]]
+)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['range_v1_v2'].sum()]
+chk = [ans["range_v1_v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "largest two v3 by id6" # q8
+question = "largest two v3 by id6"  # q8
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby('id6', sort=False, observed=True)['v3'].nlargest(2).reset_index()[['id6', 'v3']]
+ans = (
+    x.groupby("id6", sort=False, observed=True)["v3"]
+    .nlargest(2)
+    .reset_index()[["id6", "v3"]]
+)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v3'].sum()]
+chk = [ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby('id6', sort=False, observed=True)['v3'].nlargest(2).reset_index()[['id6', 'v3']]
+ans = (
+    x.groupby("id6", sort=False, observed=True)["v3"]
+    .nlargest(2)
+    .reset_index()[["id6", "v3"]]
+)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v3'].sum()]
+chk = [ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "regression v1 v2 by id2 id4" # q9
-#ans = x[['id2','id4','v1','v2']].groupby(['id2','id4']).corr().iloc[0::2][['v2']]**2 # slower, 76s vs 47s on 1e8 1e2
+question = "regression v1 v2 by id2 id4"  # q9
+# ans = x[['id2','id4','v1','v2']].groupby(['id2','id4']).corr().iloc[0::2][['v2']]**2 # slower, 76s vs 47s on 1e8 1e2
 gc.collect()
 t_start = timeit.default_timer()
 from modin.experimental.sql import query
-ans = query('SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;', df=x)
+
+ans = query(
+    "SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x
+)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['r2'].sum()]
+chk = [ans["r2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = query('SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;', df=x)
+ans = query(
+    "SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x
+)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['r2'].sum()]
+chk = [ans["r2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "sum v3 count by id1:id6" # q10
+question = "sum v3 count by id1:id6"  # q10
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'})
+ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg(
+    {"v3": "sum", "v1": "count"}
+)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v3'].sum(), ans['v1'].sum()]
+chk = [ans["v3"].sum(), ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'})
+ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg(
+    {"v3": "sum", "v1": "count"}
+)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v3'].sum(), ans['v1'].sum()]
+chk = [ans["v3"].sum(), ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-print("grouping finished, took %0.fs" % (timeit.default_timer()-task_init), flush=True)
+print(
+    "grouping finished, took %0.fs" % (timeit.default_timer() - task_init), flush=True
+)
 
 exit(0)
diff --git a/modin/join-modin.py b/modin/join-modin.py
index e433999c..09bbad32 100755
--- a/modin/join-modin.py
+++ b/modin/join-modin.py
@@ -10,6 +10,7 @@
 
 import modin as modin
 import modin.pandas as pd
+from modin.utils import execute
 
 init_modin_on_hdk(pd)
 
@@ -24,33 +25,58 @@
 on_disk = "FALSE"
 
 
-data_name = os.environ['SRC_DATANAME']
-src_jn_x = os.path.join("data", data_name+".csv")
+data_name = os.environ["SRC_DATANAME"]
+src_jn_x = os.path.join("data", data_name + ".csv")
 y_data_name = join_to_tbls(data_name)
-src_jn_y = [os.path.join("data", y_data_name[0]+".csv"), os.path.join("data", y_data_name[1]+".csv"), os.path.join("data", y_data_name[2]+".csv")]
+src_jn_y = [
+    os.path.join("data", y_data_name[0] + ".csv"),
+    os.path.join("data", y_data_name[1] + ".csv"),
+    os.path.join("data", y_data_name[2] + ".csv"),
+]
 if len(src_jn_y) != 3:
-  raise Exception("Something went wrong in preparing files used for join")
+    raise Exception("Something went wrong in preparing files used for join")
 
 
-print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2], flush=True)
+print(
+    "loading datasets "
+    + data_name
+    + ", "
+    + y_data_name[0]
+    + ", "
+    + y_data_name[1]
+    + ", "
+    + y_data_name[2],
+    flush=True,
+)
 
-x = pd.read_csv(src_jn_x, dtype={
-  **{n: "int32" for n in ["id1", "id2", "id3"]},
-  **{n: "category" for n in ["id4", "id5", "id6"]},
-  "v1": "float64",
-})
+x = pd.read_csv(
+    src_jn_x,
+    dtype={
+        **{n: "int32" for n in ["id1", "id2", "id3"]},
+        **{n: "category" for n in ["id4", "id5", "id6"]},
+        "v1": "float64",
+    },
+)
 
-small = pd.read_csv(src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"})
-medium = pd.read_csv(src_jn_y[1], dtype={
-  **{n: "int32" for n in ["id1", "id2"]},
-  **{n: "category" for n in ["id4", "id5"]},
-  "v2": "float64",
-})
-big = pd.read_csv(src_jn_y[2], dtype={
-  **{n: "int32" for n in ["id1", "id2", "id3"]},
-  **{n: "category" for n in ["id4", "id5", "id6"]},
-  "v2": "float64",
-})
+small = pd.read_csv(
+    src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"}
+)
+medium = pd.read_csv(
+    src_jn_y[1],
+    dtype={
+        **{n: "int32" for n in ["id1", "id2"]},
+        **{n: "category" for n in ["id4", "id5"]},
+        "v2": "float64",
+    },
+)
+big = pd.read_csv(
+    src_jn_y[2],
+    dtype={
+        **{n: "int32" for n in ["id1", "id2", "id3"]},
+        **{n: "category" for n in ["id4", "id5", "id6"]},
+        "v2": "float64",
+    },
+)
 
 # To trigger non-lazy loading
 [execute(df, trigger_hdk_import=True) for df in [x, small, medium, big]]
@@ -58,146 +84,326 @@
 task_init = timeit.default_timer()
 print("joining...", flush=True)
 
-question = "small inner on int" # q1
+question = "small inner on int"  # q1
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(small, on='id1')
+ans = x.merge(small, on="id1")
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(small, on='id1')
+ans = x.merge(small, on="id1")
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "medium inner on int" # q2
+question = "medium inner on int"  # q2
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(medium, on='id2')
+ans = x.merge(medium, on="id2")
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(medium, on='id2')
+ans = x.merge(medium, on="id2")
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "medium outer on int" # q3
+question = "medium outer on int"  # q3
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(medium, how='left', on='id2')
+ans = x.merge(medium, how="left", on="id2")
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(medium, how='left', on='id2')
+ans = x.merge(medium, how="left", on="id2")
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "medium inner on factor" # q4
+question = "medium inner on factor"  # q4
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(medium, on='id5')
+ans = x.merge(medium, on="id5")
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(medium, on='id5')
+ans = x.merge(medium, on="id5")
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-question = "big inner on int" # q5
+question = "big inner on int"  # q5
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(big, on='id3')
+ans = x.merge(big, on="id3")
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=1,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.merge(big, on='id3')
+ans = x.merge(big, on="id3")
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
 m = memory_usage()
 t_start = timeit.default_timer()
-chk = [ans['v1'].sum(), ans['v2'].sum()]
+chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
+write_log(
+    task=task,
+    data=data_name,
+    in_rows=x.shape[0],
+    question=question,
+    out_rows=ans.shape[0],
+    out_cols=ans.shape[1],
+    solution=solution,
+    version=ver,
+    git=git,
+    fun=fun,
+    run=2,
+    time_sec=t,
+    mem_gb=m,
+    cache=cache,
+    chk=make_chk(chk),
+    chk_time_sec=chkt,
+    on_disk=on_disk,
+)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-print("joining finished, took %0.fs" % (timeit.default_timer()-task_init), flush=True)
+print("joining finished, took %0.fs" % (timeit.default_timer() - task_init), flush=True)
 
-exit(0)
\ No newline at end of file
+exit(0)
diff --git a/modin/modin-helpers.py b/modin/modin-helpers.py
index 5154e66e..bba58d71 100644
--- a/modin/modin-helpers.py
+++ b/modin/modin-helpers.py
@@ -1,52 +1,16 @@
 import os
 
-# Run configuration
-os.environ["MODIN_CPUS"] = "40"
-
-do_execute=True
-# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000"
-solution_txt = (
-    f"modin_{os.environ.get('MODIN_HDK_FRAGMENT_SIZE', 'nfs')}_" +
-    ("exec" if do_execute else "noexec")
-)
 
 # Set up HDK backend
 os.environ["MODIN_ENGINE"] = "native"
 os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
 os.environ["MODIN_EXPERIMENTAL"] = "True"
 
-import pyhdk
-pyhdk.init()
-# pyhdk.init(enable_non_lazy_data_import=True)
-
 
 def init_modin_on_hdk(pd):
+    """Modin on HDK warmup before benchmarking for calcite"""
     from modin.experimental.sql import query
 
-    # Calcite initialization
     data = {"a": [1, 2, 3]}
     df = pd.DataFrame(data)
     query("SELECT * FROM df", df=df)
-
-
-def execute(df, *, trigger_hdk_import: bool = False):
-    if trigger_hdk_import:
-        trigger_import(df)
-    else:
-        if do_execute:
-            df._query_compiler._modin_frame._execute()
-    return df
-
-
-def trigger_import(df):
-    """
-    Trigger import execution for DataFrame obtained by HDK engine.
-    Parameters
-    ----------
-    df : DataFrame
-        DataFrame for trigger import.
-    """
-    modin_frame = df._query_compiler._modin_frame
-    if hasattr(modin_frame, "force_import"):
-        modin_frame.force_import()
-        return
diff --git a/modin/setup-modin.sh b/modin/setup-modin.sh
index 1eb9d56b..6e237158 100755
--- a/modin/setup-modin.sh
+++ b/modin/setup-modin.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 set -e
 
-curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh && \
+curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh && \
     sh install_miniconda.sh -u -b -p ./modin/py-modin && \
     rm -f install_miniconda.sh
 
 source ./modin/py-modin/bin/activate
-conda install -y -c conda-forge conda-libmamba-solver
+conda install -y conda-libmamba-solver
 
 # install binaries
 conda install -y -c conda-forge modin-hdk --solver=libmamba

From 8768466ea70e20506bd6b478381445ba030b0d2b Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Wed, 8 Nov 2023 09:38:26 +0100
Subject: [PATCH 14/23] Update to latest HDK

---
 _utils/repro.sh        |  6 ++++++
 modin/groupby-modin.py | 17 ++++++++++++++++-
 modin/join-modin.py    | 18 ++++++++++++++++--
 modin/modin-helpers.py | 16 ----------------
 modin/setup-modin.sh   |  5 +++++
 5 files changed, 43 insertions(+), 19 deletions(-)
 delete mode 100644 modin/modin-helpers.py

diff --git a/_utils/repro.sh b/_utils/repro.sh
index 20ddefd4..e4c7a0a1 100755
--- a/_utils/repro.sh
+++ b/_utils/repro.sh
@@ -42,10 +42,16 @@ curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-p
 source ./py-modin/bin/activate
 conda install -y conda-libmamba-solver
 
+conda create --name modin -y
+conda activate modin
+echo "conda activate modin" >> ./py-modin/bin/activate
+
 # install binaries
 conda install -y -c conda-forge modin-hdk --solver=libmamba
 
 conda deactivate
+conda deactivate
+
 cd ..
 
 
diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py
index 53510316..bcf86dad 100755
--- a/modin/groupby-modin.py
+++ b/modin/groupby-modin.py
@@ -2,15 +2,30 @@
 
 print("# groupby-modin.py", flush=True)
 
+import os
 import gc
 import timeit
 
-exec(open("./modin/modin-helpers.py").read())
+# Set up HDK backend
+os.environ["MODIN_ENGINE"] = "native"
+os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
+os.environ["MODIN_EXPERIMENTAL"] = "True"
+
 
 import modin as modin
 import modin.pandas as pd
 from modin.utils import execute
 
+
+def init_modin_on_hdk():
+    """Modin on HDK warmup before benchmarking for calcite"""
+    from modin.experimental.sql import query
+
+    data = {"a": [1, 2, 3]}
+    df = pd.DataFrame(data)
+    query("SELECT * FROM df", df=df)
+
+
 init_modin_on_hdk(pd)
 
 exec(open("./_helpers/helpers.py").read())
diff --git a/modin/join-modin.py b/modin/join-modin.py
index 09bbad32..50c13cc4 100755
--- a/modin/join-modin.py
+++ b/modin/join-modin.py
@@ -6,13 +6,27 @@
 import gc
 import timeit
 
-exec(open("./modin/modin-helpers.py").read())
+# Set up HDK backend
+os.environ["MODIN_ENGINE"] = "native"
+os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
+os.environ["MODIN_EXPERIMENTAL"] = "True"
+
 
 import modin as modin
 import modin.pandas as pd
 from modin.utils import execute
 
-init_modin_on_hdk(pd)
+
+def init_modin_on_hdk():
+    """Modin on HDK warmup before benchmarking for calcite"""
+    from modin.experimental.sql import query
+
+    data = {"a": [1, 2, 3]}
+    df = pd.DataFrame(data)
+    query("SELECT * FROM df", df=df)
+
+
+init_modin_on_hdk()
 
 exec(open("./_helpers/helpers.py").read())
 
diff --git a/modin/modin-helpers.py b/modin/modin-helpers.py
deleted file mode 100644
index bba58d71..00000000
--- a/modin/modin-helpers.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import os
-
-
-# Set up HDK backend
-os.environ["MODIN_ENGINE"] = "native"
-os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
-os.environ["MODIN_EXPERIMENTAL"] = "True"
-
-
-def init_modin_on_hdk(pd):
-    """Modin on HDK warmup before benchmarking for calcite"""
-    from modin.experimental.sql import query
-
-    data = {"a": [1, 2, 3]}
-    df = pd.DataFrame(data)
-    query("SELECT * FROM df", df=df)
diff --git a/modin/setup-modin.sh b/modin/setup-modin.sh
index 6e237158..c8b831bc 100755
--- a/modin/setup-modin.sh
+++ b/modin/setup-modin.sh
@@ -8,6 +8,10 @@ curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-p
 source ./modin/py-modin/bin/activate
 conda install -y conda-libmamba-solver
 
+conda create --name modin -y
+conda activate modin
+echo "conda activate modin" >> ./modin/py-modin/bin/activate
+
 # install binaries
 conda install -y -c conda-forge modin-hdk --solver=libmamba
 
@@ -15,3 +19,4 @@ conda install -y -c conda-forge modin-hdk --solver=libmamba
 python3 -c "import modin; print(modin.__version__)"
 
 conda deactivate
+conda deactivate

From 2242d835a7dc778e3f60b2494e8ef916a0fce703 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Wed, 8 Nov 2023 10:20:20 +0100
Subject: [PATCH 15/23] Fixed style

---
 _benchplot/benchplot-dict.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R
index 8241f6b7..12f6ce4e 100644
--- a/_benchplot/benchplot-dict.R
+++ b/_benchplot/benchplot-dict.R
@@ -266,7 +266,7 @@ groupby.syntax.dict = {list(
   "data.table" =  list(),
   "dplyr" =       list(),
   "pandas" =      list(),
-  "modin" =      list(),
+  "modin" =       list(),
   "pydatatable" = list(),
   "spark" =       list("not yet implemented: SPARK-26589" = "median v3 sd v3 by id4 id5"),
   "dask" =        list("not yet implemented: dask#4362" = "median v3 sd v3 by id4 id5"),

From e5a1e0c6d23e47793f852c0ef658e8d0660f6480 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Wed, 8 Nov 2023 10:42:47 +0100
Subject: [PATCH 16/23] codestyle fix

---
 _benchplot/benchplot-dict.R |   3 +-
 modin/groupby-modin.py      | 444 +++---------------------------------
 modin/join-modin.py         | 213 +----------------
 3 files changed, 42 insertions(+), 618 deletions(-)

diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R
index 12f6ce4e..50d1f9ae 100644
--- a/_benchplot/benchplot-dict.R
+++ b/_benchplot/benchplot-dict.R
@@ -296,7 +296,6 @@ groupby.data.exceptions = {list(
     "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") # read_csv #9
   )},
   "modin" = {list(
-    "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1")
   )},
   "pydatatable" = {list(
     "csv reader NAs bug: datatable#2808" = c("G1_1e9_1e2_5_0")
@@ -471,7 +470,7 @@ join.query.exceptions = {list(
   "data.table" =  list(),
   "dplyr" =       list(),
   "pandas" =      list(),
-  "modin" =      list(),
+  "modin" =       list(),
   "pydatatable" = list(),
   "spark" =       list(),
   "dask" =        list(),
diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py
index bcf86dad..7eb5fd59 100755
--- a/modin/groupby-modin.py
+++ b/modin/groupby-modin.py
@@ -71,25 +71,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
@@ -101,25 +83,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -135,25 +99,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
@@ -165,25 +111,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -199,25 +127,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
@@ -229,25 +139,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -263,25 +155,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
@@ -293,25 +167,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -327,25 +183,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
@@ -357,25 +195,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -391,25 +211,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans[("v3", "median")].sum(), ans[("v3", "std")].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
@@ -421,25 +223,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans[("v3", "median")].sum(), ans[("v3", "std")].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -447,11 +231,7 @@ def init_modin_on_hdk():
 question = "max v1 - min v2 by id3"  # q7
 gc.collect()
 t_start = timeit.default_timer()
-ans = (
-    x.groupby(["id3"], **gb_params)
-    .agg({"v1": "max", "v2": "min"})
-    .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]]
-)
+ans = x.groupby(["id3"], **gb_params).agg({"v1": "max", "v2": "min"}).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]]
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
@@ -459,33 +239,11 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["range_v1_v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = (
-    x.groupby(["id3"], **gb_params)
-    .agg({"v1": "max", "v2": "min"})
-    .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]]
-)
+ans = x.groupby(["id3"], **gb_params).agg({"v1": "max", "v2": "min"}).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]]
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
@@ -493,25 +251,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["range_v1_v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -519,11 +259,7 @@ def init_modin_on_hdk():
 question = "largest two v3 by id6"  # q8
 gc.collect()
 t_start = timeit.default_timer()
-ans = (
-    x.groupby("id6", sort=False, observed=True)["v3"]
-    .nlargest(2)
-    .reset_index()[["id6", "v3"]]
-)
+ans = x.groupby("id6", sort=False, observed=True)["v3"].nlargest(2).reset_index()[["id6", "v3"]]
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
@@ -531,33 +267,11 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = (
-    x.groupby("id6", sort=False, observed=True)["v3"]
-    .nlargest(2)
-    .reset_index()[["id6", "v3"]]
-)
+ans = x.groupby("id6", sort=False, observed=True)["v3"].nlargest(2).reset_index()[["id6", "v3"]]
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
@@ -565,25 +279,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v3"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -594,9 +290,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 from modin.experimental.sql import query
 
-ans = query(
-    "SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x
-)
+ans = query("SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
@@ -604,31 +298,11 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["r2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = query(
-    "SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x
-)
+ans = query("SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x)
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
@@ -636,25 +310,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["r2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -662,9 +318,7 @@ def init_modin_on_hdk():
 question = "sum v3 count by id1:id6"  # q10
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg(
-    {"v3": "sum", "v1": "count"}
-)
+ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg({"v3": "sum", "v1": "count"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
@@ -672,31 +326,11 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v3"].sum(), ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
-ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg(
-    {"v3": "sum", "v1": "count"}
-)
+ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg({"v3": "sum", "v1": "count"})
 execute(ans)
 print(ans.shape, flush=True)
 t = timeit.default_timer() - t_start
@@ -704,31 +338,11 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v3"].sum(), ans["v1"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
 
-print(
-    "grouping finished, took %0.fs" % (timeit.default_timer() - task_init), flush=True
-)
+print("grouping finished, took %0.fs" % (timeit.default_timer() - task_init), flush=True)
 
 exit(0)
diff --git a/modin/join-modin.py b/modin/join-modin.py
index 50c13cc4..dd99d323 100755
--- a/modin/join-modin.py
+++ b/modin/join-modin.py
@@ -52,14 +52,7 @@ def init_modin_on_hdk():
 
 
 print(
-    "loading datasets "
-    + data_name
-    + ", "
-    + y_data_name[0]
-    + ", "
-    + y_data_name[1]
-    + ", "
-    + y_data_name[2],
+    "loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2],
     flush=True,
 )
 
@@ -72,9 +65,7 @@ def init_modin_on_hdk():
     },
 )
 
-small = pd.read_csv(
-    src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"}
-)
+small = pd.read_csv(src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"})
 medium = pd.read_csv(
     src_jn_y[1],
     dtype={
@@ -109,25 +100,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
@@ -139,25 +112,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -173,25 +128,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
@@ -203,25 +140,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -237,25 +156,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
@@ -267,25 +168,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -301,25 +184,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
@@ -331,25 +196,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans
@@ -365,25 +212,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=1,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 del ans
 gc.collect()
 t_start = timeit.default_timer()
@@ -395,25 +224,7 @@ def init_modin_on_hdk():
 t_start = timeit.default_timer()
 chk = [ans["v1"].sum(), ans["v2"].sum()]
 chkt = timeit.default_timer() - t_start
-write_log(
-    task=task,
-    data=data_name,
-    in_rows=x.shape[0],
-    question=question,
-    out_rows=ans.shape[0],
-    out_cols=ans.shape[1],
-    solution=solution,
-    version=ver,
-    git=git,
-    fun=fun,
-    run=2,
-    time_sec=t,
-    mem_gb=m,
-    cache=cache,
-    chk=make_chk(chk),
-    chk_time_sec=chkt,
-    on_disk=on_disk,
-)
+write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 print(ans.head(3), flush=True)
 print(ans.tail(3), flush=True)
 del ans

From 59fb1191cfb48ca6293285c7a143bb8b9f0b1d67 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Wed, 8 Nov 2023 10:50:37 +0100
Subject: [PATCH 17/23] fixed interface

---
 modin/groupby-modin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py
index 7eb5fd59..a2f9f46c 100755
--- a/modin/groupby-modin.py
+++ b/modin/groupby-modin.py
@@ -26,7 +26,7 @@ def init_modin_on_hdk():
     query("SELECT * FROM df", df=df)
 
 
-init_modin_on_hdk(pd)
+init_modin_on_hdk()
 
 exec(open("./_helpers/helpers.py").read())
 

From 2e8388c9993a79ba04a93064ee0f7fadd7e9acc0 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Wed, 8 Nov 2023 11:31:41 +0100
Subject: [PATCH 18/23] fixed solver

---
 _utils/repro.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/_utils/repro.sh b/_utils/repro.sh
index e4c7a0a1..2e9c008e 100755
--- a/_utils/repro.sh
+++ b/_utils/repro.sh
@@ -64,7 +64,8 @@ python3 -m pip install --upgrade pandas
 deactivate
 
 source ./modin/py-modin/bin/activate
-conda update modin-hdk -y -c conda-forge --solver=libmambapython3 -m pip install --upgrade modin
+conda update modin-hdk -y -c conda-forge --solver=libmamba
+conda deactivate
 conda deactivate
 
 source ./pydatatable/py-pydatatable/bin/activate

From 4f171389a284b5c8468d56dcbc85fa961e517a27 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Wed, 8 Nov 2023 11:37:23 +0100
Subject: [PATCH 19/23] added modin

---
 _utils/repro.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_utils/repro.sh b/_utils/repro.sh
index 2e9c008e..c870a478 100755
--- a/_utils/repro.sh
+++ b/_utils/repro.sh
@@ -92,7 +92,7 @@ mv G1_1e7_1e2_0_0.csv data/
 echo "Changing run.conf and _control/data.csv to run only groupby at 0.5GB"
 cp run.conf run.conf.original
 sed -i 's/groupby join groupby2014/groupby/g' run.conf
-sed -i 's/data.table dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf
+sed -i 's/data.table dplyr pandas modin pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf
 sed -i 's/DO_PUBLISH=true/DO_PUBLISH=false/g' run.conf
 
 # set sizes

From e7aad1517358d731622925ecb6b1aae0334b691d Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Thu, 9 Nov 2023 15:03:04 +0100
Subject: [PATCH 20/23] cleaned up gitignore

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index f3b8ff70..53f4251d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,7 +8,6 @@ metastore_db/*
 *.md5
 .Rproj.user
 .Rhistory
-py-modin
 miniconda
 db-benchmark.Rproj
 */REVISION

From 655bba30eeb2f46ccb6cde3d5353ded1ae7276e3 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Mon, 13 Nov 2023 12:16:53 +0100
Subject: [PATCH 21/23] removed switch

---
 _launcher/launcher.R | 2 +-
 _launcher/solution.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/_launcher/launcher.R b/_launcher/launcher.R
index 06ae4d12..0a7bc36c 100644
--- a/_launcher/launcher.R
+++ b/_launcher/launcher.R
@@ -15,7 +15,7 @@ file.ext = function(x) {
   ans = switch(
     x,
     "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
-    "pandas"=, "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py",
+    "pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
     "clickhouse"="sql",
     "juliadf"="jl", "juliads"="jl",
   )
diff --git a/_launcher/solution.R b/_launcher/solution.R
index 4979594f..f66b4311 100755
--- a/_launcher/solution.R
+++ b/_launcher/solution.R
@@ -111,7 +111,7 @@ file.ext = function(x) {
   ans = switch(
     x,
     "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R",
-    "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py",
+    "pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py",
     "clickhouse"="sql",
     "juliadf"="jl", "juliads"="jl"
   )

From aba390f377dd5fb3533d4785df3fd4890ab23b5e Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Mon, 13 Nov 2023 12:18:16 +0100
Subject: [PATCH 22/23] added regression test

---
 .github/workflows/regression.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
index 53a7684e..33e3e4bc 100644
--- a/.github/workflows/regression.yml
+++ b/.github/workflows/regression.yml
@@ -17,7 +17,7 @@ jobs:
   strategy:
     fail-fast: false
     matrix:
-      solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion]
+      solution: [data.table, collapse, dplyr, pandas, modin, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion]
   name: Regression Tests solo solutions
   runs-on: ubuntu-20.04
   env:
@@ -91,7 +91,7 @@ jobs:
         name: ${{ matrix.solution }}-out.zip
         path: ${{ matrix.solution }}-out.zip
         if-no-files-found: error
- 
+
  regression-test-benchmark-runner-all-solutions:
   needs: regression-test-benchmark-runner-solo-solutions
   name: Regression Tests all solutions

From f22b57790c90e6934f5e4ed57900a496263fcad2 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Fri, 24 Nov 2023 19:36:37 +0100
Subject: [PATCH 23/23] Fixed CPU count & miniconda activation

---
 _utils/repro.sh        | 2 +-
 modin/groupby-modin.py | 1 +
 modin/join-modin.py    | 1 +
 modin/setup-modin.sh   | 2 +-
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/_utils/repro.sh b/_utils/repro.sh
index c870a478..2ce011c9 100755
--- a/_utils/repro.sh
+++ b/_utils/repro.sh
@@ -39,7 +39,7 @@ curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-p
     sh install_miniconda.sh -u -b -p ./py-modin && \
     rm -f install_miniconda.sh
 
-source ./py-modin/bin/activate
+eval source ./modin/py-modin/bin/activate
 conda install -y conda-libmamba-solver
 
 conda create --name modin -y
diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py
index a2f9f46c..93a5115e 100755
--- a/modin/groupby-modin.py
+++ b/modin/groupby-modin.py
@@ -10,6 +10,7 @@
 os.environ["MODIN_ENGINE"] = "native"
 os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
 os.environ["MODIN_EXPERIMENTAL"] = "True"
+os.environ["MODIN_CPUS"] = "40"
 
 
 import modin as modin
diff --git a/modin/join-modin.py b/modin/join-modin.py
index dd99d323..3c4ce57e 100755
--- a/modin/join-modin.py
+++ b/modin/join-modin.py
@@ -10,6 +10,7 @@
 os.environ["MODIN_ENGINE"] = "native"
 os.environ["MODIN_STORAGE_FORMAT"] = "hdk"
 os.environ["MODIN_EXPERIMENTAL"] = "True"
+os.environ["MODIN_CPUS"] = "40"
 
 
 import modin as modin
diff --git a/modin/setup-modin.sh b/modin/setup-modin.sh
index c8b831bc..34a9b32f 100755
--- a/modin/setup-modin.sh
+++ b/modin/setup-modin.sh
@@ -5,7 +5,7 @@ curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-p
     sh install_miniconda.sh -u -b -p ./modin/py-modin && \
     rm -f install_miniconda.sh
 
-source ./modin/py-modin/bin/activate
+eval source ./modin/py-modin/bin/activate
 conda install -y conda-libmamba-solver
 
 conda create --name modin -y