From faad60ec05b5224d029203b4ce805a31d2f53a51 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Wed, 27 Sep 2023 20:10:02 +0200 Subject: [PATCH 01/23] update --- _benchplot/benchplot-dict.R | 32 ++++++++++++++++++++++++++++++++ _control/solutions.csv | 2 ++ _launcher/solution.R | 2 +- run.conf | 2 +- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R index 14cb6964..01c95bcd 100644 --- a/_benchplot/benchplot-dict.R +++ b/_benchplot/benchplot-dict.R @@ -34,6 +34,7 @@ solution.dict = {list( "data.table" = list(name=c(short="data.table", long="data.table"), color=c(strong="blue", light="#7777FF")), "dplyr" = list(name=c(short="dplyr", long="dplyr"), color=c(strong="red", light="#FF7777")), "pandas" = list(name=c(short="pandas", long="pandas"), color=c(strong="green4", light="#77FF77")), + "modin" = list(name=c(short="pandas", long="pandas"), color=c(strong="green4", light="#77FF77")), "pydatatable" = list(name=c(short="pydatatable", long="(py)datatable"), color=c(strong="darkorange", light="orange")), "spark" = list(name=c(short="spark", long="spark"), color=c(strong="#8000FFFF", light="#CC66FF")), "dask" = list(name=c(short="dask", long="dask"), color=c(strong="slategrey", light="lightgrey")), @@ -102,6 +103,19 @@ groupby.syntax.dict = {list( "regression v1 v2 by id2 id4" = "DF[['id2','id4','v1','v2']].groupby(['id2','id4'], as_index=False, sort=False, observed=True, dropna=False).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}))", "sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3':'sum', 'v1':'size'})" )}, + # TODO: update later + "modin" = {c( + "sum v1 by id1" = "DF.groupby('id1', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum'})", + "sum v1 by id1:id2" = "DF.groupby(['id1','id2'], as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum'})", + "sum v1 mean v3 by id3" = "DF.groupby('id3', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum', 'v3':'mean'})", + "mean v1:v3 by id4" = "DF.groupby('id4', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})", + "sum v1:v3 by id6" = "DF.groupby('id6', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})", + "median v3 sd v3 by id4 id5" = "DF.groupby(['id4','id5'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3': ['median','std']})", + "max v1 - min v2 by id3" = "DF.groupby('id3', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'max', 'v2':'min'}).assign(range_v1_v2=lambda x: x['v1']-x['v2'])[['id3','range_v1_v2']]", + "largest two v3 by id6" = "DF[~DF['v3'].isna()][['id6','v3']].sort_values('v3', ascending=False).groupby('id6', as_index=False, sort=False, observed=True, dropna=False).head(2)", + "regression v1 v2 by id2 id4" = "DF[['id2','id4','v1','v2']].groupby(['id2','id4'], as_index=False, sort=False, observed=True, dropna=False).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}))", + "sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3':'sum', 'v1':'size'})" + )}, "pydatatable" = {c( "sum v1 by id1" = "DT[:, {'v1': sum(f.v1)}, by(f.id1)]", "sum v1 by id1:id2" = "DT[:, {'v1': sum(f.v1)}, by(f.id1, f.id2)]", @@ -239,6 +253,7 @@ groupby.query.exceptions = {list( "data.table" = list(), "dplyr" = list(), "pandas" = list(), + "modin" = list(), "pydatatable" = list(), "spark" = list("not yet implemented: SPARK-26589" = "median v3 sd v3 by id4 id5"), "dask" = list("not yet implemented: dask#4362" = "median v3 sd v3 by id4 id5"), @@ -265,6 +280,10 @@ groupby.data.exceptions = {list( "pandas" = {list( "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") # read_csv #9 )}, + # TODO: fix later + "modin" = {list( + "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") # read_csv #9 + )}, "pydatatable" = {list( "csv reader NAs bug: datatable#2808" = c("G1_1e9_1e2_5_0") )}, @@ -362,6 +381,14 @@ join.syntax.dict = {list( "medium inner on factor" = "DF.merge(medium, on='id5')", "big inner on int" = "DF.merge(big, on='id3')" )}, + # TODO: update later + "modin" = {c( + "small inner on int" = "DF.merge(small, on='id1')", + "medium inner on int" = "DF.merge(medium, on='id2')", + "medium outer on int" = "DF.merge(medium, how='left', on='id2')", + "medium inner on factor" = "DF.merge(medium, on='id5')", + "big inner on int" = "DF.merge(big, on='id3')" + )}, "pydatatable" = {c( "small inner on int" = "y.key = 'id1'; DT[:, :, join(y)][isfinite(f.v2), :]", "medium inner on int" = "y.key = 'id2'; DT[:, :, join(y)][isfinite(f.v2), :]", @@ -423,6 +450,7 @@ join.query.exceptions = {list( "data.table" = list(), "dplyr" = list(), "pandas" = list(), + "modin" = list(), "pydatatable" = list(), "spark" = list(), "dask" = list(), @@ -445,6 +473,10 @@ join.data.exceptions = {list( "pandas" = {list( "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") # read_csv )}, + # TODO: update later + "modin" = {list( + "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") # read_csv + )}, "pydatatable" = {list( "csv reader NAs bug: datatable#2808" = "J1_1e9_NA_5_0", "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_0_1") # q5 out of memory due to a deep copy diff --git a/_control/solutions.csv b/_control/solutions.csv index ac996de0..1df22a18 100644 --- a/_control/solutions.csv +++ b/_control/solutions.csv @@ -8,6 +8,8 @@ dplyr,groupby2014 pandas,groupby pandas,join pandas,groupby2014 +modin,groupby +modin,join pydatatable,groupby pydatatable,join spark,groupby diff --git a/_launcher/solution.R b/_launcher/solution.R index c419a2c5..4b400d57 100755 --- a/_launcher/solution.R +++ b/_launcher/solution.R @@ -111,7 +111,7 @@ file.ext = function(x) { ans = switch( x, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", - "pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py", + "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", "juliadf"="jl", "juliads"="jl" ) diff --git a/run.conf b/run.conf index 5347df80..06e26757 100644 --- a/run.conf +++ b/run.conf @@ -1,7 +1,7 @@ # task, used in init-setup-iteration.R export RUN_TASKS="groupby" # solution, used in init-setup-iteration.R -export RUN_SOLUTIONS="data.table juliads dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb datafusion" +export RUN_SOLUTIONS="data.table juliads dplyr pandas modin pydatatable spark dask clickhouse polars arrow duckdb datafusion" # juliadf clickhouse" From 1943eed6412dd3f9d194417a54e1a2fac0933be7 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Mon, 2 Oct 2023 13:14:29 +0000 Subject: [PATCH 02/23] Working version --- modin/groupby-modin.py | 164 +++++++++++++++++++++++++++++------------ 1 file changed, 117 insertions(+), 47 deletions(-) diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py index d4e45a79..dd568214 100755 --- a/modin/groupby-modin.py +++ b/modin/groupby-modin.py @@ -3,20 +3,78 @@ print("# groupby-modin.py", flush=True) import os + +os.environ["MODIN_ENGINE"] = "native" +os.environ["MODIN_STORAGE_FORMAT"] = "hdk" +os.environ["MODIN_EXPERIMENTAL"] = "True" +# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000" +os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000" +print("Pandas backend: Modin on HDK") + import gc import timeit import modin as modin import modin.pandas as pd -import ray + +import pyhdk +pyhdk.init() + +def init_modin_on_hdk(pd): + from modin.experimental.sql import query + + # Calcite initialization + data = {"a": [1, 2, 3]} + df = pd.DataFrame(data) + query("SELECT * FROM df", df=df) + + +init_modin_on_hdk(pd) +gb_params = dict(as_index=False, sort=False, observed=True) + + +def trigger_import(df: pd.DataFrame): + """ + Trigger import execution for DataFrame obtained by HDK engine. + Parameters + ---------- + df : DataFrame + DataFrame for trigger import. + """ + modin_frame = df._query_compiler._modin_frame + if hasattr(modin_frame, "force_import"): + modin_frame.force_import() + return + + # The code below has been kept for backwards compatibility and will be removed in the future. + + from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import ( + DbWorker, + ) + + df.shape # to trigger real execution + + p = modin_frame._partitions[0][0] + if ( + p.frame_id is None + and modin_frame._has_arrow_table() + and not isinstance(table := p.get(), pd.DataFrame) + ): + p.frame_id = DbWorker().import_arrow_table(table) # to trigger real execution + + +def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): + if trigger_hdk_import: + trigger_import(df) + else: + df._query_compiler._modin_frame._execute() + return df exec(open("./_helpers/helpers.py").read()) ver = modin.__version__ -ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}}) -warnings.filterwarnings('ignore') -os.environ["MODIN_ENGINE"] = "ray" +# warnings.filterwarnings('ignore') git = "" task = "groupby" @@ -29,8 +87,11 @@ src_grp = os.path.join("data", data_name+".csv") print("loading dataset %s" % data_name, flush=True) -x = pd.read_csv(src_grp, dtype={'id1':'category', 'id2':'category', 'id3':'category'}) +x = pd.read_csv(src_grp, dtype={'id1':'category', 'id2':'category', 'id3':'category', + **{n: "int32" for n in ["id4", "id5", "id6", "v1", "v2"]}, + "v3": "float64",}) print(len(x.index), flush=True) +execute(x, trigger_hdk_import=True) task_init = timeit.default_timer() print("grouping...", flush=True) @@ -38,8 +99,8 @@ question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1'], observed=True).agg({'v1':'sum'}) -ans.reset_index(inplace=True) # #68 +ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'}) +# ans.reset_index(inplace=True) # #68 print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -50,8 +111,8 @@ del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1'], observed=True).agg({'v1':'sum'}) -ans.reset_index(inplace=True) +ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -66,8 +127,8 @@ question = "sum v1 by id1:id2" # q2 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1','id2'], observed=True).agg({'v1':'sum'}) -ans.reset_index(inplace=True) +ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -78,8 +139,8 @@ del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1','id2'], observed=True).agg({'v1':'sum'}) -ans.reset_index(inplace=True) +ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -94,8 +155,8 @@ question = "sum v1 mean v3 by id3" # q3 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id3'], observed=True).agg({'v1':'sum', 'v3':'mean'}) -ans.reset_index(inplace=True) +ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -106,8 +167,8 @@ del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id3'], observed=True).agg({'v1':'sum', 'v3':'mean'}) -ans.reset_index(inplace=True) +ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -122,8 +183,8 @@ question = "mean v1:v3 by id4" # q4 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id4'], observed=True).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'}) -ans.reset_index(inplace=True) +ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -134,8 +195,8 @@ del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id4'], observed=True).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'}) -ans.reset_index(inplace=True) +ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -150,8 +211,8 @@ question = "sum v1:v3 by id6" # q5 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id6'], observed=True).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'}) -ans.reset_index(inplace=True) +ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -162,8 +223,8 @@ del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id6'], observed=True).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'}) -ans.reset_index(inplace=True) +ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -178,36 +239,37 @@ question = "median v3 sd v3 by id4 id5" # q6 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id4','id5'], observed=True).agg({'v3': ['median','std']}) -ans.reset_index(inplace=True) +ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v3']['median'].sum(), ans['v3']['std'].sum()] +chk = [ans[('v3', 'median')].sum(), ans[('v3', 'std')].sum()] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id4','id5'], observed=True).agg({'v3': ['median','std']}) -ans.reset_index(inplace=True) +ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v3']['median'].sum(), ans['v3']['std'].sum()] +chk = [ans[('v3', 'median')].sum(), ans[('v3', 'std')].sum()] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans +# TODO: change impl question = "max v1 - min v2 by id3" # q7 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id3'], observed=True).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']] -ans.reset_index(inplace=True) +ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']] +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -218,8 +280,8 @@ del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id3'], observed=True).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']] -ans.reset_index(inplace=True) +ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']] +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -231,11 +293,12 @@ print(ans.tail(3), flush=True) del ans +# TODO: change impl question = "largest two v3 by id6" # q8 gc.collect() t_start = timeit.default_timer() -ans = x[['id6','v3']].sort_values('v3', ascending=False).groupby(['id6'], observed=True).head(2) -ans.reset_index(drop=True, inplace=True) +ans = x.groupby('id6')['v3'].nlargest(2).reset_index()[['id6', 'v3']] +# ans.reset_index(drop=True, inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -246,8 +309,8 @@ del ans gc.collect() t_start = timeit.default_timer() -ans = x[['id6','v3']].sort_values('v3', ascending=False).groupby(['id6'], observed=True).head(2) -ans.reset_index(drop=True, inplace=True) +ans = x.groupby('id6')['v3'].nlargest(2).reset_index()[['id6', 'v3']] +# ans.reset_index(drop=True, inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -259,12 +322,19 @@ print(ans.tail(3), flush=True) del ans +# TODO: change impl question = "regression v1 v2 by id2 id4" # q9 #ans = x[['id2','id4','v1','v2']].groupby(['id2','id4']).corr().iloc[0::2][['v2']]**2 # slower, 76s vs 47s on 1e8 1e2 gc.collect() t_start = timeit.default_timer() -ans = x[['id2','id4','v1','v2']].groupby(['id2','id4'], observed=True).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2})) -ans.reset_index(inplace=True) +from modin.experimental.sql import query +sql = """ +SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 +FROM df +GROUP BY id2, id4; +""" +ans = query(sql, df=x) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -275,8 +345,8 @@ del ans gc.collect() t_start = timeit.default_timer() -ans = x[['id2','id4','v1','v2']].groupby(['id2','id4'], observed=True).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2})) -ans.reset_index(inplace=True) +ans = query(sql, df=x) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -291,8 +361,8 @@ question = "sum v3 count by id1:id6" # q10 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1','id2','id3','id4','id5','id6'], observed=True).agg({'v3':'sum', 'v1':'count'}) -ans.reset_index(inplace=True) +ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -303,8 +373,8 @@ del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1','id2','id3','id4','id5','id6'], observed=True).agg({'v3':'sum', 'v1':'count'}) -ans.reset_index(inplace=True) +ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'}) +# ans.reset_index(inplace=True) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() From fad2249116ea673882a2bffa38958ec2df57ae63 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Mon, 2 Oct 2023 13:15:00 +0000 Subject: [PATCH 03/23] Removed FRAGMENT SIZE hardcode --- modin/groupby-modin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py index dd568214..5a6f3db6 100755 --- a/modin/groupby-modin.py +++ b/modin/groupby-modin.py @@ -8,7 +8,7 @@ os.environ["MODIN_STORAGE_FORMAT"] = "hdk" os.environ["MODIN_EXPERIMENTAL"] = "True" # os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000" -os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000" +# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000" print("Pandas backend: Modin on HDK") import gc From 826c93245990b7ed710c40aa5799f84c510a6f52 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Mon, 2 Oct 2023 14:08:46 +0000 Subject: [PATCH 04/23] join update --- modin/join-modin.py | 232 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 206 insertions(+), 26 deletions(-) diff --git a/modin/join-modin.py b/modin/join-modin.py index b7c69650..38a8eb3b 100755 --- a/modin/join-modin.py +++ b/modin/join-modin.py @@ -5,65 +5,245 @@ import os import gc import timeit + +os.environ["MODIN_ENGINE"] = "native" +os.environ["MODIN_STORAGE_FORMAT"] = "hdk" +os.environ["MODIN_EXPERIMENTAL"] = "True" +# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000" +# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000" +print("Pandas backend: Modin on HDK") + + +import modin import modin.pandas as pd -exec(open("./helpers.py").read()) -src_x = os.environ['SRC_X_LOCAL'] -src_y = os.environ['SRC_Y_LOCAL'] +import pyhdk +pyhdk.init() + +def init_modin_on_hdk(pd): + from modin.experimental.sql import query + + # Calcite initialization + data = {"a": [1, 2, 3]} + df = pd.DataFrame(data) + query("SELECT * FROM df", df=df) + + +init_modin_on_hdk(pd) + + +def trigger_import(df: pd.DataFrame): + """ + Trigger import execution for DataFrame obtained by HDK engine. + Parameters + ---------- + df : DataFrame + DataFrame for trigger import. + """ + modin_frame = df._query_compiler._modin_frame + if hasattr(modin_frame, "force_import"): + modin_frame.force_import() + return + + # The code below has been kept for backwards compatibility and will be removed in the future. -ver = "" #pd.__version__ + from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import ( + DbWorker, + ) + + df.shape # to trigger real execution + + p = modin_frame._partitions[0][0] + if ( + p.frame_id is None + and modin_frame._has_arrow_table() + and not isinstance(table := p.get(), pd.DataFrame) + ): + p.frame_id = DbWorker().import_arrow_table(table) # to trigger real execution + + +def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): + if trigger_hdk_import: + trigger_import(df) + else: + df._query_compiler._modin_frame._execute() + return df + +exec(open("./_helpers/helpers.py").read()) + +ver = modin.__version__ git = "" task = "join" -question = "inner join" -l = [os.path.basename(src_x), os.path.basename(src_y)] -data_name = '-'.join(l) solution = "modin" -fun = "merge" +fun = ".merge" cache = "TRUE" +on_disk = "FALSE" + + +data_name = os.environ['SRC_DATANAME'] +src_jn_x = os.path.join("data", data_name+".csv") +y_data_name = join_to_tbls(data_name) +src_jn_y = [os.path.join("data", y_data_name[0]+".csv"), os.path.join("data", y_data_name[1]+".csv"), os.path.join("data", y_data_name[2]+".csv")] +if len(src_jn_y) != 3: + raise Exception("Something went wrong in preparing files used for join") + -print("loading datasets...") +print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2], flush=True) -x = pd.read_csv(os.path.basename(src_x)) -y = pd.read_csv(os.path.basename(src_y)) +x = pd.read_csv(src_jn_x, dtype={ + **{n: "int32" for n in ["id1", "id2", "id3"]}, + **{n: "category" for n in ["id4", "id5", "id6"]}, + "v1": "float64", + }) -print("joining...") +small = pd.read_csv(src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"}) +medium = pd.read_csv(src_jn_y[1], dtype={ + **{n: "int32" for n in ["id1", "id2"]}, + **{n: "category" for n in ["id4", "id5"]}, + "v2": "float64", + }) +big = pd.read_csv(src_jn_y[2], dtype={ + **{n: "int32" for n in ["id1", "id2", "id3"]}, + **{n: "category" for n in ["id4", "id5", "id6"]}, + "v2": "float64", + },) -# NotImplementedError: To contribute to Pandas on Ray, please visit github.com/modin-project/modin +[execute(df, trigger_hdk_import=True) for df in [x, small, medium, big]] + +task_init = timeit.default_timer() +print("joining...", flush=True) + +question = "small inner on int" # q1 +gc.collect() +t_start = timeit.default_timer() +ans = x.merge(small, on='id1') +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['v1'].sum(), ans['v2'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +del ans gc.collect() t_start = timeit.default_timer() -ans = x.merge(y, how='inner', on='KEY') -print(ans.shape) +ans = x.merge(small, on='id1') +print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['X2'].sum(), ans['Y2'].sum()] +chk = [ans['v1'].sum(), ans['v2'].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(ans.head(3), flush=True) +print(ans.tail(3), flush=True) del ans +question = "medium inner on int" # q2 gc.collect() t_start = timeit.default_timer() -ans = x.merge(y, how='inner', on='KEY') -print(ans.shape) +ans = x.merge(medium, on='id2') +print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['X2'].sum(), ans['Y2'].sum()] +chk = [ans['v1'].sum(), ans['v2'].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +del ans +gc.collect() +t_start = timeit.default_timer() +ans = x.merge(medium, on='id2') +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['v1'].sum(), ans['v2'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(ans.head(3), flush=True) +print(ans.tail(3), flush=True) del ans +question = "medium outer on int" # q3 gc.collect() t_start = timeit.default_timer() -ans = x.merge(y, how='inner', on='KEY') -print(ans.shape) +ans = x.merge(medium, how='left', on='id2') +print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['X2'].sum(), ans['Y2'].sum()] +chk = [ans['v1'].sum(), ans['v2'].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans +gc.collect() +t_start = timeit.default_timer() +ans = x.merge(medium, how='left', on='id2') +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['v1'].sum(), ans['v2'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(ans.head(3), flush=True) +print(ans.tail(3), flush=True) +del ans + +question = "medium inner on factor" # q4 +gc.collect() +t_start = timeit.default_timer() +ans = x.merge(medium, on='id5') +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['v1'].sum(), ans['v2'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +del ans +gc.collect() +t_start = timeit.default_timer() +ans = x.merge(medium, on='id5') +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['v1'].sum(), ans['v2'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(ans.head(3), flush=True) +print(ans.tail(3), flush=True) +del ans + +question = "big inner on int" # q5 +gc.collect() +t_start = timeit.default_timer() +ans = x.merge(big, on='id3') +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['v1'].sum(), ans['v2'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +del ans +gc.collect() +t_start = timeit.default_timer() +ans = x.merge(big, on='id3') +print(ans.shape, flush=True) +t = timeit.default_timer() - t_start +m = memory_usage() +t_start = timeit.default_timer() +chk = [ans['v1'].sum(), ans['v2'].sum()] +chkt = timeit.default_timer() - t_start +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(ans.head(3), flush=True) +print(ans.tail(3), flush=True) +del ans + +print("joining finished, took %0.fs" % (timeit.default_timer()-task_init), flush=True) -exit(0) +exit(0) \ No newline at end of file From 6ed39e2f5259f6f689c715f7d44e8b23cdf22dfa Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Tue, 10 Oct 2023 13:31:41 +0000 Subject: [PATCH 05/23] current solution --- _launcher/launcher.R | 2 +- _launcher/solution.R | 2 +- modin/groupby-modin.py | 116 +++++++++++------------------------------ modin/join-modin.py | 72 +++++-------------------- modin/modin-helpers.py | 52 ++++++++++++++++++ 5 files changed, 96 insertions(+), 148 deletions(-) create mode 100644 modin/modin-helpers.py diff --git a/_launcher/launcher.R b/_launcher/launcher.R index 2f4b07d2..a152075e 100644 --- a/_launcher/launcher.R +++ b/_launcher/launcher.R @@ -15,7 +15,7 @@ file.ext = function(x) { ans = switch( x, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", - "pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py", + "pandas"=, "spark"=, "pydatatable"=, "modin"="py", "hdk"="py", "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", "juliadf"="jl", "juliads"="jl", ) diff --git a/_launcher/solution.R b/_launcher/solution.R index 4b400d57..bb469d0c 100755 --- a/_launcher/solution.R +++ b/_launcher/solution.R @@ -111,7 +111,7 @@ file.ext = function(x) { ans = switch( x, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", - "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py", + "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "hdk"="py", "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", "juliadf"="jl", "juliads"="jl" ) diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py index 5a6f3db6..752043d5 100755 --- a/modin/groupby-modin.py +++ b/modin/groupby-modin.py @@ -2,83 +2,22 @@ print("# groupby-modin.py", flush=True) -import os - -os.environ["MODIN_ENGINE"] = "native" -os.environ["MODIN_STORAGE_FORMAT"] = "hdk" -os.environ["MODIN_EXPERIMENTAL"] = "True" -# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000" -# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000" -print("Pandas backend: Modin on HDK") - import gc import timeit -import modin as modin -import modin.pandas as pd - -import pyhdk -pyhdk.init() - -def init_modin_on_hdk(pd): - from modin.experimental.sql import query - # Calcite initialization - data = {"a": [1, 2, 3]} - df = pd.DataFrame(data) - query("SELECT * FROM df", df=df) +exec(open("./modin/modin-helpers.py").read()) +import modin as modin +import modin.pandas as pd init_modin_on_hdk(pd) -gb_params = dict(as_index=False, sort=False, observed=True) - - -def trigger_import(df: pd.DataFrame): - """ - Trigger import execution for DataFrame obtained by HDK engine. - Parameters - ---------- - df : DataFrame - DataFrame for trigger import. - """ - modin_frame = df._query_compiler._modin_frame - if hasattr(modin_frame, "force_import"): - modin_frame.force_import() - return - - # The code below has been kept for backwards compatibility and will be removed in the future. - - from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import ( - DbWorker, - ) - - df.shape # to trigger real execution - - p = modin_frame._partitions[0][0] - if ( - p.frame_id is None - and modin_frame._has_arrow_table() - and not isinstance(table := p.get(), pd.DataFrame) - ): - p.frame_id = DbWorker().import_arrow_table(table) # to trigger real execution - - -def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): - if trigger_hdk_import: - trigger_import(df) - else: - df._query_compiler._modin_frame._execute() - return df exec(open("./_helpers/helpers.py").read()) ver = modin.__version__ - - -# warnings.filterwarnings('ignore') - git = "" task = "groupby" -solution = "modin" +solution = solution_txt fun = ".groupby" cache = "TRUE" on_disk = "FALSE" @@ -91,8 +30,11 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): **{n: "int32" for n in ["id4", "id5", "id6", "v1", "v2"]}, "v3": "float64",}) print(len(x.index), flush=True) +# To trigger non-lazy loading execute(x, trigger_hdk_import=True) +gb_params = dict(as_index=False, sort=False, observed=True) + task_init = timeit.default_timer() print("grouping...", flush=True) @@ -100,7 +42,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'}) -# ans.reset_index(inplace=True) # #68 +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -112,7 +54,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -128,7 +70,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -140,7 +82,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -156,7 +98,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -168,7 +110,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -184,7 +126,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -196,7 +138,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -212,7 +154,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -224,7 +166,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -240,7 +182,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -252,7 +194,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -269,7 +211,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']] -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -281,7 +223,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']] -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -297,8 +239,8 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): question = "largest two v3 by id6" # q8 gc.collect() t_start = timeit.default_timer() -ans = x.groupby('id6')['v3'].nlargest(2).reset_index()[['id6', 'v3']] -# ans.reset_index(drop=True, inplace=True) +ans = x.groupby('id6', sort=False, observed=True)['v3'].nlargest(2).reset_index()[['id6', 'v3']] +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -309,8 +251,8 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby('id6')['v3'].nlargest(2).reset_index()[['id6', 'v3']] -# ans.reset_index(drop=True, inplace=True) +ans = x.groupby('id6', sort=False, observed=True)['v3'].nlargest(2).reset_index()[['id6', 'v3']] +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -334,7 +276,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): GROUP BY id2, id4; """ ans = query(sql, df=x) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -346,7 +288,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = query(sql, df=x) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -362,7 +304,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -374,7 +316,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'}) -# ans.reset_index(inplace=True) +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() diff --git a/modin/join-modin.py b/modin/join-modin.py index 38a8eb3b..f9b10d31 100755 --- a/modin/join-modin.py +++ b/modin/join-modin.py @@ -6,70 +6,13 @@ import gc import timeit -os.environ["MODIN_ENGINE"] = "native" -os.environ["MODIN_STORAGE_FORMAT"] = "hdk" -os.environ["MODIN_EXPERIMENTAL"] = "True" -# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000" -# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "4000000" -print("Pandas backend: Modin on HDK") +exec(open("./modin/modin-helpers.py").read()) - -import modin +import modin as modin import modin.pandas as pd - -import pyhdk -pyhdk.init() - -def init_modin_on_hdk(pd): - from modin.experimental.sql import query - - # Calcite initialization - data = {"a": [1, 2, 3]} - df = pd.DataFrame(data) - query("SELECT * FROM df", df=df) - - init_modin_on_hdk(pd) - -def trigger_import(df: pd.DataFrame): - """ - Trigger import execution for DataFrame obtained by HDK engine. - Parameters - ---------- - df : DataFrame - DataFrame for trigger import. - """ - modin_frame = df._query_compiler._modin_frame - if hasattr(modin_frame, "force_import"): - modin_frame.force_import() - return - - # The code below has been kept for backwards compatibility and will be removed in the future. - - from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import ( - DbWorker, - ) - - df.shape # to trigger real execution - - p = modin_frame._partitions[0][0] - if ( - p.frame_id is None - and modin_frame._has_arrow_table() - and not isinstance(table := p.get(), pd.DataFrame) - ): - p.frame_id = DbWorker().import_arrow_table(table) # to trigger real execution - - -def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): - if trigger_hdk_import: - trigger_import(df) - else: - df._query_compiler._modin_frame._execute() - return df - exec(open("./_helpers/helpers.py").read()) ver = modin.__version__ @@ -109,6 +52,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): "v2": "float64", },) +# To trigger non-lazy loading [execute(df, trigger_hdk_import=True) for df in [x, small, medium, big]] task_init = timeit.default_timer() @@ -118,6 +62,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.merge(small, on='id1') +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -129,6 +74,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.merge(small, on='id1') +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -144,6 +90,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.merge(medium, on='id2') +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -155,6 +102,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.merge(medium, on='id2') +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -170,6 +118,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.merge(medium, how='left', on='id2') +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -181,6 +130,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.merge(medium, how='left', on='id2') +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -196,6 +146,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.merge(medium, on='id5') +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -207,6 +158,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.merge(medium, on='id5') +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -222,6 +174,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.merge(big, on='id3') +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() @@ -233,6 +186,7 @@ def execute(df: pd.DataFrame, *, trigger_hdk_import: bool = False): gc.collect() t_start = timeit.default_timer() ans = x.merge(big, on='id3') +execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() diff --git a/modin/modin-helpers.py b/modin/modin-helpers.py new file mode 100644 index 00000000..5154e66e --- /dev/null +++ b/modin/modin-helpers.py @@ -0,0 +1,52 @@ +import os + +# Run configuration +os.environ["MODIN_CPUS"] = "40" + +do_execute=True +# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000" +solution_txt = ( + f"modin_{os.environ.get('MODIN_HDK_FRAGMENT_SIZE', 'nfs')}_" + + ("exec" if do_execute else "noexec") +) + +# Set up HDK backend +os.environ["MODIN_ENGINE"] = "native" +os.environ["MODIN_STORAGE_FORMAT"] = "hdk" +os.environ["MODIN_EXPERIMENTAL"] = "True" + +import pyhdk +pyhdk.init() +# pyhdk.init(enable_non_lazy_data_import=True) + + +def init_modin_on_hdk(pd): + from modin.experimental.sql import query + + # Calcite initialization + data = {"a": [1, 2, 3]} + df = pd.DataFrame(data) + query("SELECT * FROM df", df=df) + + +def execute(df, *, trigger_hdk_import: bool = False): + if trigger_hdk_import: + trigger_import(df) + else: + if do_execute: + df._query_compiler._modin_frame._execute() + return df + + +def trigger_import(df): + """ + Trigger import execution for DataFrame obtained by HDK engine. + Parameters + ---------- + df : DataFrame + DataFrame for trigger import. + """ + modin_frame = df._query_compiler._modin_frame + if hasattr(modin_frame, "force_import"): + modin_frame.force_import() + return From c36ee558f80e05331a41282e5bf8d4a2c5874c8c Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 12 Oct 2023 12:02:50 +0000 Subject: [PATCH 06/23] setup script --- modin/setup-modin.sh | 18 +++++++++++------- modin/upg-modin.sh | 4 ++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/modin/setup-modin.sh b/modin/setup-modin.sh index 4ef46d87..30ef1d75 100755 --- a/modin/setup-modin.sh +++ b/modin/setup-modin.sh @@ -1,16 +1,20 @@ #!/bin/bash set -e -virtualenv modin/py-modin --python=python3 -source modin/py-modin/bin/activate +curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + sh install_miniconda.sh -u -b -p ./modin/miniconda && \ + rm -f install_miniconda.sh + +source ./modin/miniconda/bin/activate +conda install -y conda-libmamba-solver # install binaries -python3 -m pip install --upgrade modin[all] +conda create -y --prefix ./modin/py-modin -c conda-forge python=3.10 --experimental-solver=libmamba +conda install -y -p ./modin/py-modin -c conda-forge modin-hdk --experimental-solver=libmamba + +conda activate modin/py-modin # check -python3 -import modin -modin.__version__ -quit() +conda run -p modin/py-modin python3 -c "import modin; print(modin.__version__)" deactivate diff --git a/modin/upg-modin.sh b/modin/upg-modin.sh index 80ca5591..f0ed4093 100755 --- a/modin/upg-modin.sh +++ b/modin/upg-modin.sh @@ -3,6 +3,6 @@ set -e echo 'upgrading modin...' -source ./modin/py-modin/bin/activate +source ./modin/miniconda/bin/activate -python -m pip install --upgrade modin[all] > /dev/null +conda update modin-hdk -p ./modin/py-modin -y From da91c4813cc2f052c7515746a9f2c5c19a43f9bb Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 12 Oct 2023 13:47:39 +0000 Subject: [PATCH 07/23] fixed --- .gitignore | 2 ++ _launcher/launcher.R | 3 ++- _launcher/solution.R | 5 +++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 18572a17..294f3b82 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ metastore_db/* *.md5 .Rproj.user .Rhistory +py-modin +miniconda db-benchmark.Rproj */REVISION */VERSION diff --git a/_launcher/launcher.R b/_launcher/launcher.R index a152075e..d3d2b53e 100644 --- a/_launcher/launcher.R +++ b/_launcher/launcher.R @@ -15,9 +15,10 @@ file.ext = function(x) { ans = switch( x, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", - "pandas"=, "spark"=, "pydatatable"=, "modin"="py", "hdk"="py", "dask"=, "datafusion"=, "polars"="py", + "pandas"=, "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", "juliadf"="jl", "juliads"="jl", + "hdk"="py" ) if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x)) ans diff --git a/_launcher/solution.R b/_launcher/solution.R index bb469d0c..af0e161b 100755 --- a/_launcher/solution.R +++ b/_launcher/solution.R @@ -111,9 +111,10 @@ file.ext = function(x) { ans = switch( x, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", - "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "hdk"="py", "dask"=, "datafusion"=, "polars"="py", + "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", - "juliadf"="jl", "juliads"="jl" + "juliadf"="jl", "juliads"="jl", + "hdk"="py" ) if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x)) ans From 7405a9585edf6f40274a76f6dd494ae64b1a85b3 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 12 Oct 2023 14:06:34 +0000 Subject: [PATCH 08/23] Updates --- _benchplot/benchplot-dict.R | 14 +++++--------- modin/groupby-modin.py | 9 ++------- modin/setup-modin.sh | 17 +++++++---------- modin/upg-modin.sh | 5 ++--- 4 files changed, 16 insertions(+), 29 deletions(-) diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R index 5242436b..33f6643a 100644 --- a/_benchplot/benchplot-dict.R +++ b/_benchplot/benchplot-dict.R @@ -34,7 +34,7 @@ solution.dict = {list( "data.table" = list(name=c(short="data.table", long="data.table"), color=c(strong="blue", light="#7777FF")), "dplyr" = list(name=c(short="dplyr", long="dplyr"), color=c(strong="red", light="#FF7777")), "pandas" = list(name=c(short="pandas", long="pandas"), color=c(strong="green4", light="#77FF77")), - "modin" = list(name=c(short="pandas", long="pandas"), color=c(strong="green4", light="#77FF77")), + "modin" = list(name=c(short="modin", long="modin"), color=c(strong="blue4", light="#7799ff")), "pydatatable" = list(name=c(short="pydatatable", long="(py)datatable"), color=c(strong="darkorange", light="orange")), "spark" = list(name=c(short="spark", long="spark"), color=c(strong="#8000FFFF", light="#CC66FF")), "dask" = list(name=c(short="dask", long="dask"), color=c(strong="slategrey", light="lightgrey")), @@ -103,7 +103,6 @@ groupby.syntax.dict = {list( "regression v1 v2 by id2 id4" = "DF[['id2','id4','v1','v2']].groupby(['id2','id4'], as_index=False, sort=False, observed=True, dropna=False).apply(lambda x: x['v1'].corr(x['v2'])**2).rename(columns={None: 'r2'})", "sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3':'sum', 'v1':'size'})" )}, - # TODO: update later "modin" = {c( "sum v1 by id1" = "DF.groupby('id1', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum'})", "sum v1 by id1:id2" = "DF.groupby(['id1','id2'], as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum'})", @@ -112,8 +111,8 @@ groupby.syntax.dict = {list( "sum v1:v3 by id6" = "DF.groupby('id6', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})", "median v3 sd v3 by id4 id5" = "DF.groupby(['id4','id5'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3': ['median','std']})", "max v1 - min v2 by id3" = "DF.groupby('id3', as_index=False, sort=False, observed=True, dropna=False).agg({'v1':'max', 'v2':'min'}).assign(range_v1_v2=lambda x: x['v1']-x['v2'])[['id3','range_v1_v2']]", - "largest two v3 by id6" = "DF[~DF['v3'].isna()][['id6','v3']].sort_values('v3', ascending=False).groupby('id6', as_index=False, sort=False, observed=True, dropna=False).head(2)", - "regression v1 v2 by id2 id4" = "DF[['id2','id4','v1','v2']].groupby(['id2','id4'], as_index=False, sort=False, observed=True, dropna=False).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}))", + "largest two v3 by id6" = "DF.groupby('id6', sort=False, observed=True)['v3'].nlargest(2).reset_index()[['id6', 'v3']]", + "regression v1 v2 by id2 id4" = "query('SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;', df=x)", "sum v3 count by id1:id6" = "DF.groupby(['id1','id2','id3','id4','id5','id6'], as_index=False, sort=False, observed=True, dropna=False).agg({'v3':'sum', 'v1':'size'})" )}, "pydatatable" = {c( @@ -280,9 +279,8 @@ groupby.data.exceptions = {list( "pandas" = {list( "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") # read_csv #9 )}, - # TODO: fix later "modin" = {list( - "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") # read_csv #9 + "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") )}, "pydatatable" = {list( "csv reader NAs bug: datatable#2808" = c("G1_1e9_1e2_5_0") @@ -381,7 +379,6 @@ join.syntax.dict = {list( "medium inner on factor" = "DF.merge(medium, on='id5')", "big inner on int" = "DF.merge(big, on='id3')" )}, - # TODO: update later "modin" = {c( "small inner on int" = "DF.merge(small, on='id1')", "medium inner on int" = "DF.merge(medium, on='id2')", @@ -473,9 +470,8 @@ join.data.exceptions = {list( "pandas" = {list( "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") # read_csv )}, - # TODO: update later "modin" = {list( - "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") # read_csv + "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") )}, "pydatatable" = {list( "csv reader NAs bug: datatable#2808" = "J1_1e9_NA_5_0", diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py index 752043d5..846c5e26 100755 --- a/modin/groupby-modin.py +++ b/modin/groupby-modin.py @@ -270,12 +270,7 @@ gc.collect() t_start = timeit.default_timer() from modin.experimental.sql import query -sql = """ -SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 -FROM df -GROUP BY id2, id4; -""" -ans = query(sql, df=x) +ans = query('SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;', df=x) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start @@ -287,7 +282,7 @@ del ans gc.collect() t_start = timeit.default_timer() -ans = query(sql, df=x) +ans = query('SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;', df=x) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start diff --git a/modin/setup-modin.sh b/modin/setup-modin.sh index 30ef1d75..1eb9d56b 100755 --- a/modin/setup-modin.sh +++ b/modin/setup-modin.sh @@ -1,20 +1,17 @@ #!/bin/bash set -e -curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - sh install_miniconda.sh -u -b -p ./modin/miniconda && \ +curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh && \ + sh install_miniconda.sh -u -b -p ./modin/py-modin && \ rm -f install_miniconda.sh -source ./modin/miniconda/bin/activate -conda install -y conda-libmamba-solver +source ./modin/py-modin/bin/activate +conda install -y -c conda-forge conda-libmamba-solver # install binaries -conda create -y --prefix ./modin/py-modin -c conda-forge python=3.10 --experimental-solver=libmamba -conda install -y -p ./modin/py-modin -c conda-forge modin-hdk --experimental-solver=libmamba - -conda activate modin/py-modin +conda install -y -c conda-forge modin-hdk --solver=libmamba # check -conda run -p modin/py-modin python3 -c "import modin; print(modin.__version__)" +python3 -c "import modin; print(modin.__version__)" -deactivate +conda deactivate diff --git a/modin/upg-modin.sh b/modin/upg-modin.sh index f0ed4093..7c6ae50d 100755 --- a/modin/upg-modin.sh +++ b/modin/upg-modin.sh @@ -3,6 +3,5 @@ set -e echo 'upgrading modin...' -source ./modin/miniconda/bin/activate - -conda update modin-hdk -p ./modin/py-modin -y +source ./modin/py-modin/bin/activate +conda update modin-hdk -y -c conda-forge --solver=libmamba From 01bdc14706adcf77c500f437ab22bc738dbb4b88 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 13 Oct 2023 13:54:09 +0000 Subject: [PATCH 09/23] moved HDK --- _launcher/solution.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_launcher/solution.R b/_launcher/solution.R index af0e161b..3b233691 100755 --- a/_launcher/solution.R +++ b/_launcher/solution.R @@ -113,8 +113,8 @@ file.ext = function(x) { "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", - "juliadf"="jl", "juliads"="jl", - "hdk"="py" + "hdk"="py", + "juliadf"="jl", "juliads"="jl" ) if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x)) ans From 379c099fd92431675be89506cc0bcaea5a598c97 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 13 Oct 2023 13:57:44 +0000 Subject: [PATCH 10/23] Fixed formatting --- modin/groupby-modin.py | 14 +++++++++----- modin/join-modin.py | 24 ++++++++++++------------ 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py index 846c5e26..3846c7be 100755 --- a/modin/groupby-modin.py +++ b/modin/groupby-modin.py @@ -17,7 +17,7 @@ ver = modin.__version__ git = "" task = "groupby" -solution = solution_txt +solution = "modin" fun = ".groupby" cache = "TRUE" on_disk = "FALSE" @@ -26,10 +26,14 @@ src_grp = os.path.join("data", data_name+".csv") print("loading dataset %s" % data_name, flush=True) -x = pd.read_csv(src_grp, dtype={'id1':'category', 'id2':'category', 'id3':'category', - **{n: "int32" for n in ["id4", "id5", "id6", "v1", "v2"]}, - "v3": "float64",}) -print(len(x.index), flush=True) +x = pd.read_csv( + src_grp, + dtype={ + 'id1':'category', 'id2':'category', 'id3':'category', + **{n: "int32" for n in ["id4", "id5", "id6", "v1", "v2"]}, + "v3": "float64", + } +) # To trigger non-lazy loading execute(x, trigger_hdk_import=True) diff --git a/modin/join-modin.py b/modin/join-modin.py index f9b10d31..7abbf252 100755 --- a/modin/join-modin.py +++ b/modin/join-modin.py @@ -35,22 +35,22 @@ print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2], flush=True) x = pd.read_csv(src_jn_x, dtype={ - **{n: "int32" for n in ["id1", "id2", "id3"]}, - **{n: "category" for n in ["id4", "id5", "id6"]}, - "v1": "float64", - }) + **{n: "int32" for n in ["id1", "id2", "id3"]}, + **{n: "category" for n in ["id4", "id5", "id6"]}, + "v1": "float64", +}) small = pd.read_csv(src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"}) medium = pd.read_csv(src_jn_y[1], dtype={ - **{n: "int32" for n in ["id1", "id2"]}, - **{n: "category" for n in ["id4", "id5"]}, - "v2": "float64", - }) + **{n: "int32" for n in ["id1", "id2"]}, + **{n: "category" for n in ["id4", "id5"]}, + "v2": "float64", +}) big = pd.read_csv(src_jn_y[2], dtype={ - **{n: "int32" for n in ["id1", "id2", "id3"]}, - **{n: "category" for n in ["id4", "id5", "id6"]}, - "v2": "float64", - },) + **{n: "int32" for n in ["id1", "id2", "id3"]}, + **{n: "category" for n in ["id4", "id5", "id6"]}, + "v2": "float64", +}) # To trigger non-lazy loading [execute(df, trigger_hdk_import=True) for df in [x, small, medium, big]] From 52d22baf5376f996b13cb523a432ade2e3d88636 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Mon, 16 Oct 2023 12:02:02 +0000 Subject: [PATCH 11/23] Updated groupby --- _launcher/launcher.R | 1 - modin/groupby-modin.py | 7 ++----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/_launcher/launcher.R b/_launcher/launcher.R index d3d2b53e..fd58be9d 100644 --- a/_launcher/launcher.R +++ b/_launcher/launcher.R @@ -18,7 +18,6 @@ file.ext = function(x) { "pandas"=, "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", "juliadf"="jl", "juliads"="jl", - "hdk"="py" ) if (is.null(ans)) stop(sprintf("solution %s does not have file extension defined in file.ext helper function", x)) ans diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py index 3846c7be..9b71d417 100755 --- a/modin/groupby-modin.py +++ b/modin/groupby-modin.py @@ -210,11 +210,10 @@ print(ans.tail(3), flush=True) del ans -# TODO: change impl question = "max v1 - min v2 by id3" # q7 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']] +ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['id3', 'range_v1_v2']] execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start @@ -226,7 +225,7 @@ del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['range_v1_v2']] +ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['id3', 'range_v1_v2']] execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start @@ -239,7 +238,6 @@ print(ans.tail(3), flush=True) del ans -# TODO: change impl question = "largest two v3 by id6" # q8 gc.collect() t_start = timeit.default_timer() @@ -268,7 +266,6 @@ print(ans.tail(3), flush=True) del ans -# TODO: change impl question = "regression v1 v2 by id2 id4" # q9 #ans = x[['id2','id4','v1','v2']].groupby(['id2','id4']).corr().iloc[0::2][['v2']]**2 # slower, 76s vs 47s on 1e8 1e2 gc.collect() From a1bbb3bc43812972172db44546a045ef969154da Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 27 Oct 2023 12:29:45 +0200 Subject: [PATCH 12/23] better name --- modin/join-modin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/join-modin.py b/modin/join-modin.py index 7abbf252..e433999c 100755 --- a/modin/join-modin.py +++ b/modin/join-modin.py @@ -19,7 +19,7 @@ git = "" task = "join" solution = "modin" -fun = ".merge" +fun = "merge" cache = "TRUE" on_disk = "FALSE" From d51206de1c6b0ff267d856c4a00b28e9e6b51178 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Tue, 7 Nov 2023 09:47:53 +0100 Subject: [PATCH 13/23] Update to latest modin --- _utils/repro.sh | 21 +- modin/groupby-modin.py | 542 +++++++++++++++++++++++++++++++++++------ modin/join-modin.py | 322 +++++++++++++++++++----- modin/modin-helpers.py | 38 +-- modin/setup-modin.sh | 4 +- 5 files changed, 750 insertions(+), 177 deletions(-) mode change 100644 => 100755 _utils/repro.sh diff --git a/_utils/repro.sh b/_utils/repro.sh old mode 100644 new mode 100755 index a8df441f..20ddefd4 --- a/_utils/repro.sh +++ b/_utils/repro.sh @@ -31,8 +31,21 @@ cd pydatatable virtualenv py-pydatatable --python=/usr/bin/python3.10 cd ../pandas virtualenv py-pandas --python=/usr/bin/python3.10 +################# +# Install modin # +################# cd ../modin -virtualenv py-modin --python=/usr/bin/python3.10 +curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh && \ + sh install_miniconda.sh -u -b -p ./py-modin && \ + rm -f install_miniconda.sh + +source ./py-modin/bin/activate +conda install -y conda-libmamba-solver + +# install binaries +conda install -y -c conda-forge modin-hdk --solver=libmamba + +conda deactivate cd .. @@ -45,8 +58,8 @@ python3 -m pip install --upgrade pandas deactivate source ./modin/py-modin/bin/activate -python3 -m pip install --upgrade modin -deactivate +conda update modin-hdk -y -c conda-forge --solver=libmambapython3 -m pip install --upgrade modin +conda deactivate source ./pydatatable/py-pydatatable/bin/activate python3 -m pip install --upgrade git+https://github.com/h2oai/datatable @@ -72,7 +85,7 @@ mv G1_1e7_1e2_0_0.csv data/ echo "Changing run.conf and _control/data.csv to run only groupby at 0.5GB" cp run.conf run.conf.original sed -i 's/groupby join groupby2014/groupby/g' run.conf -sed -i 's/data.table dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf +sed -i 's/data.table dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf sed -i 's/DO_PUBLISH=true/DO_PUBLISH=false/g' run.conf # set sizes diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py index 9b71d417..53510316 100755 --- a/modin/groupby-modin.py +++ b/modin/groupby-modin.py @@ -9,6 +9,7 @@ import modin as modin import modin.pandas as pd +from modin.utils import execute init_modin_on_hdk(pd) @@ -22,17 +23,19 @@ cache = "TRUE" on_disk = "FALSE" -data_name = os.environ['SRC_DATANAME'] -src_grp = os.path.join("data", data_name+".csv") +data_name = os.environ["SRC_DATANAME"] +src_grp = os.path.join("data", data_name + ".csv") print("loading dataset %s" % data_name, flush=True) x = pd.read_csv( src_grp, dtype={ - 'id1':'category', 'id2':'category', 'id3':'category', + "id1": "category", + "id2": "category", + "id3": "category", **{n: "int32" for n in ["id4", "id5", "id6", "v1", "v2"]}, "v3": "float64", - } + }, ) # To trigger non-lazy loading execute(x, trigger_hdk_import=True) @@ -42,288 +45,675 @@ task_init = timeit.default_timer() print("grouping...", flush=True) -question = "sum v1 by id1" # q1 +question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'}) +ans = x.groupby(["id1"], **gb_params).agg({"v1": "sum"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum()] +chk = [ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1'], **gb_params).agg({'v1':'sum'}) +ans = x.groupby(["id1"], **gb_params).agg({"v1": "sum"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum()] +chk = [ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "sum v1 by id1:id2" # q2 +question = "sum v1 by id1:id2" # q2 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'}) +ans = x.groupby(["id1", "id2"], **gb_params).agg({"v1": "sum"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum()] +chk = [ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1','id2'], **gb_params).agg({'v1':'sum'}) +ans = x.groupby(["id1", "id2"], **gb_params).agg({"v1": "sum"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum()] +chk = [ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "sum v1 mean v3 by id3" # q3 +question = "sum v1 mean v3 by id3" # q3 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'}) +ans = x.groupby(["id3"], **gb_params).agg({"v1": "sum", "v3": "mean"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v3'].sum()] +chk = [ans["v1"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id3'], **gb_params).agg({'v1':'sum', 'v3':'mean'}) +ans = x.groupby(["id3"], **gb_params).agg({"v1": "sum", "v3": "mean"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v3'].sum()] +chk = [ans["v1"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "mean v1:v3 by id4" # q4 +question = "mean v1:v3 by id4" # q4 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'}) +ans = x.groupby(["id4"], **gb_params).agg({"v1": "mean", "v2": "mean", "v3": "mean"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum(), ans['v3'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id4'], **gb_params).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'}) +ans = x.groupby(["id4"], **gb_params).agg({"v1": "mean", "v2": "mean", "v3": "mean"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum(), ans['v3'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "sum v1:v3 by id6" # q5 +question = "sum v1:v3 by id6" # q5 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'}) +ans = x.groupby(["id6"], **gb_params).agg({"v1": "sum", "v2": "sum", "v3": "sum"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum(), ans['v3'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id6'], **gb_params).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'}) +ans = x.groupby(["id6"], **gb_params).agg({"v1": "sum", "v2": "sum", "v3": "sum"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum(), ans['v3'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "median v3 sd v3 by id4 id5" # q6 +question = "median v3 sd v3 by id4 id5" # q6 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']}) +ans = x.groupby(["id4", "id5"], **gb_params).agg({"v3": ["median", "std"]}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans[('v3', 'median')].sum(), ans[('v3', 'std')].sum()] +chk = [ans[("v3", "median")].sum(), ans[("v3", "std")].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id4','id5'], **gb_params).agg({'v3': ['median','std']}) +ans = x.groupby(["id4", "id5"], **gb_params).agg({"v3": ["median", "std"]}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans[('v3', 'median')].sum(), ans[('v3', 'std')].sum()] +chk = [ans[("v3", "median")].sum(), ans[("v3", "std")].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "max v1 - min v2 by id3" # q7 +question = "max v1 - min v2 by id3" # q7 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['id3', 'range_v1_v2']] +ans = ( + x.groupby(["id3"], **gb_params) + .agg({"v1": "max", "v2": "min"}) + .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]] +) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['range_v1_v2'].sum()] +chk = [ans["range_v1_v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id3'], **gb_params).agg({'v1': 'max', 'v2': 'min'}).assign(range_v1_v2=lambda x: x['v1'] - x['v2'])[['id3', 'range_v1_v2']] +ans = ( + x.groupby(["id3"], **gb_params) + .agg({"v1": "max", "v2": "min"}) + .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]] +) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['range_v1_v2'].sum()] +chk = [ans["range_v1_v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "largest two v3 by id6" # q8 +question = "largest two v3 by id6" # q8 gc.collect() t_start = timeit.default_timer() -ans = x.groupby('id6', sort=False, observed=True)['v3'].nlargest(2).reset_index()[['id6', 'v3']] +ans = ( + x.groupby("id6", sort=False, observed=True)["v3"] + .nlargest(2) + .reset_index()[["id6", "v3"]] +) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v3'].sum()] +chk = [ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby('id6', sort=False, observed=True)['v3'].nlargest(2).reset_index()[['id6', 'v3']] +ans = ( + x.groupby("id6", sort=False, observed=True)["v3"] + .nlargest(2) + .reset_index()[["id6", "v3"]] +) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v3'].sum()] +chk = [ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "regression v1 v2 by id2 id4" # q9 -#ans = x[['id2','id4','v1','v2']].groupby(['id2','id4']).corr().iloc[0::2][['v2']]**2 # slower, 76s vs 47s on 1e8 1e2 +question = "regression v1 v2 by id2 id4" # q9 +# ans = x[['id2','id4','v1','v2']].groupby(['id2','id4']).corr().iloc[0::2][['v2']]**2 # slower, 76s vs 47s on 1e8 1e2 gc.collect() t_start = timeit.default_timer() from modin.experimental.sql import query -ans = query('SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;', df=x) + +ans = query( + "SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x +) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['r2'].sum()] +chk = [ans["r2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = query('SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;', df=x) +ans = query( + "SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x +) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['r2'].sum()] +chk = [ans["r2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "sum v3 count by id1:id6" # q10 +question = "sum v3 count by id1:id6" # q10 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'}) +ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg( + {"v3": "sum", "v1": "count"} +) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v3'].sum(), ans['v1'].sum()] +chk = [ans["v3"].sum(), ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(['id1','id2','id3','id4','id5','id6'], **gb_params).agg({'v3':'sum', 'v1':'count'}) +ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg( + {"v3": "sum", "v1": "count"} +) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v3'].sum(), ans['v1'].sum()] +chk = [ans["v3"].sum(), ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -print("grouping finished, took %0.fs" % (timeit.default_timer()-task_init), flush=True) +print( + "grouping finished, took %0.fs" % (timeit.default_timer() - task_init), flush=True +) exit(0) diff --git a/modin/join-modin.py b/modin/join-modin.py index e433999c..09bbad32 100755 --- a/modin/join-modin.py +++ b/modin/join-modin.py @@ -10,6 +10,7 @@ import modin as modin import modin.pandas as pd +from modin.utils import execute init_modin_on_hdk(pd) @@ -24,33 +25,58 @@ on_disk = "FALSE" -data_name = os.environ['SRC_DATANAME'] -src_jn_x = os.path.join("data", data_name+".csv") +data_name = os.environ["SRC_DATANAME"] +src_jn_x = os.path.join("data", data_name + ".csv") y_data_name = join_to_tbls(data_name) -src_jn_y = [os.path.join("data", y_data_name[0]+".csv"), os.path.join("data", y_data_name[1]+".csv"), os.path.join("data", y_data_name[2]+".csv")] +src_jn_y = [ + os.path.join("data", y_data_name[0] + ".csv"), + os.path.join("data", y_data_name[1] + ".csv"), + os.path.join("data", y_data_name[2] + ".csv"), +] if len(src_jn_y) != 3: - raise Exception("Something went wrong in preparing files used for join") + raise Exception("Something went wrong in preparing files used for join") -print("loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2], flush=True) +print( + "loading datasets " + + data_name + + ", " + + y_data_name[0] + + ", " + + y_data_name[1] + + ", " + + y_data_name[2], + flush=True, +) -x = pd.read_csv(src_jn_x, dtype={ - **{n: "int32" for n in ["id1", "id2", "id3"]}, - **{n: "category" for n in ["id4", "id5", "id6"]}, - "v1": "float64", -}) +x = pd.read_csv( + src_jn_x, + dtype={ + **{n: "int32" for n in ["id1", "id2", "id3"]}, + **{n: "category" for n in ["id4", "id5", "id6"]}, + "v1": "float64", + }, +) -small = pd.read_csv(src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"}) -medium = pd.read_csv(src_jn_y[1], dtype={ - **{n: "int32" for n in ["id1", "id2"]}, - **{n: "category" for n in ["id4", "id5"]}, - "v2": "float64", -}) -big = pd.read_csv(src_jn_y[2], dtype={ - **{n: "int32" for n in ["id1", "id2", "id3"]}, - **{n: "category" for n in ["id4", "id5", "id6"]}, - "v2": "float64", -}) +small = pd.read_csv( + src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"} +) +medium = pd.read_csv( + src_jn_y[1], + dtype={ + **{n: "int32" for n in ["id1", "id2"]}, + **{n: "category" for n in ["id4", "id5"]}, + "v2": "float64", + }, +) +big = pd.read_csv( + src_jn_y[2], + dtype={ + **{n: "int32" for n in ["id1", "id2", "id3"]}, + **{n: "category" for n in ["id4", "id5", "id6"]}, + "v2": "float64", + }, +) # To trigger non-lazy loading [execute(df, trigger_hdk_import=True) for df in [x, small, medium, big]] @@ -58,146 +84,326 @@ task_init = timeit.default_timer() print("joining...", flush=True) -question = "small inner on int" # q1 +question = "small inner on int" # q1 gc.collect() t_start = timeit.default_timer() -ans = x.merge(small, on='id1') +ans = x.merge(small, on="id1") execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.merge(small, on='id1') +ans = x.merge(small, on="id1") execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "medium inner on int" # q2 +question = "medium inner on int" # q2 gc.collect() t_start = timeit.default_timer() -ans = x.merge(medium, on='id2') +ans = x.merge(medium, on="id2") execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.merge(medium, on='id2') +ans = x.merge(medium, on="id2") execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "medium outer on int" # q3 +question = "medium outer on int" # q3 gc.collect() t_start = timeit.default_timer() -ans = x.merge(medium, how='left', on='id2') +ans = x.merge(medium, how="left", on="id2") execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.merge(medium, how='left', on='id2') +ans = x.merge(medium, how="left", on="id2") execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "medium inner on factor" # q4 +question = "medium inner on factor" # q4 gc.collect() t_start = timeit.default_timer() -ans = x.merge(medium, on='id5') +ans = x.merge(medium, on="id5") execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.merge(medium, on='id5') +ans = x.merge(medium, on="id5") execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -question = "big inner on int" # q5 +question = "big inner on int" # q5 gc.collect() t_start = timeit.default_timer() -ans = x.merge(big, on='id3') +ans = x.merge(big, on="id3") execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) del ans gc.collect() t_start = timeit.default_timer() -ans = x.merge(big, on='id3') +ans = x.merge(big, on="id3") execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() -chk = [ans['v1'].sum(), ans['v2'].sum()] +chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +write_log( + task=task, + data=data_name, + in_rows=x.shape[0], + question=question, + out_rows=ans.shape[0], + out_cols=ans.shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=2, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk(chk), + chk_time_sec=chkt, + on_disk=on_disk, +) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -print("joining finished, took %0.fs" % (timeit.default_timer()-task_init), flush=True) +print("joining finished, took %0.fs" % (timeit.default_timer() - task_init), flush=True) -exit(0) \ No newline at end of file +exit(0) diff --git a/modin/modin-helpers.py b/modin/modin-helpers.py index 5154e66e..bba58d71 100644 --- a/modin/modin-helpers.py +++ b/modin/modin-helpers.py @@ -1,52 +1,16 @@ import os -# Run configuration -os.environ["MODIN_CPUS"] = "40" - -do_execute=True -# os.environ['MODIN_HDK_FRAGMENT_SIZE'] = "32000000" -solution_txt = ( - f"modin_{os.environ.get('MODIN_HDK_FRAGMENT_SIZE', 'nfs')}_" + - ("exec" if do_execute else "noexec") -) # Set up HDK backend os.environ["MODIN_ENGINE"] = "native" os.environ["MODIN_STORAGE_FORMAT"] = "hdk" os.environ["MODIN_EXPERIMENTAL"] = "True" -import pyhdk -pyhdk.init() -# pyhdk.init(enable_non_lazy_data_import=True) - def init_modin_on_hdk(pd): + """Modin on HDK warmup before benchmarking for calcite""" from modin.experimental.sql import query - # Calcite initialization data = {"a": [1, 2, 3]} df = pd.DataFrame(data) query("SELECT * FROM df", df=df) - - -def execute(df, *, trigger_hdk_import: bool = False): - if trigger_hdk_import: - trigger_import(df) - else: - if do_execute: - df._query_compiler._modin_frame._execute() - return df - - -def trigger_import(df): - """ - Trigger import execution for DataFrame obtained by HDK engine. - Parameters - ---------- - df : DataFrame - DataFrame for trigger import. - """ - modin_frame = df._query_compiler._modin_frame - if hasattr(modin_frame, "force_import"): - modin_frame.force_import() - return diff --git a/modin/setup-modin.sh b/modin/setup-modin.sh index 1eb9d56b..6e237158 100755 --- a/modin/setup-modin.sh +++ b/modin/setup-modin.sh @@ -1,12 +1,12 @@ #!/bin/bash set -e -curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh && \ +curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh && \ sh install_miniconda.sh -u -b -p ./modin/py-modin && \ rm -f install_miniconda.sh source ./modin/py-modin/bin/activate -conda install -y -c conda-forge conda-libmamba-solver +conda install -y conda-libmamba-solver # install binaries conda install -y -c conda-forge modin-hdk --solver=libmamba From 8768466ea70e20506bd6b478381445ba030b0d2b Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Wed, 8 Nov 2023 09:38:26 +0100 Subject: [PATCH 14/23] Update to latest HDK --- _utils/repro.sh | 6 ++++++ modin/groupby-modin.py | 17 ++++++++++++++++- modin/join-modin.py | 18 ++++++++++++++++-- modin/modin-helpers.py | 16 ---------------- modin/setup-modin.sh | 5 +++++ 5 files changed, 43 insertions(+), 19 deletions(-) delete mode 100644 modin/modin-helpers.py diff --git a/_utils/repro.sh b/_utils/repro.sh index 20ddefd4..e4c7a0a1 100755 --- a/_utils/repro.sh +++ b/_utils/repro.sh @@ -42,10 +42,16 @@ curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-p source ./py-modin/bin/activate conda install -y conda-libmamba-solver +conda create --name modin -y +conda activate modin +echo "conda activate modin" >> ./py-modin/bin/activate + # install binaries conda install -y -c conda-forge modin-hdk --solver=libmamba conda deactivate +conda deactivate + cd .. diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py index 53510316..bcf86dad 100755 --- a/modin/groupby-modin.py +++ b/modin/groupby-modin.py @@ -2,15 +2,30 @@ print("# groupby-modin.py", flush=True) +import os import gc import timeit -exec(open("./modin/modin-helpers.py").read()) +# Set up HDK backend +os.environ["MODIN_ENGINE"] = "native" +os.environ["MODIN_STORAGE_FORMAT"] = "hdk" +os.environ["MODIN_EXPERIMENTAL"] = "True" + import modin as modin import modin.pandas as pd from modin.utils import execute + +def init_modin_on_hdk(): + """Modin on HDK warmup before benchmarking for calcite""" + from modin.experimental.sql import query + + data = {"a": [1, 2, 3]} + df = pd.DataFrame(data) + query("SELECT * FROM df", df=df) + + init_modin_on_hdk(pd) exec(open("./_helpers/helpers.py").read()) diff --git a/modin/join-modin.py b/modin/join-modin.py index 09bbad32..50c13cc4 100755 --- a/modin/join-modin.py +++ b/modin/join-modin.py @@ -6,13 +6,27 @@ import gc import timeit -exec(open("./modin/modin-helpers.py").read()) +# Set up HDK backend +os.environ["MODIN_ENGINE"] = "native" +os.environ["MODIN_STORAGE_FORMAT"] = "hdk" +os.environ["MODIN_EXPERIMENTAL"] = "True" + import modin as modin import modin.pandas as pd from modin.utils import execute -init_modin_on_hdk(pd) + +def init_modin_on_hdk(): + """Modin on HDK warmup before benchmarking for calcite""" + from modin.experimental.sql import query + + data = {"a": [1, 2, 3]} + df = pd.DataFrame(data) + query("SELECT * FROM df", df=df) + + +init_modin_on_hdk() exec(open("./_helpers/helpers.py").read()) diff --git a/modin/modin-helpers.py b/modin/modin-helpers.py deleted file mode 100644 index bba58d71..00000000 --- a/modin/modin-helpers.py +++ /dev/null @@ -1,16 +0,0 @@ -import os - - -# Set up HDK backend -os.environ["MODIN_ENGINE"] = "native" -os.environ["MODIN_STORAGE_FORMAT"] = "hdk" -os.environ["MODIN_EXPERIMENTAL"] = "True" - - -def init_modin_on_hdk(pd): - """Modin on HDK warmup before benchmarking for calcite""" - from modin.experimental.sql import query - - data = {"a": [1, 2, 3]} - df = pd.DataFrame(data) - query("SELECT * FROM df", df=df) diff --git a/modin/setup-modin.sh b/modin/setup-modin.sh index 6e237158..c8b831bc 100755 --- a/modin/setup-modin.sh +++ b/modin/setup-modin.sh @@ -8,6 +8,10 @@ curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-p source ./modin/py-modin/bin/activate conda install -y conda-libmamba-solver +conda create --name modin -y +conda activate modin +echo "conda activate modin" >> ./modin/py-modin/bin/activate + # install binaries conda install -y -c conda-forge modin-hdk --solver=libmamba @@ -15,3 +19,4 @@ conda install -y -c conda-forge modin-hdk --solver=libmamba python3 -c "import modin; print(modin.__version__)" conda deactivate +conda deactivate From 2242d835a7dc778e3f60b2494e8ef916a0fce703 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Wed, 8 Nov 2023 10:20:20 +0100 Subject: [PATCH 15/23] Fixed style --- _benchplot/benchplot-dict.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R index 8241f6b7..12f6ce4e 100644 --- a/_benchplot/benchplot-dict.R +++ b/_benchplot/benchplot-dict.R @@ -266,7 +266,7 @@ groupby.syntax.dict = {list( "data.table" = list(), "dplyr" = list(), "pandas" = list(), - "modin" = list(), + "modin" = list(), "pydatatable" = list(), "spark" = list("not yet implemented: SPARK-26589" = "median v3 sd v3 by id4 id5"), "dask" = list("not yet implemented: dask#4362" = "median v3 sd v3 by id4 id5"), From e5a1e0c6d23e47793f852c0ef658e8d0660f6480 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Wed, 8 Nov 2023 10:42:47 +0100 Subject: [PATCH 16/23] codestyle fix --- _benchplot/benchplot-dict.R | 3 +- modin/groupby-modin.py | 444 +++--------------------------------- modin/join-modin.py | 213 +---------------- 3 files changed, 42 insertions(+), 618 deletions(-) diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R index 12f6ce4e..50d1f9ae 100644 --- a/_benchplot/benchplot-dict.R +++ b/_benchplot/benchplot-dict.R @@ -296,7 +296,6 @@ groupby.data.exceptions = {list( "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") # read_csv #9 )}, "modin" = {list( - "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1") )}, "pydatatable" = {list( "csv reader NAs bug: datatable#2808" = c("G1_1e9_1e2_5_0") @@ -471,7 +470,7 @@ join.query.exceptions = {list( "data.table" = list(), "dplyr" = list(), "pandas" = list(), - "modin" = list(), + "modin" = list(), "pydatatable" = list(), "spark" = list(), "dask" = list(), diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py index bcf86dad..7eb5fd59 100755 --- a/modin/groupby-modin.py +++ b/modin/groupby-modin.py @@ -71,25 +71,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() @@ -101,25 +83,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -135,25 +99,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() @@ -165,25 +111,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -199,25 +127,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() @@ -229,25 +139,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -263,25 +155,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() @@ -293,25 +167,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -327,25 +183,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() @@ -357,25 +195,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -391,25 +211,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans[("v3", "median")].sum(), ans[("v3", "std")].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() @@ -421,25 +223,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans[("v3", "median")].sum(), ans[("v3", "std")].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -447,11 +231,7 @@ def init_modin_on_hdk(): question = "max v1 - min v2 by id3" # q7 gc.collect() t_start = timeit.default_timer() -ans = ( - x.groupby(["id3"], **gb_params) - .agg({"v1": "max", "v2": "min"}) - .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]] -) +ans = x.groupby(["id3"], **gb_params).agg({"v1": "max", "v2": "min"}).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]] execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start @@ -459,33 +239,11 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["range_v1_v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() -ans = ( - x.groupby(["id3"], **gb_params) - .agg({"v1": "max", "v2": "min"}) - .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]] -) +ans = x.groupby(["id3"], **gb_params).agg({"v1": "max", "v2": "min"}).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]] execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start @@ -493,25 +251,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["range_v1_v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -519,11 +259,7 @@ def init_modin_on_hdk(): question = "largest two v3 by id6" # q8 gc.collect() t_start = timeit.default_timer() -ans = ( - x.groupby("id6", sort=False, observed=True)["v3"] - .nlargest(2) - .reset_index()[["id6", "v3"]] -) +ans = x.groupby("id6", sort=False, observed=True)["v3"].nlargest(2).reset_index()[["id6", "v3"]] execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start @@ -531,33 +267,11 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() -ans = ( - x.groupby("id6", sort=False, observed=True)["v3"] - .nlargest(2) - .reset_index()[["id6", "v3"]] -) +ans = x.groupby("id6", sort=False, observed=True)["v3"].nlargest(2).reset_index()[["id6", "v3"]] execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start @@ -565,25 +279,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v3"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -594,9 +290,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() from modin.experimental.sql import query -ans = query( - "SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x -) +ans = query("SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start @@ -604,31 +298,11 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["r2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() -ans = query( - "SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x -) +ans = query("SELECT id2, id4, POWER(CORR(v1, v2), 2) AS r2 FROM df GROUP BY id2, id4;", df=x) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start @@ -636,25 +310,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["r2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -662,9 +318,7 @@ def init_modin_on_hdk(): question = "sum v3 count by id1:id6" # q10 gc.collect() t_start = timeit.default_timer() -ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg( - {"v3": "sum", "v1": "count"} -) +ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg({"v3": "sum", "v1": "count"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start @@ -672,31 +326,11 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v3"].sum(), ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() -ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg( - {"v3": "sum", "v1": "count"} -) +ans = x.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], **gb_params).agg({"v3": "sum", "v1": "count"}) execute(ans) print(ans.shape, flush=True) t = timeit.default_timer() - t_start @@ -704,31 +338,11 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v3"].sum(), ans["v1"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans -print( - "grouping finished, took %0.fs" % (timeit.default_timer() - task_init), flush=True -) +print("grouping finished, took %0.fs" % (timeit.default_timer() - task_init), flush=True) exit(0) diff --git a/modin/join-modin.py b/modin/join-modin.py index 50c13cc4..dd99d323 100755 --- a/modin/join-modin.py +++ b/modin/join-modin.py @@ -52,14 +52,7 @@ def init_modin_on_hdk(): print( - "loading datasets " - + data_name - + ", " - + y_data_name[0] - + ", " - + y_data_name[1] - + ", " - + y_data_name[2], + "loading datasets " + data_name + ", " + y_data_name[0] + ", " + y_data_name[1] + ", " + y_data_name[2], flush=True, ) @@ -72,9 +65,7 @@ def init_modin_on_hdk(): }, ) -small = pd.read_csv( - src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"} -) +small = pd.read_csv(src_jn_y[0], dtype={"id1": "int32", "id4": "category", "v2": "float64"}) medium = pd.read_csv( src_jn_y[1], dtype={ @@ -109,25 +100,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() @@ -139,25 +112,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -173,25 +128,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() @@ -203,25 +140,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -237,25 +156,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() @@ -267,25 +168,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -301,25 +184,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() @@ -331,25 +196,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans @@ -365,25 +212,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=1, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() @@ -395,25 +224,7 @@ def init_modin_on_hdk(): t_start = timeit.default_timer() chk = [ans["v1"].sum(), ans["v2"].sum()] chkt = timeit.default_timer() - t_start -write_log( - task=task, - data=data_name, - in_rows=x.shape[0], - question=question, - out_rows=ans.shape[0], - out_cols=ans.shape[1], - solution=solution, - version=ver, - git=git, - fun=fun, - run=2, - time_sec=t, - mem_gb=m, - cache=cache, - chk=make_chk(chk), - chk_time_sec=chkt, - on_disk=on_disk, -) +write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans From 59fb1191cfb48ca6293285c7a143bb8b9f0b1d67 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Wed, 8 Nov 2023 10:50:37 +0100 Subject: [PATCH 17/23] fixed interface --- modin/groupby-modin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py index 7eb5fd59..a2f9f46c 100755 --- a/modin/groupby-modin.py +++ b/modin/groupby-modin.py @@ -26,7 +26,7 @@ def init_modin_on_hdk(): query("SELECT * FROM df", df=df) -init_modin_on_hdk(pd) +init_modin_on_hdk() exec(open("./_helpers/helpers.py").read()) From 2e8388c9993a79ba04a93064ee0f7fadd7e9acc0 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Wed, 8 Nov 2023 11:31:41 +0100 Subject: [PATCH 18/23] fixed solver --- _utils/repro.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/_utils/repro.sh b/_utils/repro.sh index e4c7a0a1..2e9c008e 100755 --- a/_utils/repro.sh +++ b/_utils/repro.sh @@ -64,7 +64,8 @@ python3 -m pip install --upgrade pandas deactivate source ./modin/py-modin/bin/activate -conda update modin-hdk -y -c conda-forge --solver=libmambapython3 -m pip install --upgrade modin +conda update modin-hdk -y -c conda-forge --solver=libmamba +conda deactivate conda deactivate source ./pydatatable/py-pydatatable/bin/activate From 4f171389a284b5c8468d56dcbc85fa961e517a27 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Wed, 8 Nov 2023 11:37:23 +0100 Subject: [PATCH 19/23] added modin --- _utils/repro.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_utils/repro.sh b/_utils/repro.sh index 2e9c008e..c870a478 100755 --- a/_utils/repro.sh +++ b/_utils/repro.sh @@ -92,7 +92,7 @@ mv G1_1e7_1e2_0_0.csv data/ echo "Changing run.conf and _control/data.csv to run only groupby at 0.5GB" cp run.conf run.conf.original sed -i 's/groupby join groupby2014/groupby/g' run.conf -sed -i 's/data.table dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf +sed -i 's/data.table dplyr pandas modin pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf sed -i 's/DO_PUBLISH=true/DO_PUBLISH=false/g' run.conf # set sizes From e7aad1517358d731622925ecb6b1aae0334b691d Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Thu, 9 Nov 2023 15:03:04 +0100 Subject: [PATCH 20/23] cleaned up gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index f3b8ff70..53f4251d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,6 @@ metastore_db/* *.md5 .Rproj.user .Rhistory -py-modin miniconda db-benchmark.Rproj */REVISION From 655bba30eeb2f46ccb6cde3d5353ded1ae7276e3 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Mon, 13 Nov 2023 12:16:53 +0100 Subject: [PATCH 21/23] removed switch --- _launcher/launcher.R | 2 +- _launcher/solution.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/_launcher/launcher.R b/_launcher/launcher.R index 06ae4d12..0a7bc36c 100644 --- a/_launcher/launcher.R +++ b/_launcher/launcher.R @@ -15,7 +15,7 @@ file.ext = function(x) { ans = switch( x, "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", - "pandas"=, "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py", + "pandas"=, "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", "juliadf"="jl", "juliads"="jl", ) diff --git a/_launcher/solution.R b/_launcher/solution.R index 4979594f..f66b4311 100755 --- a/_launcher/solution.R +++ b/_launcher/solution.R @@ -111,7 +111,7 @@ file.ext = function(x) { ans = switch( x, "collapse"=, "data.table"=, "dplyr"=, "h2o"=, "arrow"=, "duckdb"="R", "duckdb-latest"="R", - "pandas"="py", "spark"=, "pydatatable"=, "modin"="py", "dask"=, "datafusion"=, "polars"="py", + "pandas"="py", "spark"=, "pydatatable"=, "modin"=, "dask"=, "datafusion"=, "polars"="py", "clickhouse"="sql", "juliadf"="jl", "juliads"="jl" ) From aba390f377dd5fb3533d4785df3fd4890ab23b5e Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Mon, 13 Nov 2023 12:18:16 +0100 Subject: [PATCH 22/23] added regression test --- .github/workflows/regression.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 53a7684e..33e3e4bc 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion] + solution: [data.table, collapse, dplyr, pandas, modin, pydatatable, spark, juliadf, juliads, polars, arrow, duckdb, duckdb-latest, datafusion] name: Regression Tests solo solutions runs-on: ubuntu-20.04 env: @@ -91,7 +91,7 @@ jobs: name: ${{ matrix.solution }}-out.zip path: ${{ matrix.solution }}-out.zip if-no-files-found: error - + regression-test-benchmark-runner-all-solutions: needs: regression-test-benchmark-runner-solo-solutions name: Regression Tests all solutions From f22b57790c90e6934f5e4ed57900a496263fcad2 Mon Sep 17 00:00:00 2001 From: Egor Krivov Date: Fri, 24 Nov 2023 19:36:37 +0100 Subject: [PATCH 23/23] Fixed CPU count & miniconda activation --- _utils/repro.sh | 2 +- modin/groupby-modin.py | 1 + modin/join-modin.py | 1 + modin/setup-modin.sh | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/_utils/repro.sh b/_utils/repro.sh index c870a478..2ce011c9 100755 --- a/_utils/repro.sh +++ b/_utils/repro.sh @@ -39,7 +39,7 @@ curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-p sh install_miniconda.sh -u -b -p ./py-modin && \ rm -f install_miniconda.sh -source ./py-modin/bin/activate +eval source ./modin/py-modin/bin/activate conda install -y conda-libmamba-solver conda create --name modin -y diff --git a/modin/groupby-modin.py b/modin/groupby-modin.py index a2f9f46c..93a5115e 100755 --- a/modin/groupby-modin.py +++ b/modin/groupby-modin.py @@ -10,6 +10,7 @@ os.environ["MODIN_ENGINE"] = "native" os.environ["MODIN_STORAGE_FORMAT"] = "hdk" os.environ["MODIN_EXPERIMENTAL"] = "True" +os.environ["MODIN_CPUS"] = "40" import modin as modin diff --git a/modin/join-modin.py b/modin/join-modin.py index dd99d323..3c4ce57e 100755 --- a/modin/join-modin.py +++ b/modin/join-modin.py @@ -10,6 +10,7 @@ os.environ["MODIN_ENGINE"] = "native" os.environ["MODIN_STORAGE_FORMAT"] = "hdk" os.environ["MODIN_EXPERIMENTAL"] = "True" +os.environ["MODIN_CPUS"] = "40" import modin as modin diff --git a/modin/setup-modin.sh b/modin/setup-modin.sh index c8b831bc..34a9b32f 100755 --- a/modin/setup-modin.sh +++ b/modin/setup-modin.sh @@ -5,7 +5,7 @@ curl -o install_miniconda.sh -L https://repo.anaconda.com/miniconda/Miniconda3-p sh install_miniconda.sh -u -b -p ./modin/py-modin && \ rm -f install_miniconda.sh -source ./modin/py-modin/bin/activate +eval source ./modin/py-modin/bin/activate conda install -y conda-libmamba-solver conda create --name modin -y