diff --git a/Cargo.lock b/Cargo.lock index 3abb6a12..032a6c4c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2457,7 +2457,7 @@ dependencies = [ [[package]] name = "polars_ds" -version = "0.6.3" +version = "0.7.0" dependencies = [ "ahash", "approx", diff --git a/Cargo.toml b/Cargo.toml index 03c816ad..3a9a39a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "polars_ds" -version = "0.6.3" +version = "0.7.0" edition = "2021" [lib] diff --git a/README.md b/README.md index 077412aa..0af79fa9 100644 --- a/README.md +++ b/README.md @@ -216,7 +216,7 @@ Generally speaking, the more expressions you want to evaluate simultaneously, th Why does speed matter? -If your code already executes under 1s, then maybe it doesn't. But as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute. +If your code already executes under 1s and you only use your code in non-production, ad-hoc environments, then maybe it doesn't. Even so, as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute or when you have a production pipeline that has to deliver the data under a time constraint. ## HELP WANTED! diff --git a/benchmarks/benchmarks.ipynb b/benchmarks/benchmarks.ipynb index 383f4891..f7d92f7e 100644 --- a/benchmarks/benchmarks.ipynb +++ b/benchmarks/benchmarks.ipynb @@ -31,7 +31,7 @@ "\n", "Comparison: \n", "\n", - "Polars + PDS vs Pandas + Sklearn" + "Polars + PDS vs. Pandas + Sklearn" ] }, { @@ -86,32 +86,32 @@ " \n", " \n", " 0\n", - " 0.413767\n", - " 1\n", + " 0.621657\n", + " 0\n", " 2020-01-01\n", " \n", " \n", " 1\n", - " 0.125783\n", + " 0.502729\n", " 1\n", " 2020-01-02\n", " \n", " \n", " 2\n", - " 0.382943\n", - " 1\n", + " 0.084236\n", + " 0\n", " 2020-01-03\n", " \n", " \n", " 3\n", - " 0.690455\n", - " 0\n", + " 0.818261\n", + " 1\n", " 2020-01-04\n", " \n", " \n", " 4\n", - " 0.492488\n", - " 0\n", + " 0.742475\n", + " 1\n", " 2020-01-05\n", " \n", " \n", @@ -122,31 +122,31 @@ " \n", " \n", " 1731\n", - " 0.365318\n", - " 1\n", + " 0.225007\n", + " 0\n", " 2024-09-27\n", " \n", " \n", " 1732\n", - " 0.635105\n", - " 1\n", + " 0.550625\n", + " 0\n", " 2024-09-28\n", " \n", " \n", " 1733\n", - " 0.156054\n", + " 0.351283\n", " 1\n", " 2024-09-29\n", " \n", " \n", " 1734\n", - " 0.736704\n", + " 0.430682\n", " 1\n", " 2024-09-30\n", " \n", " \n", " 1735\n", - " 0.660525\n", + " 0.683423\n", " 1\n", " 2024-10-01\n", " \n", @@ -157,17 +157,17 @@ ], "text/plain": [ " predicted actual_target dates\n", - "0 0.413767 1 2020-01-01\n", - "1 0.125783 1 2020-01-02\n", - "2 0.382943 1 2020-01-03\n", - "3 0.690455 0 2020-01-04\n", - "4 0.492488 0 2020-01-05\n", + "0 0.621657 0 2020-01-01\n", + "1 0.502729 1 2020-01-02\n", + "2 0.084236 0 2020-01-03\n", + "3 0.818261 1 2020-01-04\n", + "4 0.742475 1 2020-01-05\n", "... ... ... ...\n", - "1731 0.365318 1 2024-09-27\n", - "1732 0.635105 1 2024-09-28\n", - "1733 0.156054 1 2024-09-29\n", - "1734 0.736704 1 2024-09-30\n", - "1735 0.660525 1 2024-10-01\n", + "1731 0.225007 0 2024-09-27\n", + "1732 0.550625 0 2024-09-28\n", + "1733 0.351283 1 2024-09-29\n", + "1734 0.430682 1 2024-09-30\n", + "1735 0.683423 1 2024-10-01\n", "\n", "[1736 rows x 3 columns]" ] @@ -200,7 +200,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "10.3 ms ± 83 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "5.8 ms ± 15.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], @@ -219,14 +219,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2.13 ms ± 67.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "1.32 ms ± 1.72 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], @@ -237,43 +237,491 @@ " roc_auc = pds.query_roc_auc(\"actual_target\", \"predicted\"),\n", " log_loss = pds.query_log_loss(\"actual_target\", \"predicted\")\n", ").sort(\"dates\")\n", - "# 1/5 of the time, less lines of code + easier to understand syntax" + "# 1/4 of the time, less lines of code + easier to understand syntax" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Common Traditional ML Pipelines\n", + "\n", + "Use cases:\n", + "\n", + "1. Data Transformation before model training\n", + "2. Feature Engineering pipelines, etc.\n", + "\n", + "Comparison: \n", + "\n", + "Polars + PDS vs. Pandas + Sklearn vs. Polars + Sklearn" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# A random Dataframe with 50k records\n", + "size = 50_000\n", + "df_pl = pds.frame(size=size).select(\n", + " pds.random(0.0, 1.0).alias(\"x1\"),\n", + " pds.random(0.0, 1.0).alias(\"x2\"),\n", + " pds.random(0.0, 1.0).alias(\"x3\"),\n", + ").with_columns(\n", + " x4 = pl.when(pl.col(\"x3\") > 0.3).then(None).otherwise(pl.col(\"x3\")),\n", + " x5 = pl.when(pl.col(\"x2\") > 0.5).then(None).otherwise(pl.col(\"x2\")),\n", + ")\n", + "df_pd = df_pl.to_pandas()" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 5)
x1x2x3x4x5
f64f64f64f64f64
0.576860.7969510.479145nullnull
0.7037580.8156890.970173nullnull
0.3304150.9524430.30547nullnull
0.4196660.4021720.65559null0.402172
0.0990820.5652920.715153nullnull
0.6915350.2977780.752498null0.297778
0.9238420.5093010.976943nullnull
0.706760.8952960.773036nullnull
0.1517060.3458590.892369null0.345859
0.2013880.7467210.885525nullnull
" + ], + "text/plain": [ + "shape: (10, 5)\n", + "┌──────────┬──────────┬──────────┬──────┬──────────┐\n", + "│ x1 ┆ x2 ┆ x3 ┆ x4 ┆ x5 │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════╪══════════╪══════════╪══════╪══════════╡\n", + "│ 0.57686 ┆ 0.796951 ┆ 0.479145 ┆ null ┆ null │\n", + "│ 0.703758 ┆ 0.815689 ┆ 0.970173 ┆ null ┆ null │\n", + "│ 0.330415 ┆ 0.952443 ┆ 0.30547 ┆ null ┆ null │\n", + "│ 0.419666 ┆ 0.402172 ┆ 0.65559 ┆ null ┆ 0.402172 │\n", + "│ 0.099082 ┆ 0.565292 ┆ 0.715153 ┆ null ┆ null │\n", + "│ 0.691535 ┆ 0.297778 ┆ 0.752498 ┆ null ┆ 0.297778 │\n", + "│ 0.923842 ┆ 0.509301 ┆ 0.976943 ┆ null ┆ null │\n", + "│ 0.70676 ┆ 0.895296 ┆ 0.773036 ┆ null ┆ null │\n", + "│ 0.151706 ┆ 0.345859 ┆ 0.892369 ┆ null ┆ 0.345859 │\n", + "│ 0.201388 ┆ 0.746721 ┆ 0.885525 ┆ null ┆ null │\n", + "└──────────┴──────────┴──────────┴──────┴──────────┘" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_pl.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pandas + Sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": 9, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from sklearn import set_config\n", + "set_config(transform_output=\"pandas\")" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "impute_step = ColumnTransformer(\n", + " [(\"MedianImputer1\", SimpleImputer(strategy=\"median\"), [3]),\n", + " (\"MedianImputer2\", SimpleImputer(strategy=\"median\"), [4])],\n", + " remainder = \"passthrough\",\n", + " verbose_feature_names_out = False,\n", + ")\n", + "\n", + "pipe = Pipeline(steps = [\n", + " (\"Imputer\", impute_step), # impute only column 3 and 4\n", + " (\"StandardScaler\", StandardScaler()), # Scale all columns\n", + "])" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x1x2x3x4x5
00.2582821.036672-0.064459-0.003164-0.005449
10.6987421.1016851.633356-0.003164-0.005449
2-0.5971231.576182-0.664973-0.003164-0.005449
3-0.287334-0.3330940.545629-0.0031641.495760
4-1.4000740.2328840.751579-0.003164-0.005449
50.656316-0.6953130.880706-0.0031640.476784
61.4626460.0386121.656764-0.003164-0.005449
70.7091611.3779000.951719-0.003164-0.005449
8-1.217417-0.5284841.364335-0.0031640.946099
9-1.0449710.8623881.340669-0.003164-0.005449
\n", + "
" + ], + "text/plain": [ + " x1 x2 x3 x4 x5\n", + "0 0.258282 1.036672 -0.064459 -0.003164 -0.005449\n", + "1 0.698742 1.101685 1.633356 -0.003164 -0.005449\n", + "2 -0.597123 1.576182 -0.664973 -0.003164 -0.005449\n", + "3 -0.287334 -0.333094 0.545629 -0.003164 1.495760\n", + "4 -1.400074 0.232884 0.751579 -0.003164 -0.005449\n", + "5 0.656316 -0.695313 0.880706 -0.003164 0.476784\n", + "6 1.462646 0.038612 1.656764 -0.003164 -0.005449\n", + "7 0.709161 1.377900 0.951719 -0.003164 -0.005449\n", + "8 -1.217417 -0.528484 1.364335 -0.003164 0.946099\n", + "9 -1.044971 0.862388 1.340669 -0.003164 -0.005449" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe.fit_transform(df_pd)[[\"x1\", \"x2\", \"x3\", \"x4\", \"x5\"]].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8.66 ms ± 20.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "pipe.fit_transform(df_pd)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Polars + Sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from sklearn import set_config\n", + "set_config(transform_output=\"polars\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 5)
x1x2x3x4x5
f64f64f64f64f64
0.2582821.036672-0.064459-0.003164-0.005449
0.6987421.1016851.633356-0.003164-0.005449
-0.5971231.576182-0.664973-0.003164-0.005449
-0.287334-0.3330940.545629-0.0031641.49576
-1.4000740.2328840.751579-0.003164-0.005449
0.656316-0.6953130.880706-0.0031640.476784
1.4626460.0386121.656764-0.003164-0.005449
0.7091611.37790.951719-0.003164-0.005449
-1.217417-0.5284841.364335-0.0031640.946099
-1.0449710.8623881.340669-0.003164-0.005449
" + ], + "text/plain": [ + "shape: (10, 5)\n", + "┌───────────┬───────────┬───────────┬───────────┬───────────┐\n", + "│ x1 ┆ x2 ┆ x3 ┆ x4 ┆ x5 │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ 0.258282 ┆ 1.036672 ┆ -0.064459 ┆ -0.003164 ┆ -0.005449 │\n", + "│ 0.698742 ┆ 1.101685 ┆ 1.633356 ┆ -0.003164 ┆ -0.005449 │\n", + "│ -0.597123 ┆ 1.576182 ┆ -0.664973 ┆ -0.003164 ┆ -0.005449 │\n", + "│ -0.287334 ┆ -0.333094 ┆ 0.545629 ┆ -0.003164 ┆ 1.49576 │\n", + "│ -1.400074 ┆ 0.232884 ┆ 0.751579 ┆ -0.003164 ┆ -0.005449 │\n", + "│ 0.656316 ┆ -0.695313 ┆ 0.880706 ┆ -0.003164 ┆ 0.476784 │\n", + "│ 1.462646 ┆ 0.038612 ┆ 1.656764 ┆ -0.003164 ┆ -0.005449 │\n", + "│ 0.709161 ┆ 1.3779 ┆ 0.951719 ┆ -0.003164 ┆ -0.005449 │\n", + "│ -1.217417 ┆ -0.528484 ┆ 1.364335 ┆ -0.003164 ┆ 0.946099 │\n", + "│ -1.044971 ┆ 0.862388 ┆ 1.340669 ┆ -0.003164 ┆ -0.005449 │\n", + "└───────────┴───────────┴───────────┴───────────┴───────────┘" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe.fit_transform(df_pl).select([\"x1\", \"x2\", \"x3\", \"x4\", \"x5\"]).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.35 ms ± 51.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "pipe.fit_transform(df_pl)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# If you use sklearn, there is not a lot of time difference because they underlying engine\n", + "# is not parallel (there are options but they don't work properly on Linux, which is basically\n", + "# all cloud compute nowadays.)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Polars + Polars DS " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from polars_ds.pipeline import Pipeline, Blueprint" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 5)
x1x2x3x4x5
f64f64f64f64f64
0.2582821.036672-0.064459-0.003164-0.005449
0.6987421.1016851.633356-0.003164-0.005449
-0.5971231.576182-0.664973-0.003164-0.005449
-0.287334-0.3330940.545629-0.0031641.49576
-1.4000740.2328840.751579-0.003164-0.005449
0.656316-0.6953130.880706-0.0031640.476784
1.4626460.0386121.656764-0.003164-0.005449
0.7091611.37790.951719-0.003164-0.005449
-1.217417-0.5284841.364335-0.0031640.946099
-1.0449710.8623881.340669-0.003164-0.005449
" + ], + "text/plain": [ + "shape: (10, 5)\n", + "┌───────────┬───────────┬───────────┬───────────┬───────────┐\n", + "│ x1 ┆ x2 ┆ x3 ┆ x4 ┆ x5 │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡\n", + "│ 0.258282 ┆ 1.036672 ┆ -0.064459 ┆ -0.003164 ┆ -0.005449 │\n", + "│ 0.698742 ┆ 1.101685 ┆ 1.633356 ┆ -0.003164 ┆ -0.005449 │\n", + "│ -0.597123 ┆ 1.576182 ┆ -0.664973 ┆ -0.003164 ┆ -0.005449 │\n", + "│ -0.287334 ┆ -0.333094 ┆ 0.545629 ┆ -0.003164 ┆ 1.49576 │\n", + "│ -1.400074 ┆ 0.232884 ┆ 0.751579 ┆ -0.003164 ┆ -0.005449 │\n", + "│ 0.656316 ┆ -0.695313 ┆ 0.880706 ┆ -0.003164 ┆ 0.476784 │\n", + "│ 1.462646 ┆ 0.038612 ┆ 1.656764 ┆ -0.003164 ┆ -0.005449 │\n", + "│ 0.709161 ┆ 1.3779 ┆ 0.951719 ┆ -0.003164 ┆ -0.005449 │\n", + "│ -1.217417 ┆ -0.528484 ┆ 1.364335 ┆ -0.003164 ┆ 0.946099 │\n", + "│ -1.044971 ┆ 0.862388 ┆ 1.340669 ┆ -0.003164 ┆ -0.005449 │\n", + "└───────────┴───────────┴───────────┴───────────┴───────────┘" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bp = (\n", + " Blueprint(df_pl, name = \"example_pipeline\") \n", + " .impute([\"x4\", \"x5\"], method = \"median\")\n", + " .scale(pl.all(), method = \"standard\")\n", + ")\n", + "\n", + "pipe = bp.materialize() # bp.fit() also works\n", + "pipe.transform(df_pl).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "750 μs ± 1.31 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "pipe = bp.materialize() # bp.fit() also works\n", + "pipe.transform(df_pl)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# This reason for this incredible speedup is\n", + "# 1. PDS run natively in Polars, which means free parallelization\n", + "# 2. Impute, despite being a very common data transformation, is very slow in Sklearn\n", + "# but is extremely fast in Polars. (This is because SimpleImputer uses NumPy Array to run imputation,\n", + "# while Polars uses ChunkedArray which has tiny overhead when it comes to finding and filling nulls.)" + ] }, { "cell_type": "code", diff --git a/benchmarks/linear_regression.ipynb b/benchmarks/linear_regression.ipynb index 7aa62f58..32db65d8 100644 --- a/benchmarks/linear_regression.ipynb +++ b/benchmarks/linear_regression.ipynb @@ -2,9 +2,17 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7.0\n" + ] + } + ], "source": [ "import polars as pl\n", "import pandas as pd\n", @@ -16,12 +24,44 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 8)
x1x2x3x4x5codeidy
f64f64f64f64f64i32i64f64
0.0231530.2188930.1654740.0652970.43763610-0.002283
0.2121670.8211210.7266890.4847750.97551310.172509
0.5875990.4322260.8254910.144750.80575120.202238
0.2780520.5474040.5442410.781110.119928330.334958
0.657510.1114540.7678590.6618470.278934240.337549
" + ], + "text/plain": [ + "shape: (5, 8)\n", + "┌──────────┬──────────┬──────────┬──────────┬──────────┬──────┬─────┬───────────┐\n", + "│ x1 ┆ x2 ┆ x3 ┆ x4 ┆ x5 ┆ code ┆ id ┆ y │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ i64 ┆ f64 │\n", + "╞══════════╪══════════╪══════════╪══════════╪══════════╪══════╪═════╪═══════════╡\n", + "│ 0.023153 ┆ 0.218893 ┆ 0.165474 ┆ 0.065297 ┆ 0.437636 ┆ 1 ┆ 0 ┆ -0.002283 │\n", + "│ 0.212167 ┆ 0.821121 ┆ 0.726689 ┆ 0.484775 ┆ 0.97551 ┆ 3 ┆ 1 ┆ 0.172509 │\n", + "│ 0.587599 ┆ 0.432226 ┆ 0.825491 ┆ 0.14475 ┆ 0.80575 ┆ 1 ┆ 2 ┆ 0.202238 │\n", + "│ 0.278052 ┆ 0.547404 ┆ 0.544241 ┆ 0.78111 ┆ 0.119928 ┆ 3 ┆ 3 ┆ 0.334958 │\n", + "│ 0.65751 ┆ 0.111454 ┆ 0.767859 ┆ 0.661847 ┆ 0.278934 ┆ 2 ┆ 4 ┆ 0.337549 │\n", + "└──────────┴──────────┴──────────┴──────────┴──────────┴──────┴─────┴───────────┘" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "size = 50_000\n", - "df = pds.random_data(size=size, n_cols=0).select(\n", + "df = pds.frame(size=size).select(\n", " pds.random(0.0, 1.0).alias(\"x1\"),\n", " pds.random(0.0, 1.0).alias(\"x2\"),\n", " pds.random(0.0, 1.0).alias(\"x3\"),\n", @@ -37,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -56,16 +96,35 @@ "\n", "I did not invent any of the algorithms that solves the linear regression problem. Not did I make any improvement to existing algorithms. I only rewrote them in Rust, using Faer, and brought the algorithms alive with Polars.\n", "\n", - "1. Polars DS In-DataFrame Linear Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy LinearRegression\n", - "2. Polars DS In-DataFrame Ridge Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy Ridge\n", - "3. Polars DS In-DataFrame Lasso Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy Lasso" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "1. Polars DS In-DataFrame Linear Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy LinearRegression" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Polars DS: shape: (5,)\n", + "Series: '' [f64]\n", + "[\n", + "\t0.500019\n", + "\t0.250019\n", + "\t-0.149981\n", + "\t0.200018\n", + "\t-0.129981\n", + "]\n", + "PDS LR: \n", + "Sklearn: [[ 0.50001865 0.25001948 -0.14998127 0.20001819 -0.12998134]]\n" + ] + } + ], "source": [ "# Polars DS way\n", "print(\n", @@ -74,7 +133,6 @@ " pds.lin_reg(\n", " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n", " target = \"y\",\n", - " method = \"normal\",\n", " )\n", " ).item(0, 0)\n", ")\n", @@ -82,8 +140,9 @@ "# Fit is done implicitly because X and y are passed at initialization\n", "# You can also don't put X and y here and do a lr.fit(X,y) later.\n", "lr = pds_linear.LR(\n", - " X=X, y=y, add_bias=False, method=\"normal\"\n", + " fit_bias=False\n", ") \n", + "lr.fit(X, y)\n", "print(\"PDS LR: \", lr.coeffs)\n", "\n", "# Sklearn\n", @@ -94,236 +153,67 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "787 μs ± 10.3 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], "source": [ "%%timeit \n", "df.select(\n", " pds.lin_reg(\n", " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n", " target = \"y\",\n", - " method = \"normal\",\n", " )\n", ")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "631 μs ± 1.89 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], "source": [ "%%timeit\n", "lr = pds_linear.LR(\n", - " add_bias=False, method=\"normal\"\n", + " fit_bias=False,\n", ")\n", "lr.fit(X, y)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.42 ms ± 2.87 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" + ] + } + ], "source": [ "%%timeit\n", "reg = LinearRegression(fit_intercept=False, copy_X=False)\n", "reg.fit(X, y)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Polars DS way\n", - "print(\n", - " \"Polars DS: \",\n", - " df.select(\n", - " pds.lin_reg(\n", - " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n", - " target = \"y\",\n", - " method = \"l1\",\n", - " l1_reg = 0.1\n", - " )\n", - " ).item(0, 0)\n", - ")\n", - "\n", - "# Fit is done implicitly because X and y are passed at initialization\n", - "# You can also don't put X and y here and do a lr.fit(X,y) later.\n", - "lr = pds_linear.LR(\n", - " X=X, y=y, add_bias=False, method=\"l1\", lambda_ = 0.1,\n", - ") \n", - "print(\"PDS LR: \", lr.coeffs)\n", - "\n", - "# Sklearn\n", - "reg = Lasso(alpha = 0.1, fit_intercept=False)\n", - "reg.fit(X, y)\n", - "print(\"Sklearn: \", reg.coef_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%timeit\n", - "df.select(\n", - " pds.lin_reg(\n", - " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n", - " target = \"y\",\n", - " method = \"l1\",\n", - " l1_reg = 0.1\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%timeit\n", - "lr = pds_linear.LR(\n", - " add_bias=False, method=\"l1\", lambda_=0.1\n", - ") \n", - "# This is faster than the in-dataframe ver because this uses NumPy data directly, which skips a copy.\n", - "# This is faster than sklearn because the underlying linalg library is faster. The convergence criterion is also simpler, though \n", - "# less rigourous, than sklearn's. However, you can set tol = 1e-7 and still be faster.\n", - "lr.fit(X, y)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%timeit\n", - "reg = Lasso(alpha = 0.1, fit_intercept=False, copy_X=False)\n", - "reg.fit(X, y)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Polars DS way\n", - "print(\n", - " \"Polars DS: \",\n", - " df.select(\n", - " pds.lin_reg(\n", - " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n", - " target = \"y\",\n", - " method = \"l2\",\n", - " l2_reg = 0.1\n", - " )\n", - " ).item(0, 0)\n", - ")\n", - "\n", - "# Fit is done implicitly because X and y are passed at initialization\n", - "# You can also don't put X and y here and do a lr.fit(X,y) later.\n", - "lr = pds_linear.LR(\n", - " X=X, y=y, add_bias=False, method=\"l2\", lambda_ = 0.1,\n", - ") \n", - "print(\"PDS LR: \", lr.coeffs)\n", - "\n", - "# Sklearn\n", - "reg = Ridge(alpha = 0.1, fit_intercept=False)\n", - "reg.fit(X, y)\n", - "print(\"Sklearn: \", reg.coef_)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%timeit\n", - "df.select(\n", - " pds.lin_reg(\n", - " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n", - " target = \"y\",\n", - " method = \"l2\",\n", - " l2_reg = 0.1\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%timeit\n", - "lr = pds_linear.LR(\n", - " add_bias=False, method=\"l2\", lambda_=0.1\n", - ") \n", - "lr.fit(X, y)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%timeit\n", - "reg = Ridge(alpha = 0.1, fit_intercept=False, copy_X=False)\n", - "reg.fit(X, y)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# What you can do with Polars DS but will be hard for Scikit-learn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Train a linear regression model on each category. And return the predictions\n", - "df.select(\n", - " pl.col(\"id\"),\n", - " pds.lin_reg(\n", - " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n", - " target = \"y\",\n", - " method = \"l2\",\n", - " l2_reg = 0.1,\n", - " return_pred = True\n", - " ).over(\"code\").alias(\"predictions\")\n", - ").unnest(\"predictions\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Train a linear regression model on each category. And return only the coefficients\n", - "df.group_by(\"code\").agg(\n", - " pds.lin_reg(\n", - " \"x1\", \"x2\", \"x3\",\n", - " target = \"y\",\n", - " method = \"l2\",\n", - " l2_reg = 0.1,\n", - " )\n", - ").sort(\"code\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -339,8 +229,16 @@ "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.11.8" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" } }, "nbformat": 4, diff --git a/docs/index.md b/docs/index.md index a879b90a..de3bd728 100644 --- a/docs/index.md +++ b/docs/index.md @@ -205,7 +205,7 @@ Generally speaking, the more expressions you want to evaluate simultaneously, th Why does speed matter? -If your code already executes under 1s, then maybe it doesn't. But as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute. +If your code already executes under 1s and you only use your code in non-production, ad-hoc environments, then maybe it doesn't. Even so, as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute or when you have a production pipeline that has to deliver the data under a time constraint. ## HELP WANTED! diff --git a/examples/basics.ipynb b/examples/basics.ipynb index 50efabcc..00ba148c 100644 --- a/examples/basics.ipynb +++ b/examples/basics.ipynb @@ -46,21 +46,21 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 13)
ftime_idxdummyactualpredicteddummy_groupsx1x2x3abyy2
f64i64stri32f64strf64f64f64f64f64f64f64
0.00"a"00.091583"a"0.0275160.0250680.0735830.6103640.579474-0.0986840.007555
0.8414711"a"10.585465"a"0.9470790.9175480.5393840.2482190.909225-0.3916860.482099
0.9092972"a"10.098363"a"0.1988410.1135980.8667510.523130.392237-1.236188-0.009659
0.141123"a"10.03237"a"0.4434980.2081410.1378990.3517430.354237-0.0778790.137624
-0.7568024"a"10.324095"a"0.8042340.1033710.8854420.4134730.870453-1.1764780.062609
" + "shape: (5, 13)
ftime_idxdummyactualpredicteddummy_groupsx1x2x3abyy2
f64i64stri32f64strf64f64f64f64f64f64f64
0.00"a"10.836972"a"0.1811840.8331640.2428370.9633570.645792-0.0871060.374231
0.8414711"a"00.160224"a"0.7160970.72390.0061440.5237030.3082670.3154130.418257
0.9092972"a"00.289834"a"0.8035620.1386190.3991910.4808380.035734-0.4366030.127021
0.141123"a"00.192884"a"0.9242350.082840.0717270.8540510.9430420.0559430.150295
-0.7568024"a"00.370113"a"0.4608230.0854750.9671260.9650460.556006-1.3558570.00166
" ], "text/plain": [ "shape: (5, 13)\n", - "┌───────────┬──────────┬───────┬────────┬───┬──────────┬──────────┬───────────┬───────────┐\n", - "│ f ┆ time_idx ┆ dummy ┆ actual ┆ … ┆ a ┆ b ┆ y ┆ y2 │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ f64 ┆ i64 ┆ str ┆ i32 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞═══════════╪══════════╪═══════╪════════╪═══╪══════════╪══════════╪═══════════╪═══════════╡\n", - "│ 0.0 ┆ 0 ┆ a ┆ 0 ┆ … ┆ 0.610364 ┆ 0.579474 ┆ -0.098684 ┆ 0.007555 │\n", - "│ 0.841471 ┆ 1 ┆ a ┆ 1 ┆ … ┆ 0.248219 ┆ 0.909225 ┆ -0.391686 ┆ 0.482099 │\n", - "│ 0.909297 ┆ 2 ┆ a ┆ 1 ┆ … ┆ 0.52313 ┆ 0.392237 ┆ -1.236188 ┆ -0.009659 │\n", - "│ 0.14112 ┆ 3 ┆ a ┆ 1 ┆ … ┆ 0.351743 ┆ 0.354237 ┆ -0.077879 ┆ 0.137624 │\n", - "│ -0.756802 ┆ 4 ┆ a ┆ 1 ┆ … ┆ 0.413473 ┆ 0.870453 ┆ -1.176478 ┆ 0.062609 │\n", - "└───────────┴──────────┴───────┴────────┴───┴──────────┴──────────┴───────────┴───────────┘" + "┌───────────┬──────────┬───────┬────────┬───┬──────────┬──────────┬───────────┬──────────┐\n", + "│ f ┆ time_idx ┆ dummy ┆ actual ┆ … ┆ a ┆ b ┆ y ┆ y2 │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ i64 ┆ str ┆ i32 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════════╪══════════╪═══════╪════════╪═══╪══════════╪══════════╪═══════════╪══════════╡\n", + "│ 0.0 ┆ 0 ┆ a ┆ 1 ┆ … ┆ 0.963357 ┆ 0.645792 ┆ -0.087106 ┆ 0.374231 │\n", + "│ 0.841471 ┆ 1 ┆ a ┆ 0 ┆ … ┆ 0.523703 ┆ 0.308267 ┆ 0.315413 ┆ 0.418257 │\n", + "│ 0.909297 ┆ 2 ┆ a ┆ 0 ┆ … ┆ 0.480838 ┆ 0.035734 ┆ -0.436603 ┆ 0.127021 │\n", + "│ 0.14112 ┆ 3 ┆ a ┆ 0 ┆ … ┆ 0.854051 ┆ 0.943042 ┆ 0.055943 ┆ 0.150295 │\n", + "│ -0.756802 ┆ 4 ┆ a ┆ 0 ┆ … ┆ 0.965046 ┆ 0.556006 ┆ -1.355857 ┆ 0.00166 │\n", + "└───────────┴──────────┴───────┴────────┴───┴──────────┴──────────┴───────────┴──────────┘" ] }, "execution_count": 2, @@ -218,7 +218,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
fab
f64f64f64
1.3944e-15-0.610364-0.579474
-0.841471-0.248219-0.909225
-0.909297-0.52313-0.392237
-0.14112-0.351743-0.354237
0.7568020.196891-0.290979
" + "shape: (5, 3)
fab
f64f64f64
1.3944e-15-0.963357-0.645792
-0.841471-0.523703-0.308267
-0.909297-0.480838-0.035734
-0.14112-0.854051-0.943042
0.756802-0.0016880.089786
" ], "text/plain": [ "shape: (5, 3)\n", @@ -227,11 +227,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 │\n", "╞════════════╪═══════════╪═══════════╡\n", - "│ 1.3944e-15 ┆ -0.610364 ┆ -0.579474 │\n", - "│ -0.841471 ┆ -0.248219 ┆ -0.909225 │\n", - "│ -0.909297 ┆ -0.52313 ┆ -0.392237 │\n", - "│ -0.14112 ┆ -0.351743 ┆ -0.354237 │\n", - "│ 0.756802 ┆ 0.196891 ┆ -0.290979 │\n", + "│ 1.3944e-15 ┆ -0.963357 ┆ -0.645792 │\n", + "│ -0.841471 ┆ -0.523703 ┆ -0.308267 │\n", + "│ -0.909297 ┆ -0.480838 ┆ -0.035734 │\n", + "│ -0.14112 ┆ -0.854051 ┆ -0.943042 │\n", + "│ 0.756802 ┆ -0.001688 ┆ 0.089786 │\n", "└────────────┴───────────┴───────────┘" ] }, @@ -268,17 +268,17 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 1)
coeffs
list[f64]
[-0.498886, -0.35376]
" + "shape: (1, 1)
coeffs
list[f64]
[-0.500734, -0.338584]
" ], "text/plain": [ "shape: (1, 1)\n", - "┌───────────────────────┐\n", - "│ coeffs │\n", - "│ --- │\n", - "│ list[f64] │\n", - "╞═══════════════════════╡\n", - "│ [-0.498886, -0.35376] │\n", - "└───────────────────────┘" + "┌────────────────────────┐\n", + "│ coeffs │\n", + "│ --- │\n", + "│ list[f64] │\n", + "╞════════════════════════╡\n", + "│ [-0.500734, -0.338584] │\n", + "└────────────────────────┘" ] }, "execution_count": 7, @@ -313,17 +313,17 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 2)
target_0target_1
list[f64]list[f64]
[-0.498886, -0.35376][0.086782, 0.406454]
" + "shape: (1, 2)
target_0target_1
list[f64]list[f64]
[-0.500734, -0.338584][0.086658, 0.407468]
" ], "text/plain": [ "shape: (1, 2)\n", - "┌───────────────────────┬──────────────────────┐\n", - "│ target_0 ┆ target_1 │\n", - "│ --- ┆ --- │\n", - "│ list[f64] ┆ list[f64] │\n", - "╞═══════════════════════╪══════════════════════╡\n", - "│ [-0.498886, -0.35376] ┆ [0.086782, 0.406454] │\n", - "└───────────────────────┴──────────────────────┘" + "┌────────────────────────┬──────────────────────┐\n", + "│ target_0 ┆ target_1 │\n", + "│ --- ┆ --- │\n", + "│ list[f64] ┆ list[f64] │\n", + "╞════════════════════════╪══════════════════════╡\n", + "│ [-0.500734, -0.338584] ┆ [0.086658, 0.407468] │\n", + "└────────────────────────┴──────────────────────┘" ] }, "execution_count": 8, @@ -358,7 +358,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (4, 7)
featuresbetastd_errtp>|t|0.0250.975
strf64f64f64f64f64f64
"ln(x1+1)"0.2190350.001699128.8879870.00.2157040.222366
"exp(x2)"0.1736410.000686253.1377880.00.1722960.174986
"sin(x3)"-1.7434040.001351-1290.2529470.0-1.746052-1.740755
"__bias__"-0.1062270.001517-70.0202820.0-0.109201-0.103253
" + "shape: (4, 7)
featuresbetastd_errtp>|t|0.0250.975
strf64f64f64f64f64f64
"ln(x1+1)"0.2200870.001678131.1407370.00.2167970.223376
"exp(x2)"0.1744490.000676258.1797750.00.1731250.175774
"sin(x3)"-1.7457810.001346-1297.0839540.0-1.748419-1.743142
"__bias__"-0.1069510.0015-71.2928130.0-0.109891-0.10401
" ], "text/plain": [ "shape: (4, 7)\n", @@ -367,10 +367,10 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════╪═══════════╪══════════╪══════════════╪═══════╪═══════════╪═══════════╡\n", - "│ ln(x1+1) ┆ 0.219035 ┆ 0.001699 ┆ 128.887987 ┆ 0.0 ┆ 0.215704 ┆ 0.222366 │\n", - "│ exp(x2) ┆ 0.173641 ┆ 0.000686 ┆ 253.137788 ┆ 0.0 ┆ 0.172296 ┆ 0.174986 │\n", - "│ sin(x3) ┆ -1.743404 ┆ 0.001351 ┆ -1290.252947 ┆ 0.0 ┆ -1.746052 ┆ -1.740755 │\n", - "│ __bias__ ┆ -0.106227 ┆ 0.001517 ┆ -70.020282 ┆ 0.0 ┆ -0.109201 ┆ -0.103253 │\n", + "│ ln(x1+1) ┆ 0.220087 ┆ 0.001678 ┆ 131.140737 ┆ 0.0 ┆ 0.216797 ┆ 0.223376 │\n", + "│ exp(x2) ┆ 0.174449 ┆ 0.000676 ┆ 258.179775 ┆ 0.0 ┆ 0.173125 ┆ 0.175774 │\n", + "│ sin(x3) ┆ -1.745781 ┆ 0.001346 ┆ -1297.083954 ┆ 0.0 ┆ -1.748419 ┆ -1.743142 │\n", + "│ __bias__ ┆ -0.106951 ┆ 0.0015 ┆ -71.292813 ┆ 0.0 ┆ -0.109891 ┆ -0.10401 │\n", "└──────────┴───────────┴──────────┴──────────────┴───────┴───────────┴───────────┘" ] }, @@ -407,7 +407,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (10_000, 2)
dummycoeffs
strlist[f64]
"a"[-0.508478, -0.340153]
"a"[-0.508478, -0.340153]
"a"[-0.508478, -0.340153]
"a"[-0.508478, -0.340153]
"a"[-0.508478, -0.340153]
"b"[-0.489381, -0.36711]
"b"[-0.489381, -0.36711]
"b"[-0.489381, -0.36711]
"b"[-0.489381, -0.36711]
"b"[-0.489381, -0.36711]
" + "shape: (10_000, 2)
dummycoeffs
strlist[f64]
"a"[-0.479674, -0.344547]
"a"[-0.479674, -0.344547]
"a"[-0.479674, -0.344547]
"a"[-0.479674, -0.344547]
"a"[-0.479674, -0.344547]
"b"[-0.5218, -0.33279]
"b"[-0.5218, -0.33279]
"b"[-0.5218, -0.33279]
"b"[-0.5218, -0.33279]
"b"[-0.5218, -0.33279]
" ], "text/plain": [ "shape: (10_000, 2)\n", @@ -416,17 +416,17 @@ "│ --- ┆ --- │\n", "│ str ┆ list[f64] │\n", "╞═══════╪════════════════════════╡\n", - "│ a ┆ [-0.508478, -0.340153] │\n", - "│ a ┆ [-0.508478, -0.340153] │\n", - "│ a ┆ [-0.508478, -0.340153] │\n", - "│ a ┆ [-0.508478, -0.340153] │\n", - "│ a ┆ [-0.508478, -0.340153] │\n", + "│ a ┆ [-0.479674, -0.344547] │\n", + "│ a ┆ [-0.479674, -0.344547] │\n", + "│ a ┆ [-0.479674, -0.344547] │\n", + "│ a ┆ [-0.479674, -0.344547] │\n", + "│ a ┆ [-0.479674, -0.344547] │\n", "│ … ┆ … │\n", - "│ b ┆ [-0.489381, -0.36711] │\n", - "│ b ┆ [-0.489381, -0.36711] │\n", - "│ b ┆ [-0.489381, -0.36711] │\n", - "│ b ┆ [-0.489381, -0.36711] │\n", - "│ b ┆ [-0.489381, -0.36711] │\n", + "│ b ┆ [-0.5218, -0.33279] │\n", + "│ b ┆ [-0.5218, -0.33279] │\n", + "│ b ┆ [-0.5218, -0.33279] │\n", + "│ b ┆ [-0.5218, -0.33279] │\n", + "│ b ┆ [-0.5218, -0.33279] │\n", "└───────┴────────────────────────┘" ] }, @@ -462,7 +462,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 5)
x1x2ypredresid
f64f64f64f64f64
0.0275160.025068-0.098684-0.022595-0.076089
0.9470790.917548-0.391686-0.7970760.405389
0.1988410.113598-1.236188-0.139386-1.096802
0.4434980.208141-0.077879-0.2948870.217008
0.8042340.103371-1.176478-0.437789-0.738689
" + "shape: (5, 5)
x1x2ypredresid
f64f64f64f64f64
0.1811840.833164-0.087106-0.3728210.285715
0.7160970.72390.315413-0.6036740.919088
0.8035620.138619-0.436603-0.4493040.012702
0.9242350.082840.055943-0.4908440.546787
0.4608230.085475-1.355857-0.25969-1.096167
" ], "text/plain": [ "shape: (5, 5)\n", @@ -471,11 +471,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════╪══════════╪═══════════╪═══════════╪═══════════╡\n", - "│ 0.027516 ┆ 0.025068 ┆ -0.098684 ┆ -0.022595 ┆ -0.076089 │\n", - "│ 0.947079 ┆ 0.917548 ┆ -0.391686 ┆ -0.797076 ┆ 0.405389 │\n", - "│ 0.198841 ┆ 0.113598 ┆ -1.236188 ┆ -0.139386 ┆ -1.096802 │\n", - "│ 0.443498 ┆ 0.208141 ┆ -0.077879 ┆ -0.294887 ┆ 0.217008 │\n", - "│ 0.804234 ┆ 0.103371 ┆ -1.176478 ┆ -0.437789 ┆ -0.738689 │\n", + "│ 0.181184 ┆ 0.833164 ┆ -0.087106 ┆ -0.372821 ┆ 0.285715 │\n", + "│ 0.716097 ┆ 0.7239 ┆ 0.315413 ┆ -0.603674 ┆ 0.919088 │\n", + "│ 0.803562 ┆ 0.138619 ┆ -0.436603 ┆ -0.449304 ┆ 0.012702 │\n", + "│ 0.924235 ┆ 0.08284 ┆ 0.055943 ┆ -0.490844 ┆ 0.546787 │\n", + "│ 0.460823 ┆ 0.085475 ┆ -1.355857 ┆ -0.25969 ┆ -1.096167 │\n", "└──────────┴──────────┴───────────┴───────────┴───────────┘" ] }, @@ -515,7 +515,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
dummycoeffs
strlist[f64]
"b"[-0.489381, -0.36711]
"a"[-0.508478, -0.340153]
" + "shape: (2, 2)
dummycoeffs
strlist[f64]
"a"[-0.479674, -0.344547]
"b"[-0.5218, -0.33279]
" ], "text/plain": [ "shape: (2, 2)\n", @@ -524,8 +524,8 @@ "│ --- ┆ --- │\n", "│ str ┆ list[f64] │\n", "╞═══════╪════════════════════════╡\n", - "│ b ┆ [-0.489381, -0.36711] │\n", - "│ a ┆ [-0.508478, -0.340153] │\n", + "│ a ┆ [-0.479674, -0.344547] │\n", + "│ b ┆ [-0.5218, -0.33279] │\n", "└───────┴────────────────────────┘" ] }, @@ -560,7 +560,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
dummycoeffs
strlist[f64]
"a"[-0.343272, -0.157735]
"b"[-0.315854, -0.193592]
" + "shape: (2, 2)
dummycoeffs
strlist[f64]
"a"[-0.299928, -0.187761]
"b"[-0.347887, -0.161111]
" ], "text/plain": [ "shape: (2, 2)\n", @@ -569,8 +569,8 @@ "│ --- ┆ --- │\n", "│ str ┆ list[f64] │\n", "╞═══════╪════════════════════════╡\n", - "│ a ┆ [-0.343272, -0.157735] │\n", - "│ b ┆ [-0.315854, -0.193592] │\n", + "│ a ┆ [-0.299928, -0.187761] │\n", + "│ b ┆ [-0.347887, -0.161111] │\n", "└───────┴────────────────────────┘" ] }, @@ -607,7 +607,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
dummylasso_r2
strf64
"a"-0.54074
"b"-0.548295
" + "shape: (2, 2)
dummylasso_r2
strf64
"a"-0.533955
"b"-0.547336
" ], "text/plain": [ "shape: (2, 2)\n", @@ -616,8 +616,8 @@ "│ --- ┆ --- │\n", "│ str ┆ f64 │\n", "╞═══════╪═══════════╡\n", - "│ a ┆ -0.54074 │\n", - "│ b ┆ -0.548295 │\n", + "│ a ┆ -0.533955 │\n", + "│ b ┆ -0.547336 │\n", "└───────┴───────────┘" ] }, @@ -658,7 +658,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (10_000, 5)
yx1x2coeffspred
f64f64f64list[f64]f64
-0.0986840.0275160.025068nullnull
-0.3916860.9470790.917548nullnull
-1.2361880.1988410.113598nullnull
-0.0778790.4434980.208141nullnull
-1.1764780.8042340.103371[-1.609748, 1.186046]-1.172012
0.2376180.717380.729978[-0.418405, -0.473687]-0.645937
-0.8797490.3889870.291635[-0.813367, -0.190164]-0.371848
-0.3020750.008090.953496[-0.891931, -0.00105]-0.008217
-1.0378870.2299350.373374[-1.01028, -0.033456]-0.244791
0.1634980.68660.724015[0.116038, -0.35731]-0.179026
" + "shape: (10_000, 5)
yx1x2coeffspred
f64f64f64list[f64]f64
-0.0871060.1811840.833164nullnull
0.3154130.7160970.7239nullnull
-0.4366030.8035620.138619nullnull
0.0559430.9242350.08284nullnull
-1.3558570.4608230.085475[-0.434778, 0.298689]-0.174825
-0.7858110.2978070.209961[-1.72997, -0.058576]-0.527496
-0.3917380.261320.040594[-1.856937, 0.204615]-0.476948
-0.4727050.8816790.572239[0.021692, -1.521368]-0.851461
-0.413730.9334420.189696[-0.181844, -1.202663]-0.397881
-0.0586460.1536290.836968[-0.610511, -0.035761]-0.123723
" ], "text/plain": [ "shape: (10_000, 5)\n", @@ -667,17 +667,17 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ list[f64] ┆ f64 │\n", "╞═══════════╪══════════╪══════════╪════════════════════════╪═══════════╡\n", - "│ -0.098684 ┆ 0.027516 ┆ 0.025068 ┆ null ┆ null │\n", - "│ -0.391686 ┆ 0.947079 ┆ 0.917548 ┆ null ┆ null │\n", - "│ -1.236188 ┆ 0.198841 ┆ 0.113598 ┆ null ┆ null │\n", - "│ -0.077879 ┆ 0.443498 ┆ 0.208141 ┆ null ┆ null │\n", - "│ -1.176478 ┆ 0.804234 ┆ 0.103371 ┆ [-1.609748, 1.186046] ┆ -1.172012 │\n", + "│ -0.087106 ┆ 0.181184 ┆ 0.833164 ┆ null ┆ null │\n", + "│ 0.315413 ┆ 0.716097 ┆ 0.7239 ┆ null ┆ null │\n", + "│ -0.436603 ┆ 0.803562 ┆ 0.138619 ┆ null ┆ null │\n", + "│ 0.055943 ┆ 0.924235 ┆ 0.08284 ┆ null ┆ null │\n", + "│ -1.355857 ┆ 0.460823 ┆ 0.085475 ┆ [-0.434778, 0.298689] ┆ -0.174825 │\n", "│ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 0.237618 ┆ 0.71738 ┆ 0.729978 ┆ [-0.418405, -0.473687] ┆ -0.645937 │\n", - "│ -0.879749 ┆ 0.388987 ┆ 0.291635 ┆ [-0.813367, -0.190164] ┆ -0.371848 │\n", - "│ -0.302075 ┆ 0.00809 ┆ 0.953496 ┆ [-0.891931, -0.00105] ┆ -0.008217 │\n", - "│ -1.037887 ┆ 0.229935 ┆ 0.373374 ┆ [-1.01028, -0.033456] ┆ -0.244791 │\n", - "│ 0.163498 ┆ 0.6866 ┆ 0.724015 ┆ [0.116038, -0.35731] ┆ -0.179026 │\n", + "│ -0.785811 ┆ 0.297807 ┆ 0.209961 ┆ [-1.72997, -0.058576] ┆ -0.527496 │\n", + "│ -0.391738 ┆ 0.26132 ┆ 0.040594 ┆ [-1.856937, 0.204615] ┆ -0.476948 │\n", + "│ -0.472705 ┆ 0.881679 ┆ 0.572239 ┆ [0.021692, -1.521368] ┆ -0.851461 │\n", + "│ -0.41373 ┆ 0.933442 ┆ 0.189696 ┆ [-0.181844, -1.202663] ┆ -0.397881 │\n", + "│ -0.058646 ┆ 0.153629 ┆ 0.836968 ┆ [-0.610511, -0.035761] ┆ -0.123723 │\n", "└───────────┴──────────┴──────────┴────────────────────────┴───────────┘" ] }, @@ -758,7 +758,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 1)
a
list[f64]
[28.850744, 28.801703, 28.618474]
" + "shape: (1, 1)
a
list[f64]
[29.073839, 28.893157, 28.404245]
" ], "text/plain": [ "shape: (1, 1)\n", @@ -767,7 +767,7 @@ "│ --- │\n", "│ list[f64] │\n", "╞═════════════════════════════════╡\n", - "│ [28.850744, 28.801703, 28.6184… │\n", + "│ [29.073839, 28.893157, 28.4042… │\n", "└─────────────────────────────────┘" ] }, @@ -799,7 +799,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 2)
singular_valueweight_vector
f64list[f64]
28.820497[0.995171, 0.098156]
28.76974[-0.098156, 0.995171]
" + "shape: (2, 2)
singular_valueweight_vector
f64list[f64]
29.015447[0.568763, 0.822502]
28.458258[0.822502, -0.568763]
" ], "text/plain": [ "shape: (2, 2)\n", @@ -808,8 +808,8 @@ "│ --- ┆ --- │\n", "│ f64 ┆ list[f64] │\n", "╞════════════════╪═══════════════════════╡\n", - "│ 28.820497 ┆ [0.995171, 0.098156] │\n", - "│ 28.76974 ┆ [-0.098156, 0.995171] │\n", + "│ 29.015447 ┆ [0.568763, 0.822502] │\n", + "│ 28.458258 ┆ [0.822502, -0.568763] │\n", "└────────────────┴───────────────────────┘" ] }, @@ -841,7 +841,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
pc1
f64
0.11709
-0.210939
0.011899
-0.162391
-0.050289
" + "shape: (5, 1)
pc1
f64
0.380626
-0.147048
-0.395586
0.562945
0.307737
" ], "text/plain": [ "shape: (5, 1)\n", @@ -850,11 +850,11 @@ "│ --- │\n", "│ f64 │\n", "╞═══════════╡\n", - "│ 0.11709 │\n", - "│ -0.210939 │\n", - "│ 0.011899 │\n", - "│ -0.162391 │\n", - "│ -0.050289 │\n", + "│ 0.380626 │\n", + "│ -0.147048 │\n", + "│ -0.395586 │\n", + "│ 0.562945 │\n", + "│ 0.307737 │\n", "└───────────┘" ] }, @@ -894,7 +894,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (2, 8)
dummy_groupsl2log lossprecisionrecallfaverage_precisionroc_auc
strf64f64f64f64f64f64f64
"b"0.3346761.0027540.4902680.4841810.4872050.5031040.500475
"a"0.3284010.9852680.5087090.5125650.510630.5046450.507109
" + "shape: (2, 8)
dummy_groupsl2log lossprecisionrecallfaverage_precisionroc_auc
strf64f64f64f64f64f64f64
"a"0.335440.9979450.4964790.5015810.4990170.4984350.493508
"b"0.3329561.0010330.5188980.5092740.5140410.5125840.500236
" ], "text/plain": [ "shape: (2, 8)\n", @@ -904,8 +904,8 @@ "│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ --- ┆ f64 │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ │\n", "╞══════════════╪══════════╪══════════╪═══════════╪══════════╪══════════╪════════════════╪══════════╡\n", - "│ b ┆ 0.334676 ┆ 1.002754 ┆ 0.490268 ┆ 0.484181 ┆ 0.487205 ┆ 0.503104 ┆ 0.500475 │\n", - "│ a ┆ 0.328401 ┆ 0.985268 ┆ 0.508709 ┆ 0.512565 ┆ 0.51063 ┆ 0.504645 ┆ 0.507109 │\n", + "│ a ┆ 0.33544 ┆ 0.997945 ┆ 0.496479 ┆ 0.501581 ┆ 0.499017 ┆ 0.498435 ┆ 0.493508 │\n", + "│ b ┆ 0.332956 ┆ 1.001033 ┆ 0.518898 ┆ 0.509274 ┆ 0.514041 ┆ 0.512584 ┆ 0.500236 │\n", "└──────────────┴──────────┴──────────┴───────────┴──────────┴──────────┴────────────────┴──────────┘" ] }, @@ -993,7 +993,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
sen
str
"hello"
"going"
"world"
"church"
"to"
" + "shape: (5, 1)
sen
str
"world"
"church"
"hello"
"going"
"to"
" ], "text/plain": [ "shape: (5, 1)\n", @@ -1002,10 +1002,10 @@ "│ --- │\n", "│ str │\n", "╞════════╡\n", - "│ hello │\n", - "│ going │\n", "│ world │\n", "│ church │\n", + "│ hello │\n", + "│ going │\n", "│ to │\n", "└────────┘" ] @@ -1038,7 +1038,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
sen
str
"go"
"hello"
"world"
"church"
""
" + "shape: (5, 1)
sen
str
"hello"
""
"world"
"go"
"church"
" ], "text/plain": [ "shape: (5, 1)\n", @@ -1047,11 +1047,11 @@ "│ --- │\n", "│ str │\n", "╞════════╡\n", - "│ go │\n", "│ hello │\n", + "│ │\n", "│ world │\n", + "│ go │\n", "│ church │\n", - "│ │\n", "└────────┘" ] }, @@ -1421,21 +1421,21 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 1)
a
f64
null
null
0.166248
1.339555
1.29705
" + "shape: (5, 1)
a
f64
null
null
-0.667205
-0.004369
-1.539039
" ], "text/plain": [ "shape: (5, 1)\n", - "┌──────────┐\n", - "│ a │\n", - "│ --- │\n", - "│ f64 │\n", - "╞══════════╡\n", - "│ null │\n", - "│ null │\n", - "│ 0.166248 │\n", - "│ 1.339555 │\n", - "│ 1.29705 │\n", - "└──────────┘" + "┌───────────┐\n", + "│ a │\n", + "│ --- │\n", + "│ f64 │\n", + "╞═══════════╡\n", + "│ null │\n", + "│ null │\n", + "│ -0.667205 │\n", + "│ -0.004369 │\n", + "│ -1.539039 │\n", + "└───────────┘" ] }, "execution_count": 32, @@ -1468,21 +1468,21 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
arandom_normalrandom_normal_that_respects_null_of_a
f64f64f64
null-0.819894null
null-0.106444null
0.1662480.3111770.46741
1.3395551.5076271.895496
1.29705-0.404330.528693
" + "shape: (5, 3)
arandom_normalrandom_normal_that_respects_null_of_a
f64f64f64
null0.766622null
null0.626792null
-0.6672051.568425-1.028465
-0.004369-0.8467361.131894
-1.539039-0.9944061.053838
" ], "text/plain": [ "shape: (5, 3)\n", - "┌──────────┬───────────────┬─────────────────────────────────┐\n", - "│ a ┆ random_normal ┆ random_normal_that_respects_nu… │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ f64 ┆ f64 ┆ f64 │\n", - "╞══════════╪═══════════════╪═════════════════════════════════╡\n", - "│ null ┆ -0.819894 ┆ null │\n", - "│ null ┆ -0.106444 ┆ null │\n", - "│ 0.166248 ┆ 0.311177 ┆ 0.46741 │\n", - "│ 1.339555 ┆ 1.507627 ┆ 1.895496 │\n", - "│ 1.29705 ┆ -0.40433 ┆ 0.528693 │\n", - "└──────────┴───────────────┴─────────────────────────────────┘" + "┌───────────┬───────────────┬─────────────────────────────────┐\n", + "│ a ┆ random_normal ┆ random_normal_that_respects_nu… │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 │\n", + "╞═══════════╪═══════════════╪═════════════════════════════════╡\n", + "│ null ┆ 0.766622 ┆ null │\n", + "│ null ┆ 0.626792 ┆ null │\n", + "│ -0.667205 ┆ 1.568425 ┆ -1.028465 │\n", + "│ -0.004369 ┆ -0.846736 ┆ 1.131894 │\n", + "│ -1.539039 ┆ -0.994406 ┆ 1.053838 │\n", + "└───────────┴───────────────┴─────────────────────────────────┘" ] }, "execution_count": 33, @@ -1516,21 +1516,21 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
arandom_strrandom_str_that_respects_null_of_a
f64strstr
null"cC"null
null"HxBp"null
0.166248"qjl""RMFn"
1.339555"3N""n6Al"
1.29705"cHD""MF"
" + "shape: (5, 3)
arandom_strrandom_str_that_respects_null_of_a
f64strstr
null"FL"null
null"ftnwd"null
-0.667205"7YV""o1"
-0.004369"G7""Ys"
-1.539039"Jd4""3umWr"
" ], "text/plain": [ "shape: (5, 3)\n", - "┌──────────┬────────────┬─────────────────────────────────┐\n", - "│ a ┆ random_str ┆ random_str_that_respects_null_… │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ f64 ┆ str ┆ str │\n", - "╞══════════╪════════════╪═════════════════════════════════╡\n", - "│ null ┆ cC ┆ null │\n", - "│ null ┆ HxBp ┆ null │\n", - "│ 0.166248 ┆ qjl ┆ RMFn │\n", - "│ 1.339555 ┆ 3N ┆ n6Al │\n", - "│ 1.29705 ┆ cHD ┆ MF │\n", - "└──────────┴────────────┴─────────────────────────────────┘" + "┌───────────┬────────────┬─────────────────────────────────┐\n", + "│ a ┆ random_str ┆ random_str_that_respects_null_… │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ str ┆ str │\n", + "╞═══════════╪════════════╪═════════════════════════════════╡\n", + "│ null ┆ FL ┆ null │\n", + "│ null ┆ ftnwd ┆ null │\n", + "│ -0.667205 ┆ 7YV ┆ o1 │\n", + "│ -0.004369 ┆ G7 ┆ Ys │\n", + "│ -1.539039 ┆ Jd4 ┆ 3umWr │\n", + "└───────────┴────────────┴─────────────────────────────────┘" ] }, "execution_count": 34, @@ -1564,21 +1564,21 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 2)
arandom_str
f64str
nullnull
nullnull
0.166248"uhpES"
1.339555"1AzJe"
1.29705"EpZWF"
" + "shape: (5, 2)
arandom_str
f64str
nullnull
nullnull
-0.667205"hIQx3"
-0.004369"OZsZn"
-1.539039"OXelh"
" ], "text/plain": [ "shape: (5, 2)\n", - "┌──────────┬────────────┐\n", - "│ a ┆ random_str │\n", - "│ --- ┆ --- │\n", - "│ f64 ┆ str │\n", - "╞══════════╪════════════╡\n", - "│ null ┆ null │\n", - "│ null ┆ null │\n", - "│ 0.166248 ┆ uhpES │\n", - "│ 1.339555 ┆ 1AzJe │\n", - "│ 1.29705 ┆ EpZWF │\n", - "└──────────┴────────────┘" + "┌───────────┬────────────┐\n", + "│ a ┆ random_str │\n", + "│ --- ┆ --- │\n", + "│ f64 ┆ str │\n", + "╞═══════════╪════════════╡\n", + "│ null ┆ null │\n", + "│ null ┆ null │\n", + "│ -0.667205 ┆ hIQx3 │\n", + "│ -0.004369 ┆ OZsZn │\n", + "│ -1.539039 ┆ OXelh │\n", + "└───────────┴────────────┘" ] }, "execution_count": 35, @@ -1611,21 +1611,21 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 4)
atest1literaltest1_perturbed
f64f64f64f64
null0.175841null0.176231
null-0.816172null-0.815861
0.1662481.9556281.9952671.955387
1.3395550.5769813.0111820.57688
1.29705-1.0834621.005481-1.083934
" + "shape: (5, 4)
atest1literaltest1_perturbed
f64f64f64f64
null-0.647906null-0.648101
null0.721174null0.721425
-0.6672050.6104711.4766930.610372
-0.004369-0.0545580.705794-0.054194
-1.5390390.2661830.0603740.266557
" ], "text/plain": [ "shape: (5, 4)\n", - "┌──────────┬───────────┬──────────┬─────────────────┐\n", - "│ a ┆ test1 ┆ literal ┆ test1_perturbed │\n", - "│ --- ┆ --- ┆ --- ┆ --- │\n", - "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", - "╞══════════╪═══════════╪══════════╪═════════════════╡\n", - "│ null ┆ 0.175841 ┆ null ┆ 0.176231 │\n", - "│ null ┆ -0.816172 ┆ null ┆ -0.815861 │\n", - "│ 0.166248 ┆ 1.955628 ┆ 1.995267 ┆ 1.955387 │\n", - "│ 1.339555 ┆ 0.576981 ┆ 3.011182 ┆ 0.57688 │\n", - "│ 1.29705 ┆ -1.083462 ┆ 1.005481 ┆ -1.083934 │\n", - "└──────────┴───────────┴──────────┴─────────────────┘" + "┌───────────┬───────────┬──────────┬─────────────────┐\n", + "│ a ┆ test1 ┆ literal ┆ test1_perturbed │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════════╪═══════════╪══════════╪═════════════════╡\n", + "│ null ┆ -0.647906 ┆ null ┆ -0.648101 │\n", + "│ null ┆ 0.721174 ┆ null ┆ 0.721425 │\n", + "│ -0.667205 ┆ 0.610471 ┆ 1.476693 ┆ 0.610372 │\n", + "│ -0.004369 ┆ -0.054558 ┆ 0.705794 ┆ -0.054194 │\n", + "│ -1.539039 ┆ 0.266183 ┆ 0.060374 ┆ 0.266557 │\n", + "└───────────┴───────────┴──────────┴─────────────────┘" ] }, "execution_count": 36, @@ -1663,21 +1663,21 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 4)
a[0, 1)NormalInt from [0, 10)
f64f64f64i32
null0.714802-0.546072
null0.3555130.8275999
0.1662480.0022820.5702960
1.3395550.786958-0.0310613
1.297050.320615-0.1586688
" + "shape: (5, 4)
a[0, 1)NormalInt from [0, 10)
f64f64f64i32
null0.6850050.3662168
null0.24877-1.1890344
-0.6672050.890820.9462594
-0.0043690.9515030.6068959
-1.5390390.707259-0.9071757
" ], "text/plain": [ "shape: (5, 4)\n", - "┌──────────┬──────────┬───────────┬──────────────────┐\n", - "│ a ┆ [0, 1) ┆ Normal ┆ Int from [0, 10) │\n", - "│ --- ┆ --- ┆ --- ┆ --- │\n", - "│ f64 ┆ f64 ┆ f64 ┆ i32 │\n", - "╞══════════╪══════════╪═══════════╪══════════════════╡\n", - "│ null ┆ 0.714802 ┆ -0.54607 ┆ 2 │\n", - "│ null ┆ 0.355513 ┆ 0.827599 ┆ 9 │\n", - "│ 0.166248 ┆ 0.002282 ┆ 0.570296 ┆ 0 │\n", - "│ 1.339555 ┆ 0.786958 ┆ -0.031061 ┆ 3 │\n", - "│ 1.29705 ┆ 0.320615 ┆ -0.158668 ┆ 8 │\n", - "└──────────┴──────────┴───────────┴──────────────────┘" + "┌───────────┬──────────┬───────────┬──────────────────┐\n", + "│ a ┆ [0, 1) ┆ Normal ┆ Int from [0, 10) │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ f64 ┆ f64 ┆ f64 ┆ i32 │\n", + "╞═══════════╪══════════╪═══════════╪══════════════════╡\n", + "│ null ┆ 0.685005 ┆ 0.366216 ┆ 8 │\n", + "│ null ┆ 0.24877 ┆ -1.189034 ┆ 4 │\n", + "│ -0.667205 ┆ 0.89082 ┆ 0.946259 ┆ 4 │\n", + "│ -0.004369 ┆ 0.951503 ┆ 0.606895 ┆ 9 │\n", + "│ -1.539039 ┆ 0.707259 ┆ -0.907175 ┆ 7 │\n", + "└───────────┴──────────┴───────────┴──────────────────┘" ] }, "execution_count": 37, @@ -1711,7 +1711,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 4)
t-tests: statisticst-tests: pvaluenormality_test: statisticsnormality_test: pvalue
f64f64f64f64
-0.4252430.6707220.7828710.676086
" + "shape: (1, 4)
t-tests: statisticst-tests: pvaluenormality_test: statisticsnormality_test: pvalue
f64f64f64f64
-0.9410260.3468441.4200340.491636
" ], "text/plain": [ "shape: (1, 4)\n", @@ -1720,7 +1720,7 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════╪═════════════════╪════════════════════════════╪════════════════════════╡\n", - "│ -0.425243 ┆ 0.670722 ┆ 0.782871 ┆ 0.676086 │\n", + "│ -0.941026 ┆ 0.346844 ┆ 1.420034 ┆ 0.491636 │\n", "└─────────────────────┴─────────────────┴────────────────────────────┴────────────────────────┘" ] }, @@ -1764,7 +1764,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 5)
market_idvar1var2category_1category_2
i64f64f64i32i32
00.5983210.07541545
10.0732960.78989323
20.8180230.50497424
00.9851040.15305306
10.4408520.86290621
" + "shape: (5, 5)
market_idvar1var2category_1category_2
i64f64f64i32i32
00.8429720.45036434
10.6256630.09508306
20.0292550.51552834
00.7825690.66447814
10.4871030.93536107
" ], "text/plain": [ "shape: (5, 5)\n", @@ -1773,11 +1773,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ f64 ┆ i32 ┆ i32 │\n", "╞═══════════╪══════════╪══════════╪════════════╪════════════╡\n", - "│ 0 ┆ 0.598321 ┆ 0.075415 ┆ 4 ┆ 5 │\n", - "│ 1 ┆ 0.073296 ┆ 0.789893 ┆ 2 ┆ 3 │\n", - "│ 2 ┆ 0.818023 ┆ 0.504974 ┆ 2 ┆ 4 │\n", - "│ 0 ┆ 0.985104 ┆ 0.153053 ┆ 0 ┆ 6 │\n", - "│ 1 ┆ 0.440852 ┆ 0.862906 ┆ 2 ┆ 1 │\n", + "│ 0 ┆ 0.842972 ┆ 0.450364 ┆ 3 ┆ 4 │\n", + "│ 1 ┆ 0.625663 ┆ 0.095083 ┆ 0 ┆ 6 │\n", + "│ 2 ┆ 0.029255 ┆ 0.515528 ┆ 3 ┆ 4 │\n", + "│ 0 ┆ 0.782569 ┆ 0.664478 ┆ 1 ┆ 4 │\n", + "│ 1 ┆ 0.487103 ┆ 0.935361 ┆ 0 ┆ 7 │\n", "└───────────┴──────────┴──────────┴────────────┴────────────┘" ] }, @@ -1817,17 +1817,17 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1, 3)
t-testchi2-testf-test
struct[2]struct[2]struct[2]
{0.356596,0.721402}{31.810658,0.668157}{0.744296,0.5617}
" + "shape: (1, 3)
t-testchi2-testf-test
struct[2]struct[2]struct[2]
{1.550123,0.121144}{22.889401,0.955922}{1.746959,0.136718}
" ], "text/plain": [ "shape: (1, 3)\n", - "┌─────────────────────┬──────────────────────┬───────────────────┐\n", - "│ t-test ┆ chi2-test ┆ f-test │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ struct[2] ┆ struct[2] ┆ struct[2] │\n", - "╞═════════════════════╪══════════════════════╪═══════════════════╡\n", - "│ {0.356596,0.721402} ┆ {31.810658,0.668157} ┆ {0.744296,0.5617} │\n", - "└─────────────────────┴──────────────────────┴───────────────────┘" + "┌─────────────────────┬──────────────────────┬─────────────────────┐\n", + "│ t-test ┆ chi2-test ┆ f-test │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ struct[2] ┆ struct[2] ┆ struct[2] │\n", + "╞═════════════════════╪══════════════════════╪═════════════════════╡\n", + "│ {1.550123,0.121144} ┆ {22.889401,0.955922} ┆ {1.746959,0.136718} │\n", + "└─────────────────────┴──────────────────────┴─────────────────────┘" ] }, "execution_count": 40, @@ -1860,9 +1860,9 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │\n", "╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡\n", - "│ 0 ┆ {0.782406,0.434031} ┆ {32.80012,0.621581} ┆ {2.027486,0.088156} │\n", - "│ 1 ┆ {-1.168306,0.242767} ┆ {34.251982,0.551894} ┆ {0.414089,0.798598} │\n", - "│ 2 ┆ {0.988312,0.323072} ┆ {35.722092,0.481702} ┆ {1.335438,0.254489} │\n", + "│ 0 ┆ {2.182824,0.029118} ┆ {38.555846,0.35473} ┆ {1.597169,0.172493} │\n", + "│ 1 ┆ {0.972503,0.330871} ┆ {26.35273,0.880507} ┆ {0.864623,0.484451} │\n", + "│ 2 ┆ {-0.471616,0.637232} ┆ {34.267769,0.551134} ┆ {0.783499,0.535832} │\n", "└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘\n" ] } @@ -1894,7 +1894,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (9, 2)
first_digit_cntfirst_digit_distribution
u32f64
5370.1074
5340.1068
5770.1154
6050.121
5460.1092
5720.1144
5050.101
5610.1122
5630.1126
" + "shape: (9, 2)
first_digit_cntfirst_digit_distribution
u32f64
5560.1112
5660.1132
5560.1112
5250.105
5920.1184
5200.104
5660.1132
5520.1104
5670.1134
" ], "text/plain": [ "shape: (9, 2)\n", @@ -1903,15 +1903,15 @@ "│ --- ┆ --- │\n", "│ u32 ┆ f64 │\n", "╞═════════════════╪══════════════════════════╡\n", - "│ 537 ┆ 0.1074 │\n", - "│ 534 ┆ 0.1068 │\n", - "│ 577 ┆ 0.1154 │\n", - "│ 605 ┆ 0.121 │\n", - "│ 546 ┆ 0.1092 │\n", - "│ 572 ┆ 0.1144 │\n", - "│ 505 ┆ 0.101 │\n", - "│ 561 ┆ 0.1122 │\n", - "│ 563 ┆ 0.1126 │\n", + "│ 556 ┆ 0.1112 │\n", + "│ 566 ┆ 0.1132 │\n", + "│ 556 ┆ 0.1112 │\n", + "│ 525 ┆ 0.105 │\n", + "│ 592 ┆ 0.1184 │\n", + "│ 520 ┆ 0.104 │\n", + "│ 566 ┆ 0.1132 │\n", + "│ 552 ┆ 0.1104 │\n", + "│ 567 ┆ 0.1134 │\n", "└─────────────────┴──────────────────────────┘" ] }, @@ -1977,7 +1977,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 7)
idvar1var2var3rrhnb_l_inf_cnt
u32f64f64f64f64f64u32
00.7200430.7620570.08020.8533731.18488816
10.7468590.7747830.9698850.0279926.01137215
20.210970.0291060.5229270.3174769.59637514
30.7017920.5273460.3522970.9123834.87447416
40.7238150.5447530.3116940.2104745.69628117
" + "shape: (5, 7)
idvar1var2var3rrhnb_l_inf_cnt
u32f64f64f64f64f64u32
00.3477840.130260.3340190.6984917.90958615
10.482210.0509910.7361850.8920897.8234518
20.7866480.6397780.7747210.1342843.5151420
30.9447630.1294090.4603580.7158578.13377816
40.5976980.7476960.8853920.6708412.39268719
" ], "text/plain": [ "shape: (5, 7)\n", @@ -1986,11 +1986,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ u32 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════════╡\n", - "│ 0 ┆ 0.720043 ┆ 0.762057 ┆ 0.0802 ┆ 0.853373 ┆ 1.184888 ┆ 16 │\n", - "│ 1 ┆ 0.746859 ┆ 0.774783 ┆ 0.969885 ┆ 0.027992 ┆ 6.011372 ┆ 15 │\n", - "│ 2 ┆ 0.21097 ┆ 0.029106 ┆ 0.522927 ┆ 0.317476 ┆ 9.596375 ┆ 14 │\n", - "│ 3 ┆ 0.701792 ┆ 0.527346 ┆ 0.352297 ┆ 0.912383 ┆ 4.874474 ┆ 16 │\n", - "│ 4 ┆ 0.723815 ┆ 0.544753 ┆ 0.311694 ┆ 0.210474 ┆ 5.696281 ┆ 17 │\n", + "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ 15 │\n", + "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ 8 │\n", + "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ 20 │\n", + "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ 16 │\n", + "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ 19 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────────┘" ] }, @@ -2027,7 +2027,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 7)
idvar1var2var3rrhnb_l1_r_cnt
u32f64f64f64f64f64u32
00.7200430.7620570.08020.8533731.184888690
10.7468590.7747830.9698850.0279926.0113721
20.210970.0291060.5229270.3174769.59637556
30.7017920.5273460.3522970.9123834.8744741289
40.7238150.5447530.3116940.2104745.69628123
" + "shape: (5, 7)
idvar1var2var3rrhnb_l1_r_cnt
u32f64f64f64f64f64u32
00.3477840.130260.3340190.6984917.909586538
10.482210.0509910.7361850.8920897.823451783
20.7866480.6397780.7747210.1342843.5151410
30.9447630.1294090.4603580.7158578.133778389
40.5976980.7476960.8853920.6708412.392687483
" ], "text/plain": [ "shape: (5, 7)\n", @@ -2036,11 +2036,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ u32 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪═════════════╡\n", - "│ 0 ┆ 0.720043 ┆ 0.762057 ┆ 0.0802 ┆ 0.853373 ┆ 1.184888 ┆ 690 │\n", - "│ 1 ┆ 0.746859 ┆ 0.774783 ┆ 0.969885 ┆ 0.027992 ┆ 6.011372 ┆ 1 │\n", - "│ 2 ┆ 0.21097 ┆ 0.029106 ┆ 0.522927 ┆ 0.317476 ┆ 9.596375 ┆ 56 │\n", - "│ 3 ┆ 0.701792 ┆ 0.527346 ┆ 0.352297 ┆ 0.912383 ┆ 4.874474 ┆ 1289 │\n", - "│ 4 ┆ 0.723815 ┆ 0.544753 ┆ 0.311694 ┆ 0.210474 ┆ 5.696281 ┆ 23 │\n", + "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ 538 │\n", + "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ 783 │\n", + "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ 10 │\n", + "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ 389 │\n", + "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ 483 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴─────────────┘" ] }, @@ -2076,7 +2076,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 7)
idvar1var2var3rrhbest friends
u32f64f64f64f64f64list[u32]
00.7200430.7620570.08020.8533731.184888[0, 1171, … 1754]
10.7468590.7747830.9698850.0279926.011372[1, 906, … 1751]
20.210970.0291060.5229270.3174769.596375[2, 50, … 853]
30.7017920.5273460.3522970.9123834.874474[3, 1558, … 921]
40.7238150.5447530.3116940.2104745.696281[4, 3, … 485]
" + "shape: (5, 7)
idvar1var2var3rrhbest friends
u32f64f64f64f64f64list[u32]
00.3477840.130260.3340190.6984917.909586[0, 502, … 115]
10.482210.0509910.7361850.8920897.823451[1, 1527, … 400]
20.7866480.6397780.7747210.1342843.51514[2, 1430, … 1451]
30.9447630.1294090.4603580.7158578.133778[3, 598, … 711]
40.5976980.7476960.8853920.6708412.392687[4, 650, … 213]
" ], "text/plain": [ "shape: (5, 7)\n", @@ -2085,11 +2085,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ list[u32] │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪═══════════════════╡\n", - "│ 0 ┆ 0.720043 ┆ 0.762057 ┆ 0.0802 ┆ 0.853373 ┆ 1.184888 ┆ [0, 1171, … 1754] │\n", - "│ 1 ┆ 0.746859 ┆ 0.774783 ┆ 0.969885 ┆ 0.027992 ┆ 6.011372 ┆ [1, 906, … 1751] │\n", - "│ 2 ┆ 0.21097 ┆ 0.029106 ┆ 0.522927 ┆ 0.317476 ┆ 9.596375 ┆ [2, 50, … 853] │\n", - "│ 3 ┆ 0.701792 ┆ 0.527346 ┆ 0.352297 ┆ 0.912383 ┆ 4.874474 ┆ [3, 1558, … 921] │\n", - "│ 4 ┆ 0.723815 ┆ 0.544753 ┆ 0.311694 ┆ 0.210474 ┆ 5.696281 ┆ [4, 3, … 485] │\n", + "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ [0, 502, … 115] │\n", + "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ [1, 1527, … 400] │\n", + "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ [2, 1430, … 1451] │\n", + "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ [3, 598, … 711] │\n", + "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ [4, 650, … 213] │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴───────────────────┘" ] }, @@ -2128,11 +2128,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ u32 ┆ list[u32] ┆ u32 │\n", "╞═════╪══════════════════╪════════════════════╡\n", - "│ 0 ┆ [0, 1171, … 912] ┆ 9 │\n", - "│ 1 ┆ [1, 906, … 831] ┆ 5 │\n", - "│ 2 ┆ [2, 50, … 1682] ┆ 8 │\n", - "│ 3 ┆ [3, 1558, … 66] ┆ 7 │\n", - "│ 4 ┆ [4, 3, … 1370] ┆ 6 │\n", + "│ 0 ┆ [0, 502, … 875] ┆ 10 │\n", + "│ 1 ┆ [1, 1527, … 400] ┆ 3 │\n", + "│ 2 ┆ [2, 1430, … 549] ┆ 10 │\n", + "│ 3 ┆ [3, 598, … 1768] ┆ 9 │\n", + "│ 4 ┆ [4, 650, … 803] ┆ 6 │\n", "└─────┴──────────────────┴────────────────────┘\n" ] } @@ -2173,7 +2173,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 8)
idvar1var2var3rrhidxdist
u32f64f64f64f64f64list[u32]list[f64]
00.7200430.7620570.08020.8533731.184888[0, 1171, … 1754][0.0, 0.054683, … 0.077248]
10.7468590.7747830.9698850.0279926.011372[1, 906, … 1751][0.0, 0.042337, … 0.053288]
20.210970.0291060.5229270.3174769.596375[2, 50, … 853][0.0, 0.059335, … 0.06505]
30.7017920.5273460.3522970.9123834.874474[3, 1558, … 921][0.0, 0.015422, … 0.067852]
40.7238150.5447530.3116940.2104745.696281[4, 3, … 485][0.0, 0.049363, … 0.060237]
" + "shape: (5, 8)
idvar1var2var3rrhidxdist
u32f64f64f64f64f64list[u32]list[f64]
00.3477840.130260.3340190.6984917.909586[0, 502, … 115][0.0, 0.066443, … 0.072798]
10.482210.0509910.7361850.8920897.823451[1, 1527, … 400][0.0, 0.049926, … 0.063975]
20.7866480.6397780.7747210.1342843.51514[2, 1430, … 1451][0.0, 0.02861, … 0.057878]
30.9447630.1294090.4603580.7158578.133778[3, 598, … 711][0.0, 0.032508, … 0.046937]
40.5976980.7476960.8853920.6708412.392687[4, 650, … 213][0.0, 0.068048, … 0.076969]
" ], "text/plain": [ "shape: (5, 8)\n", @@ -2182,16 +2182,16 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ list[u32] ┆ list[f64] │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════════════╪══════════════════╡\n", - "│ 0 ┆ 0.720043 ┆ 0.762057 ┆ 0.0802 ┆ 0.853373 ┆ 1.184888 ┆ [0, 1171, … ┆ [0.0, 0.054683, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ 1754] ┆ … 0.077248] │\n", - "│ 1 ┆ 0.746859 ┆ 0.774783 ┆ 0.969885 ┆ 0.027992 ┆ 6.011372 ┆ [1, 906, … 1751] ┆ [0.0, 0.042337, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.053288] │\n", - "│ 2 ┆ 0.21097 ┆ 0.029106 ┆ 0.522927 ┆ 0.317476 ┆ 9.596375 ┆ [2, 50, … 853] ┆ [0.0, 0.059335, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.06505] │\n", - "│ 3 ┆ 0.701792 ┆ 0.527346 ┆ 0.352297 ┆ 0.912383 ┆ 4.874474 ┆ [3, 1558, … 921] ┆ [0.0, 0.015422, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.067852] │\n", - "│ 4 ┆ 0.723815 ┆ 0.544753 ┆ 0.311694 ┆ 0.210474 ┆ 5.696281 ┆ [4, 3, … 485] ┆ [0.0, 0.049363, │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.060237] │\n", + "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ [0, 502, … 115] ┆ [0.0, 0.066443, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.072798] │\n", + "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ [1, 1527, … 400] ┆ [0.0, 0.049926, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.063975] │\n", + "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ [2, 1430, … ┆ [0.0, 0.02861, … │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ 1451] ┆ 0.057878] │\n", + "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ [3, 598, … 711] ┆ [0.0, 0.032508, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.046937] │\n", + "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ [4, 650, … 213] ┆ [0.0, 0.068048, │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.076969] │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────────────┴──────────────────┘" ] }, @@ -2231,7 +2231,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
30.7017920.5273460.3522970.9123834.874474
40.7238150.5447530.3116940.2104745.696281
60.7367240.7761740.6935740.5321663.944928
70.6176420.7889390.4883180.275192.900536
80.3272070.4561770.4690140.1288779.917232
" + "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
00.3477840.130260.3340190.6984917.909586
20.7866480.6397780.7747210.1342843.51514
50.7126330.284850.3291330.5433386.065003
70.4057690.4433430.8922050.7317089.658069
100.8369910.4285170.4042040.4400194.264234
" ], "text/plain": [ "shape: (5, 6)\n", @@ -2240,11 +2240,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n", - "│ 3 ┆ 0.701792 ┆ 0.527346 ┆ 0.352297 ┆ 0.912383 ┆ 4.874474 │\n", - "│ 4 ┆ 0.723815 ┆ 0.544753 ┆ 0.311694 ┆ 0.210474 ┆ 5.696281 │\n", - "│ 6 ┆ 0.736724 ┆ 0.776174 ┆ 0.693574 ┆ 0.532166 ┆ 3.944928 │\n", - "│ 7 ┆ 0.617642 ┆ 0.788939 ┆ 0.488318 ┆ 0.27519 ┆ 2.900536 │\n", - "│ 8 ┆ 0.327207 ┆ 0.456177 ┆ 0.469014 ┆ 0.128877 ┆ 9.917232 │\n", + "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 │\n", + "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 │\n", + "│ 5 ┆ 0.712633 ┆ 0.28485 ┆ 0.329133 ┆ 0.543338 ┆ 6.065003 │\n", + "│ 7 ┆ 0.405769 ┆ 0.443343 ┆ 0.892205 ┆ 0.731708 ┆ 9.658069 │\n", + "│ 10 ┆ 0.836991 ┆ 0.428517 ┆ 0.404204 ┆ 0.440019 ┆ 4.264234 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘" ] }, @@ -2281,7 +2281,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
360.5684330.4569690.058860.0067660.753822
1170.5211210.5218920.1293220.3777080.731702
1380.5318770.4880150.0408610.9059181.466601
1690.4657040.5019990.2534890.6254596.367421
1760.4737390.4913080.8941730.6525566.084469
" + "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
610.4575860.5075030.6142960.558337.064511
1640.5505850.5645620.9172420.7561738.670795
2570.5712680.544020.1108260.167148.900079
3340.5513340.4582450.403990.4943471.872597
3520.5353940.4594570.4672950.9867023.193573
" ], "text/plain": [ "shape: (5, 6)\n", @@ -2290,11 +2290,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n", - "│ 36 ┆ 0.568433 ┆ 0.456969 ┆ 0.05886 ┆ 0.006766 ┆ 0.753822 │\n", - "│ 117 ┆ 0.521121 ┆ 0.521892 ┆ 0.129322 ┆ 0.377708 ┆ 0.731702 │\n", - "│ 138 ┆ 0.531877 ┆ 0.488015 ┆ 0.040861 ┆ 0.905918 ┆ 1.466601 │\n", - "│ 169 ┆ 0.465704 ┆ 0.501999 ┆ 0.253489 ┆ 0.625459 ┆ 6.367421 │\n", - "│ 176 ┆ 0.473739 ┆ 0.491308 ┆ 0.894173 ┆ 0.652556 ┆ 6.084469 │\n", + "│ 61 ┆ 0.457586 ┆ 0.507503 ┆ 0.614296 ┆ 0.55833 ┆ 7.064511 │\n", + "│ 164 ┆ 0.550585 ┆ 0.564562 ┆ 0.917242 ┆ 0.756173 ┆ 8.670795 │\n", + "│ 257 ┆ 0.571268 ┆ 0.54402 ┆ 0.110826 ┆ 0.16714 ┆ 8.900079 │\n", + "│ 334 ┆ 0.551334 ┆ 0.458245 ┆ 0.40399 ┆ 0.494347 ┆ 1.872597 │\n", + "│ 352 ┆ 0.535394 ┆ 0.459457 ┆ 0.467295 ┆ 0.986702 ┆ 3.193573 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘" ] }, @@ -2331,7 +2331,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
1690.4657040.5019990.2534890.6254596.367421
1760.4737390.4913080.8941730.6525566.084469
2350.5645890.4843920.0571020.9750238.699902
3670.4782390.5663790.6206460.3849229.836408
3830.5018910.463470.1358890.6168737.838947
" + "shape: (5, 6)
idvar1var2var3rrh
u32f64f64f64f64f64
610.4575860.5075030.6142960.558337.064511
3540.4429610.5194260.6919720.944377.440443
4060.5204020.4435650.0155720.8146729.903239
4110.5230370.5695130.9751170.1634148.785701
4880.45470.4532890.3886350.3912559.463455
" ], "text/plain": [ "shape: (5, 6)\n", @@ -2340,11 +2340,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n", - "│ 169 ┆ 0.465704 ┆ 0.501999 ┆ 0.253489 ┆ 0.625459 ┆ 6.367421 │\n", - "│ 176 ┆ 0.473739 ┆ 0.491308 ┆ 0.894173 ┆ 0.652556 ┆ 6.084469 │\n", - "│ 235 ┆ 0.564589 ┆ 0.484392 ┆ 0.057102 ┆ 0.975023 ┆ 8.699902 │\n", - "│ 367 ┆ 0.478239 ┆ 0.566379 ┆ 0.620646 ┆ 0.384922 ┆ 9.836408 │\n", - "│ 383 ┆ 0.501891 ┆ 0.46347 ┆ 0.135889 ┆ 0.616873 ┆ 7.838947 │\n", + "│ 61 ┆ 0.457586 ┆ 0.507503 ┆ 0.614296 ┆ 0.55833 ┆ 7.064511 │\n", + "│ 354 ┆ 0.442961 ┆ 0.519426 ┆ 0.691972 ┆ 0.94437 ┆ 7.440443 │\n", + "│ 406 ┆ 0.520402 ┆ 0.443565 ┆ 0.015572 ┆ 0.814672 ┆ 9.903239 │\n", + "│ 411 ┆ 0.523037 ┆ 0.569513 ┆ 0.975117 ┆ 0.163414 ┆ 8.785701 │\n", + "│ 488 ┆ 0.4547 ┆ 0.453289 ┆ 0.388635 ┆ 0.391255 ┆ 9.463455 │\n", "└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘" ] }, @@ -2381,7 +2381,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 3)
idfriendscount
u64list[u32]u32
0[0, 1345, … 304]4
1[1, 6, 278]3
2[2, 934, 853]3
3[3, 1584, … 159]5
4[4, 1939, … 392]5
" + "shape: (5, 3)
idfriendscount
u64list[u32]u32
0[0, 459, … 1058]6
1[1]1
2[2, 1077]2
3[3, 104]2
4[4, 781, … 650]4
" ], "text/plain": [ "shape: (5, 3)\n", @@ -2390,11 +2390,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ u64 ┆ list[u32] ┆ u32 │\n", "╞═════╪══════════════════╪═══════╡\n", - "│ 0 ┆ [0, 1345, … 304] ┆ 4 │\n", - "│ 1 ┆ [1, 6, 278] ┆ 3 │\n", - "│ 2 ┆ [2, 934, 853] ┆ 3 │\n", - "│ 3 ┆ [3, 1584, … 159] ┆ 5 │\n", - "│ 4 ┆ [4, 1939, … 392] ┆ 5 │\n", + "│ 0 ┆ [0, 459, … 1058] ┆ 6 │\n", + "│ 1 ┆ [1] ┆ 1 │\n", + "│ 2 ┆ [2, 1077] ┆ 2 │\n", + "│ 3 ┆ [3, 104] ┆ 2 │\n", + "│ 4 ┆ [4, 781, … 650] ┆ 4 │\n", "└─────┴──────────────────┴───────┘" ] }, @@ -2439,7 +2439,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_19683/3354819425.py:3: UserWarning: The compatibility layer is considered experimental.\n", + "/tmp/ipykernel_28864/3354819425.py:3: UserWarning: The compatibility layer is considered experimental.\n", " from polars_ds.compat import compat as pds2\n" ] }, @@ -2453,7 +2453,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 6)
actualpredicted0-20-9s1s2
f64f64i32i32strstr
1.00.7890801"7J""k"
1.00.50348515"S""yj"
1.00.73686828"iB""p"
1.00.90439714"R""Js"
1.00.84337912"A""WR"
" + "shape: (5, 6)
actualpredicted0-20-9s1s2
f64f64i32i32strstr
1.00.96565326"I0""nR"
0.00.4403722"6""Mz"
1.00.93195512"1""kg"
0.00.55819714"R6""m"
1.00.23553521"bF""RO"
" ], "text/plain": [ "shape: (5, 6)\n", @@ -2462,11 +2462,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ i32 ┆ i32 ┆ str ┆ str │\n", "╞════════╪═══════════╪═════╪═════╪═════╪═════╡\n", - "│ 1.0 ┆ 0.78908 ┆ 0 ┆ 1 ┆ 7J ┆ k │\n", - "│ 1.0 ┆ 0.503485 ┆ 1 ┆ 5 ┆ S ┆ yj │\n", - "│ 1.0 ┆ 0.736868 ┆ 2 ┆ 8 ┆ iB ┆ p │\n", - "│ 1.0 ┆ 0.904397 ┆ 1 ┆ 4 ┆ R ┆ Js │\n", - "│ 1.0 ┆ 0.843379 ┆ 1 ┆ 2 ┆ A ┆ WR │\n", + "│ 1.0 ┆ 0.965653 ┆ 2 ┆ 6 ┆ I0 ┆ nR │\n", + "│ 0.0 ┆ 0.44037 ┆ 2 ┆ 2 ┆ 6 ┆ Mz │\n", + "│ 1.0 ┆ 0.931955 ┆ 1 ┆ 2 ┆ 1 ┆ kg │\n", + "│ 0.0 ┆ 0.558197 ┆ 1 ┆ 4 ┆ R6 ┆ m │\n", + "│ 1.0 ┆ 0.235535 ┆ 2 ┆ 1 ┆ bF ┆ RO │\n", "└────────┴───────────┴─────┴─────┴─────┴─────┘" ] }, @@ -2553,7 +2553,7 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪══════════╪══════════╪═══════════════════╪══════════╡\n", - "│ 0.503697 ┆ 0.500827 ┆ 0.502258 ┆ 0.50319 ┆ 0.501299 │\n", + "│ 0.499669 ┆ 0.499599 ┆ 0.499634 ┆ 0.497488 ┆ 0.498677 │\n", "└───────────┴──────────┴──────────┴───────────────────┴──────────┘\n", "shape: (1, 5)\n", "┌───────────┬──────────┬──────────┬───────────────────┬──────────┐\n", @@ -2561,7 +2561,7 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪══════════╪══════════╪═══════════════════╪══════════╡\n", - "│ 0.503697 ┆ 0.500827 ┆ 0.502258 ┆ 0.50319 ┆ 0.501299 │\n", + "│ 0.499669 ┆ 0.499599 ┆ 0.499634 ┆ 0.497488 ┆ 0.498677 │\n", "└───────────┴──────────┴──────────┴───────────────────┴──────────┘\n", "shape: (1, 5)\n", "┌───────────┬──────────┬──────────┬───────────────────┬──────────┐\n", @@ -2569,7 +2569,7 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪══════════╪══════════╪═══════════════════╪══════════╡\n", - "│ 0.503697 ┆ 0.500827 ┆ 0.502258 ┆ 0.50319 ┆ 0.501299 │\n", + "│ 0.499669 ┆ 0.499599 ┆ 0.499634 ┆ 0.497488 ┆ 0.498677 │\n", "└───────────┴──────────┴──────────┴───────────────────┴──────────┘\n" ] } @@ -2599,7 +2599,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 4)
<=baseline_pctactual_pctpsi_bin
f64f64f64f64
0.1944640.20.1870.000874
0.4138970.20.2070.000241
0.59990.20.190.000513
0.8040260.20.2020.00002
inf0.20.2140.000947
" + "shape: (5, 4)
<=baseline_pctactual_pctpsi_bin
f64f64f64f64
0.2113040.20.2150.001085
0.3997050.20.1950.000127
0.5946050.20.1870.000874
0.7973540.20.1890.000622
inf0.20.2140.000947
" ], "text/plain": [ "shape: (5, 4)\n", @@ -2608,10 +2608,10 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════╪══════════════╪════════════╪══════════╡\n", - "│ 0.194464 ┆ 0.2 ┆ 0.187 ┆ 0.000874 │\n", - "│ 0.413897 ┆ 0.2 ┆ 0.207 ┆ 0.000241 │\n", - "│ 0.5999 ┆ 0.2 ┆ 0.19 ┆ 0.000513 │\n", - "│ 0.804026 ┆ 0.2 ┆ 0.202 ┆ 0.00002 │\n", + "│ 0.211304 ┆ 0.2 ┆ 0.215 ┆ 0.001085 │\n", + "│ 0.399705 ┆ 0.2 ┆ 0.195 ┆ 0.000127 │\n", + "│ 0.594605 ┆ 0.2 ┆ 0.187 ┆ 0.000874 │\n", + "│ 0.797354 ┆ 0.2 ┆ 0.189 ┆ 0.000622 │\n", "│ inf ┆ 0.2 ┆ 0.214 ┆ 0.000947 │\n", "└──────────┴──────────────┴────────────┴──────────┘" ] @@ -2647,13 +2647,13 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1,)
cid_ce
f64
12.900387
" + "shape: (1,)
cid_ce
f64
13.128145
" ], "text/plain": [ "shape: (1,)\n", "Series: 'cid_ce' [f64]\n", "[\n", - "\t12.900387\n", + "\t13.128145\n", "]" ] }, @@ -2684,13 +2684,13 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (1,)
c3_stats
f64
0.123215
" + "shape: (1,)
c3_stats
f64
0.11619
" ], "text/plain": [ "shape: (1,)\n", "Series: 'c3_stats' [f64]\n", "[\n", - "\t0.123215\n", + "\t0.11619\n", "]" ] }, @@ -2722,7 +2722,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (10,)
str_leven
u32
2
2
2
2
2
2
1
2
2
2
" + "shape: (10,)
str_leven
u32
2
2
2
2
2
2
2
1
2
2
" ], "text/plain": [ "shape: (10,)\n", @@ -2734,8 +2734,8 @@ "\t2\n", "\t2\n", "\t2\n", - "\t1\n", "\t2\n", + "\t1\n", "\t2\n", "\t2\n", "]" diff --git a/examples/pipeline.ipynb b/examples/pipeline.ipynb index afaca423..86796d34 100644 --- a/examples/pipeline.ipynb +++ b/examples/pipeline.ipynb @@ -190,7 +190,7 @@ "\n", "\n", "Step 3:\n", - "col(\"city_category\").is_not_null().all_horizontal()\n", + "col(\"city_category\").is_not_null()\n", "\n", "Step 4:\n", "selector\n", @@ -237,10 +237,10 @@ "col(\"employer_category1\").replace_strict([Series[value], Series[to], null])\n", "\n", "Step 14:\n", - "dtype_columns([UInt32, UInt8, Int32, Int8, UInt64, UInt16, Int64, Int16]).shrink_dtype()\n", + "dtype_columns([Int16, UInt32, UInt8, Int32, UInt64, Int8, UInt16, Int64]).shrink_dtype()\n", "\n", "Step 15:\n", - "dtype_columns([Float32, Float64]).strict_cast(Float32)\n" + "dtype_columns([Float64, Float32]).strict_cast(Float32)\n" ] }, "execution_count": 5, @@ -461,7 +461,7 @@ "\n", "\n", "Step 3:\n", - "col(\"city_category\").is_not_null().all_horizontal()\n", + "col(\"city_category\").is_not_null()\n", "\n", "Step 4:\n", "selector\n", @@ -508,10 +508,10 @@ "col(\"employer_category1\").replace_strict([Series[value], Series[to], null])\n", "\n", "Step 14:\n", - "dtype_columns([UInt32, UInt8, Int32, Int8, UInt64, UInt16, Int64, Int16]).shrink_dtype()\n", + "dtype_columns([Int16, UInt32, UInt8, Int32, UInt64, Int8, UInt16, Int64]).shrink_dtype()\n", "\n", "Step 15:\n", - "dtype_columns([Float32, Float64]).strict_cast(Float32)\n" + "dtype_columns([Float64, Float32]).strict_cast(Float32)\n" ] }, "execution_count": 10, @@ -662,9 +662,9 @@ " '{\"Alias\":[{\"Column\":\"Var1\"},\"var1\"]}',\n", " '{\"Alias\":[{\"Column\":\"Approved\"},\"approved\"]}']},\n", " {'SQLStep': \"\\nselect\\n*\\n, 'TEST' as test_col\\nfrom df\\nwhere loan_amount is not null\\n\"},\n", - " {'FilterStep': ['{\"Function\":{\"input\":[{\"Function\":{\"input\":[{\"Column\":\"city_category\"}],\"function\":{\"Boolean\":\"IsNotNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}],\"function\":{\"Boolean\":\"AllHorizontal\"},\"options\":{\"collect_groups\":\"GroupWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE | INPUT_WILDCARD_EXPANSION | ALLOW_EMPTY_INPUTS\"}}}']},\n", - " {'SelectStep': ['{\"Selector\":{\"Add\":[{\"Root\":{\"DtypeColumn\":[{\"Decimal\":[null,null]},\"Float32\",\"UInt32\",\"UInt8\",\"Int32\",\"Int8\",\"Float64\",\"UInt64\",\"UInt16\",\"Int64\",\"Int16\"]}},{\"Root\":{\"Columns\":[\"gender\",\"employer_category1\",\"city_category\",\"test_col\"]}}]}}']},\n", - " {'WithColumnsStep': ['{\"Alias\":[{\"Ternary\":{\"predicate\":{\"Function\":{\"input\":[{\"Column\":\"loan_period\"}],\"function\":{\"Boolean\":\"IsNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"truthy\":{\"Function\":{\"input\":[{\"BinaryExpr\":{\"left\":{\"Column\":\"var1\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":0.5098100117596632}}}},{\"BinaryExpr\":{\"left\":{\"Column\":\"existing_emi\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":-7.604079653752158e-6}}}}],\"function\":\"SumHorizontal\",\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE | INPUT_WILDCARD_EXPANSION\"}}},\"falsy\":{\"Cast\":{\"expr\":{\"Column\":\"loan_period\"},\"dtype\":\"Float64\",\"options\":\"Strict\"}}}},\"loan_period\"]}']},\n", + " {'FilterStep': ['{\"Function\":{\"input\":[{\"Column\":\"city_category\"}],\"function\":{\"Boolean\":\"IsNotNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", + " {'SelectStep': ['{\"Selector\":{\"Add\":[{\"Root\":{\"DtypeColumn\":[\"Int16\",\"UInt32\",\"UInt8\",\"Float64\",\"Int32\",\"UInt64\",\"Int8\",\"UInt16\",\"Int64\",{\"Decimal\":[null,null]},\"Float32\"]}},{\"Root\":{\"Columns\":[\"gender\",\"employer_category1\",\"city_category\",\"test_col\"]}}]}}']},\n", + " {'WithColumnsStep': ['{\"Alias\":[{\"Ternary\":{\"predicate\":{\"Function\":{\"input\":[{\"Column\":\"loan_period\"}],\"function\":{\"Boolean\":\"IsNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"truthy\":{\"Function\":{\"input\":[{\"BinaryExpr\":{\"left\":{\"Column\":\"var1\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":0.5098100117596667}}}},{\"BinaryExpr\":{\"left\":{\"Column\":\"existing_emi\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":-7.6040796537530525e-6}}}}],\"function\":{\"SumHorizontal\":{\"ignore_nulls\":true}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE | INPUT_WILDCARD_EXPANSION\"}}},\"falsy\":{\"Cast\":{\"expr\":{\"Column\":\"loan_period\"},\"dtype\":\"Float64\",\"options\":\"Strict\"}}}},\"loan_period\"]}']},\n", " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"existing_emi\"},{\"Literal\":{\"Float\":0.0}}],\"function\":\"FillNull\",\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", " {'WithColumnsStep': ['{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"existing_emi\"}],\"function\":\"Log1p\",\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"existing_emi_log1p\"]}',\n", " '{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"loan_amount\"}],\"function\":\"Log1p\",\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"loan_amount_log1p\"]}',\n", @@ -685,10 +685,10 @@ " {'WithColumnsStep': ['{\"Alias\":[{\"Cast\":{\"expr\":{\"Function\":{\"input\":[{\"Column\":\"employer_category1\"}],\"function\":{\"Boolean\":\"IsNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"dtype\":\"UInt8\",\"options\":\"Strict\"}},\"employer_category1_is_missing\"]}']},\n", " {'WithColumnsStep': ['{\"Alias\":[{\"Cast\":{\"expr\":{\"BinaryExpr\":{\"left\":{\"Column\":\"gender\"},\"op\":\"EqValidity\",\"right\":{\"Literal\":{\"String\":\"Male\"}}}},\"dtype\":\"UInt8\",\"options\":\"Strict\"}},\"gender_Male\"]}']},\n", " {'SelectStep': ['{\"Exclude\":[\"Wildcard\",[{\"Name\":\"gender\"}]]}']},\n", - " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"city_category\"},{\"Literal\":{\"Series\":{\"name\":\"value\",\"datatype\":\"String\",\"bit_settings\":\"\",\"values\":[\"B\",\"A\",\"C\"]}}},{\"Literal\":{\"Series\":{\"name\":\"woe\",\"datatype\":\"Float64\",\"bit_settings\":\"\",\"values\":[-0.04647519483535344,0.0809586180645928,-0.47955283435510176]}}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", - " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"employer_category1\"},{\"Literal\":{\"Series\":{\"name\":\"value\",\"datatype\":\"String\",\"bit_settings\":\"\",\"values\":[\"A\",\"C\",\"B\"]}}},{\"Literal\":{\"Series\":{\"name\":\"to\",\"datatype\":\"Float64\",\"bit_settings\":\"\",\"values\":[0.014736842105263158,0.02660307366189719,0.024335548172757474]}}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", - " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"DtypeColumn\":[\"UInt32\",\"UInt8\",\"Int32\",\"Int8\",\"UInt64\",\"UInt16\",\"Int64\",\"Int16\"]}],\"function\":\"ShrinkType\",\"options\":{\"collect_groups\":\"GroupWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", - " {'WithColumnsStep': ['{\"Cast\":{\"expr\":{\"DtypeColumn\":[\"Float32\",\"Float64\"]},\"dtype\":\"Float32\",\"options\":\"Strict\"}}']}],\n", + " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"city_category\"},{\"Literal\":{\"Series\":{\"name\":\"value\",\"datatype\":\"String\",\"bit_settings\":\"\",\"values\":[\"A\",\"C\",\"B\"]}}},{\"Literal\":{\"Series\":{\"name\":\"woe\",\"datatype\":\"Float64\",\"bit_settings\":\"\",\"values\":[0.0809586180645928,-0.47955283435510176,-0.04647519483535344]}}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", + " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"employer_category1\"},{\"Literal\":{\"Series\":{\"name\":\"value\",\"datatype\":\"String\",\"bit_settings\":\"\",\"values\":[\"B\",\"C\",\"A\"]}}},{\"Literal\":{\"Series\":{\"name\":\"to\",\"datatype\":\"Float64\",\"bit_settings\":\"\",\"values\":[0.024335548172757474,0.02660307366189719,0.014736842105263158]}}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", + " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"DtypeColumn\":[\"Int16\",\"UInt32\",\"UInt8\",\"Int32\",\"UInt64\",\"Int8\",\"UInt16\",\"Int64\"]}],\"function\":\"ShrinkType\",\"options\":{\"collect_groups\":\"GroupWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n", + " {'WithColumnsStep': ['{\"Cast\":{\"expr\":{\"DtypeColumn\":[\"Float64\",\"Float32\"]},\"dtype\":\"Float32\",\"options\":\"Strict\"}}']}],\n", " 'ensure_features_in': False,\n", " 'ensure_features_out': True}" ] diff --git a/examples/sample_and_split.ipynb b/examples/sample_and_split.ipynb index 1e8d32db..8eb12e1c 100644 --- a/examples/sample_and_split.ipynb +++ b/examples/sample_and_split.ipynb @@ -42,7 +42,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (5, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
03.1699780.1830960.848878-0.988939369.761952"A"
18.8107680.5696720.048483-0.44255258.0126620"A"
23.2740630.6327720.4474680.255512-1284.3898791"A"
310.8476720.890060.7720620.735149-0.3629830"A"
411.664820.9071671.3939292.285448-2031.3216220"A"
" + "shape: (5, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
05.3554620.2275850.8754131.255306-1534.2960750"A"
13.1437420.6517112.12331-0.27767544.7987710"A"
29.5851380.7201471.048850.019822388.7244410"A"
311.730430.0596023.624234-1.177224442.3975180"A"
41.3104150.7838363.703261.501242189.0644922"A"
" ], "text/plain": [ "shape: (5, 8)\n", @@ -51,11 +51,11 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n", - "│ 0 ┆ 3.169978 ┆ 0.183096 ┆ 0.848878 ┆ -0.988939 ┆ 369.76195 ┆ 2 ┆ A │\n", - "│ 1 ┆ 8.810768 ┆ 0.569672 ┆ 0.048483 ┆ -0.44255 ┆ 258.012662 ┆ 0 ┆ A │\n", - "│ 2 ┆ 3.274063 ┆ 0.632772 ┆ 0.447468 ┆ 0.255512 ┆ -1284.389879 ┆ 1 ┆ A │\n", - "│ 3 ┆ 10.847672 ┆ 0.89006 ┆ 0.772062 ┆ 0.735149 ┆ -0.362983 ┆ 0 ┆ A │\n", - "│ 4 ┆ 11.66482 ┆ 0.907167 ┆ 1.393929 ┆ 2.285448 ┆ -2031.321622 ┆ 0 ┆ A │\n", + "│ 0 ┆ 5.355462 ┆ 0.227585 ┆ 0.875413 ┆ 1.255306 ┆ -1534.296075 ┆ 0 ┆ A │\n", + "│ 1 ┆ 3.143742 ┆ 0.651711 ┆ 2.12331 ┆ -0.27767 ┆ 544.798771 ┆ 0 ┆ A │\n", + "│ 2 ┆ 9.585138 ┆ 0.720147 ┆ 1.04885 ┆ 0.01982 ┆ 2388.724441 ┆ 0 ┆ A │\n", + "│ 3 ┆ 11.73043 ┆ 0.059602 ┆ 3.624234 ┆ -1.177224 ┆ 442.397518 ┆ 0 ┆ A │\n", + "│ 4 ┆ 1.310415 ┆ 0.783836 ┆ 3.70326 ┆ 1.501242 ┆ 189.064492 ┆ 2 ┆ A │\n", "└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘" ] }, @@ -85,7 +85,7 @@ { "data": { "text/plain": [ - "['row_num', 'normal', 'flags']" + "['row_num', 'uniform_2', 'exp']" ] }, "execution_count": 3, @@ -94,7 +94,7 @@ } ], "source": [ - "sa.random_cols(df, 2, keep = [\"row_num\"])" + "sa.random_cols(df.columns, 2, keep = [\"row_num\"])" ] }, { @@ -112,7 +112,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (60_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
18.8107680.5696720.048483-0.44255258.0126620"A"
23.2740630.6327720.4474680.255512-1284.3898791"A"
411.664820.9071671.3939292.285448-2031.3216220"A"
61.5222470.6263310.460844-0.0607391487.4443431"A"
73.935480.3632292.002222-0.613627-335.2031830"A"
999913.8085940.6939146.727779-0.781093-868.3070312"C"
999946.2463620.995973.468162-0.699768-145.4718141"C"
999960.5204350.7581790.6805180.788875-3203.568962"C"
999976.2509580.7623930.086911.79754696.8593271"C"
999984.4910910.3969690.0125852.024051-2468.8598152"C"
" + "shape: (60_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
13.1437420.6517112.12331-0.27767544.7987710"A"
29.5851380.7201471.048850.019822388.7244410"A"
60.1896620.06511.316939-0.244435748.9951790"A"
70.6613460.8740924.8430380.31243-383.6591350"A"
80.0538010.9833420.4523620.312257-386.6897190"A"
999947.5361220.114142.847801-0.916853-1340.1115132"C"
9999610.0305770.9395680.9877190.701578-768.0626550"C"
999975.1185980.5523952.390273-2.57956-1076.6100990"C"
999985.7014280.5215721.290974-1.3617795.2780611"C"
999997.9460390.2251552.5649990.367505-1021.4799371"C"
" ], "text/plain": [ "shape: (60_000, 8)\n", @@ -121,17 +121,17 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n", - "│ 1 ┆ 8.810768 ┆ 0.569672 ┆ 0.048483 ┆ -0.44255 ┆ 258.012662 ┆ 0 ┆ A │\n", - "│ 2 ┆ 3.274063 ┆ 0.632772 ┆ 0.447468 ┆ 0.255512 ┆ -1284.389879 ┆ 1 ┆ A │\n", - "│ 4 ┆ 11.66482 ┆ 0.907167 ┆ 1.393929 ┆ 2.285448 ┆ -2031.321622 ┆ 0 ┆ A │\n", - "│ 6 ┆ 1.522247 ┆ 0.626331 ┆ 0.460844 ┆ -0.060739 ┆ 1487.444343 ┆ 1 ┆ A │\n", - "│ 7 ┆ 3.93548 ┆ 0.363229 ┆ 2.002222 ┆ -0.613627 ┆ -335.203183 ┆ 0 ┆ A │\n", + "│ 1 ┆ 3.143742 ┆ 0.651711 ┆ 2.12331 ┆ -0.27767 ┆ 544.798771 ┆ 0 ┆ A │\n", + "│ 2 ┆ 9.585138 ┆ 0.720147 ┆ 1.04885 ┆ 0.01982 ┆ 2388.724441 ┆ 0 ┆ A │\n", + "│ 6 ┆ 0.189662 ┆ 0.0651 ┆ 1.316939 ┆ -0.244435 ┆ 748.995179 ┆ 0 ┆ A │\n", + "│ 7 ┆ 0.661346 ┆ 0.874092 ┆ 4.843038 ┆ 0.31243 ┆ -383.659135 ┆ 0 ┆ A │\n", + "│ 8 ┆ 0.053801 ┆ 0.983342 ┆ 0.452362 ┆ 0.312257 ┆ -386.689719 ┆ 0 ┆ A │\n", "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 99991 ┆ 3.808594 ┆ 0.693914 ┆ 6.727779 ┆ -0.781093 ┆ -868.307031 ┆ 2 ┆ C │\n", - "│ 99994 ┆ 6.246362 ┆ 0.99597 ┆ 3.468162 ┆ -0.699768 ┆ -145.471814 ┆ 1 ┆ C │\n", - "│ 99996 ┆ 0.520435 ┆ 0.758179 ┆ 0.680518 ┆ 0.788875 ┆ -3203.56896 ┆ 2 ┆ C │\n", - "│ 99997 ┆ 6.250958 ┆ 0.762393 ┆ 0.08691 ┆ 1.79754 ┆ 696.859327 ┆ 1 ┆ C │\n", - "│ 99998 ┆ 4.491091 ┆ 0.396969 ┆ 0.012585 ┆ 2.024051 ┆ -2468.859815 ┆ 2 ┆ C │\n", + "│ 99994 ┆ 7.536122 ┆ 0.11414 ┆ 2.847801 ┆ -0.916853 ┆ -1340.111513 ┆ 2 ┆ C │\n", + "│ 99996 ┆ 10.030577 ┆ 0.939568 ┆ 0.987719 ┆ 0.701578 ┆ -768.062655 ┆ 0 ┆ C │\n", + "│ 99997 ┆ 5.118598 ┆ 0.552395 ┆ 2.390273 ┆ -2.57956 ┆ -1076.610099 ┆ 0 ┆ C │\n", + "│ 99998 ┆ 5.701428 ┆ 0.521572 ┆ 1.290974 ┆ -1.361779 ┆ 5.278061 ┆ 1 ┆ C │\n", + "│ 99999 ┆ 7.946039 ┆ 0.225155 ┆ 2.564999 ┆ 0.367505 ┆ -1021.479937 ┆ 1 ┆ C │\n", "└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘" ] }, @@ -160,27 +160,27 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (30_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
03.1699780.1830960.848878-0.988939369.761952"A"
122.7187840.2363270.6563412.042461992.1066460"A"
135.6882420.2381281.989903-1.89097596.6090981"A"
1610.6301570.6854172.040244-0.411343-80.4406542"A"
186.1333180.8685813.786928-0.853489-824.3728641"A"
999767.6862650.0371789.8724010.0027091013.8234432"C"
999819.0405850.2725630.423536-0.365252-718.1514621"C"
999858.9403850.8562152.3550230.609717-34.9440960"C"
999866.5013580.6762971.185671-0.284971583.3654431"C"
999960.5204350.7581790.6805180.788875-3203.568962"C"
" + "shape: (30_000, 8)
row_numuniform_1uniform_2expnormalfat_normalflagscategory
i64f64f64f64f64f64i32str
109.7816230.5638684.4885530.1231011628.8184961"A"
114.5083280.5946973.8777570.849688-1242.376971"A"
141.7023380.7763051.3469870.481826-403.302142"A"
1911.8972340.550351.7914770.861923641.5327762"A"
224.0775150.7377171.0932351.0484441269.1830712"A"
999895.260120.4790690.748342-0.224175-84.2662241"C"
999947.5361220.114142.847801-0.916853-1340.1115132"C"
9999510.4906820.6116920.384882-0.474915157.0110962"C"
9999610.0305770.9395680.9877190.701578-768.0626550"C"
999985.7014280.5215721.290974-1.3617795.2780611"C"
" ], "text/plain": [ "shape: (30_000, 8)\n", - "┌─────────┬───────────┬───────────┬──────────┬───────────┬─────────────┬───────┬──────────┐\n", - "│ row_num ┆ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ flags ┆ category │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", - "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪═════════════╪═══════╪══════════╡\n", - "│ 0 ┆ 3.169978 ┆ 0.183096 ┆ 0.848878 ┆ -0.988939 ┆ 369.76195 ┆ 2 ┆ A │\n", - "│ 12 ┆ 2.718784 ┆ 0.236327 ┆ 0.656341 ┆ 2.042461 ┆ 992.106646 ┆ 0 ┆ A │\n", - "│ 13 ┆ 5.688242 ┆ 0.238128 ┆ 1.989903 ┆ -1.890975 ┆ 96.609098 ┆ 1 ┆ A │\n", - "│ 16 ┆ 10.630157 ┆ 0.685417 ┆ 2.040244 ┆ -0.411343 ┆ -80.440654 ┆ 2 ┆ A │\n", - "│ 18 ┆ 6.133318 ┆ 0.868581 ┆ 3.786928 ┆ -0.853489 ┆ -824.372864 ┆ 1 ┆ A │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 99976 ┆ 7.686265 ┆ 0.037178 ┆ 9.872401 ┆ 0.002709 ┆ 1013.823443 ┆ 2 ┆ C │\n", - "│ 99981 ┆ 9.040585 ┆ 0.272563 ┆ 0.423536 ┆ -0.365252 ┆ -718.151462 ┆ 1 ┆ C │\n", - "│ 99985 ┆ 8.940385 ┆ 0.856215 ┆ 2.355023 ┆ 0.609717 ┆ -34.944096 ┆ 0 ┆ C │\n", - "│ 99986 ┆ 6.501358 ┆ 0.676297 ┆ 1.185671 ┆ -0.284971 ┆ 583.365443 ┆ 1 ┆ C │\n", - "│ 99996 ┆ 0.520435 ┆ 0.758179 ┆ 0.680518 ┆ 0.788875 ┆ -3203.56896 ┆ 2 ┆ C │\n", - "└─────────┴───────────┴───────────┴──────────┴───────────┴─────────────┴───────┴──────────┘" + "┌─────────┬───────────┬───────────┬──────────┬───────────┬──────────────┬───────┬──────────┐\n", + "│ row_num ┆ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ flags ┆ category │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n", + "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n", + "│ 10 ┆ 9.781623 ┆ 0.563868 ┆ 4.488553 ┆ 0.123101 ┆ 1628.818496 ┆ 1 ┆ A │\n", + "│ 11 ┆ 4.508328 ┆ 0.594697 ┆ 3.877757 ┆ 0.849688 ┆ -1242.37697 ┆ 1 ┆ A │\n", + "│ 14 ┆ 1.702338 ┆ 0.776305 ┆ 1.346987 ┆ 0.481826 ┆ -403.30214 ┆ 2 ┆ A │\n", + "│ 19 ┆ 11.897234 ┆ 0.55035 ┆ 1.791477 ┆ 0.861923 ┆ 641.532776 ┆ 2 ┆ A │\n", + "│ 22 ┆ 4.077515 ┆ 0.737717 ┆ 1.093235 ┆ 1.048444 ┆ 1269.183071 ┆ 2 ┆ A │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 99989 ┆ 5.26012 ┆ 0.479069 ┆ 0.748342 ┆ -0.224175 ┆ -84.266224 ┆ 1 ┆ C │\n", + "│ 99994 ┆ 7.536122 ┆ 0.11414 ┆ 2.847801 ┆ -0.916853 ┆ -1340.111513 ┆ 2 ┆ C │\n", + "│ 99995 ┆ 10.490682 ┆ 0.611692 ┆ 0.384882 ┆ -0.474915 ┆ 157.011096 ┆ 2 ┆ C │\n", + "│ 99996 ┆ 10.030577 ┆ 0.939568 ┆ 0.987719 ┆ 0.701578 ┆ -768.062655 ┆ 0 ┆ C │\n", + "│ 99998 ┆ 5.701428 ┆ 0.521572 ┆ 1.290974 ┆ -1.361779 ┆ 5.278061 ┆ 1 ┆ C │\n", + "└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘" ] }, "execution_count": 5, @@ -207,7 +207,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
flagslen
i32u32
033465
133331
233204
" + "shape: (3, 2)
flagslen
i32u32
033381
133169
233450
" ], "text/plain": [ "shape: (3, 2)\n", @@ -216,9 +216,9 @@ "│ --- ┆ --- │\n", "│ i32 ┆ u32 │\n", "╞═══════╪═══════╡\n", - "│ 0 ┆ 33465 │\n", - "│ 1 ┆ 33331 │\n", - "│ 2 ┆ 33204 │\n", + "│ 0 ┆ 33381 │\n", + "│ 1 ┆ 33169 │\n", + "│ 2 ┆ 33450 │\n", "└───────┴───────┘" ] }, @@ -246,7 +246,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
flagslen
i32u32
016732
133331
233204
" + "shape: (3, 2)
flagslen
i32u32
016690
133169
233450
" ], "text/plain": [ "shape: (3, 2)\n", @@ -255,9 +255,9 @@ "│ --- ┆ --- │\n", "│ i32 ┆ u32 │\n", "╞═══════╪═══════╡\n", - "│ 0 ┆ 16732 │\n", - "│ 1 ┆ 33331 │\n", - "│ 2 ┆ 33204 │\n", + "│ 0 ┆ 16690 │\n", + "│ 1 ┆ 33169 │\n", + "│ 2 ┆ 33450 │\n", "└───────┴───────┘" ] }, @@ -290,7 +290,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
flagslen
i32u32
016732
19999
213281
" + "shape: (3, 2)
flagslen
i32u32
016690
19950
213380
" ], "text/plain": [ "shape: (3, 2)\n", @@ -299,9 +299,9 @@ "│ --- ┆ --- │\n", "│ i32 ┆ u32 │\n", "╞═══════╪═══════╡\n", - "│ 0 ┆ 16732 │\n", - "│ 1 ┆ 9999 │\n", - "│ 2 ┆ 13281 │\n", + "│ 0 ┆ 16690 │\n", + "│ 1 ┆ 9950 │\n", + "│ 2 ┆ 13380 │\n", "└───────┴───────┘" ] }, @@ -464,7 +464,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (3, 2)
categorylen
stru32
"A"10000
"B"4285
"C"5715
" + "shape: (3, 2)
categorylen
stru32
"A"10000
"B"4220
"C"5780
" ], "text/plain": [ "shape: (3, 2)\n", @@ -474,8 +474,8 @@ "│ str ┆ u32 │\n", "╞══════════╪═══════╡\n", "│ A ┆ 10000 │\n", - "│ B ┆ 4285 │\n", - "│ C ┆ 5715 │\n", + "│ B ┆ 4220 │\n", + "│ C ┆ 5780 │\n", "└──────────┴───────┘" ] }, @@ -509,7 +509,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (9, 3)
categoryflagslen
stri32u32
"A"09960
"A"19960
"A"29960
"B"09962
"B"19962
"B"29962
"C"013223
"C"113223
"C"213223
" + "shape: (9, 3)
categoryflagslen
stri32u32
"A"09917
"A"19917
"A"29917
"B"09848
"B"19848
"B"29848
"C"013262
"C"113262
"C"213262
" ], "text/plain": [ "shape: (9, 3)\n", @@ -518,15 +518,15 @@ "│ --- ┆ --- ┆ --- │\n", "│ str ┆ i32 ┆ u32 │\n", "╞══════════╪═══════╪═══════╡\n", - "│ A ┆ 0 ┆ 9960 │\n", - "│ A ┆ 1 ┆ 9960 │\n", - "│ A ┆ 2 ┆ 9960 │\n", - "│ B ┆ 0 ┆ 9962 │\n", - "│ B ┆ 1 ┆ 9962 │\n", - "│ B ┆ 2 ┆ 9962 │\n", - "│ C ┆ 0 ┆ 13223 │\n", - "│ C ┆ 1 ┆ 13223 │\n", - "│ C ┆ 2 ┆ 13223 │\n", + "│ A ┆ 0 ┆ 9917 │\n", + "│ A ┆ 1 ┆ 9917 │\n", + "│ A ┆ 2 ┆ 9917 │\n", + "│ B ┆ 0 ┆ 9848 │\n", + "│ B ┆ 1 ┆ 9848 │\n", + "│ B ┆ 2 ┆ 9848 │\n", + "│ C ┆ 0 ┆ 13262 │\n", + "│ C ┆ 1 ┆ 13262 │\n", + "│ C ┆ 2 ┆ 13262 │\n", "└──────────┴───────┴───────┘" ] }, @@ -561,7 +561,7 @@ " white-space: pre-wrap;\n", "}\n", "\n", - "shape: (9, 3)
categoryflagslen
stri32u32
"A"09960
"A"19960
"A"29960
"B"09962
"B"19962
"B"29962
"C"010000
"C"110000
"C"210000
" + "shape: (9, 3)
categoryflagslen
stri32u32
"A"09917
"A"19917
"A"29917
"B"09848
"B"19848
"B"29848
"C"010000
"C"110000
"C"210000
" ], "text/plain": [ "shape: (9, 3)\n", @@ -570,12 +570,12 @@ "│ --- ┆ --- ┆ --- │\n", "│ str ┆ i32 ┆ u32 │\n", "╞══════════╪═══════╪═══════╡\n", - "│ A ┆ 0 ┆ 9960 │\n", - "│ A ┆ 1 ┆ 9960 │\n", - "│ A ┆ 2 ┆ 9960 │\n", - "│ B ┆ 0 ┆ 9962 │\n", - "│ B ┆ 1 ┆ 9962 │\n", - "│ B ┆ 2 ┆ 9962 │\n", + "│ A ┆ 0 ┆ 9917 │\n", + "│ A ┆ 1 ┆ 9917 │\n", + "│ A ┆ 2 ┆ 9917 │\n", + "│ B ┆ 0 ┆ 9848 │\n", + "│ B ┆ 1 ┆ 9848 │\n", + "│ B ┆ 2 ┆ 9848 │\n", "│ C ┆ 0 ┆ 10000 │\n", "│ C ┆ 1 ┆ 10000 │\n", "│ C ┆ 2 ┆ 10000 │\n", diff --git a/pyproject.toml b/pyproject.toml index 3eea4807..28986d28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "maturin" [project] name = "polars_ds" requires-python = ">=3.9" -version = "0.6.3" +version = "0.7.0" license = { file = "LICENSE.txt" } classifiers = [ diff --git a/python/polars_ds/__init__.py b/python/polars_ds/__init__.py index e3f2a89f..b1b19c80 100644 --- a/python/polars_ds/__init__.py +++ b/python/polars_ds/__init__.py @@ -10,7 +10,7 @@ from polars_ds.expr_knn import * # noqa: F403 from polars_ds.expr_linear import * # noqa: F403 -__version__ = "0.6.3" +__version__ = "0.7.0" def frame(size: int = 2_000, index_name: str = "row_num") -> pl.DataFrame: """ diff --git a/python/polars_ds/pipeline.py b/python/polars_ds/pipeline.py index 2267847b..9a4708cb 100644 --- a/python/polars_ds/pipeline.py +++ b/python/polars_ds/pipeline.py @@ -67,24 +67,27 @@ def __iter__(self): @dataclass class FitStep: # Not a FittedStep func: FitTransformFunc - cols: IntoExprColumn + cols: IntoExprColumn | None exclude: List[str] # Here we allow IntoExprColumn as input so that users can use selectors, or other polars expressions # to specify input columns, which adds flexibility. # We still need real column names so that the functions in transforms.py will work. def fit(self, df: PolarsFrame) -> ExprTransform: - if _IS_POLARS_V1: - real_cols: List[str] = [ - x - for x in df.lazy().select(self.cols).collect_schema().names() - if x not in self.exclude - ] + if self.cols is None: + return self.func(df) else: - real_cols: List[str] = [ - x for x in df.select(self.cols).columns if x not in self.exclude - ] - return self.func(df, real_cols) + if _IS_POLARS_V1: + real_cols: List[str] = [ + x + for x in df.lazy().select(self.cols).collect_schema().names() + if x not in self.exclude + ] + else: + real_cols: List[str] = [ + x for x in df.select(self.cols).columns if x not in self.exclude + ] + return self.func(df, real_cols) Step: TypeAlias = Union[FitStep, SelectStep, WithColumnsStep, FilterStep, SQLStep] @@ -480,7 +483,7 @@ def _get_target(self, target: str | pl.Expr | None = None) -> str | pl.Expr: # self._steps = [deepcopy(s) for s in self._steps] # return self - def filter(self, *by: str | pl.Expr, all_: bool = True) -> Self: + def filter(self, by: str | pl.Expr) -> Self: """ Filters on the dataframe using native polars expressions or SQL boolean expressions. @@ -488,14 +491,8 @@ def filter(self, *by: str | pl.Expr, all_: bool = True) -> Self: ---------- by Native polars boolean expression or SQL strings - all_ - Whether all conditions should be met by all or any (all = False). """ - exprs = [s if isinstance(s, pl.Expr) else pl.sql_expr(s) for s in by] - if all_: - self._steps.append(FilterStep(pl.all_horizontal(exprs))) - else: - self._steps.append(FilterStep(pl.any_horizontal(exprs))) + self._steps.append(FilterStep(by if isinstance(by, pl.Expr) else pl.sql_expr(by))) return self def sql_transform(self, sql: str) -> Self: @@ -514,6 +511,13 @@ def sql_transform(self, sql: str) -> Self: self._steps.append(SQLStep(sql)) return self + def cast_bools(self, to: pl.DataType = pl.UInt8) -> Self: + """ + Cast all boolean columns in the dataframe to the given type. + """ + self._steps.append(WithColumnsStep(cs.boolean().cast(to))) + return self + def impute(self, cols: IntoExprColumn, method: SimpleImputeMethod = "mean") -> Self: """ Imputes null values in the given columns. Note: this doesn't fill NaN. If filling for NaN is needed, @@ -530,6 +534,27 @@ def impute(self, cols: IntoExprColumn, method: SimpleImputeMethod = "mean") -> S self._steps.append(FitStep(partial(t.impute, method=method), cols, self.exclude)) return self + def conditional_impute( + self, + rules_dict: Dict[str, str | pl.Expr], + method:SimpleImputeMethod = "mean" + ) -> Self: + """ + Conditionally imputes values in the given columns. This transform will collect if input is lazy. + + Parameters + ---------- + rules_dict + Dictionary where keys are column names (must be string), and values are SQL/Polars Conditions + that when true, those values in the column will be imputed, + and the value to impute will be learned on the data where the condition is false. + method + One of `mean`, `median`, `mode`. If `mode`, a random value will be chosen if there is + a tie. + """ + self._steps.append(FitStep(partial(t.conditional_impute, rules_dict=rules_dict, method=method), None, self.exclude)) + return self + def nan_to_null(self) -> Self: """ Maps NaN values in all columns to null. diff --git a/python/polars_ds/sample_and_split.py b/python/polars_ds/sample_and_split.py index f06182be..795376ec 100644 --- a/python/polars_ds/sample_and_split.py +++ b/python/polars_ds/sample_and_split.py @@ -3,6 +3,7 @@ import polars as pl import random import math +from ._utils import _IS_POLARS_V1 from .typing import PolarsFrame from typing import List, Tuple from itertools import combinations, islice @@ -153,19 +154,19 @@ def downsample( def random_cols( - df: PolarsFrame, + all_columns:List[str], k: int, keep: List[str] | None = None, seed: int | None = None, ) -> List[str]: """ - Selects random columns in the dataframe. Returns the selected columns in a list. Note, it is - impossible for this to randomly select both ["x", "y"] and ["y", "x"]. + Selects random columns from the given pool of columns. Returns the selected columns in a list. + Note, it is impossible for this to randomly select both ["x", "y"] and ["y", "x"]. Parameters ---------- - df - Either a lazy or eager Polars dataframe + all_columns + All column names k Select k random columns from all columns outside of `keep`. keep @@ -175,12 +176,12 @@ def random_cols( """ if keep is None: out = [] - to_sample = combinations(df.columns, k) + to_sample = combinations(all_columns, k) else: out = keep - to_sample = combinations((c for c in df.columns if c not in keep), k) + to_sample = combinations((c for c in all_columns if c not in keep), k) - pool_size = len(df.columns) - len(out) + pool_size = len(all_columns) - len(out) if pool_size < k: raise ValueError("Not enough columns to select from.") @@ -229,26 +230,29 @@ def split_by_ratio( train = frames[(True,)].select(pl.col("*").exclude(["__id", "__tt"])) test = frames[(False,)].select(pl.col("*").exclude(["__id", "__tt"])) return [train, test] - else: - if sum(split_ratio) != 1: - raise ValueError("Sum of the ratios is not 1.") - - df_eager = ( - df.with_row_index(name="__id") - .with_columns(pl.col("__id").shuffle(seed=seed).alias("__tt")) - .sort("__tt") - .lazy() - .collect() - ) + else: # Should work with iterable (with a length), not just list + if len(split_ratio) == 1: + return split_by_ratio(df, split_ratio[0], seed) + else: + if sum(split_ratio) != 1: + raise ValueError("Sum of the ratios is not 1.") - n = len(df_eager) - start = 0 - dfs = [] - for v in split_ratio: - length = int(n * v) - dfs.append( - df_eager.slice(start, length=length).select(pl.col("*").exclude(["__id", "__tt"])) + df_eager = ( + df.with_row_index(name="__id") + .with_columns(pl.col("__id").shuffle(seed=seed).alias("__tt")) + .sort("__tt") + .lazy() + .collect() ) - start += length - return dfs + n = len(df_eager) + start = 0 + dfs = [] + for v in split_ratio: + length = int(n * v) + dfs.append( + df_eager.slice(start, length=length).select(pl.col("*").exclude(["__id", "__tt"])) + ) + start += length + + return dfs diff --git a/python/polars_ds/transforms.py b/python/polars_ds/transforms.py index ed8a6ea8..fa6e8647 100644 --- a/python/polars_ds/transforms.py +++ b/python/polars_ds/transforms.py @@ -46,8 +46,47 @@ def impute(df: PolarsFrame, cols: List[str], method: SimpleImputeMethod = "mean" temp = df.lazy().select(pl.col(cols).mode().list.first()).collect().row(0) return [pl.col(c).fill_null(m) for c, m in zip(cols, temp)] else: - raise ValueError(f"Unknown input method: {method}") + raise ValueError(f"Unknown impute method: `{method}`") + +def conditional_impute( + df: PolarsFrame, + rules_dict: Dict[str, str | pl.Expr], + method: SimpleImputeMethod = "mean" +) -> ExprTransform: + """ + Conditionally imputes values in the given columns. This transform will collect if input is lazy. + Parameters + ---------- + df + Either a lazy or an eager dataframe + rules_dict + Dictionary where keys are column names (must be string), and values are SQL/Polars Conditions + that when true, those values in the column will be imputed, + and the value to impute will be learned on the data where the condition is false. + method + One of `mean`, `median`, `mode`. If `mode`, a random value will be chosen if there is + a tie. + """ + rules_dict = { + c: (r if isinstance(r, pl.Expr) else pl.sql_expr(r)) + for c, r in rules_dict.items() + } + cols = list(rules_dict.keys()) + # Learn on the data where the condition is false + if method == "mean": + temp = df.lazy().select( + *(pl.col(c).filter(rules_dict[c].not_()).mean() for c in rules_dict.keys()) + ).collect().row(0) + return [pl.when(rules_dict[c]).then(m).otherwise(pl.col(c)).alias(c) for c, m in zip(cols, temp)] + elif method == "median": + temp = df.lazy().select(*(pl.col(c).filter(rules_dict[c].not_()).median() for c in rules_dict.keys())).collect().row(0) + return [pl.when(rules_dict[c]).then(m).otherwise(pl.col(c)).alias(c) for c, m in zip(cols, temp)] + elif method == "mode": + temp = df.lazy().select(*(pl.col(c).filter(rules_dict[c].not_()).mode().list.first() for c in rules_dict.keys())).collect().row(0) + return [pl.when(rules_dict[c]).then(m).otherwise(pl.col(c)).alias(c) for c, m in zip(cols, temp)] + else: + raise ValueError(f"Unknown impute method: `{method}`") def linear_impute( df: PolarsFrame, features: List[str], target: str | pl.Expr, add_bias: bool = False diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 449ccf2a..bc293424 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -22,6 +22,31 @@ def test_linear_impute(): assert_frame_equal(imputed_c, correct_c) +def test_conditional_impute(): + df = pl.DataFrame({ + "a": [float('nan'), None, float("inf"), 9999, 100, 100, 100, 800], + }) + + res = df.with_columns( + t.conditional_impute( + df, + {"a": ((pl.col("a").is_finite().not_()) | pl.col("a").is_null() | (pl.col("a") > 899))}, + method = "mean" + )[0].alias("result") + )["result"] + + assert list(res)[:4] == [275.0, 275.0, 275.0, 275.0] + + res = df.with_columns( + t.conditional_impute( + df, + {"a": ((pl.col("a").is_finite().not_()) | pl.col("a").is_null() | (pl.col("a") > 899))}, + method = "median" + )[0].alias("result") + )["result"] + + assert list(res)[:4] == [100.0, 100.0, 100.0, 100.0] + def test_winsorize(): df = pds.frame(size=1000).select(