diff --git a/Cargo.lock b/Cargo.lock
index 3abb6a12..032a6c4c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2457,7 +2457,7 @@ dependencies = [
[[package]]
name = "polars_ds"
-version = "0.6.3"
+version = "0.7.0"
dependencies = [
"ahash",
"approx",
diff --git a/Cargo.toml b/Cargo.toml
index 03c816ad..3a9a39a5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "polars_ds"
-version = "0.6.3"
+version = "0.7.0"
edition = "2021"
[lib]
diff --git a/README.md b/README.md
index 077412aa..0af79fa9 100644
--- a/README.md
+++ b/README.md
@@ -216,7 +216,7 @@ Generally speaking, the more expressions you want to evaluate simultaneously, th
Why does speed matter?
-If your code already executes under 1s, then maybe it doesn't. But as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute.
+If your code already executes under 1s and you only use your code in non-production, ad-hoc environments, then maybe it doesn't. Even so, as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute or when you have a production pipeline that has to deliver the data under a time constraint.
## HELP WANTED!
diff --git a/benchmarks/benchmarks.ipynb b/benchmarks/benchmarks.ipynb
index 383f4891..f7d92f7e 100644
--- a/benchmarks/benchmarks.ipynb
+++ b/benchmarks/benchmarks.ipynb
@@ -31,7 +31,7 @@
"\n",
"Comparison: \n",
"\n",
- "Polars + PDS vs Pandas + Sklearn"
+ "Polars + PDS vs. Pandas + Sklearn"
]
},
{
@@ -86,32 +86,32 @@
"
\n",
" \n",
" 0 | \n",
- " 0.413767 | \n",
- " 1 | \n",
+ " 0.621657 | \n",
+ " 0 | \n",
" 2020-01-01 | \n",
"
\n",
" \n",
" 1 | \n",
- " 0.125783 | \n",
+ " 0.502729 | \n",
" 1 | \n",
" 2020-01-02 | \n",
"
\n",
" \n",
" 2 | \n",
- " 0.382943 | \n",
- " 1 | \n",
+ " 0.084236 | \n",
+ " 0 | \n",
" 2020-01-03 | \n",
"
\n",
" \n",
" 3 | \n",
- " 0.690455 | \n",
- " 0 | \n",
+ " 0.818261 | \n",
+ " 1 | \n",
" 2020-01-04 | \n",
"
\n",
" \n",
" 4 | \n",
- " 0.492488 | \n",
- " 0 | \n",
+ " 0.742475 | \n",
+ " 1 | \n",
" 2020-01-05 | \n",
"
\n",
" \n",
@@ -122,31 +122,31 @@
"
\n",
" \n",
" 1731 | \n",
- " 0.365318 | \n",
- " 1 | \n",
+ " 0.225007 | \n",
+ " 0 | \n",
" 2024-09-27 | \n",
"
\n",
" \n",
" 1732 | \n",
- " 0.635105 | \n",
- " 1 | \n",
+ " 0.550625 | \n",
+ " 0 | \n",
" 2024-09-28 | \n",
"
\n",
" \n",
" 1733 | \n",
- " 0.156054 | \n",
+ " 0.351283 | \n",
" 1 | \n",
" 2024-09-29 | \n",
"
\n",
" \n",
" 1734 | \n",
- " 0.736704 | \n",
+ " 0.430682 | \n",
" 1 | \n",
" 2024-09-30 | \n",
"
\n",
" \n",
" 1735 | \n",
- " 0.660525 | \n",
+ " 0.683423 | \n",
" 1 | \n",
" 2024-10-01 | \n",
"
\n",
@@ -157,17 +157,17 @@
],
"text/plain": [
" predicted actual_target dates\n",
- "0 0.413767 1 2020-01-01\n",
- "1 0.125783 1 2020-01-02\n",
- "2 0.382943 1 2020-01-03\n",
- "3 0.690455 0 2020-01-04\n",
- "4 0.492488 0 2020-01-05\n",
+ "0 0.621657 0 2020-01-01\n",
+ "1 0.502729 1 2020-01-02\n",
+ "2 0.084236 0 2020-01-03\n",
+ "3 0.818261 1 2020-01-04\n",
+ "4 0.742475 1 2020-01-05\n",
"... ... ... ...\n",
- "1731 0.365318 1 2024-09-27\n",
- "1732 0.635105 1 2024-09-28\n",
- "1733 0.156054 1 2024-09-29\n",
- "1734 0.736704 1 2024-09-30\n",
- "1735 0.660525 1 2024-10-01\n",
+ "1731 0.225007 0 2024-09-27\n",
+ "1732 0.550625 0 2024-09-28\n",
+ "1733 0.351283 1 2024-09-29\n",
+ "1734 0.430682 1 2024-09-30\n",
+ "1735 0.683423 1 2024-10-01\n",
"\n",
"[1736 rows x 3 columns]"
]
@@ -200,7 +200,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "10.3 ms ± 83 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+ "5.8 ms ± 15.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
@@ -219,14 +219,14 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "2.13 ms ± 67.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+ "1.32 ms ± 1.72 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
@@ -237,43 +237,491 @@
" roc_auc = pds.query_roc_auc(\"actual_target\", \"predicted\"),\n",
" log_loss = pds.query_log_loss(\"actual_target\", \"predicted\")\n",
").sort(\"dates\")\n",
- "# 1/5 of the time, less lines of code + easier to understand syntax"
+ "# 1/4 of the time, less lines of code + easier to understand syntax"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Common Traditional ML Pipelines\n",
+ "\n",
+ "Use cases:\n",
+ "\n",
+ "1. Data Transformation before model training\n",
+ "2. Feature Engineering pipelines, etc.\n",
+ "\n",
+ "Comparison: \n",
+ "\n",
+ "Polars + PDS vs. Pandas + Sklearn vs. Polars + Sklearn"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "# A random Dataframe with 50k records\n",
+ "size = 50_000\n",
+ "df_pl = pds.frame(size=size).select(\n",
+ " pds.random(0.0, 1.0).alias(\"x1\"),\n",
+ " pds.random(0.0, 1.0).alias(\"x2\"),\n",
+ " pds.random(0.0, 1.0).alias(\"x3\"),\n",
+ ").with_columns(\n",
+ " x4 = pl.when(pl.col(\"x3\") > 0.3).then(None).otherwise(pl.col(\"x3\")),\n",
+ " x5 = pl.when(pl.col(\"x2\") > 0.5).then(None).otherwise(pl.col(\"x2\")),\n",
+ ")\n",
+ "df_pd = df_pl.to_pandas()"
+ ]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (10, 5)x1 | x2 | x3 | x4 | x5 |
---|
f64 | f64 | f64 | f64 | f64 |
0.57686 | 0.796951 | 0.479145 | null | null |
0.703758 | 0.815689 | 0.970173 | null | null |
0.330415 | 0.952443 | 0.30547 | null | null |
0.419666 | 0.402172 | 0.65559 | null | 0.402172 |
0.099082 | 0.565292 | 0.715153 | null | null |
0.691535 | 0.297778 | 0.752498 | null | 0.297778 |
0.923842 | 0.509301 | 0.976943 | null | null |
0.70676 | 0.895296 | 0.773036 | null | null |
0.151706 | 0.345859 | 0.892369 | null | 0.345859 |
0.201388 | 0.746721 | 0.885525 | null | null |
"
+ ],
+ "text/plain": [
+ "shape: (10, 5)\n",
+ "┌──────────┬──────────┬──────────┬──────┬──────────┐\n",
+ "│ x1 ┆ x2 ┆ x3 ┆ x4 ┆ x5 │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
+ "╞══════════╪══════════╪══════════╪══════╪══════════╡\n",
+ "│ 0.57686 ┆ 0.796951 ┆ 0.479145 ┆ null ┆ null │\n",
+ "│ 0.703758 ┆ 0.815689 ┆ 0.970173 ┆ null ┆ null │\n",
+ "│ 0.330415 ┆ 0.952443 ┆ 0.30547 ┆ null ┆ null │\n",
+ "│ 0.419666 ┆ 0.402172 ┆ 0.65559 ┆ null ┆ 0.402172 │\n",
+ "│ 0.099082 ┆ 0.565292 ┆ 0.715153 ┆ null ┆ null │\n",
+ "│ 0.691535 ┆ 0.297778 ┆ 0.752498 ┆ null ┆ 0.297778 │\n",
+ "│ 0.923842 ┆ 0.509301 ┆ 0.976943 ┆ null ┆ null │\n",
+ "│ 0.70676 ┆ 0.895296 ┆ 0.773036 ┆ null ┆ null │\n",
+ "│ 0.151706 ┆ 0.345859 ┆ 0.892369 ┆ null ┆ 0.345859 │\n",
+ "│ 0.201388 ┆ 0.746721 ┆ 0.885525 ┆ null ┆ null │\n",
+ "└──────────┴──────────┴──────────┴──────┴──────────┘"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_pl.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Pandas + Sklearn"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.compose import ColumnTransformer"
+ ]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "from sklearn import set_config\n",
+ "set_config(transform_output=\"pandas\")"
+ ]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "impute_step = ColumnTransformer(\n",
+ " [(\"MedianImputer1\", SimpleImputer(strategy=\"median\"), [3]),\n",
+ " (\"MedianImputer2\", SimpleImputer(strategy=\"median\"), [4])],\n",
+ " remainder = \"passthrough\",\n",
+ " verbose_feature_names_out = False,\n",
+ ")\n",
+ "\n",
+ "pipe = Pipeline(steps = [\n",
+ " (\"Imputer\", impute_step), # impute only column 3 and 4\n",
+ " (\"StandardScaler\", StandardScaler()), # Scale all columns\n",
+ "])"
+ ]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " x1 | \n",
+ " x2 | \n",
+ " x3 | \n",
+ " x4 | \n",
+ " x5 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.258282 | \n",
+ " 1.036672 | \n",
+ " -0.064459 | \n",
+ " -0.003164 | \n",
+ " -0.005449 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.698742 | \n",
+ " 1.101685 | \n",
+ " 1.633356 | \n",
+ " -0.003164 | \n",
+ " -0.005449 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " -0.597123 | \n",
+ " 1.576182 | \n",
+ " -0.664973 | \n",
+ " -0.003164 | \n",
+ " -0.005449 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " -0.287334 | \n",
+ " -0.333094 | \n",
+ " 0.545629 | \n",
+ " -0.003164 | \n",
+ " 1.495760 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " -1.400074 | \n",
+ " 0.232884 | \n",
+ " 0.751579 | \n",
+ " -0.003164 | \n",
+ " -0.005449 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 0.656316 | \n",
+ " -0.695313 | \n",
+ " 0.880706 | \n",
+ " -0.003164 | \n",
+ " 0.476784 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 1.462646 | \n",
+ " 0.038612 | \n",
+ " 1.656764 | \n",
+ " -0.003164 | \n",
+ " -0.005449 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 0.709161 | \n",
+ " 1.377900 | \n",
+ " 0.951719 | \n",
+ " -0.003164 | \n",
+ " -0.005449 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " -1.217417 | \n",
+ " -0.528484 | \n",
+ " 1.364335 | \n",
+ " -0.003164 | \n",
+ " 0.946099 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " -1.044971 | \n",
+ " 0.862388 | \n",
+ " 1.340669 | \n",
+ " -0.003164 | \n",
+ " -0.005449 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " x1 x2 x3 x4 x5\n",
+ "0 0.258282 1.036672 -0.064459 -0.003164 -0.005449\n",
+ "1 0.698742 1.101685 1.633356 -0.003164 -0.005449\n",
+ "2 -0.597123 1.576182 -0.664973 -0.003164 -0.005449\n",
+ "3 -0.287334 -0.333094 0.545629 -0.003164 1.495760\n",
+ "4 -1.400074 0.232884 0.751579 -0.003164 -0.005449\n",
+ "5 0.656316 -0.695313 0.880706 -0.003164 0.476784\n",
+ "6 1.462646 0.038612 1.656764 -0.003164 -0.005449\n",
+ "7 0.709161 1.377900 0.951719 -0.003164 -0.005449\n",
+ "8 -1.217417 -0.528484 1.364335 -0.003164 0.946099\n",
+ "9 -1.044971 0.862388 1.340669 -0.003164 -0.005449"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipe.fit_transform(df_pd)[[\"x1\", \"x2\", \"x3\", \"x4\", \"x5\"]].head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "8.66 ms ± 20.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%timeit\n",
+ "pipe.fit_transform(df_pd)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Polars + Sklearn"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "from sklearn import set_config\n",
+ "set_config(transform_output=\"polars\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (10, 5)x1 | x2 | x3 | x4 | x5 |
---|
f64 | f64 | f64 | f64 | f64 |
0.258282 | 1.036672 | -0.064459 | -0.003164 | -0.005449 |
0.698742 | 1.101685 | 1.633356 | -0.003164 | -0.005449 |
-0.597123 | 1.576182 | -0.664973 | -0.003164 | -0.005449 |
-0.287334 | -0.333094 | 0.545629 | -0.003164 | 1.49576 |
-1.400074 | 0.232884 | 0.751579 | -0.003164 | -0.005449 |
0.656316 | -0.695313 | 0.880706 | -0.003164 | 0.476784 |
1.462646 | 0.038612 | 1.656764 | -0.003164 | -0.005449 |
0.709161 | 1.3779 | 0.951719 | -0.003164 | -0.005449 |
-1.217417 | -0.528484 | 1.364335 | -0.003164 | 0.946099 |
-1.044971 | 0.862388 | 1.340669 | -0.003164 | -0.005449 |
"
+ ],
+ "text/plain": [
+ "shape: (10, 5)\n",
+ "┌───────────┬───────────┬───────────┬───────────┬───────────┐\n",
+ "│ x1 ┆ x2 ┆ x3 ┆ x4 ┆ x5 │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
+ "╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡\n",
+ "│ 0.258282 ┆ 1.036672 ┆ -0.064459 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ 0.698742 ┆ 1.101685 ┆ 1.633356 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ -0.597123 ┆ 1.576182 ┆ -0.664973 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ -0.287334 ┆ -0.333094 ┆ 0.545629 ┆ -0.003164 ┆ 1.49576 │\n",
+ "│ -1.400074 ┆ 0.232884 ┆ 0.751579 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ 0.656316 ┆ -0.695313 ┆ 0.880706 ┆ -0.003164 ┆ 0.476784 │\n",
+ "│ 1.462646 ┆ 0.038612 ┆ 1.656764 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ 0.709161 ┆ 1.3779 ┆ 0.951719 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ -1.217417 ┆ -0.528484 ┆ 1.364335 ┆ -0.003164 ┆ 0.946099 │\n",
+ "│ -1.044971 ┆ 0.862388 ┆ 1.340669 ┆ -0.003164 ┆ -0.005449 │\n",
+ "└───────────┴───────────┴───────────┴───────────┴───────────┘"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pipe.fit_transform(df_pl).select([\"x1\", \"x2\", \"x3\", \"x4\", \"x5\"]).head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "7.35 ms ± 51.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%timeit\n",
+ "pipe.fit_transform(df_pl)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# If you use sklearn, there is not a lot of time difference because they underlying engine\n",
+ "# is not parallel (there are options but they don't work properly on Linux, which is basically\n",
+ "# all cloud compute nowadays.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Polars + Polars DS "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from polars_ds.pipeline import Pipeline, Blueprint"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (10, 5)x1 | x2 | x3 | x4 | x5 |
---|
f64 | f64 | f64 | f64 | f64 |
0.258282 | 1.036672 | -0.064459 | -0.003164 | -0.005449 |
0.698742 | 1.101685 | 1.633356 | -0.003164 | -0.005449 |
-0.597123 | 1.576182 | -0.664973 | -0.003164 | -0.005449 |
-0.287334 | -0.333094 | 0.545629 | -0.003164 | 1.49576 |
-1.400074 | 0.232884 | 0.751579 | -0.003164 | -0.005449 |
0.656316 | -0.695313 | 0.880706 | -0.003164 | 0.476784 |
1.462646 | 0.038612 | 1.656764 | -0.003164 | -0.005449 |
0.709161 | 1.3779 | 0.951719 | -0.003164 | -0.005449 |
-1.217417 | -0.528484 | 1.364335 | -0.003164 | 0.946099 |
-1.044971 | 0.862388 | 1.340669 | -0.003164 | -0.005449 |
"
+ ],
+ "text/plain": [
+ "shape: (10, 5)\n",
+ "┌───────────┬───────────┬───────────┬───────────┬───────────┐\n",
+ "│ x1 ┆ x2 ┆ x3 ┆ x4 ┆ x5 │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
+ "╞═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡\n",
+ "│ 0.258282 ┆ 1.036672 ┆ -0.064459 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ 0.698742 ┆ 1.101685 ┆ 1.633356 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ -0.597123 ┆ 1.576182 ┆ -0.664973 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ -0.287334 ┆ -0.333094 ┆ 0.545629 ┆ -0.003164 ┆ 1.49576 │\n",
+ "│ -1.400074 ┆ 0.232884 ┆ 0.751579 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ 0.656316 ┆ -0.695313 ┆ 0.880706 ┆ -0.003164 ┆ 0.476784 │\n",
+ "│ 1.462646 ┆ 0.038612 ┆ 1.656764 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ 0.709161 ┆ 1.3779 ┆ 0.951719 ┆ -0.003164 ┆ -0.005449 │\n",
+ "│ -1.217417 ┆ -0.528484 ┆ 1.364335 ┆ -0.003164 ┆ 0.946099 │\n",
+ "│ -1.044971 ┆ 0.862388 ┆ 1.340669 ┆ -0.003164 ┆ -0.005449 │\n",
+ "└───────────┴───────────┴───────────┴───────────┴───────────┘"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "bp = (\n",
+ " Blueprint(df_pl, name = \"example_pipeline\") \n",
+ " .impute([\"x4\", \"x5\"], method = \"median\")\n",
+ " .scale(pl.all(), method = \"standard\")\n",
+ ")\n",
+ "\n",
+ "pipe = bp.materialize() # bp.fit() also works\n",
+ "pipe.transform(df_pl).head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "750 μs ± 1.31 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%timeit\n",
+ "pipe = bp.materialize() # bp.fit() also works\n",
+ "pipe.transform(df_pl)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This reason for this incredible speedup is\n",
+ "# 1. PDS run natively in Polars, which means free parallelization\n",
+ "# 2. Impute, despite being a very common data transformation, is very slow in Sklearn\n",
+ "# but is extremely fast in Polars. (This is because SimpleImputer uses NumPy Array to run imputation,\n",
+ "# while Polars uses ChunkedArray which has tiny overhead when it comes to finding and filling nulls.)"
+ ]
},
{
"cell_type": "code",
diff --git a/benchmarks/linear_regression.ipynb b/benchmarks/linear_regression.ipynb
index 7aa62f58..32db65d8 100644
--- a/benchmarks/linear_regression.ipynb
+++ b/benchmarks/linear_regression.ipynb
@@ -2,9 +2,17 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.7.0\n"
+ ]
+ }
+ ],
"source": [
"import polars as pl\n",
"import pandas as pd\n",
@@ -16,12 +24,44 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 8)x1 | x2 | x3 | x4 | x5 | code | id | y |
---|
f64 | f64 | f64 | f64 | f64 | i32 | i64 | f64 |
0.023153 | 0.218893 | 0.165474 | 0.065297 | 0.437636 | 1 | 0 | -0.002283 |
0.212167 | 0.821121 | 0.726689 | 0.484775 | 0.97551 | 3 | 1 | 0.172509 |
0.587599 | 0.432226 | 0.825491 | 0.14475 | 0.80575 | 1 | 2 | 0.202238 |
0.278052 | 0.547404 | 0.544241 | 0.78111 | 0.119928 | 3 | 3 | 0.334958 |
0.65751 | 0.111454 | 0.767859 | 0.661847 | 0.278934 | 2 | 4 | 0.337549 |
"
+ ],
+ "text/plain": [
+ "shape: (5, 8)\n",
+ "┌──────────┬──────────┬──────────┬──────────┬──────────┬──────┬─────┬───────────┐\n",
+ "│ x1 ┆ x2 ┆ x3 ┆ x4 ┆ x5 ┆ code ┆ id ┆ y │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ i64 ┆ f64 │\n",
+ "╞══════════╪══════════╪══════════╪══════════╪══════════╪══════╪═════╪═══════════╡\n",
+ "│ 0.023153 ┆ 0.218893 ┆ 0.165474 ┆ 0.065297 ┆ 0.437636 ┆ 1 ┆ 0 ┆ -0.002283 │\n",
+ "│ 0.212167 ┆ 0.821121 ┆ 0.726689 ┆ 0.484775 ┆ 0.97551 ┆ 3 ┆ 1 ┆ 0.172509 │\n",
+ "│ 0.587599 ┆ 0.432226 ┆ 0.825491 ┆ 0.14475 ┆ 0.80575 ┆ 1 ┆ 2 ┆ 0.202238 │\n",
+ "│ 0.278052 ┆ 0.547404 ┆ 0.544241 ┆ 0.78111 ┆ 0.119928 ┆ 3 ┆ 3 ┆ 0.334958 │\n",
+ "│ 0.65751 ┆ 0.111454 ┆ 0.767859 ┆ 0.661847 ┆ 0.278934 ┆ 2 ┆ 4 ┆ 0.337549 │\n",
+ "└──────────┴──────────┴──────────┴──────────┴──────────┴──────┴─────┴───────────┘"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"size = 50_000\n",
- "df = pds.random_data(size=size, n_cols=0).select(\n",
+ "df = pds.frame(size=size).select(\n",
" pds.random(0.0, 1.0).alias(\"x1\"),\n",
" pds.random(0.0, 1.0).alias(\"x2\"),\n",
" pds.random(0.0, 1.0).alias(\"x3\"),\n",
@@ -37,7 +77,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -56,16 +96,35 @@
"\n",
"I did not invent any of the algorithms that solves the linear regression problem. Not did I make any improvement to existing algorithms. I only rewrote them in Rust, using Faer, and brought the algorithms alive with Polars.\n",
"\n",
- "1. Polars DS In-DataFrame Linear Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy LinearRegression\n",
- "2. Polars DS In-DataFrame Ridge Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy Ridge\n",
- "3. Polars DS In-DataFrame Lasso Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy Lasso"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "1. Polars DS In-DataFrame Linear Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy LinearRegression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Polars DS: shape: (5,)\n",
+ "Series: '' [f64]\n",
+ "[\n",
+ "\t0.500019\n",
+ "\t0.250019\n",
+ "\t-0.149981\n",
+ "\t0.200018\n",
+ "\t-0.129981\n",
+ "]\n",
+ "PDS LR: \n",
+ "Sklearn: [[ 0.50001865 0.25001948 -0.14998127 0.20001819 -0.12998134]]\n"
+ ]
+ }
+ ],
"source": [
"# Polars DS way\n",
"print(\n",
@@ -74,7 +133,6 @@
" pds.lin_reg(\n",
" \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
" target = \"y\",\n",
- " method = \"normal\",\n",
" )\n",
" ).item(0, 0)\n",
")\n",
@@ -82,8 +140,9 @@
"# Fit is done implicitly because X and y are passed at initialization\n",
"# You can also don't put X and y here and do a lr.fit(X,y) later.\n",
"lr = pds_linear.LR(\n",
- " X=X, y=y, add_bias=False, method=\"normal\"\n",
+ " fit_bias=False\n",
") \n",
+ "lr.fit(X, y)\n",
"print(\"PDS LR: \", lr.coeffs)\n",
"\n",
"# Sklearn\n",
@@ -94,236 +153,67 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "787 μs ± 10.3 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
+ ]
+ }
+ ],
"source": [
"%%timeit \n",
"df.select(\n",
" pds.lin_reg(\n",
" \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
" target = \"y\",\n",
- " method = \"normal\",\n",
" )\n",
")"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "631 μs ± 1.89 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
+ ]
+ }
+ ],
"source": [
"%%timeit\n",
"lr = pds_linear.LR(\n",
- " add_bias=False, method=\"normal\"\n",
+ " fit_bias=False,\n",
")\n",
"lr.fit(X, y)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1.42 ms ± 2.87 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
+ ]
+ }
+ ],
"source": [
"%%timeit\n",
"reg = LinearRegression(fit_intercept=False, copy_X=False)\n",
"reg.fit(X, y)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Polars DS way\n",
- "print(\n",
- " \"Polars DS: \",\n",
- " df.select(\n",
- " pds.lin_reg(\n",
- " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
- " target = \"y\",\n",
- " method = \"l1\",\n",
- " l1_reg = 0.1\n",
- " )\n",
- " ).item(0, 0)\n",
- ")\n",
- "\n",
- "# Fit is done implicitly because X and y are passed at initialization\n",
- "# You can also don't put X and y here and do a lr.fit(X,y) later.\n",
- "lr = pds_linear.LR(\n",
- " X=X, y=y, add_bias=False, method=\"l1\", lambda_ = 0.1,\n",
- ") \n",
- "print(\"PDS LR: \", lr.coeffs)\n",
- "\n",
- "# Sklearn\n",
- "reg = Lasso(alpha = 0.1, fit_intercept=False)\n",
- "reg.fit(X, y)\n",
- "print(\"Sklearn: \", reg.coef_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%timeit\n",
- "df.select(\n",
- " pds.lin_reg(\n",
- " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
- " target = \"y\",\n",
- " method = \"l1\",\n",
- " l1_reg = 0.1\n",
- " )\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%timeit\n",
- "lr = pds_linear.LR(\n",
- " add_bias=False, method=\"l1\", lambda_=0.1\n",
- ") \n",
- "# This is faster than the in-dataframe ver because this uses NumPy data directly, which skips a copy.\n",
- "# This is faster than sklearn because the underlying linalg library is faster. The convergence criterion is also simpler, though \n",
- "# less rigourous, than sklearn's. However, you can set tol = 1e-7 and still be faster.\n",
- "lr.fit(X, y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%timeit\n",
- "reg = Lasso(alpha = 0.1, fit_intercept=False, copy_X=False)\n",
- "reg.fit(X, y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Polars DS way\n",
- "print(\n",
- " \"Polars DS: \",\n",
- " df.select(\n",
- " pds.lin_reg(\n",
- " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
- " target = \"y\",\n",
- " method = \"l2\",\n",
- " l2_reg = 0.1\n",
- " )\n",
- " ).item(0, 0)\n",
- ")\n",
- "\n",
- "# Fit is done implicitly because X and y are passed at initialization\n",
- "# You can also don't put X and y here and do a lr.fit(X,y) later.\n",
- "lr = pds_linear.LR(\n",
- " X=X, y=y, add_bias=False, method=\"l2\", lambda_ = 0.1,\n",
- ") \n",
- "print(\"PDS LR: \", lr.coeffs)\n",
- "\n",
- "# Sklearn\n",
- "reg = Ridge(alpha = 0.1, fit_intercept=False)\n",
- "reg.fit(X, y)\n",
- "print(\"Sklearn: \", reg.coef_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%timeit\n",
- "df.select(\n",
- " pds.lin_reg(\n",
- " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
- " target = \"y\",\n",
- " method = \"l2\",\n",
- " l2_reg = 0.1\n",
- " )\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%timeit\n",
- "lr = pds_linear.LR(\n",
- " add_bias=False, method=\"l2\", lambda_=0.1\n",
- ") \n",
- "lr.fit(X, y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%timeit\n",
- "reg = Ridge(alpha = 0.1, fit_intercept=False, copy_X=False)\n",
- "reg.fit(X, y)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# What you can do with Polars DS but will be hard for Scikit-learn"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Train a linear regression model on each category. And return the predictions\n",
- "df.select(\n",
- " pl.col(\"id\"),\n",
- " pds.lin_reg(\n",
- " \"x1\", \"x2\", \"x3\", \"x4\", \"x5\",\n",
- " target = \"y\",\n",
- " method = \"l2\",\n",
- " l2_reg = 0.1,\n",
- " return_pred = True\n",
- " ).over(\"code\").alias(\"predictions\")\n",
- ").unnest(\"predictions\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Train a linear regression model on each category. And return only the coefficients\n",
- "df.group_by(\"code\").agg(\n",
- " pds.lin_reg(\n",
- " \"x1\", \"x2\", \"x3\",\n",
- " target = \"y\",\n",
- " method = \"l2\",\n",
- " l2_reg = 0.1,\n",
- " )\n",
- ").sort(\"code\")"
- ]
- },
{
"cell_type": "code",
"execution_count": null,
@@ -339,8 +229,16 @@
"name": "python3"
},
"language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
"name": "python",
- "version": "3.11.8"
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
}
},
"nbformat": 4,
diff --git a/docs/index.md b/docs/index.md
index a879b90a..de3bd728 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -205,7 +205,7 @@ Generally speaking, the more expressions you want to evaluate simultaneously, th
Why does speed matter?
-If your code already executes under 1s, then maybe it doesn't. But as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute.
+If your code already executes under 1s and you only use your code in non-production, ad-hoc environments, then maybe it doesn't. Even so, as your data grow, having a 5s run vs. a 1s run will make a lot of difference in your iterations for your project. Speed of execution becomes a bigger issues if you are building reports on demand, or if you need to pay extra for additional compute or when you have a production pipeline that has to deliver the data under a time constraint.
## HELP WANTED!
diff --git a/examples/basics.ipynb b/examples/basics.ipynb
index 50efabcc..00ba148c 100644
--- a/examples/basics.ipynb
+++ b/examples/basics.ipynb
@@ -46,21 +46,21 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 13)f | time_idx | dummy | actual | predicted | dummy_groups | x1 | x2 | x3 | a | b | y | y2 |
---|
f64 | i64 | str | i32 | f64 | str | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
0.0 | 0 | "a" | 0 | 0.091583 | "a" | 0.027516 | 0.025068 | 0.073583 | 0.610364 | 0.579474 | -0.098684 | 0.007555 |
0.841471 | 1 | "a" | 1 | 0.585465 | "a" | 0.947079 | 0.917548 | 0.539384 | 0.248219 | 0.909225 | -0.391686 | 0.482099 |
0.909297 | 2 | "a" | 1 | 0.098363 | "a" | 0.198841 | 0.113598 | 0.866751 | 0.52313 | 0.392237 | -1.236188 | -0.009659 |
0.14112 | 3 | "a" | 1 | 0.03237 | "a" | 0.443498 | 0.208141 | 0.137899 | 0.351743 | 0.354237 | -0.077879 | 0.137624 |
-0.756802 | 4 | "a" | 1 | 0.324095 | "a" | 0.804234 | 0.103371 | 0.885442 | 0.413473 | 0.870453 | -1.176478 | 0.062609 |
"
+ "shape: (5, 13)f | time_idx | dummy | actual | predicted | dummy_groups | x1 | x2 | x3 | a | b | y | y2 |
---|
f64 | i64 | str | i32 | f64 | str | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
0.0 | 0 | "a" | 1 | 0.836972 | "a" | 0.181184 | 0.833164 | 0.242837 | 0.963357 | 0.645792 | -0.087106 | 0.374231 |
0.841471 | 1 | "a" | 0 | 0.160224 | "a" | 0.716097 | 0.7239 | 0.006144 | 0.523703 | 0.308267 | 0.315413 | 0.418257 |
0.909297 | 2 | "a" | 0 | 0.289834 | "a" | 0.803562 | 0.138619 | 0.399191 | 0.480838 | 0.035734 | -0.436603 | 0.127021 |
0.14112 | 3 | "a" | 0 | 0.192884 | "a" | 0.924235 | 0.08284 | 0.071727 | 0.854051 | 0.943042 | 0.055943 | 0.150295 |
-0.756802 | 4 | "a" | 0 | 0.370113 | "a" | 0.460823 | 0.085475 | 0.967126 | 0.965046 | 0.556006 | -1.355857 | 0.00166 |
"
],
"text/plain": [
"shape: (5, 13)\n",
- "┌───────────┬──────────┬───────┬────────┬───┬──────────┬──────────┬───────────┬───────────┐\n",
- "│ f ┆ time_idx ┆ dummy ┆ actual ┆ … ┆ a ┆ b ┆ y ┆ y2 │\n",
- "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
- "│ f64 ┆ i64 ┆ str ┆ i32 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
- "╞═══════════╪══════════╪═══════╪════════╪═══╪══════════╪══════════╪═══════════╪═══════════╡\n",
- "│ 0.0 ┆ 0 ┆ a ┆ 0 ┆ … ┆ 0.610364 ┆ 0.579474 ┆ -0.098684 ┆ 0.007555 │\n",
- "│ 0.841471 ┆ 1 ┆ a ┆ 1 ┆ … ┆ 0.248219 ┆ 0.909225 ┆ -0.391686 ┆ 0.482099 │\n",
- "│ 0.909297 ┆ 2 ┆ a ┆ 1 ┆ … ┆ 0.52313 ┆ 0.392237 ┆ -1.236188 ┆ -0.009659 │\n",
- "│ 0.14112 ┆ 3 ┆ a ┆ 1 ┆ … ┆ 0.351743 ┆ 0.354237 ┆ -0.077879 ┆ 0.137624 │\n",
- "│ -0.756802 ┆ 4 ┆ a ┆ 1 ┆ … ┆ 0.413473 ┆ 0.870453 ┆ -1.176478 ┆ 0.062609 │\n",
- "└───────────┴──────────┴───────┴────────┴───┴──────────┴──────────┴───────────┴───────────┘"
+ "┌───────────┬──────────┬───────┬────────┬───┬──────────┬──────────┬───────────┬──────────┐\n",
+ "│ f ┆ time_idx ┆ dummy ┆ actual ┆ … ┆ a ┆ b ┆ y ┆ y2 │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ f64 ┆ i64 ┆ str ┆ i32 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
+ "╞═══════════╪══════════╪═══════╪════════╪═══╪══════════╪══════════╪═══════════╪══════════╡\n",
+ "│ 0.0 ┆ 0 ┆ a ┆ 1 ┆ … ┆ 0.963357 ┆ 0.645792 ┆ -0.087106 ┆ 0.374231 │\n",
+ "│ 0.841471 ┆ 1 ┆ a ┆ 0 ┆ … ┆ 0.523703 ┆ 0.308267 ┆ 0.315413 ┆ 0.418257 │\n",
+ "│ 0.909297 ┆ 2 ┆ a ┆ 0 ┆ … ┆ 0.480838 ┆ 0.035734 ┆ -0.436603 ┆ 0.127021 │\n",
+ "│ 0.14112 ┆ 3 ┆ a ┆ 0 ┆ … ┆ 0.854051 ┆ 0.943042 ┆ 0.055943 ┆ 0.150295 │\n",
+ "│ -0.756802 ┆ 4 ┆ a ┆ 0 ┆ … ┆ 0.965046 ┆ 0.556006 ┆ -1.355857 ┆ 0.00166 │\n",
+ "└───────────┴──────────┴───────┴────────┴───┴──────────┴──────────┴───────────┴──────────┘"
]
},
"execution_count": 2,
@@ -218,7 +218,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 3)f | a | b |
---|
f64 | f64 | f64 |
1.3944e-15 | -0.610364 | -0.579474 |
-0.841471 | -0.248219 | -0.909225 |
-0.909297 | -0.52313 | -0.392237 |
-0.14112 | -0.351743 | -0.354237 |
0.756802 | 0.196891 | -0.290979 |
"
+ "shape: (5, 3)f | a | b |
---|
f64 | f64 | f64 |
1.3944e-15 | -0.963357 | -0.645792 |
-0.841471 | -0.523703 | -0.308267 |
-0.909297 | -0.480838 | -0.035734 |
-0.14112 | -0.854051 | -0.943042 |
0.756802 | -0.001688 | 0.089786 |
"
],
"text/plain": [
"shape: (5, 3)\n",
@@ -227,11 +227,11 @@
"│ --- ┆ --- ┆ --- │\n",
"│ f64 ┆ f64 ┆ f64 │\n",
"╞════════════╪═══════════╪═══════════╡\n",
- "│ 1.3944e-15 ┆ -0.610364 ┆ -0.579474 │\n",
- "│ -0.841471 ┆ -0.248219 ┆ -0.909225 │\n",
- "│ -0.909297 ┆ -0.52313 ┆ -0.392237 │\n",
- "│ -0.14112 ┆ -0.351743 ┆ -0.354237 │\n",
- "│ 0.756802 ┆ 0.196891 ┆ -0.290979 │\n",
+ "│ 1.3944e-15 ┆ -0.963357 ┆ -0.645792 │\n",
+ "│ -0.841471 ┆ -0.523703 ┆ -0.308267 │\n",
+ "│ -0.909297 ┆ -0.480838 ┆ -0.035734 │\n",
+ "│ -0.14112 ┆ -0.854051 ┆ -0.943042 │\n",
+ "│ 0.756802 ┆ -0.001688 ┆ 0.089786 │\n",
"└────────────┴───────────┴───────────┘"
]
},
@@ -268,17 +268,17 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (1, 1)coeffs |
---|
list[f64] |
[-0.498886, -0.35376] |
"
+ "shape: (1, 1)coeffs |
---|
list[f64] |
[-0.500734, -0.338584] |
"
],
"text/plain": [
"shape: (1, 1)\n",
- "┌───────────────────────┐\n",
- "│ coeffs │\n",
- "│ --- │\n",
- "│ list[f64] │\n",
- "╞═══════════════════════╡\n",
- "│ [-0.498886, -0.35376] │\n",
- "└───────────────────────┘"
+ "┌────────────────────────┐\n",
+ "│ coeffs │\n",
+ "│ --- │\n",
+ "│ list[f64] │\n",
+ "╞════════════════════════╡\n",
+ "│ [-0.500734, -0.338584] │\n",
+ "└────────────────────────┘"
]
},
"execution_count": 7,
@@ -313,17 +313,17 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (1, 2)target_0 | target_1 |
---|
list[f64] | list[f64] |
[-0.498886, -0.35376] | [0.086782, 0.406454] |
"
+ "shape: (1, 2)target_0 | target_1 |
---|
list[f64] | list[f64] |
[-0.500734, -0.338584] | [0.086658, 0.407468] |
"
],
"text/plain": [
"shape: (1, 2)\n",
- "┌───────────────────────┬──────────────────────┐\n",
- "│ target_0 ┆ target_1 │\n",
- "│ --- ┆ --- │\n",
- "│ list[f64] ┆ list[f64] │\n",
- "╞═══════════════════════╪══════════════════════╡\n",
- "│ [-0.498886, -0.35376] ┆ [0.086782, 0.406454] │\n",
- "└───────────────────────┴──────────────────────┘"
+ "┌────────────────────────┬──────────────────────┐\n",
+ "│ target_0 ┆ target_1 │\n",
+ "│ --- ┆ --- │\n",
+ "│ list[f64] ┆ list[f64] │\n",
+ "╞════════════════════════╪══════════════════════╡\n",
+ "│ [-0.500734, -0.338584] ┆ [0.086658, 0.407468] │\n",
+ "└────────────────────────┴──────────────────────┘"
]
},
"execution_count": 8,
@@ -358,7 +358,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (4, 7)features | beta | std_err | t | p>|t| | 0.025 | 0.975 |
---|
str | f64 | f64 | f64 | f64 | f64 | f64 |
"ln(x1+1)" | 0.219035 | 0.001699 | 128.887987 | 0.0 | 0.215704 | 0.222366 |
"exp(x2)" | 0.173641 | 0.000686 | 253.137788 | 0.0 | 0.172296 | 0.174986 |
"sin(x3)" | -1.743404 | 0.001351 | -1290.252947 | 0.0 | -1.746052 | -1.740755 |
"__bias__" | -0.106227 | 0.001517 | -70.020282 | 0.0 | -0.109201 | -0.103253 |
"
+ "shape: (4, 7)features | beta | std_err | t | p>|t| | 0.025 | 0.975 |
---|
str | f64 | f64 | f64 | f64 | f64 | f64 |
"ln(x1+1)" | 0.220087 | 0.001678 | 131.140737 | 0.0 | 0.216797 | 0.223376 |
"exp(x2)" | 0.174449 | 0.000676 | 258.179775 | 0.0 | 0.173125 | 0.175774 |
"sin(x3)" | -1.745781 | 0.001346 | -1297.083954 | 0.0 | -1.748419 | -1.743142 |
"__bias__" | -0.106951 | 0.0015 | -71.292813 | 0.0 | -0.109891 | -0.10401 |
"
],
"text/plain": [
"shape: (4, 7)\n",
@@ -367,10 +367,10 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞══════════╪═══════════╪══════════╪══════════════╪═══════╪═══════════╪═══════════╡\n",
- "│ ln(x1+1) ┆ 0.219035 ┆ 0.001699 ┆ 128.887987 ┆ 0.0 ┆ 0.215704 ┆ 0.222366 │\n",
- "│ exp(x2) ┆ 0.173641 ┆ 0.000686 ┆ 253.137788 ┆ 0.0 ┆ 0.172296 ┆ 0.174986 │\n",
- "│ sin(x3) ┆ -1.743404 ┆ 0.001351 ┆ -1290.252947 ┆ 0.0 ┆ -1.746052 ┆ -1.740755 │\n",
- "│ __bias__ ┆ -0.106227 ┆ 0.001517 ┆ -70.020282 ┆ 0.0 ┆ -0.109201 ┆ -0.103253 │\n",
+ "│ ln(x1+1) ┆ 0.220087 ┆ 0.001678 ┆ 131.140737 ┆ 0.0 ┆ 0.216797 ┆ 0.223376 │\n",
+ "│ exp(x2) ┆ 0.174449 ┆ 0.000676 ┆ 258.179775 ┆ 0.0 ┆ 0.173125 ┆ 0.175774 │\n",
+ "│ sin(x3) ┆ -1.745781 ┆ 0.001346 ┆ -1297.083954 ┆ 0.0 ┆ -1.748419 ┆ -1.743142 │\n",
+ "│ __bias__ ┆ -0.106951 ┆ 0.0015 ┆ -71.292813 ┆ 0.0 ┆ -0.109891 ┆ -0.10401 │\n",
"└──────────┴───────────┴──────────┴──────────────┴───────┴───────────┴───────────┘"
]
},
@@ -407,7 +407,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (10_000, 2)dummy | coeffs |
---|
str | list[f64] |
"a" | [-0.508478, -0.340153] |
"a" | [-0.508478, -0.340153] |
"a" | [-0.508478, -0.340153] |
"a" | [-0.508478, -0.340153] |
"a" | [-0.508478, -0.340153] |
… | … |
"b" | [-0.489381, -0.36711] |
"b" | [-0.489381, -0.36711] |
"b" | [-0.489381, -0.36711] |
"b" | [-0.489381, -0.36711] |
"b" | [-0.489381, -0.36711] |
"
+ "shape: (10_000, 2)dummy | coeffs |
---|
str | list[f64] |
"a" | [-0.479674, -0.344547] |
"a" | [-0.479674, -0.344547] |
"a" | [-0.479674, -0.344547] |
"a" | [-0.479674, -0.344547] |
"a" | [-0.479674, -0.344547] |
… | … |
"b" | [-0.5218, -0.33279] |
"b" | [-0.5218, -0.33279] |
"b" | [-0.5218, -0.33279] |
"b" | [-0.5218, -0.33279] |
"b" | [-0.5218, -0.33279] |
"
],
"text/plain": [
"shape: (10_000, 2)\n",
@@ -416,17 +416,17 @@
"│ --- ┆ --- │\n",
"│ str ┆ list[f64] │\n",
"╞═══════╪════════════════════════╡\n",
- "│ a ┆ [-0.508478, -0.340153] │\n",
- "│ a ┆ [-0.508478, -0.340153] │\n",
- "│ a ┆ [-0.508478, -0.340153] │\n",
- "│ a ┆ [-0.508478, -0.340153] │\n",
- "│ a ┆ [-0.508478, -0.340153] │\n",
+ "│ a ┆ [-0.479674, -0.344547] │\n",
+ "│ a ┆ [-0.479674, -0.344547] │\n",
+ "│ a ┆ [-0.479674, -0.344547] │\n",
+ "│ a ┆ [-0.479674, -0.344547] │\n",
+ "│ a ┆ [-0.479674, -0.344547] │\n",
"│ … ┆ … │\n",
- "│ b ┆ [-0.489381, -0.36711] │\n",
- "│ b ┆ [-0.489381, -0.36711] │\n",
- "│ b ┆ [-0.489381, -0.36711] │\n",
- "│ b ┆ [-0.489381, -0.36711] │\n",
- "│ b ┆ [-0.489381, -0.36711] │\n",
+ "│ b ┆ [-0.5218, -0.33279] │\n",
+ "│ b ┆ [-0.5218, -0.33279] │\n",
+ "│ b ┆ [-0.5218, -0.33279] │\n",
+ "│ b ┆ [-0.5218, -0.33279] │\n",
+ "│ b ┆ [-0.5218, -0.33279] │\n",
"└───────┴────────────────────────┘"
]
},
@@ -462,7 +462,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 5)x1 | x2 | y | pred | resid |
---|
f64 | f64 | f64 | f64 | f64 |
0.027516 | 0.025068 | -0.098684 | -0.022595 | -0.076089 |
0.947079 | 0.917548 | -0.391686 | -0.797076 | 0.405389 |
0.198841 | 0.113598 | -1.236188 | -0.139386 | -1.096802 |
0.443498 | 0.208141 | -0.077879 | -0.294887 | 0.217008 |
0.804234 | 0.103371 | -1.176478 | -0.437789 | -0.738689 |
"
+ "shape: (5, 5)x1 | x2 | y | pred | resid |
---|
f64 | f64 | f64 | f64 | f64 |
0.181184 | 0.833164 | -0.087106 | -0.372821 | 0.285715 |
0.716097 | 0.7239 | 0.315413 | -0.603674 | 0.919088 |
0.803562 | 0.138619 | -0.436603 | -0.449304 | 0.012702 |
0.924235 | 0.08284 | 0.055943 | -0.490844 | 0.546787 |
0.460823 | 0.085475 | -1.355857 | -0.25969 | -1.096167 |
"
],
"text/plain": [
"shape: (5, 5)\n",
@@ -471,11 +471,11 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞══════════╪══════════╪═══════════╪═══════════╪═══════════╡\n",
- "│ 0.027516 ┆ 0.025068 ┆ -0.098684 ┆ -0.022595 ┆ -0.076089 │\n",
- "│ 0.947079 ┆ 0.917548 ┆ -0.391686 ┆ -0.797076 ┆ 0.405389 │\n",
- "│ 0.198841 ┆ 0.113598 ┆ -1.236188 ┆ -0.139386 ┆ -1.096802 │\n",
- "│ 0.443498 ┆ 0.208141 ┆ -0.077879 ┆ -0.294887 ┆ 0.217008 │\n",
- "│ 0.804234 ┆ 0.103371 ┆ -1.176478 ┆ -0.437789 ┆ -0.738689 │\n",
+ "│ 0.181184 ┆ 0.833164 ┆ -0.087106 ┆ -0.372821 ┆ 0.285715 │\n",
+ "│ 0.716097 ┆ 0.7239 ┆ 0.315413 ┆ -0.603674 ┆ 0.919088 │\n",
+ "│ 0.803562 ┆ 0.138619 ┆ -0.436603 ┆ -0.449304 ┆ 0.012702 │\n",
+ "│ 0.924235 ┆ 0.08284 ┆ 0.055943 ┆ -0.490844 ┆ 0.546787 │\n",
+ "│ 0.460823 ┆ 0.085475 ┆ -1.355857 ┆ -0.25969 ┆ -1.096167 │\n",
"└──────────┴──────────┴───────────┴───────────┴───────────┘"
]
},
@@ -515,7 +515,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (2, 2)dummy | coeffs |
---|
str | list[f64] |
"b" | [-0.489381, -0.36711] |
"a" | [-0.508478, -0.340153] |
"
+ "shape: (2, 2)dummy | coeffs |
---|
str | list[f64] |
"a" | [-0.479674, -0.344547] |
"b" | [-0.5218, -0.33279] |
"
],
"text/plain": [
"shape: (2, 2)\n",
@@ -524,8 +524,8 @@
"│ --- ┆ --- │\n",
"│ str ┆ list[f64] │\n",
"╞═══════╪════════════════════════╡\n",
- "│ b ┆ [-0.489381, -0.36711] │\n",
- "│ a ┆ [-0.508478, -0.340153] │\n",
+ "│ a ┆ [-0.479674, -0.344547] │\n",
+ "│ b ┆ [-0.5218, -0.33279] │\n",
"└───────┴────────────────────────┘"
]
},
@@ -560,7 +560,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (2, 2)dummy | coeffs |
---|
str | list[f64] |
"a" | [-0.343272, -0.157735] |
"b" | [-0.315854, -0.193592] |
"
+ "shape: (2, 2)dummy | coeffs |
---|
str | list[f64] |
"a" | [-0.299928, -0.187761] |
"b" | [-0.347887, -0.161111] |
"
],
"text/plain": [
"shape: (2, 2)\n",
@@ -569,8 +569,8 @@
"│ --- ┆ --- │\n",
"│ str ┆ list[f64] │\n",
"╞═══════╪════════════════════════╡\n",
- "│ a ┆ [-0.343272, -0.157735] │\n",
- "│ b ┆ [-0.315854, -0.193592] │\n",
+ "│ a ┆ [-0.299928, -0.187761] │\n",
+ "│ b ┆ [-0.347887, -0.161111] │\n",
"└───────┴────────────────────────┘"
]
},
@@ -607,7 +607,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (2, 2)dummy | lasso_r2 |
---|
str | f64 |
"a" | -0.54074 |
"b" | -0.548295 |
"
+ "shape: (2, 2)dummy | lasso_r2 |
---|
str | f64 |
"a" | -0.533955 |
"b" | -0.547336 |
"
],
"text/plain": [
"shape: (2, 2)\n",
@@ -616,8 +616,8 @@
"│ --- ┆ --- │\n",
"│ str ┆ f64 │\n",
"╞═══════╪═══════════╡\n",
- "│ a ┆ -0.54074 │\n",
- "│ b ┆ -0.548295 │\n",
+ "│ a ┆ -0.533955 │\n",
+ "│ b ┆ -0.547336 │\n",
"└───────┴───────────┘"
]
},
@@ -658,7 +658,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (10_000, 5)y | x1 | x2 | coeffs | pred |
---|
f64 | f64 | f64 | list[f64] | f64 |
-0.098684 | 0.027516 | 0.025068 | null | null |
-0.391686 | 0.947079 | 0.917548 | null | null |
-1.236188 | 0.198841 | 0.113598 | null | null |
-0.077879 | 0.443498 | 0.208141 | null | null |
-1.176478 | 0.804234 | 0.103371 | [-1.609748, 1.186046] | -1.172012 |
… | … | … | … | … |
0.237618 | 0.71738 | 0.729978 | [-0.418405, -0.473687] | -0.645937 |
-0.879749 | 0.388987 | 0.291635 | [-0.813367, -0.190164] | -0.371848 |
-0.302075 | 0.00809 | 0.953496 | [-0.891931, -0.00105] | -0.008217 |
-1.037887 | 0.229935 | 0.373374 | [-1.01028, -0.033456] | -0.244791 |
0.163498 | 0.6866 | 0.724015 | [0.116038, -0.35731] | -0.179026 |
"
+ "shape: (10_000, 5)y | x1 | x2 | coeffs | pred |
---|
f64 | f64 | f64 | list[f64] | f64 |
-0.087106 | 0.181184 | 0.833164 | null | null |
0.315413 | 0.716097 | 0.7239 | null | null |
-0.436603 | 0.803562 | 0.138619 | null | null |
0.055943 | 0.924235 | 0.08284 | null | null |
-1.355857 | 0.460823 | 0.085475 | [-0.434778, 0.298689] | -0.174825 |
… | … | … | … | … |
-0.785811 | 0.297807 | 0.209961 | [-1.72997, -0.058576] | -0.527496 |
-0.391738 | 0.26132 | 0.040594 | [-1.856937, 0.204615] | -0.476948 |
-0.472705 | 0.881679 | 0.572239 | [0.021692, -1.521368] | -0.851461 |
-0.41373 | 0.933442 | 0.189696 | [-0.181844, -1.202663] | -0.397881 |
-0.058646 | 0.153629 | 0.836968 | [-0.610511, -0.035761] | -0.123723 |
"
],
"text/plain": [
"shape: (10_000, 5)\n",
@@ -667,17 +667,17 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ f64 ┆ f64 ┆ f64 ┆ list[f64] ┆ f64 │\n",
"╞═══════════╪══════════╪══════════╪════════════════════════╪═══════════╡\n",
- "│ -0.098684 ┆ 0.027516 ┆ 0.025068 ┆ null ┆ null │\n",
- "│ -0.391686 ┆ 0.947079 ┆ 0.917548 ┆ null ┆ null │\n",
- "│ -1.236188 ┆ 0.198841 ┆ 0.113598 ┆ null ┆ null │\n",
- "│ -0.077879 ┆ 0.443498 ┆ 0.208141 ┆ null ┆ null │\n",
- "│ -1.176478 ┆ 0.804234 ┆ 0.103371 ┆ [-1.609748, 1.186046] ┆ -1.172012 │\n",
+ "│ -0.087106 ┆ 0.181184 ┆ 0.833164 ┆ null ┆ null │\n",
+ "│ 0.315413 ┆ 0.716097 ┆ 0.7239 ┆ null ┆ null │\n",
+ "│ -0.436603 ┆ 0.803562 ┆ 0.138619 ┆ null ┆ null │\n",
+ "│ 0.055943 ┆ 0.924235 ┆ 0.08284 ┆ null ┆ null │\n",
+ "│ -1.355857 ┆ 0.460823 ┆ 0.085475 ┆ [-0.434778, 0.298689] ┆ -0.174825 │\n",
"│ … ┆ … ┆ … ┆ … ┆ … │\n",
- "│ 0.237618 ┆ 0.71738 ┆ 0.729978 ┆ [-0.418405, -0.473687] ┆ -0.645937 │\n",
- "│ -0.879749 ┆ 0.388987 ┆ 0.291635 ┆ [-0.813367, -0.190164] ┆ -0.371848 │\n",
- "│ -0.302075 ┆ 0.00809 ┆ 0.953496 ┆ [-0.891931, -0.00105] ┆ -0.008217 │\n",
- "│ -1.037887 ┆ 0.229935 ┆ 0.373374 ┆ [-1.01028, -0.033456] ┆ -0.244791 │\n",
- "│ 0.163498 ┆ 0.6866 ┆ 0.724015 ┆ [0.116038, -0.35731] ┆ -0.179026 │\n",
+ "│ -0.785811 ┆ 0.297807 ┆ 0.209961 ┆ [-1.72997, -0.058576] ┆ -0.527496 │\n",
+ "│ -0.391738 ┆ 0.26132 ┆ 0.040594 ┆ [-1.856937, 0.204615] ┆ -0.476948 │\n",
+ "│ -0.472705 ┆ 0.881679 ┆ 0.572239 ┆ [0.021692, -1.521368] ┆ -0.851461 │\n",
+ "│ -0.41373 ┆ 0.933442 ┆ 0.189696 ┆ [-0.181844, -1.202663] ┆ -0.397881 │\n",
+ "│ -0.058646 ┆ 0.153629 ┆ 0.836968 ┆ [-0.610511, -0.035761] ┆ -0.123723 │\n",
"└───────────┴──────────┴──────────┴────────────────────────┴───────────┘"
]
},
@@ -758,7 +758,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (1, 1)a |
---|
list[f64] |
[28.850744, 28.801703, 28.618474] |
"
+ "shape: (1, 1)a |
---|
list[f64] |
[29.073839, 28.893157, 28.404245] |
"
],
"text/plain": [
"shape: (1, 1)\n",
@@ -767,7 +767,7 @@
"│ --- │\n",
"│ list[f64] │\n",
"╞═════════════════════════════════╡\n",
- "│ [28.850744, 28.801703, 28.6184… │\n",
+ "│ [29.073839, 28.893157, 28.4042… │\n",
"└─────────────────────────────────┘"
]
},
@@ -799,7 +799,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (2, 2)singular_value | weight_vector |
---|
f64 | list[f64] |
28.820497 | [0.995171, 0.098156] |
28.76974 | [-0.098156, 0.995171] |
"
+ "shape: (2, 2)singular_value | weight_vector |
---|
f64 | list[f64] |
29.015447 | [0.568763, 0.822502] |
28.458258 | [0.822502, -0.568763] |
"
],
"text/plain": [
"shape: (2, 2)\n",
@@ -808,8 +808,8 @@
"│ --- ┆ --- │\n",
"│ f64 ┆ list[f64] │\n",
"╞════════════════╪═══════════════════════╡\n",
- "│ 28.820497 ┆ [0.995171, 0.098156] │\n",
- "│ 28.76974 ┆ [-0.098156, 0.995171] │\n",
+ "│ 29.015447 ┆ [0.568763, 0.822502] │\n",
+ "│ 28.458258 ┆ [0.822502, -0.568763] │\n",
"└────────────────┴───────────────────────┘"
]
},
@@ -841,7 +841,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 1)pc1 |
---|
f64 |
0.11709 |
-0.210939 |
0.011899 |
-0.162391 |
-0.050289 |
"
+ "shape: (5, 1)pc1 |
---|
f64 |
0.380626 |
-0.147048 |
-0.395586 |
0.562945 |
0.307737 |
"
],
"text/plain": [
"shape: (5, 1)\n",
@@ -850,11 +850,11 @@
"│ --- │\n",
"│ f64 │\n",
"╞═══════════╡\n",
- "│ 0.11709 │\n",
- "│ -0.210939 │\n",
- "│ 0.011899 │\n",
- "│ -0.162391 │\n",
- "│ -0.050289 │\n",
+ "│ 0.380626 │\n",
+ "│ -0.147048 │\n",
+ "│ -0.395586 │\n",
+ "│ 0.562945 │\n",
+ "│ 0.307737 │\n",
"└───────────┘"
]
},
@@ -894,7 +894,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (2, 8)dummy_groups | l2 | log loss | precision | recall | f | average_precision | roc_auc |
---|
str | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
"b" | 0.334676 | 1.002754 | 0.490268 | 0.484181 | 0.487205 | 0.503104 | 0.500475 |
"a" | 0.328401 | 0.985268 | 0.508709 | 0.512565 | 0.51063 | 0.504645 | 0.507109 |
"
+ "shape: (2, 8)dummy_groups | l2 | log loss | precision | recall | f | average_precision | roc_auc |
---|
str | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
"a" | 0.33544 | 0.997945 | 0.496479 | 0.501581 | 0.499017 | 0.498435 | 0.493508 |
"b" | 0.332956 | 1.001033 | 0.518898 | 0.509274 | 0.514041 | 0.512584 | 0.500236 |
"
],
"text/plain": [
"shape: (2, 8)\n",
@@ -904,8 +904,8 @@
"│ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ --- ┆ f64 │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ f64 ┆ │\n",
"╞══════════════╪══════════╪══════════╪═══════════╪══════════╪══════════╪════════════════╪══════════╡\n",
- "│ b ┆ 0.334676 ┆ 1.002754 ┆ 0.490268 ┆ 0.484181 ┆ 0.487205 ┆ 0.503104 ┆ 0.500475 │\n",
- "│ a ┆ 0.328401 ┆ 0.985268 ┆ 0.508709 ┆ 0.512565 ┆ 0.51063 ┆ 0.504645 ┆ 0.507109 │\n",
+ "│ a ┆ 0.33544 ┆ 0.997945 ┆ 0.496479 ┆ 0.501581 ┆ 0.499017 ┆ 0.498435 ┆ 0.493508 │\n",
+ "│ b ┆ 0.332956 ┆ 1.001033 ┆ 0.518898 ┆ 0.509274 ┆ 0.514041 ┆ 0.512584 ┆ 0.500236 │\n",
"└──────────────┴──────────┴──────────┴───────────┴──────────┴──────────┴────────────────┴──────────┘"
]
},
@@ -993,7 +993,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 1)sen |
---|
str |
"hello" |
"going" |
"world" |
"church" |
"to" |
"
+ "shape: (5, 1)sen |
---|
str |
"world" |
"church" |
"hello" |
"going" |
"to" |
"
],
"text/plain": [
"shape: (5, 1)\n",
@@ -1002,10 +1002,10 @@
"│ --- │\n",
"│ str │\n",
"╞════════╡\n",
- "│ hello │\n",
- "│ going │\n",
"│ world │\n",
"│ church │\n",
+ "│ hello │\n",
+ "│ going │\n",
"│ to │\n",
"└────────┘"
]
@@ -1038,7 +1038,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 1)sen |
---|
str |
"go" |
"hello" |
"world" |
"church" |
"" |
"
+ "shape: (5, 1)sen |
---|
str |
"hello" |
"" |
"world" |
"go" |
"church" |
"
],
"text/plain": [
"shape: (5, 1)\n",
@@ -1047,11 +1047,11 @@
"│ --- │\n",
"│ str │\n",
"╞════════╡\n",
- "│ go │\n",
"│ hello │\n",
+ "│ │\n",
"│ world │\n",
+ "│ go │\n",
"│ church │\n",
- "│ │\n",
"└────────┘"
]
},
@@ -1421,21 +1421,21 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 1)a |
---|
f64 |
null |
null |
0.166248 |
1.339555 |
1.29705 |
"
+ "shape: (5, 1)a |
---|
f64 |
null |
null |
-0.667205 |
-0.004369 |
-1.539039 |
"
],
"text/plain": [
"shape: (5, 1)\n",
- "┌──────────┐\n",
- "│ a │\n",
- "│ --- │\n",
- "│ f64 │\n",
- "╞══════════╡\n",
- "│ null │\n",
- "│ null │\n",
- "│ 0.166248 │\n",
- "│ 1.339555 │\n",
- "│ 1.29705 │\n",
- "└──────────┘"
+ "┌───────────┐\n",
+ "│ a │\n",
+ "│ --- │\n",
+ "│ f64 │\n",
+ "╞═══════════╡\n",
+ "│ null │\n",
+ "│ null │\n",
+ "│ -0.667205 │\n",
+ "│ -0.004369 │\n",
+ "│ -1.539039 │\n",
+ "└───────────┘"
]
},
"execution_count": 32,
@@ -1468,21 +1468,21 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 3)a | random_normal | random_normal_that_respects_null_of_a |
---|
f64 | f64 | f64 |
null | -0.819894 | null |
null | -0.106444 | null |
0.166248 | 0.311177 | 0.46741 |
1.339555 | 1.507627 | 1.895496 |
1.29705 | -0.40433 | 0.528693 |
"
+ "shape: (5, 3)a | random_normal | random_normal_that_respects_null_of_a |
---|
f64 | f64 | f64 |
null | 0.766622 | null |
null | 0.626792 | null |
-0.667205 | 1.568425 | -1.028465 |
-0.004369 | -0.846736 | 1.131894 |
-1.539039 | -0.994406 | 1.053838 |
"
],
"text/plain": [
"shape: (5, 3)\n",
- "┌──────────┬───────────────┬─────────────────────────────────┐\n",
- "│ a ┆ random_normal ┆ random_normal_that_respects_nu… │\n",
- "│ --- ┆ --- ┆ --- │\n",
- "│ f64 ┆ f64 ┆ f64 │\n",
- "╞══════════╪═══════════════╪═════════════════════════════════╡\n",
- "│ null ┆ -0.819894 ┆ null │\n",
- "│ null ┆ -0.106444 ┆ null │\n",
- "│ 0.166248 ┆ 0.311177 ┆ 0.46741 │\n",
- "│ 1.339555 ┆ 1.507627 ┆ 1.895496 │\n",
- "│ 1.29705 ┆ -0.40433 ┆ 0.528693 │\n",
- "└──────────┴───────────────┴─────────────────────────────────┘"
+ "┌───────────┬───────────────┬─────────────────────────────────┐\n",
+ "│ a ┆ random_normal ┆ random_normal_that_respects_nu… │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ f64 ┆ f64 ┆ f64 │\n",
+ "╞═══════════╪═══════════════╪═════════════════════════════════╡\n",
+ "│ null ┆ 0.766622 ┆ null │\n",
+ "│ null ┆ 0.626792 ┆ null │\n",
+ "│ -0.667205 ┆ 1.568425 ┆ -1.028465 │\n",
+ "│ -0.004369 ┆ -0.846736 ┆ 1.131894 │\n",
+ "│ -1.539039 ┆ -0.994406 ┆ 1.053838 │\n",
+ "└───────────┴───────────────┴─────────────────────────────────┘"
]
},
"execution_count": 33,
@@ -1516,21 +1516,21 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 3)a | random_str | random_str_that_respects_null_of_a |
---|
f64 | str | str |
null | "cC" | null |
null | "HxBp" | null |
0.166248 | "qjl" | "RMFn" |
1.339555 | "3N" | "n6Al" |
1.29705 | "cHD" | "MF" |
"
+ "shape: (5, 3)a | random_str | random_str_that_respects_null_of_a |
---|
f64 | str | str |
null | "FL" | null |
null | "ftnwd" | null |
-0.667205 | "7YV" | "o1" |
-0.004369 | "G7" | "Ys" |
-1.539039 | "Jd4" | "3umWr" |
"
],
"text/plain": [
"shape: (5, 3)\n",
- "┌──────────┬────────────┬─────────────────────────────────┐\n",
- "│ a ┆ random_str ┆ random_str_that_respects_null_… │\n",
- "│ --- ┆ --- ┆ --- │\n",
- "│ f64 ┆ str ┆ str │\n",
- "╞══════════╪════════════╪═════════════════════════════════╡\n",
- "│ null ┆ cC ┆ null │\n",
- "│ null ┆ HxBp ┆ null │\n",
- "│ 0.166248 ┆ qjl ┆ RMFn │\n",
- "│ 1.339555 ┆ 3N ┆ n6Al │\n",
- "│ 1.29705 ┆ cHD ┆ MF │\n",
- "└──────────┴────────────┴─────────────────────────────────┘"
+ "┌───────────┬────────────┬─────────────────────────────────┐\n",
+ "│ a ┆ random_str ┆ random_str_that_respects_null_… │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ f64 ┆ str ┆ str │\n",
+ "╞═══════════╪════════════╪═════════════════════════════════╡\n",
+ "│ null ┆ FL ┆ null │\n",
+ "│ null ┆ ftnwd ┆ null │\n",
+ "│ -0.667205 ┆ 7YV ┆ o1 │\n",
+ "│ -0.004369 ┆ G7 ┆ Ys │\n",
+ "│ -1.539039 ┆ Jd4 ┆ 3umWr │\n",
+ "└───────────┴────────────┴─────────────────────────────────┘"
]
},
"execution_count": 34,
@@ -1564,21 +1564,21 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 2)a | random_str |
---|
f64 | str |
null | null |
null | null |
0.166248 | "uhpES" |
1.339555 | "1AzJe" |
1.29705 | "EpZWF" |
"
+ "shape: (5, 2)a | random_str |
---|
f64 | str |
null | null |
null | null |
-0.667205 | "hIQx3" |
-0.004369 | "OZsZn" |
-1.539039 | "OXelh" |
"
],
"text/plain": [
"shape: (5, 2)\n",
- "┌──────────┬────────────┐\n",
- "│ a ┆ random_str │\n",
- "│ --- ┆ --- │\n",
- "│ f64 ┆ str │\n",
- "╞══════════╪════════════╡\n",
- "│ null ┆ null │\n",
- "│ null ┆ null │\n",
- "│ 0.166248 ┆ uhpES │\n",
- "│ 1.339555 ┆ 1AzJe │\n",
- "│ 1.29705 ┆ EpZWF │\n",
- "└──────────┴────────────┘"
+ "┌───────────┬────────────┐\n",
+ "│ a ┆ random_str │\n",
+ "│ --- ┆ --- │\n",
+ "│ f64 ┆ str │\n",
+ "╞═══════════╪════════════╡\n",
+ "│ null ┆ null │\n",
+ "│ null ┆ null │\n",
+ "│ -0.667205 ┆ hIQx3 │\n",
+ "│ -0.004369 ┆ OZsZn │\n",
+ "│ -1.539039 ┆ OXelh │\n",
+ "└───────────┴────────────┘"
]
},
"execution_count": 35,
@@ -1611,21 +1611,21 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 4)a | test1 | literal | test1_perturbed |
---|
f64 | f64 | f64 | f64 |
null | 0.175841 | null | 0.176231 |
null | -0.816172 | null | -0.815861 |
0.166248 | 1.955628 | 1.995267 | 1.955387 |
1.339555 | 0.576981 | 3.011182 | 0.57688 |
1.29705 | -1.083462 | 1.005481 | -1.083934 |
"
+ "shape: (5, 4)a | test1 | literal | test1_perturbed |
---|
f64 | f64 | f64 | f64 |
null | -0.647906 | null | -0.648101 |
null | 0.721174 | null | 0.721425 |
-0.667205 | 0.610471 | 1.476693 | 0.610372 |
-0.004369 | -0.054558 | 0.705794 | -0.054194 |
-1.539039 | 0.266183 | 0.060374 | 0.266557 |
"
],
"text/plain": [
"shape: (5, 4)\n",
- "┌──────────┬───────────┬──────────┬─────────────────┐\n",
- "│ a ┆ test1 ┆ literal ┆ test1_perturbed │\n",
- "│ --- ┆ --- ┆ --- ┆ --- │\n",
- "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
- "╞══════════╪═══════════╪══════════╪═════════════════╡\n",
- "│ null ┆ 0.175841 ┆ null ┆ 0.176231 │\n",
- "│ null ┆ -0.816172 ┆ null ┆ -0.815861 │\n",
- "│ 0.166248 ┆ 1.955628 ┆ 1.995267 ┆ 1.955387 │\n",
- "│ 1.339555 ┆ 0.576981 ┆ 3.011182 ┆ 0.57688 │\n",
- "│ 1.29705 ┆ -1.083462 ┆ 1.005481 ┆ -1.083934 │\n",
- "└──────────┴───────────┴──────────┴─────────────────┘"
+ "┌───────────┬───────────┬──────────┬─────────────────┐\n",
+ "│ a ┆ test1 ┆ literal ┆ test1_perturbed │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
+ "╞═══════════╪═══════════╪══════════╪═════════════════╡\n",
+ "│ null ┆ -0.647906 ┆ null ┆ -0.648101 │\n",
+ "│ null ┆ 0.721174 ┆ null ┆ 0.721425 │\n",
+ "│ -0.667205 ┆ 0.610471 ┆ 1.476693 ┆ 0.610372 │\n",
+ "│ -0.004369 ┆ -0.054558 ┆ 0.705794 ┆ -0.054194 │\n",
+ "│ -1.539039 ┆ 0.266183 ┆ 0.060374 ┆ 0.266557 │\n",
+ "└───────────┴───────────┴──────────┴─────────────────┘"
]
},
"execution_count": 36,
@@ -1663,21 +1663,21 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 4)a | [0, 1) | Normal | Int from [0, 10) |
---|
f64 | f64 | f64 | i32 |
null | 0.714802 | -0.54607 | 2 |
null | 0.355513 | 0.827599 | 9 |
0.166248 | 0.002282 | 0.570296 | 0 |
1.339555 | 0.786958 | -0.031061 | 3 |
1.29705 | 0.320615 | -0.158668 | 8 |
"
+ "shape: (5, 4)a | [0, 1) | Normal | Int from [0, 10) |
---|
f64 | f64 | f64 | i32 |
null | 0.685005 | 0.366216 | 8 |
null | 0.24877 | -1.189034 | 4 |
-0.667205 | 0.89082 | 0.946259 | 4 |
-0.004369 | 0.951503 | 0.606895 | 9 |
-1.539039 | 0.707259 | -0.907175 | 7 |
"
],
"text/plain": [
"shape: (5, 4)\n",
- "┌──────────┬──────────┬───────────┬──────────────────┐\n",
- "│ a ┆ [0, 1) ┆ Normal ┆ Int from [0, 10) │\n",
- "│ --- ┆ --- ┆ --- ┆ --- │\n",
- "│ f64 ┆ f64 ┆ f64 ┆ i32 │\n",
- "╞══════════╪══════════╪═══════════╪══════════════════╡\n",
- "│ null ┆ 0.714802 ┆ -0.54607 ┆ 2 │\n",
- "│ null ┆ 0.355513 ┆ 0.827599 ┆ 9 │\n",
- "│ 0.166248 ┆ 0.002282 ┆ 0.570296 ┆ 0 │\n",
- "│ 1.339555 ┆ 0.786958 ┆ -0.031061 ┆ 3 │\n",
- "│ 1.29705 ┆ 0.320615 ┆ -0.158668 ┆ 8 │\n",
- "└──────────┴──────────┴───────────┴──────────────────┘"
+ "┌───────────┬──────────┬───────────┬──────────────────┐\n",
+ "│ a ┆ [0, 1) ┆ Normal ┆ Int from [0, 10) │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ f64 ┆ f64 ┆ f64 ┆ i32 │\n",
+ "╞═══════════╪══════════╪═══════════╪══════════════════╡\n",
+ "│ null ┆ 0.685005 ┆ 0.366216 ┆ 8 │\n",
+ "│ null ┆ 0.24877 ┆ -1.189034 ┆ 4 │\n",
+ "│ -0.667205 ┆ 0.89082 ┆ 0.946259 ┆ 4 │\n",
+ "│ -0.004369 ┆ 0.951503 ┆ 0.606895 ┆ 9 │\n",
+ "│ -1.539039 ┆ 0.707259 ┆ -0.907175 ┆ 7 │\n",
+ "└───────────┴──────────┴───────────┴──────────────────┘"
]
},
"execution_count": 37,
@@ -1711,7 +1711,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (1, 4)t-tests: statistics | t-tests: pvalue | normality_test: statistics | normality_test: pvalue |
---|
f64 | f64 | f64 | f64 |
-0.425243 | 0.670722 | 0.782871 | 0.676086 |
"
+ "shape: (1, 4)t-tests: statistics | t-tests: pvalue | normality_test: statistics | normality_test: pvalue |
---|
f64 | f64 | f64 | f64 |
-0.941026 | 0.346844 | 1.420034 | 0.491636 |
"
],
"text/plain": [
"shape: (1, 4)\n",
@@ -1720,7 +1720,7 @@
"│ --- ┆ --- ┆ --- ┆ --- │\n",
"│ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞═════════════════════╪═════════════════╪════════════════════════════╪════════════════════════╡\n",
- "│ -0.425243 ┆ 0.670722 ┆ 0.782871 ┆ 0.676086 │\n",
+ "│ -0.941026 ┆ 0.346844 ┆ 1.420034 ┆ 0.491636 │\n",
"└─────────────────────┴─────────────────┴────────────────────────────┴────────────────────────┘"
]
},
@@ -1764,7 +1764,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 5)market_id | var1 | var2 | category_1 | category_2 |
---|
i64 | f64 | f64 | i32 | i32 |
0 | 0.598321 | 0.075415 | 4 | 5 |
1 | 0.073296 | 0.789893 | 2 | 3 |
2 | 0.818023 | 0.504974 | 2 | 4 |
0 | 0.985104 | 0.153053 | 0 | 6 |
1 | 0.440852 | 0.862906 | 2 | 1 |
"
+ "shape: (5, 5)market_id | var1 | var2 | category_1 | category_2 |
---|
i64 | f64 | f64 | i32 | i32 |
0 | 0.842972 | 0.450364 | 3 | 4 |
1 | 0.625663 | 0.095083 | 0 | 6 |
2 | 0.029255 | 0.515528 | 3 | 4 |
0 | 0.782569 | 0.664478 | 1 | 4 |
1 | 0.487103 | 0.935361 | 0 | 7 |
"
],
"text/plain": [
"shape: (5, 5)\n",
@@ -1773,11 +1773,11 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ f64 ┆ f64 ┆ i32 ┆ i32 │\n",
"╞═══════════╪══════════╪══════════╪════════════╪════════════╡\n",
- "│ 0 ┆ 0.598321 ┆ 0.075415 ┆ 4 ┆ 5 │\n",
- "│ 1 ┆ 0.073296 ┆ 0.789893 ┆ 2 ┆ 3 │\n",
- "│ 2 ┆ 0.818023 ┆ 0.504974 ┆ 2 ┆ 4 │\n",
- "│ 0 ┆ 0.985104 ┆ 0.153053 ┆ 0 ┆ 6 │\n",
- "│ 1 ┆ 0.440852 ┆ 0.862906 ┆ 2 ┆ 1 │\n",
+ "│ 0 ┆ 0.842972 ┆ 0.450364 ┆ 3 ┆ 4 │\n",
+ "│ 1 ┆ 0.625663 ┆ 0.095083 ┆ 0 ┆ 6 │\n",
+ "│ 2 ┆ 0.029255 ┆ 0.515528 ┆ 3 ┆ 4 │\n",
+ "│ 0 ┆ 0.782569 ┆ 0.664478 ┆ 1 ┆ 4 │\n",
+ "│ 1 ┆ 0.487103 ┆ 0.935361 ┆ 0 ┆ 7 │\n",
"└───────────┴──────────┴──────────┴────────────┴────────────┘"
]
},
@@ -1817,17 +1817,17 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (1, 3)t-test | chi2-test | f-test |
---|
struct[2] | struct[2] | struct[2] |
{0.356596,0.721402} | {31.810658,0.668157} | {0.744296,0.5617} |
"
+ "shape: (1, 3)t-test | chi2-test | f-test |
---|
struct[2] | struct[2] | struct[2] |
{1.550123,0.121144} | {22.889401,0.955922} | {1.746959,0.136718} |
"
],
"text/plain": [
"shape: (1, 3)\n",
- "┌─────────────────────┬──────────────────────┬───────────────────┐\n",
- "│ t-test ┆ chi2-test ┆ f-test │\n",
- "│ --- ┆ --- ┆ --- │\n",
- "│ struct[2] ┆ struct[2] ┆ struct[2] │\n",
- "╞═════════════════════╪══════════════════════╪═══════════════════╡\n",
- "│ {0.356596,0.721402} ┆ {31.810658,0.668157} ┆ {0.744296,0.5617} │\n",
- "└─────────────────────┴──────────────────────┴───────────────────┘"
+ "┌─────────────────────┬──────────────────────┬─────────────────────┐\n",
+ "│ t-test ┆ chi2-test ┆ f-test │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ struct[2] ┆ struct[2] ┆ struct[2] │\n",
+ "╞═════════════════════╪══════════════════════╪═════════════════════╡\n",
+ "│ {1.550123,0.121144} ┆ {22.889401,0.955922} ┆ {1.746959,0.136718} │\n",
+ "└─────────────────────┴──────────────────────┴─────────────────────┘"
]
},
"execution_count": 40,
@@ -1860,9 +1860,9 @@
"│ --- ┆ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │\n",
"╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡\n",
- "│ 0 ┆ {0.782406,0.434031} ┆ {32.80012,0.621581} ┆ {2.027486,0.088156} │\n",
- "│ 1 ┆ {-1.168306,0.242767} ┆ {34.251982,0.551894} ┆ {0.414089,0.798598} │\n",
- "│ 2 ┆ {0.988312,0.323072} ┆ {35.722092,0.481702} ┆ {1.335438,0.254489} │\n",
+ "│ 0 ┆ {2.182824,0.029118} ┆ {38.555846,0.35473} ┆ {1.597169,0.172493} │\n",
+ "│ 1 ┆ {0.972503,0.330871} ┆ {26.35273,0.880507} ┆ {0.864623,0.484451} │\n",
+ "│ 2 ┆ {-0.471616,0.637232} ┆ {34.267769,0.551134} ┆ {0.783499,0.535832} │\n",
"└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘\n"
]
}
@@ -1894,7 +1894,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (9, 2)first_digit_cnt | first_digit_distribution |
---|
u32 | f64 |
537 | 0.1074 |
534 | 0.1068 |
577 | 0.1154 |
605 | 0.121 |
546 | 0.1092 |
572 | 0.1144 |
505 | 0.101 |
561 | 0.1122 |
563 | 0.1126 |
"
+ "shape: (9, 2)first_digit_cnt | first_digit_distribution |
---|
u32 | f64 |
556 | 0.1112 |
566 | 0.1132 |
556 | 0.1112 |
525 | 0.105 |
592 | 0.1184 |
520 | 0.104 |
566 | 0.1132 |
552 | 0.1104 |
567 | 0.1134 |
"
],
"text/plain": [
"shape: (9, 2)\n",
@@ -1903,15 +1903,15 @@
"│ --- ┆ --- │\n",
"│ u32 ┆ f64 │\n",
"╞═════════════════╪══════════════════════════╡\n",
- "│ 537 ┆ 0.1074 │\n",
- "│ 534 ┆ 0.1068 │\n",
- "│ 577 ┆ 0.1154 │\n",
- "│ 605 ┆ 0.121 │\n",
- "│ 546 ┆ 0.1092 │\n",
- "│ 572 ┆ 0.1144 │\n",
- "│ 505 ┆ 0.101 │\n",
- "│ 561 ┆ 0.1122 │\n",
- "│ 563 ┆ 0.1126 │\n",
+ "│ 556 ┆ 0.1112 │\n",
+ "│ 566 ┆ 0.1132 │\n",
+ "│ 556 ┆ 0.1112 │\n",
+ "│ 525 ┆ 0.105 │\n",
+ "│ 592 ┆ 0.1184 │\n",
+ "│ 520 ┆ 0.104 │\n",
+ "│ 566 ┆ 0.1132 │\n",
+ "│ 552 ┆ 0.1104 │\n",
+ "│ 567 ┆ 0.1134 │\n",
"└─────────────────┴──────────────────────────┘"
]
},
@@ -1977,7 +1977,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 7)id | var1 | var2 | var3 | r | rh | nb_l_inf_cnt |
---|
u32 | f64 | f64 | f64 | f64 | f64 | u32 |
0 | 0.720043 | 0.762057 | 0.0802 | 0.853373 | 1.184888 | 16 |
1 | 0.746859 | 0.774783 | 0.969885 | 0.027992 | 6.011372 | 15 |
2 | 0.21097 | 0.029106 | 0.522927 | 0.317476 | 9.596375 | 14 |
3 | 0.701792 | 0.527346 | 0.352297 | 0.912383 | 4.874474 | 16 |
4 | 0.723815 | 0.544753 | 0.311694 | 0.210474 | 5.696281 | 17 |
"
+ "shape: (5, 7)id | var1 | var2 | var3 | r | rh | nb_l_inf_cnt |
---|
u32 | f64 | f64 | f64 | f64 | f64 | u32 |
0 | 0.347784 | 0.13026 | 0.334019 | 0.698491 | 7.909586 | 15 |
1 | 0.48221 | 0.050991 | 0.736185 | 0.892089 | 7.823451 | 8 |
2 | 0.786648 | 0.639778 | 0.774721 | 0.134284 | 3.51514 | 20 |
3 | 0.944763 | 0.129409 | 0.460358 | 0.715857 | 8.133778 | 16 |
4 | 0.597698 | 0.747696 | 0.885392 | 0.670841 | 2.392687 | 19 |
"
],
"text/plain": [
"shape: (5, 7)\n",
@@ -1986,11 +1986,11 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ u32 │\n",
"╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════════╡\n",
- "│ 0 ┆ 0.720043 ┆ 0.762057 ┆ 0.0802 ┆ 0.853373 ┆ 1.184888 ┆ 16 │\n",
- "│ 1 ┆ 0.746859 ┆ 0.774783 ┆ 0.969885 ┆ 0.027992 ┆ 6.011372 ┆ 15 │\n",
- "│ 2 ┆ 0.21097 ┆ 0.029106 ┆ 0.522927 ┆ 0.317476 ┆ 9.596375 ┆ 14 │\n",
- "│ 3 ┆ 0.701792 ┆ 0.527346 ┆ 0.352297 ┆ 0.912383 ┆ 4.874474 ┆ 16 │\n",
- "│ 4 ┆ 0.723815 ┆ 0.544753 ┆ 0.311694 ┆ 0.210474 ┆ 5.696281 ┆ 17 │\n",
+ "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ 15 │\n",
+ "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ 8 │\n",
+ "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ 20 │\n",
+ "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ 16 │\n",
+ "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ 19 │\n",
"└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────────┘"
]
},
@@ -2027,7 +2027,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 7)id | var1 | var2 | var3 | r | rh | nb_l1_r_cnt |
---|
u32 | f64 | f64 | f64 | f64 | f64 | u32 |
0 | 0.720043 | 0.762057 | 0.0802 | 0.853373 | 1.184888 | 690 |
1 | 0.746859 | 0.774783 | 0.969885 | 0.027992 | 6.011372 | 1 |
2 | 0.21097 | 0.029106 | 0.522927 | 0.317476 | 9.596375 | 56 |
3 | 0.701792 | 0.527346 | 0.352297 | 0.912383 | 4.874474 | 1289 |
4 | 0.723815 | 0.544753 | 0.311694 | 0.210474 | 5.696281 | 23 |
"
+ "shape: (5, 7)id | var1 | var2 | var3 | r | rh | nb_l1_r_cnt |
---|
u32 | f64 | f64 | f64 | f64 | f64 | u32 |
0 | 0.347784 | 0.13026 | 0.334019 | 0.698491 | 7.909586 | 538 |
1 | 0.48221 | 0.050991 | 0.736185 | 0.892089 | 7.823451 | 783 |
2 | 0.786648 | 0.639778 | 0.774721 | 0.134284 | 3.51514 | 10 |
3 | 0.944763 | 0.129409 | 0.460358 | 0.715857 | 8.133778 | 389 |
4 | 0.597698 | 0.747696 | 0.885392 | 0.670841 | 2.392687 | 483 |
"
],
"text/plain": [
"shape: (5, 7)\n",
@@ -2036,11 +2036,11 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ u32 │\n",
"╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪═════════════╡\n",
- "│ 0 ┆ 0.720043 ┆ 0.762057 ┆ 0.0802 ┆ 0.853373 ┆ 1.184888 ┆ 690 │\n",
- "│ 1 ┆ 0.746859 ┆ 0.774783 ┆ 0.969885 ┆ 0.027992 ┆ 6.011372 ┆ 1 │\n",
- "│ 2 ┆ 0.21097 ┆ 0.029106 ┆ 0.522927 ┆ 0.317476 ┆ 9.596375 ┆ 56 │\n",
- "│ 3 ┆ 0.701792 ┆ 0.527346 ┆ 0.352297 ┆ 0.912383 ┆ 4.874474 ┆ 1289 │\n",
- "│ 4 ┆ 0.723815 ┆ 0.544753 ┆ 0.311694 ┆ 0.210474 ┆ 5.696281 ┆ 23 │\n",
+ "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ 538 │\n",
+ "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ 783 │\n",
+ "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ 10 │\n",
+ "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ 389 │\n",
+ "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ 483 │\n",
"└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴─────────────┘"
]
},
@@ -2076,7 +2076,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 7)id | var1 | var2 | var3 | r | rh | best friends |
---|
u32 | f64 | f64 | f64 | f64 | f64 | list[u32] |
0 | 0.720043 | 0.762057 | 0.0802 | 0.853373 | 1.184888 | [0, 1171, … 1754] |
1 | 0.746859 | 0.774783 | 0.969885 | 0.027992 | 6.011372 | [1, 906, … 1751] |
2 | 0.21097 | 0.029106 | 0.522927 | 0.317476 | 9.596375 | [2, 50, … 853] |
3 | 0.701792 | 0.527346 | 0.352297 | 0.912383 | 4.874474 | [3, 1558, … 921] |
4 | 0.723815 | 0.544753 | 0.311694 | 0.210474 | 5.696281 | [4, 3, … 485] |
"
+ "shape: (5, 7)id | var1 | var2 | var3 | r | rh | best friends |
---|
u32 | f64 | f64 | f64 | f64 | f64 | list[u32] |
0 | 0.347784 | 0.13026 | 0.334019 | 0.698491 | 7.909586 | [0, 502, … 115] |
1 | 0.48221 | 0.050991 | 0.736185 | 0.892089 | 7.823451 | [1, 1527, … 400] |
2 | 0.786648 | 0.639778 | 0.774721 | 0.134284 | 3.51514 | [2, 1430, … 1451] |
3 | 0.944763 | 0.129409 | 0.460358 | 0.715857 | 8.133778 | [3, 598, … 711] |
4 | 0.597698 | 0.747696 | 0.885392 | 0.670841 | 2.392687 | [4, 650, … 213] |
"
],
"text/plain": [
"shape: (5, 7)\n",
@@ -2085,11 +2085,11 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ list[u32] │\n",
"╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪═══════════════════╡\n",
- "│ 0 ┆ 0.720043 ┆ 0.762057 ┆ 0.0802 ┆ 0.853373 ┆ 1.184888 ┆ [0, 1171, … 1754] │\n",
- "│ 1 ┆ 0.746859 ┆ 0.774783 ┆ 0.969885 ┆ 0.027992 ┆ 6.011372 ┆ [1, 906, … 1751] │\n",
- "│ 2 ┆ 0.21097 ┆ 0.029106 ┆ 0.522927 ┆ 0.317476 ┆ 9.596375 ┆ [2, 50, … 853] │\n",
- "│ 3 ┆ 0.701792 ┆ 0.527346 ┆ 0.352297 ┆ 0.912383 ┆ 4.874474 ┆ [3, 1558, … 921] │\n",
- "│ 4 ┆ 0.723815 ┆ 0.544753 ┆ 0.311694 ┆ 0.210474 ┆ 5.696281 ┆ [4, 3, … 485] │\n",
+ "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ [0, 502, … 115] │\n",
+ "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ [1, 1527, … 400] │\n",
+ "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ [2, 1430, … 1451] │\n",
+ "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ [3, 598, … 711] │\n",
+ "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ [4, 650, … 213] │\n",
"└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴───────────────────┘"
]
},
@@ -2128,11 +2128,11 @@
"│ --- ┆ --- ┆ --- │\n",
"│ u32 ┆ list[u32] ┆ u32 │\n",
"╞═════╪══════════════════╪════════════════════╡\n",
- "│ 0 ┆ [0, 1171, … 912] ┆ 9 │\n",
- "│ 1 ┆ [1, 906, … 831] ┆ 5 │\n",
- "│ 2 ┆ [2, 50, … 1682] ┆ 8 │\n",
- "│ 3 ┆ [3, 1558, … 66] ┆ 7 │\n",
- "│ 4 ┆ [4, 3, … 1370] ┆ 6 │\n",
+ "│ 0 ┆ [0, 502, … 875] ┆ 10 │\n",
+ "│ 1 ┆ [1, 1527, … 400] ┆ 3 │\n",
+ "│ 2 ┆ [2, 1430, … 549] ┆ 10 │\n",
+ "│ 3 ┆ [3, 598, … 1768] ┆ 9 │\n",
+ "│ 4 ┆ [4, 650, … 803] ┆ 6 │\n",
"└─────┴──────────────────┴────────────────────┘\n"
]
}
@@ -2173,7 +2173,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 8)id | var1 | var2 | var3 | r | rh | idx | dist |
---|
u32 | f64 | f64 | f64 | f64 | f64 | list[u32] | list[f64] |
0 | 0.720043 | 0.762057 | 0.0802 | 0.853373 | 1.184888 | [0, 1171, … 1754] | [0.0, 0.054683, … 0.077248] |
1 | 0.746859 | 0.774783 | 0.969885 | 0.027992 | 6.011372 | [1, 906, … 1751] | [0.0, 0.042337, … 0.053288] |
2 | 0.21097 | 0.029106 | 0.522927 | 0.317476 | 9.596375 | [2, 50, … 853] | [0.0, 0.059335, … 0.06505] |
3 | 0.701792 | 0.527346 | 0.352297 | 0.912383 | 4.874474 | [3, 1558, … 921] | [0.0, 0.015422, … 0.067852] |
4 | 0.723815 | 0.544753 | 0.311694 | 0.210474 | 5.696281 | [4, 3, … 485] | [0.0, 0.049363, … 0.060237] |
"
+ "shape: (5, 8)id | var1 | var2 | var3 | r | rh | idx | dist |
---|
u32 | f64 | f64 | f64 | f64 | f64 | list[u32] | list[f64] |
0 | 0.347784 | 0.13026 | 0.334019 | 0.698491 | 7.909586 | [0, 502, … 115] | [0.0, 0.066443, … 0.072798] |
1 | 0.48221 | 0.050991 | 0.736185 | 0.892089 | 7.823451 | [1, 1527, … 400] | [0.0, 0.049926, … 0.063975] |
2 | 0.786648 | 0.639778 | 0.774721 | 0.134284 | 3.51514 | [2, 1430, … 1451] | [0.0, 0.02861, … 0.057878] |
3 | 0.944763 | 0.129409 | 0.460358 | 0.715857 | 8.133778 | [3, 598, … 711] | [0.0, 0.032508, … 0.046937] |
4 | 0.597698 | 0.747696 | 0.885392 | 0.670841 | 2.392687 | [4, 650, … 213] | [0.0, 0.068048, … 0.076969] |
"
],
"text/plain": [
"shape: (5, 8)\n",
@@ -2182,16 +2182,16 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ list[u32] ┆ list[f64] │\n",
"╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╪══════════════════╪══════════════════╡\n",
- "│ 0 ┆ 0.720043 ┆ 0.762057 ┆ 0.0802 ┆ 0.853373 ┆ 1.184888 ┆ [0, 1171, … ┆ [0.0, 0.054683, │\n",
- "│ ┆ ┆ ┆ ┆ ┆ ┆ 1754] ┆ … 0.077248] │\n",
- "│ 1 ┆ 0.746859 ┆ 0.774783 ┆ 0.969885 ┆ 0.027992 ┆ 6.011372 ┆ [1, 906, … 1751] ┆ [0.0, 0.042337, │\n",
- "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.053288] │\n",
- "│ 2 ┆ 0.21097 ┆ 0.029106 ┆ 0.522927 ┆ 0.317476 ┆ 9.596375 ┆ [2, 50, … 853] ┆ [0.0, 0.059335, │\n",
- "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.06505] │\n",
- "│ 3 ┆ 0.701792 ┆ 0.527346 ┆ 0.352297 ┆ 0.912383 ┆ 4.874474 ┆ [3, 1558, … 921] ┆ [0.0, 0.015422, │\n",
- "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.067852] │\n",
- "│ 4 ┆ 0.723815 ┆ 0.544753 ┆ 0.311694 ┆ 0.210474 ┆ 5.696281 ┆ [4, 3, … 485] ┆ [0.0, 0.049363, │\n",
- "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.060237] │\n",
+ "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 ┆ [0, 502, … 115] ┆ [0.0, 0.066443, │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.072798] │\n",
+ "│ 1 ┆ 0.48221 ┆ 0.050991 ┆ 0.736185 ┆ 0.892089 ┆ 7.823451 ┆ [1, 1527, … 400] ┆ [0.0, 0.049926, │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.063975] │\n",
+ "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 ┆ [2, 1430, … ┆ [0.0, 0.02861, … │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ 1451] ┆ 0.057878] │\n",
+ "│ 3 ┆ 0.944763 ┆ 0.129409 ┆ 0.460358 ┆ 0.715857 ┆ 8.133778 ┆ [3, 598, … 711] ┆ [0.0, 0.032508, │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.046937] │\n",
+ "│ 4 ┆ 0.597698 ┆ 0.747696 ┆ 0.885392 ┆ 0.670841 ┆ 2.392687 ┆ [4, 650, … 213] ┆ [0.0, 0.068048, │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … 0.076969] │\n",
"└─────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────────────┴──────────────────┘"
]
},
@@ -2231,7 +2231,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 6)id | var1 | var2 | var3 | r | rh |
---|
u32 | f64 | f64 | f64 | f64 | f64 |
3 | 0.701792 | 0.527346 | 0.352297 | 0.912383 | 4.874474 |
4 | 0.723815 | 0.544753 | 0.311694 | 0.210474 | 5.696281 |
6 | 0.736724 | 0.776174 | 0.693574 | 0.532166 | 3.944928 |
7 | 0.617642 | 0.788939 | 0.488318 | 0.27519 | 2.900536 |
8 | 0.327207 | 0.456177 | 0.469014 | 0.128877 | 9.917232 |
"
+ "shape: (5, 6)id | var1 | var2 | var3 | r | rh |
---|
u32 | f64 | f64 | f64 | f64 | f64 |
0 | 0.347784 | 0.13026 | 0.334019 | 0.698491 | 7.909586 |
2 | 0.786648 | 0.639778 | 0.774721 | 0.134284 | 3.51514 |
5 | 0.712633 | 0.28485 | 0.329133 | 0.543338 | 6.065003 |
7 | 0.405769 | 0.443343 | 0.892205 | 0.731708 | 9.658069 |
10 | 0.836991 | 0.428517 | 0.404204 | 0.440019 | 4.264234 |
"
],
"text/plain": [
"shape: (5, 6)\n",
@@ -2240,11 +2240,11 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n",
- "│ 3 ┆ 0.701792 ┆ 0.527346 ┆ 0.352297 ┆ 0.912383 ┆ 4.874474 │\n",
- "│ 4 ┆ 0.723815 ┆ 0.544753 ┆ 0.311694 ┆ 0.210474 ┆ 5.696281 │\n",
- "│ 6 ┆ 0.736724 ┆ 0.776174 ┆ 0.693574 ┆ 0.532166 ┆ 3.944928 │\n",
- "│ 7 ┆ 0.617642 ┆ 0.788939 ┆ 0.488318 ┆ 0.27519 ┆ 2.900536 │\n",
- "│ 8 ┆ 0.327207 ┆ 0.456177 ┆ 0.469014 ┆ 0.128877 ┆ 9.917232 │\n",
+ "│ 0 ┆ 0.347784 ┆ 0.13026 ┆ 0.334019 ┆ 0.698491 ┆ 7.909586 │\n",
+ "│ 2 ┆ 0.786648 ┆ 0.639778 ┆ 0.774721 ┆ 0.134284 ┆ 3.51514 │\n",
+ "│ 5 ┆ 0.712633 ┆ 0.28485 ┆ 0.329133 ┆ 0.543338 ┆ 6.065003 │\n",
+ "│ 7 ┆ 0.405769 ┆ 0.443343 ┆ 0.892205 ┆ 0.731708 ┆ 9.658069 │\n",
+ "│ 10 ┆ 0.836991 ┆ 0.428517 ┆ 0.404204 ┆ 0.440019 ┆ 4.264234 │\n",
"└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘"
]
},
@@ -2281,7 +2281,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 6)id | var1 | var2 | var3 | r | rh |
---|
u32 | f64 | f64 | f64 | f64 | f64 |
36 | 0.568433 | 0.456969 | 0.05886 | 0.006766 | 0.753822 |
117 | 0.521121 | 0.521892 | 0.129322 | 0.377708 | 0.731702 |
138 | 0.531877 | 0.488015 | 0.040861 | 0.905918 | 1.466601 |
169 | 0.465704 | 0.501999 | 0.253489 | 0.625459 | 6.367421 |
176 | 0.473739 | 0.491308 | 0.894173 | 0.652556 | 6.084469 |
"
+ "shape: (5, 6)id | var1 | var2 | var3 | r | rh |
---|
u32 | f64 | f64 | f64 | f64 | f64 |
61 | 0.457586 | 0.507503 | 0.614296 | 0.55833 | 7.064511 |
164 | 0.550585 | 0.564562 | 0.917242 | 0.756173 | 8.670795 |
257 | 0.571268 | 0.54402 | 0.110826 | 0.16714 | 8.900079 |
334 | 0.551334 | 0.458245 | 0.40399 | 0.494347 | 1.872597 |
352 | 0.535394 | 0.459457 | 0.467295 | 0.986702 | 3.193573 |
"
],
"text/plain": [
"shape: (5, 6)\n",
@@ -2290,11 +2290,11 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n",
- "│ 36 ┆ 0.568433 ┆ 0.456969 ┆ 0.05886 ┆ 0.006766 ┆ 0.753822 │\n",
- "│ 117 ┆ 0.521121 ┆ 0.521892 ┆ 0.129322 ┆ 0.377708 ┆ 0.731702 │\n",
- "│ 138 ┆ 0.531877 ┆ 0.488015 ┆ 0.040861 ┆ 0.905918 ┆ 1.466601 │\n",
- "│ 169 ┆ 0.465704 ┆ 0.501999 ┆ 0.253489 ┆ 0.625459 ┆ 6.367421 │\n",
- "│ 176 ┆ 0.473739 ┆ 0.491308 ┆ 0.894173 ┆ 0.652556 ┆ 6.084469 │\n",
+ "│ 61 ┆ 0.457586 ┆ 0.507503 ┆ 0.614296 ┆ 0.55833 ┆ 7.064511 │\n",
+ "│ 164 ┆ 0.550585 ┆ 0.564562 ┆ 0.917242 ┆ 0.756173 ┆ 8.670795 │\n",
+ "│ 257 ┆ 0.571268 ┆ 0.54402 ┆ 0.110826 ┆ 0.16714 ┆ 8.900079 │\n",
+ "│ 334 ┆ 0.551334 ┆ 0.458245 ┆ 0.40399 ┆ 0.494347 ┆ 1.872597 │\n",
+ "│ 352 ┆ 0.535394 ┆ 0.459457 ┆ 0.467295 ┆ 0.986702 ┆ 3.193573 │\n",
"└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘"
]
},
@@ -2331,7 +2331,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 6)id | var1 | var2 | var3 | r | rh |
---|
u32 | f64 | f64 | f64 | f64 | f64 |
169 | 0.465704 | 0.501999 | 0.253489 | 0.625459 | 6.367421 |
176 | 0.473739 | 0.491308 | 0.894173 | 0.652556 | 6.084469 |
235 | 0.564589 | 0.484392 | 0.057102 | 0.975023 | 8.699902 |
367 | 0.478239 | 0.566379 | 0.620646 | 0.384922 | 9.836408 |
383 | 0.501891 | 0.46347 | 0.135889 | 0.616873 | 7.838947 |
"
+ "shape: (5, 6)id | var1 | var2 | var3 | r | rh |
---|
u32 | f64 | f64 | f64 | f64 | f64 |
61 | 0.457586 | 0.507503 | 0.614296 | 0.55833 | 7.064511 |
354 | 0.442961 | 0.519426 | 0.691972 | 0.94437 | 7.440443 |
406 | 0.520402 | 0.443565 | 0.015572 | 0.814672 | 9.903239 |
411 | 0.523037 | 0.569513 | 0.975117 | 0.163414 | 8.785701 |
488 | 0.4547 | 0.453289 | 0.388635 | 0.391255 | 9.463455 |
"
],
"text/plain": [
"shape: (5, 6)\n",
@@ -2340,11 +2340,11 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞═════╪══════════╪══════════╪══════════╪══════════╪══════════╡\n",
- "│ 169 ┆ 0.465704 ┆ 0.501999 ┆ 0.253489 ┆ 0.625459 ┆ 6.367421 │\n",
- "│ 176 ┆ 0.473739 ┆ 0.491308 ┆ 0.894173 ┆ 0.652556 ┆ 6.084469 │\n",
- "│ 235 ┆ 0.564589 ┆ 0.484392 ┆ 0.057102 ┆ 0.975023 ┆ 8.699902 │\n",
- "│ 367 ┆ 0.478239 ┆ 0.566379 ┆ 0.620646 ┆ 0.384922 ┆ 9.836408 │\n",
- "│ 383 ┆ 0.501891 ┆ 0.46347 ┆ 0.135889 ┆ 0.616873 ┆ 7.838947 │\n",
+ "│ 61 ┆ 0.457586 ┆ 0.507503 ┆ 0.614296 ┆ 0.55833 ┆ 7.064511 │\n",
+ "│ 354 ┆ 0.442961 ┆ 0.519426 ┆ 0.691972 ┆ 0.94437 ┆ 7.440443 │\n",
+ "│ 406 ┆ 0.520402 ┆ 0.443565 ┆ 0.015572 ┆ 0.814672 ┆ 9.903239 │\n",
+ "│ 411 ┆ 0.523037 ┆ 0.569513 ┆ 0.975117 ┆ 0.163414 ┆ 8.785701 │\n",
+ "│ 488 ┆ 0.4547 ┆ 0.453289 ┆ 0.388635 ┆ 0.391255 ┆ 9.463455 │\n",
"└─────┴──────────┴──────────┴──────────┴──────────┴──────────┘"
]
},
@@ -2381,7 +2381,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 3)id | friends | count |
---|
u64 | list[u32] | u32 |
0 | [0, 1345, … 304] | 4 |
1 | [1, 6, 278] | 3 |
2 | [2, 934, 853] | 3 |
3 | [3, 1584, … 159] | 5 |
4 | [4, 1939, … 392] | 5 |
"
+ "shape: (5, 3)id | friends | count |
---|
u64 | list[u32] | u32 |
0 | [0, 459, … 1058] | 6 |
1 | [1] | 1 |
2 | [2, 1077] | 2 |
3 | [3, 104] | 2 |
4 | [4, 781, … 650] | 4 |
"
],
"text/plain": [
"shape: (5, 3)\n",
@@ -2390,11 +2390,11 @@
"│ --- ┆ --- ┆ --- │\n",
"│ u64 ┆ list[u32] ┆ u32 │\n",
"╞═════╪══════════════════╪═══════╡\n",
- "│ 0 ┆ [0, 1345, … 304] ┆ 4 │\n",
- "│ 1 ┆ [1, 6, 278] ┆ 3 │\n",
- "│ 2 ┆ [2, 934, 853] ┆ 3 │\n",
- "│ 3 ┆ [3, 1584, … 159] ┆ 5 │\n",
- "│ 4 ┆ [4, 1939, … 392] ┆ 5 │\n",
+ "│ 0 ┆ [0, 459, … 1058] ┆ 6 │\n",
+ "│ 1 ┆ [1] ┆ 1 │\n",
+ "│ 2 ┆ [2, 1077] ┆ 2 │\n",
+ "│ 3 ┆ [3, 104] ┆ 2 │\n",
+ "│ 4 ┆ [4, 781, … 650] ┆ 4 │\n",
"└─────┴──────────────────┴───────┘"
]
},
@@ -2439,7 +2439,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "/tmp/ipykernel_19683/3354819425.py:3: UserWarning: The compatibility layer is considered experimental.\n",
+ "/tmp/ipykernel_28864/3354819425.py:3: UserWarning: The compatibility layer is considered experimental.\n",
" from polars_ds.compat import compat as pds2\n"
]
},
@@ -2453,7 +2453,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 6)actual | predicted | 0-2 | 0-9 | s1 | s2 |
---|
f64 | f64 | i32 | i32 | str | str |
1.0 | 0.78908 | 0 | 1 | "7J" | "k" |
1.0 | 0.503485 | 1 | 5 | "S" | "yj" |
1.0 | 0.736868 | 2 | 8 | "iB" | "p" |
1.0 | 0.904397 | 1 | 4 | "R" | "Js" |
1.0 | 0.843379 | 1 | 2 | "A" | "WR" |
"
+ "shape: (5, 6)actual | predicted | 0-2 | 0-9 | s1 | s2 |
---|
f64 | f64 | i32 | i32 | str | str |
1.0 | 0.965653 | 2 | 6 | "I0" | "nR" |
0.0 | 0.44037 | 2 | 2 | "6" | "Mz" |
1.0 | 0.931955 | 1 | 2 | "1" | "kg" |
0.0 | 0.558197 | 1 | 4 | "R6" | "m" |
1.0 | 0.235535 | 2 | 1 | "bF" | "RO" |
"
],
"text/plain": [
"shape: (5, 6)\n",
@@ -2462,11 +2462,11 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ f64 ┆ f64 ┆ i32 ┆ i32 ┆ str ┆ str │\n",
"╞════════╪═══════════╪═════╪═════╪═════╪═════╡\n",
- "│ 1.0 ┆ 0.78908 ┆ 0 ┆ 1 ┆ 7J ┆ k │\n",
- "│ 1.0 ┆ 0.503485 ┆ 1 ┆ 5 ┆ S ┆ yj │\n",
- "│ 1.0 ┆ 0.736868 ┆ 2 ┆ 8 ┆ iB ┆ p │\n",
- "│ 1.0 ┆ 0.904397 ┆ 1 ┆ 4 ┆ R ┆ Js │\n",
- "│ 1.0 ┆ 0.843379 ┆ 1 ┆ 2 ┆ A ┆ WR │\n",
+ "│ 1.0 ┆ 0.965653 ┆ 2 ┆ 6 ┆ I0 ┆ nR │\n",
+ "│ 0.0 ┆ 0.44037 ┆ 2 ┆ 2 ┆ 6 ┆ Mz │\n",
+ "│ 1.0 ┆ 0.931955 ┆ 1 ┆ 2 ┆ 1 ┆ kg │\n",
+ "│ 0.0 ┆ 0.558197 ┆ 1 ┆ 4 ┆ R6 ┆ m │\n",
+ "│ 1.0 ┆ 0.235535 ┆ 2 ┆ 1 ┆ bF ┆ RO │\n",
"└────────┴───────────┴─────┴─────┴─────┴─────┘"
]
},
@@ -2553,7 +2553,7 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞═══════════╪══════════╪══════════╪═══════════════════╪══════════╡\n",
- "│ 0.503697 ┆ 0.500827 ┆ 0.502258 ┆ 0.50319 ┆ 0.501299 │\n",
+ "│ 0.499669 ┆ 0.499599 ┆ 0.499634 ┆ 0.497488 ┆ 0.498677 │\n",
"└───────────┴──────────┴──────────┴───────────────────┴──────────┘\n",
"shape: (1, 5)\n",
"┌───────────┬──────────┬──────────┬───────────────────┬──────────┐\n",
@@ -2561,7 +2561,7 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞═══════════╪══════════╪══════════╪═══════════════════╪══════════╡\n",
- "│ 0.503697 ┆ 0.500827 ┆ 0.502258 ┆ 0.50319 ┆ 0.501299 │\n",
+ "│ 0.499669 ┆ 0.499599 ┆ 0.499634 ┆ 0.497488 ┆ 0.498677 │\n",
"└───────────┴──────────┴──────────┴───────────────────┴──────────┘\n",
"shape: (1, 5)\n",
"┌───────────┬──────────┬──────────┬───────────────────┬──────────┐\n",
@@ -2569,7 +2569,7 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞═══════════╪══════════╪══════════╪═══════════════════╪══════════╡\n",
- "│ 0.503697 ┆ 0.500827 ┆ 0.502258 ┆ 0.50319 ┆ 0.501299 │\n",
+ "│ 0.499669 ┆ 0.499599 ┆ 0.499634 ┆ 0.497488 ┆ 0.498677 │\n",
"└───────────┴──────────┴──────────┴───────────────────┴──────────┘\n"
]
}
@@ -2599,7 +2599,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 4)<= | baseline_pct | actual_pct | psi_bin |
---|
f64 | f64 | f64 | f64 |
0.194464 | 0.2 | 0.187 | 0.000874 |
0.413897 | 0.2 | 0.207 | 0.000241 |
0.5999 | 0.2 | 0.19 | 0.000513 |
0.804026 | 0.2 | 0.202 | 0.00002 |
inf | 0.2 | 0.214 | 0.000947 |
"
+ "shape: (5, 4)<= | baseline_pct | actual_pct | psi_bin |
---|
f64 | f64 | f64 | f64 |
0.211304 | 0.2 | 0.215 | 0.001085 |
0.399705 | 0.2 | 0.195 | 0.000127 |
0.594605 | 0.2 | 0.187 | 0.000874 |
0.797354 | 0.2 | 0.189 | 0.000622 |
inf | 0.2 | 0.214 | 0.000947 |
"
],
"text/plain": [
"shape: (5, 4)\n",
@@ -2608,10 +2608,10 @@
"│ --- ┆ --- ┆ --- ┆ --- │\n",
"│ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞══════════╪══════════════╪════════════╪══════════╡\n",
- "│ 0.194464 ┆ 0.2 ┆ 0.187 ┆ 0.000874 │\n",
- "│ 0.413897 ┆ 0.2 ┆ 0.207 ┆ 0.000241 │\n",
- "│ 0.5999 ┆ 0.2 ┆ 0.19 ┆ 0.000513 │\n",
- "│ 0.804026 ┆ 0.2 ┆ 0.202 ┆ 0.00002 │\n",
+ "│ 0.211304 ┆ 0.2 ┆ 0.215 ┆ 0.001085 │\n",
+ "│ 0.399705 ┆ 0.2 ┆ 0.195 ┆ 0.000127 │\n",
+ "│ 0.594605 ┆ 0.2 ┆ 0.187 ┆ 0.000874 │\n",
+ "│ 0.797354 ┆ 0.2 ┆ 0.189 ┆ 0.000622 │\n",
"│ inf ┆ 0.2 ┆ 0.214 ┆ 0.000947 │\n",
"└──────────┴──────────────┴────────────┴──────────┘"
]
@@ -2647,13 +2647,13 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (1,)"
+ "shape: (1,)"
],
"text/plain": [
"shape: (1,)\n",
"Series: 'cid_ce' [f64]\n",
"[\n",
- "\t12.900387\n",
+ "\t13.128145\n",
"]"
]
},
@@ -2684,13 +2684,13 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (1,)"
+ "shape: (1,)"
],
"text/plain": [
"shape: (1,)\n",
"Series: 'c3_stats' [f64]\n",
"[\n",
- "\t0.123215\n",
+ "\t0.11619\n",
"]"
]
},
@@ -2722,7 +2722,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (10,)"
+ "shape: (10,)"
],
"text/plain": [
"shape: (10,)\n",
@@ -2734,8 +2734,8 @@
"\t2\n",
"\t2\n",
"\t2\n",
- "\t1\n",
"\t2\n",
+ "\t1\n",
"\t2\n",
"\t2\n",
"]"
diff --git a/examples/pipeline.ipynb b/examples/pipeline.ipynb
index afaca423..86796d34 100644
--- a/examples/pipeline.ipynb
+++ b/examples/pipeline.ipynb
@@ -190,7 +190,7 @@
"\n",
"\n",
"Step 3:\n",
- "col(\"city_category\").is_not_null().all_horizontal()\n",
+ "col(\"city_category\").is_not_null()\n",
"\n",
"Step 4:\n",
"selector\n",
@@ -237,10 +237,10 @@
"col(\"employer_category1\").replace_strict([Series[value], Series[to], null])\n",
"\n",
"Step 14:\n",
- "dtype_columns([UInt32, UInt8, Int32, Int8, UInt64, UInt16, Int64, Int16]).shrink_dtype()\n",
+ "dtype_columns([Int16, UInt32, UInt8, Int32, UInt64, Int8, UInt16, Int64]).shrink_dtype()\n",
"\n",
"Step 15:\n",
- "dtype_columns([Float32, Float64]).strict_cast(Float32)\n"
+ "dtype_columns([Float64, Float32]).strict_cast(Float32)\n"
]
},
"execution_count": 5,
@@ -461,7 +461,7 @@
"\n",
"\n",
"Step 3:\n",
- "col(\"city_category\").is_not_null().all_horizontal()\n",
+ "col(\"city_category\").is_not_null()\n",
"\n",
"Step 4:\n",
"selector\n",
@@ -508,10 +508,10 @@
"col(\"employer_category1\").replace_strict([Series[value], Series[to], null])\n",
"\n",
"Step 14:\n",
- "dtype_columns([UInt32, UInt8, Int32, Int8, UInt64, UInt16, Int64, Int16]).shrink_dtype()\n",
+ "dtype_columns([Int16, UInt32, UInt8, Int32, UInt64, Int8, UInt16, Int64]).shrink_dtype()\n",
"\n",
"Step 15:\n",
- "dtype_columns([Float32, Float64]).strict_cast(Float32)\n"
+ "dtype_columns([Float64, Float32]).strict_cast(Float32)\n"
]
},
"execution_count": 10,
@@ -662,9 +662,9 @@
" '{\"Alias\":[{\"Column\":\"Var1\"},\"var1\"]}',\n",
" '{\"Alias\":[{\"Column\":\"Approved\"},\"approved\"]}']},\n",
" {'SQLStep': \"\\nselect\\n*\\n, 'TEST' as test_col\\nfrom df\\nwhere loan_amount is not null\\n\"},\n",
- " {'FilterStep': ['{\"Function\":{\"input\":[{\"Function\":{\"input\":[{\"Column\":\"city_category\"}],\"function\":{\"Boolean\":\"IsNotNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}],\"function\":{\"Boolean\":\"AllHorizontal\"},\"options\":{\"collect_groups\":\"GroupWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE | INPUT_WILDCARD_EXPANSION | ALLOW_EMPTY_INPUTS\"}}}']},\n",
- " {'SelectStep': ['{\"Selector\":{\"Add\":[{\"Root\":{\"DtypeColumn\":[{\"Decimal\":[null,null]},\"Float32\",\"UInt32\",\"UInt8\",\"Int32\",\"Int8\",\"Float64\",\"UInt64\",\"UInt16\",\"Int64\",\"Int16\"]}},{\"Root\":{\"Columns\":[\"gender\",\"employer_category1\",\"city_category\",\"test_col\"]}}]}}']},\n",
- " {'WithColumnsStep': ['{\"Alias\":[{\"Ternary\":{\"predicate\":{\"Function\":{\"input\":[{\"Column\":\"loan_period\"}],\"function\":{\"Boolean\":\"IsNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"truthy\":{\"Function\":{\"input\":[{\"BinaryExpr\":{\"left\":{\"Column\":\"var1\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":0.5098100117596632}}}},{\"BinaryExpr\":{\"left\":{\"Column\":\"existing_emi\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":-7.604079653752158e-6}}}}],\"function\":\"SumHorizontal\",\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE | INPUT_WILDCARD_EXPANSION\"}}},\"falsy\":{\"Cast\":{\"expr\":{\"Column\":\"loan_period\"},\"dtype\":\"Float64\",\"options\":\"Strict\"}}}},\"loan_period\"]}']},\n",
+ " {'FilterStep': ['{\"Function\":{\"input\":[{\"Column\":\"city_category\"}],\"function\":{\"Boolean\":\"IsNotNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n",
+ " {'SelectStep': ['{\"Selector\":{\"Add\":[{\"Root\":{\"DtypeColumn\":[\"Int16\",\"UInt32\",\"UInt8\",\"Float64\",\"Int32\",\"UInt64\",\"Int8\",\"UInt16\",\"Int64\",{\"Decimal\":[null,null]},\"Float32\"]}},{\"Root\":{\"Columns\":[\"gender\",\"employer_category1\",\"city_category\",\"test_col\"]}}]}}']},\n",
+ " {'WithColumnsStep': ['{\"Alias\":[{\"Ternary\":{\"predicate\":{\"Function\":{\"input\":[{\"Column\":\"loan_period\"}],\"function\":{\"Boolean\":\"IsNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"truthy\":{\"Function\":{\"input\":[{\"BinaryExpr\":{\"left\":{\"Column\":\"var1\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":0.5098100117596667}}}},{\"BinaryExpr\":{\"left\":{\"Column\":\"existing_emi\"},\"op\":\"Multiply\",\"right\":{\"Literal\":{\"Float\":-7.6040796537530525e-6}}}}],\"function\":{\"SumHorizontal\":{\"ignore_nulls\":true}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE | INPUT_WILDCARD_EXPANSION\"}}},\"falsy\":{\"Cast\":{\"expr\":{\"Column\":\"loan_period\"},\"dtype\":\"Float64\",\"options\":\"Strict\"}}}},\"loan_period\"]}']},\n",
" {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"existing_emi\"},{\"Literal\":{\"Float\":0.0}}],\"function\":\"FillNull\",\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n",
" {'WithColumnsStep': ['{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"existing_emi\"}],\"function\":\"Log1p\",\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"existing_emi_log1p\"]}',\n",
" '{\"Alias\":[{\"Function\":{\"input\":[{\"Column\":\"loan_amount\"}],\"function\":\"Log1p\",\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"loan_amount_log1p\"]}',\n",
@@ -685,10 +685,10 @@
" {'WithColumnsStep': ['{\"Alias\":[{\"Cast\":{\"expr\":{\"Function\":{\"input\":[{\"Column\":\"employer_category1\"}],\"function\":{\"Boolean\":\"IsNull\"},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}},\"dtype\":\"UInt8\",\"options\":\"Strict\"}},\"employer_category1_is_missing\"]}']},\n",
" {'WithColumnsStep': ['{\"Alias\":[{\"Cast\":{\"expr\":{\"BinaryExpr\":{\"left\":{\"Column\":\"gender\"},\"op\":\"EqValidity\",\"right\":{\"Literal\":{\"String\":\"Male\"}}}},\"dtype\":\"UInt8\",\"options\":\"Strict\"}},\"gender_Male\"]}']},\n",
" {'SelectStep': ['{\"Exclude\":[\"Wildcard\",[{\"Name\":\"gender\"}]]}']},\n",
- " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"city_category\"},{\"Literal\":{\"Series\":{\"name\":\"value\",\"datatype\":\"String\",\"bit_settings\":\"\",\"values\":[\"B\",\"A\",\"C\"]}}},{\"Literal\":{\"Series\":{\"name\":\"woe\",\"datatype\":\"Float64\",\"bit_settings\":\"\",\"values\":[-0.04647519483535344,0.0809586180645928,-0.47955283435510176]}}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n",
- " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"employer_category1\"},{\"Literal\":{\"Series\":{\"name\":\"value\",\"datatype\":\"String\",\"bit_settings\":\"\",\"values\":[\"A\",\"C\",\"B\"]}}},{\"Literal\":{\"Series\":{\"name\":\"to\",\"datatype\":\"Float64\",\"bit_settings\":\"\",\"values\":[0.014736842105263158,0.02660307366189719,0.024335548172757474]}}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n",
- " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"DtypeColumn\":[\"UInt32\",\"UInt8\",\"Int32\",\"Int8\",\"UInt64\",\"UInt16\",\"Int64\",\"Int16\"]}],\"function\":\"ShrinkType\",\"options\":{\"collect_groups\":\"GroupWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n",
- " {'WithColumnsStep': ['{\"Cast\":{\"expr\":{\"DtypeColumn\":[\"Float32\",\"Float64\"]},\"dtype\":\"Float32\",\"options\":\"Strict\"}}']}],\n",
+ " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"city_category\"},{\"Literal\":{\"Series\":{\"name\":\"value\",\"datatype\":\"String\",\"bit_settings\":\"\",\"values\":[\"A\",\"C\",\"B\"]}}},{\"Literal\":{\"Series\":{\"name\":\"woe\",\"datatype\":\"Float64\",\"bit_settings\":\"\",\"values\":[0.0809586180645928,-0.47955283435510176,-0.04647519483535344]}}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n",
+ " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"Column\":\"employer_category1\"},{\"Literal\":{\"Series\":{\"name\":\"value\",\"datatype\":\"String\",\"bit_settings\":\"\",\"values\":[\"B\",\"C\",\"A\"]}}},{\"Literal\":{\"Series\":{\"name\":\"to\",\"datatype\":\"Float64\",\"bit_settings\":\"\",\"values\":[0.024335548172757474,0.02660307366189719,0.014736842105263158]}}},{\"Literal\":\"Null\"}],\"function\":{\"ReplaceStrict\":{\"return_dtype\":null}},\"options\":{\"collect_groups\":\"ElementWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n",
+ " {'WithColumnsStep': ['{\"Function\":{\"input\":[{\"DtypeColumn\":[\"Int16\",\"UInt32\",\"UInt8\",\"Int32\",\"UInt64\",\"Int8\",\"UInt16\",\"Int64\"]}],\"function\":\"ShrinkType\",\"options\":{\"collect_groups\":\"GroupWise\",\"fmt_str\":\"\",\"check_lengths\":true,\"flags\":\"ALLOW_GROUP_AWARE\"}}}']},\n",
+ " {'WithColumnsStep': ['{\"Cast\":{\"expr\":{\"DtypeColumn\":[\"Float64\",\"Float32\"]},\"dtype\":\"Float32\",\"options\":\"Strict\"}}']}],\n",
" 'ensure_features_in': False,\n",
" 'ensure_features_out': True}"
]
diff --git a/examples/sample_and_split.ipynb b/examples/sample_and_split.ipynb
index 1e8d32db..8eb12e1c 100644
--- a/examples/sample_and_split.ipynb
+++ b/examples/sample_and_split.ipynb
@@ -42,7 +42,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (5, 8)row_num | uniform_1 | uniform_2 | exp | normal | fat_normal | flags | category |
---|
i64 | f64 | f64 | f64 | f64 | f64 | i32 | str |
0 | 3.169978 | 0.183096 | 0.848878 | -0.988939 | 369.76195 | 2 | "A" |
1 | 8.810768 | 0.569672 | 0.048483 | -0.44255 | 258.012662 | 0 | "A" |
2 | 3.274063 | 0.632772 | 0.447468 | 0.255512 | -1284.389879 | 1 | "A" |
3 | 10.847672 | 0.89006 | 0.772062 | 0.735149 | -0.362983 | 0 | "A" |
4 | 11.66482 | 0.907167 | 1.393929 | 2.285448 | -2031.321622 | 0 | "A" |
"
+ "shape: (5, 8)row_num | uniform_1 | uniform_2 | exp | normal | fat_normal | flags | category |
---|
i64 | f64 | f64 | f64 | f64 | f64 | i32 | str |
0 | 5.355462 | 0.227585 | 0.875413 | 1.255306 | -1534.296075 | 0 | "A" |
1 | 3.143742 | 0.651711 | 2.12331 | -0.27767 | 544.798771 | 0 | "A" |
2 | 9.585138 | 0.720147 | 1.04885 | 0.01982 | 2388.724441 | 0 | "A" |
3 | 11.73043 | 0.059602 | 3.624234 | -1.177224 | 442.397518 | 0 | "A" |
4 | 1.310415 | 0.783836 | 3.70326 | 1.501242 | 189.064492 | 2 | "A" |
"
],
"text/plain": [
"shape: (5, 8)\n",
@@ -51,11 +51,11 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n",
"╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n",
- "│ 0 ┆ 3.169978 ┆ 0.183096 ┆ 0.848878 ┆ -0.988939 ┆ 369.76195 ┆ 2 ┆ A │\n",
- "│ 1 ┆ 8.810768 ┆ 0.569672 ┆ 0.048483 ┆ -0.44255 ┆ 258.012662 ┆ 0 ┆ A │\n",
- "│ 2 ┆ 3.274063 ┆ 0.632772 ┆ 0.447468 ┆ 0.255512 ┆ -1284.389879 ┆ 1 ┆ A │\n",
- "│ 3 ┆ 10.847672 ┆ 0.89006 ┆ 0.772062 ┆ 0.735149 ┆ -0.362983 ┆ 0 ┆ A │\n",
- "│ 4 ┆ 11.66482 ┆ 0.907167 ┆ 1.393929 ┆ 2.285448 ┆ -2031.321622 ┆ 0 ┆ A │\n",
+ "│ 0 ┆ 5.355462 ┆ 0.227585 ┆ 0.875413 ┆ 1.255306 ┆ -1534.296075 ┆ 0 ┆ A │\n",
+ "│ 1 ┆ 3.143742 ┆ 0.651711 ┆ 2.12331 ┆ -0.27767 ┆ 544.798771 ┆ 0 ┆ A │\n",
+ "│ 2 ┆ 9.585138 ┆ 0.720147 ┆ 1.04885 ┆ 0.01982 ┆ 2388.724441 ┆ 0 ┆ A │\n",
+ "│ 3 ┆ 11.73043 ┆ 0.059602 ┆ 3.624234 ┆ -1.177224 ┆ 442.397518 ┆ 0 ┆ A │\n",
+ "│ 4 ┆ 1.310415 ┆ 0.783836 ┆ 3.70326 ┆ 1.501242 ┆ 189.064492 ┆ 2 ┆ A │\n",
"└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘"
]
},
@@ -85,7 +85,7 @@
{
"data": {
"text/plain": [
- "['row_num', 'normal', 'flags']"
+ "['row_num', 'uniform_2', 'exp']"
]
},
"execution_count": 3,
@@ -94,7 +94,7 @@
}
],
"source": [
- "sa.random_cols(df, 2, keep = [\"row_num\"])"
+ "sa.random_cols(df.columns, 2, keep = [\"row_num\"])"
]
},
{
@@ -112,7 +112,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (60_000, 8)row_num | uniform_1 | uniform_2 | exp | normal | fat_normal | flags | category |
---|
i64 | f64 | f64 | f64 | f64 | f64 | i32 | str |
1 | 8.810768 | 0.569672 | 0.048483 | -0.44255 | 258.012662 | 0 | "A" |
2 | 3.274063 | 0.632772 | 0.447468 | 0.255512 | -1284.389879 | 1 | "A" |
4 | 11.66482 | 0.907167 | 1.393929 | 2.285448 | -2031.321622 | 0 | "A" |
6 | 1.522247 | 0.626331 | 0.460844 | -0.060739 | 1487.444343 | 1 | "A" |
7 | 3.93548 | 0.363229 | 2.002222 | -0.613627 | -335.203183 | 0 | "A" |
… | … | … | … | … | … | … | … |
99991 | 3.808594 | 0.693914 | 6.727779 | -0.781093 | -868.307031 | 2 | "C" |
99994 | 6.246362 | 0.99597 | 3.468162 | -0.699768 | -145.471814 | 1 | "C" |
99996 | 0.520435 | 0.758179 | 0.680518 | 0.788875 | -3203.56896 | 2 | "C" |
99997 | 6.250958 | 0.762393 | 0.08691 | 1.79754 | 696.859327 | 1 | "C" |
99998 | 4.491091 | 0.396969 | 0.012585 | 2.024051 | -2468.859815 | 2 | "C" |
"
+ "shape: (60_000, 8)row_num | uniform_1 | uniform_2 | exp | normal | fat_normal | flags | category |
---|
i64 | f64 | f64 | f64 | f64 | f64 | i32 | str |
1 | 3.143742 | 0.651711 | 2.12331 | -0.27767 | 544.798771 | 0 | "A" |
2 | 9.585138 | 0.720147 | 1.04885 | 0.01982 | 2388.724441 | 0 | "A" |
6 | 0.189662 | 0.0651 | 1.316939 | -0.244435 | 748.995179 | 0 | "A" |
7 | 0.661346 | 0.874092 | 4.843038 | 0.31243 | -383.659135 | 0 | "A" |
8 | 0.053801 | 0.983342 | 0.452362 | 0.312257 | -386.689719 | 0 | "A" |
… | … | … | … | … | … | … | … |
99994 | 7.536122 | 0.11414 | 2.847801 | -0.916853 | -1340.111513 | 2 | "C" |
99996 | 10.030577 | 0.939568 | 0.987719 | 0.701578 | -768.062655 | 0 | "C" |
99997 | 5.118598 | 0.552395 | 2.390273 | -2.57956 | -1076.610099 | 0 | "C" |
99998 | 5.701428 | 0.521572 | 1.290974 | -1.361779 | 5.278061 | 1 | "C" |
99999 | 7.946039 | 0.225155 | 2.564999 | 0.367505 | -1021.479937 | 1 | "C" |
"
],
"text/plain": [
"shape: (60_000, 8)\n",
@@ -121,17 +121,17 @@
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n",
"╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n",
- "│ 1 ┆ 8.810768 ┆ 0.569672 ┆ 0.048483 ┆ -0.44255 ┆ 258.012662 ┆ 0 ┆ A │\n",
- "│ 2 ┆ 3.274063 ┆ 0.632772 ┆ 0.447468 ┆ 0.255512 ┆ -1284.389879 ┆ 1 ┆ A │\n",
- "│ 4 ┆ 11.66482 ┆ 0.907167 ┆ 1.393929 ┆ 2.285448 ┆ -2031.321622 ┆ 0 ┆ A │\n",
- "│ 6 ┆ 1.522247 ┆ 0.626331 ┆ 0.460844 ┆ -0.060739 ┆ 1487.444343 ┆ 1 ┆ A │\n",
- "│ 7 ┆ 3.93548 ┆ 0.363229 ┆ 2.002222 ┆ -0.613627 ┆ -335.203183 ┆ 0 ┆ A │\n",
+ "│ 1 ┆ 3.143742 ┆ 0.651711 ┆ 2.12331 ┆ -0.27767 ┆ 544.798771 ┆ 0 ┆ A │\n",
+ "│ 2 ┆ 9.585138 ┆ 0.720147 ┆ 1.04885 ┆ 0.01982 ┆ 2388.724441 ┆ 0 ┆ A │\n",
+ "│ 6 ┆ 0.189662 ┆ 0.0651 ┆ 1.316939 ┆ -0.244435 ┆ 748.995179 ┆ 0 ┆ A │\n",
+ "│ 7 ┆ 0.661346 ┆ 0.874092 ┆ 4.843038 ┆ 0.31243 ┆ -383.659135 ┆ 0 ┆ A │\n",
+ "│ 8 ┆ 0.053801 ┆ 0.983342 ┆ 0.452362 ┆ 0.312257 ┆ -386.689719 ┆ 0 ┆ A │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
- "│ 99991 ┆ 3.808594 ┆ 0.693914 ┆ 6.727779 ┆ -0.781093 ┆ -868.307031 ┆ 2 ┆ C │\n",
- "│ 99994 ┆ 6.246362 ┆ 0.99597 ┆ 3.468162 ┆ -0.699768 ┆ -145.471814 ┆ 1 ┆ C │\n",
- "│ 99996 ┆ 0.520435 ┆ 0.758179 ┆ 0.680518 ┆ 0.788875 ┆ -3203.56896 ┆ 2 ┆ C │\n",
- "│ 99997 ┆ 6.250958 ┆ 0.762393 ┆ 0.08691 ┆ 1.79754 ┆ 696.859327 ┆ 1 ┆ C │\n",
- "│ 99998 ┆ 4.491091 ┆ 0.396969 ┆ 0.012585 ┆ 2.024051 ┆ -2468.859815 ┆ 2 ┆ C │\n",
+ "│ 99994 ┆ 7.536122 ┆ 0.11414 ┆ 2.847801 ┆ -0.916853 ┆ -1340.111513 ┆ 2 ┆ C │\n",
+ "│ 99996 ┆ 10.030577 ┆ 0.939568 ┆ 0.987719 ┆ 0.701578 ┆ -768.062655 ┆ 0 ┆ C │\n",
+ "│ 99997 ┆ 5.118598 ┆ 0.552395 ┆ 2.390273 ┆ -2.57956 ┆ -1076.610099 ┆ 0 ┆ C │\n",
+ "│ 99998 ┆ 5.701428 ┆ 0.521572 ┆ 1.290974 ┆ -1.361779 ┆ 5.278061 ┆ 1 ┆ C │\n",
+ "│ 99999 ┆ 7.946039 ┆ 0.225155 ┆ 2.564999 ┆ 0.367505 ┆ -1021.479937 ┆ 1 ┆ C │\n",
"└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘"
]
},
@@ -160,27 +160,27 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (30_000, 8)row_num | uniform_1 | uniform_2 | exp | normal | fat_normal | flags | category |
---|
i64 | f64 | f64 | f64 | f64 | f64 | i32 | str |
0 | 3.169978 | 0.183096 | 0.848878 | -0.988939 | 369.76195 | 2 | "A" |
12 | 2.718784 | 0.236327 | 0.656341 | 2.042461 | 992.106646 | 0 | "A" |
13 | 5.688242 | 0.238128 | 1.989903 | -1.890975 | 96.609098 | 1 | "A" |
16 | 10.630157 | 0.685417 | 2.040244 | -0.411343 | -80.440654 | 2 | "A" |
18 | 6.133318 | 0.868581 | 3.786928 | -0.853489 | -824.372864 | 1 | "A" |
… | … | … | … | … | … | … | … |
99976 | 7.686265 | 0.037178 | 9.872401 | 0.002709 | 1013.823443 | 2 | "C" |
99981 | 9.040585 | 0.272563 | 0.423536 | -0.365252 | -718.151462 | 1 | "C" |
99985 | 8.940385 | 0.856215 | 2.355023 | 0.609717 | -34.944096 | 0 | "C" |
99986 | 6.501358 | 0.676297 | 1.185671 | -0.284971 | 583.365443 | 1 | "C" |
99996 | 0.520435 | 0.758179 | 0.680518 | 0.788875 | -3203.56896 | 2 | "C" |
"
+ "shape: (30_000, 8)row_num | uniform_1 | uniform_2 | exp | normal | fat_normal | flags | category |
---|
i64 | f64 | f64 | f64 | f64 | f64 | i32 | str |
10 | 9.781623 | 0.563868 | 4.488553 | 0.123101 | 1628.818496 | 1 | "A" |
11 | 4.508328 | 0.594697 | 3.877757 | 0.849688 | -1242.37697 | 1 | "A" |
14 | 1.702338 | 0.776305 | 1.346987 | 0.481826 | -403.30214 | 2 | "A" |
19 | 11.897234 | 0.55035 | 1.791477 | 0.861923 | 641.532776 | 2 | "A" |
22 | 4.077515 | 0.737717 | 1.093235 | 1.048444 | 1269.183071 | 2 | "A" |
… | … | … | … | … | … | … | … |
99989 | 5.26012 | 0.479069 | 0.748342 | -0.224175 | -84.266224 | 1 | "C" |
99994 | 7.536122 | 0.11414 | 2.847801 | -0.916853 | -1340.111513 | 2 | "C" |
99995 | 10.490682 | 0.611692 | 0.384882 | -0.474915 | 157.011096 | 2 | "C" |
99996 | 10.030577 | 0.939568 | 0.987719 | 0.701578 | -768.062655 | 0 | "C" |
99998 | 5.701428 | 0.521572 | 1.290974 | -1.361779 | 5.278061 | 1 | "C" |
"
],
"text/plain": [
"shape: (30_000, 8)\n",
- "┌─────────┬───────────┬───────────┬──────────┬───────────┬─────────────┬───────┬──────────┐\n",
- "│ row_num ┆ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ flags ┆ category │\n",
- "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
- "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n",
- "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪═════════════╪═══════╪══════════╡\n",
- "│ 0 ┆ 3.169978 ┆ 0.183096 ┆ 0.848878 ┆ -0.988939 ┆ 369.76195 ┆ 2 ┆ A │\n",
- "│ 12 ┆ 2.718784 ┆ 0.236327 ┆ 0.656341 ┆ 2.042461 ┆ 992.106646 ┆ 0 ┆ A │\n",
- "│ 13 ┆ 5.688242 ┆ 0.238128 ┆ 1.989903 ┆ -1.890975 ┆ 96.609098 ┆ 1 ┆ A │\n",
- "│ 16 ┆ 10.630157 ┆ 0.685417 ┆ 2.040244 ┆ -0.411343 ┆ -80.440654 ┆ 2 ┆ A │\n",
- "│ 18 ┆ 6.133318 ┆ 0.868581 ┆ 3.786928 ┆ -0.853489 ┆ -824.372864 ┆ 1 ┆ A │\n",
- "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
- "│ 99976 ┆ 7.686265 ┆ 0.037178 ┆ 9.872401 ┆ 0.002709 ┆ 1013.823443 ┆ 2 ┆ C │\n",
- "│ 99981 ┆ 9.040585 ┆ 0.272563 ┆ 0.423536 ┆ -0.365252 ┆ -718.151462 ┆ 1 ┆ C │\n",
- "│ 99985 ┆ 8.940385 ┆ 0.856215 ┆ 2.355023 ┆ 0.609717 ┆ -34.944096 ┆ 0 ┆ C │\n",
- "│ 99986 ┆ 6.501358 ┆ 0.676297 ┆ 1.185671 ┆ -0.284971 ┆ 583.365443 ┆ 1 ┆ C │\n",
- "│ 99996 ┆ 0.520435 ┆ 0.758179 ┆ 0.680518 ┆ 0.788875 ┆ -3203.56896 ┆ 2 ┆ C │\n",
- "└─────────┴───────────┴───────────┴──────────┴───────────┴─────────────┴───────┴──────────┘"
+ "┌─────────┬───────────┬───────────┬──────────┬───────────┬──────────────┬───────┬──────────┐\n",
+ "│ row_num ┆ uniform_1 ┆ uniform_2 ┆ exp ┆ normal ┆ fat_normal ┆ flags ┆ category │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ i32 ┆ str │\n",
+ "╞═════════╪═══════════╪═══════════╪══════════╪═══════════╪══════════════╪═══════╪══════════╡\n",
+ "│ 10 ┆ 9.781623 ┆ 0.563868 ┆ 4.488553 ┆ 0.123101 ┆ 1628.818496 ┆ 1 ┆ A │\n",
+ "│ 11 ┆ 4.508328 ┆ 0.594697 ┆ 3.877757 ┆ 0.849688 ┆ -1242.37697 ┆ 1 ┆ A │\n",
+ "│ 14 ┆ 1.702338 ┆ 0.776305 ┆ 1.346987 ┆ 0.481826 ┆ -403.30214 ┆ 2 ┆ A │\n",
+ "│ 19 ┆ 11.897234 ┆ 0.55035 ┆ 1.791477 ┆ 0.861923 ┆ 641.532776 ┆ 2 ┆ A │\n",
+ "│ 22 ┆ 4.077515 ┆ 0.737717 ┆ 1.093235 ┆ 1.048444 ┆ 1269.183071 ┆ 2 ┆ A │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ 99989 ┆ 5.26012 ┆ 0.479069 ┆ 0.748342 ┆ -0.224175 ┆ -84.266224 ┆ 1 ┆ C │\n",
+ "│ 99994 ┆ 7.536122 ┆ 0.11414 ┆ 2.847801 ┆ -0.916853 ┆ -1340.111513 ┆ 2 ┆ C │\n",
+ "│ 99995 ┆ 10.490682 ┆ 0.611692 ┆ 0.384882 ┆ -0.474915 ┆ 157.011096 ┆ 2 ┆ C │\n",
+ "│ 99996 ┆ 10.030577 ┆ 0.939568 ┆ 0.987719 ┆ 0.701578 ┆ -768.062655 ┆ 0 ┆ C │\n",
+ "│ 99998 ┆ 5.701428 ┆ 0.521572 ┆ 1.290974 ┆ -1.361779 ┆ 5.278061 ┆ 1 ┆ C │\n",
+ "└─────────┴───────────┴───────────┴──────────┴───────────┴──────────────┴───────┴──────────┘"
]
},
"execution_count": 5,
@@ -207,7 +207,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (3, 2)flags | len |
---|
i32 | u32 |
0 | 33465 |
1 | 33331 |
2 | 33204 |
"
+ "shape: (3, 2)flags | len |
---|
i32 | u32 |
0 | 33381 |
1 | 33169 |
2 | 33450 |
"
],
"text/plain": [
"shape: (3, 2)\n",
@@ -216,9 +216,9 @@
"│ --- ┆ --- │\n",
"│ i32 ┆ u32 │\n",
"╞═══════╪═══════╡\n",
- "│ 0 ┆ 33465 │\n",
- "│ 1 ┆ 33331 │\n",
- "│ 2 ┆ 33204 │\n",
+ "│ 0 ┆ 33381 │\n",
+ "│ 1 ┆ 33169 │\n",
+ "│ 2 ┆ 33450 │\n",
"└───────┴───────┘"
]
},
@@ -246,7 +246,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (3, 2)flags | len |
---|
i32 | u32 |
0 | 16732 |
1 | 33331 |
2 | 33204 |
"
+ "shape: (3, 2)flags | len |
---|
i32 | u32 |
0 | 16690 |
1 | 33169 |
2 | 33450 |
"
],
"text/plain": [
"shape: (3, 2)\n",
@@ -255,9 +255,9 @@
"│ --- ┆ --- │\n",
"│ i32 ┆ u32 │\n",
"╞═══════╪═══════╡\n",
- "│ 0 ┆ 16732 │\n",
- "│ 1 ┆ 33331 │\n",
- "│ 2 ┆ 33204 │\n",
+ "│ 0 ┆ 16690 │\n",
+ "│ 1 ┆ 33169 │\n",
+ "│ 2 ┆ 33450 │\n",
"└───────┴───────┘"
]
},
@@ -290,7 +290,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (3, 2)flags | len |
---|
i32 | u32 |
0 | 16732 |
1 | 9999 |
2 | 13281 |
"
+ "shape: (3, 2)flags | len |
---|
i32 | u32 |
0 | 16690 |
1 | 9950 |
2 | 13380 |
"
],
"text/plain": [
"shape: (3, 2)\n",
@@ -299,9 +299,9 @@
"│ --- ┆ --- │\n",
"│ i32 ┆ u32 │\n",
"╞═══════╪═══════╡\n",
- "│ 0 ┆ 16732 │\n",
- "│ 1 ┆ 9999 │\n",
- "│ 2 ┆ 13281 │\n",
+ "│ 0 ┆ 16690 │\n",
+ "│ 1 ┆ 9950 │\n",
+ "│ 2 ┆ 13380 │\n",
"└───────┴───────┘"
]
},
@@ -464,7 +464,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (3, 2)category | len |
---|
str | u32 |
"A" | 10000 |
"B" | 4285 |
"C" | 5715 |
"
+ "shape: (3, 2)category | len |
---|
str | u32 |
"A" | 10000 |
"B" | 4220 |
"C" | 5780 |
"
],
"text/plain": [
"shape: (3, 2)\n",
@@ -474,8 +474,8 @@
"│ str ┆ u32 │\n",
"╞══════════╪═══════╡\n",
"│ A ┆ 10000 │\n",
- "│ B ┆ 4285 │\n",
- "│ C ┆ 5715 │\n",
+ "│ B ┆ 4220 │\n",
+ "│ C ┆ 5780 │\n",
"└──────────┴───────┘"
]
},
@@ -509,7 +509,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (9, 3)category | flags | len |
---|
str | i32 | u32 |
"A" | 0 | 9960 |
"A" | 1 | 9960 |
"A" | 2 | 9960 |
"B" | 0 | 9962 |
"B" | 1 | 9962 |
"B" | 2 | 9962 |
"C" | 0 | 13223 |
"C" | 1 | 13223 |
"C" | 2 | 13223 |
"
+ "shape: (9, 3)category | flags | len |
---|
str | i32 | u32 |
"A" | 0 | 9917 |
"A" | 1 | 9917 |
"A" | 2 | 9917 |
"B" | 0 | 9848 |
"B" | 1 | 9848 |
"B" | 2 | 9848 |
"C" | 0 | 13262 |
"C" | 1 | 13262 |
"C" | 2 | 13262 |
"
],
"text/plain": [
"shape: (9, 3)\n",
@@ -518,15 +518,15 @@
"│ --- ┆ --- ┆ --- │\n",
"│ str ┆ i32 ┆ u32 │\n",
"╞══════════╪═══════╪═══════╡\n",
- "│ A ┆ 0 ┆ 9960 │\n",
- "│ A ┆ 1 ┆ 9960 │\n",
- "│ A ┆ 2 ┆ 9960 │\n",
- "│ B ┆ 0 ┆ 9962 │\n",
- "│ B ┆ 1 ┆ 9962 │\n",
- "│ B ┆ 2 ┆ 9962 │\n",
- "│ C ┆ 0 ┆ 13223 │\n",
- "│ C ┆ 1 ┆ 13223 │\n",
- "│ C ┆ 2 ┆ 13223 │\n",
+ "│ A ┆ 0 ┆ 9917 │\n",
+ "│ A ┆ 1 ┆ 9917 │\n",
+ "│ A ┆ 2 ┆ 9917 │\n",
+ "│ B ┆ 0 ┆ 9848 │\n",
+ "│ B ┆ 1 ┆ 9848 │\n",
+ "│ B ┆ 2 ┆ 9848 │\n",
+ "│ C ┆ 0 ┆ 13262 │\n",
+ "│ C ┆ 1 ┆ 13262 │\n",
+ "│ C ┆ 2 ┆ 13262 │\n",
"└──────────┴───────┴───────┘"
]
},
@@ -561,7 +561,7 @@
" white-space: pre-wrap;\n",
"}\n",
"\n",
- "shape: (9, 3)category | flags | len |
---|
str | i32 | u32 |
"A" | 0 | 9960 |
"A" | 1 | 9960 |
"A" | 2 | 9960 |
"B" | 0 | 9962 |
"B" | 1 | 9962 |
"B" | 2 | 9962 |
"C" | 0 | 10000 |
"C" | 1 | 10000 |
"C" | 2 | 10000 |
"
+ "shape: (9, 3)category | flags | len |
---|
str | i32 | u32 |
"A" | 0 | 9917 |
"A" | 1 | 9917 |
"A" | 2 | 9917 |
"B" | 0 | 9848 |
"B" | 1 | 9848 |
"B" | 2 | 9848 |
"C" | 0 | 10000 |
"C" | 1 | 10000 |
"C" | 2 | 10000 |
"
],
"text/plain": [
"shape: (9, 3)\n",
@@ -570,12 +570,12 @@
"│ --- ┆ --- ┆ --- │\n",
"│ str ┆ i32 ┆ u32 │\n",
"╞══════════╪═══════╪═══════╡\n",
- "│ A ┆ 0 ┆ 9960 │\n",
- "│ A ┆ 1 ┆ 9960 │\n",
- "│ A ┆ 2 ┆ 9960 │\n",
- "│ B ┆ 0 ┆ 9962 │\n",
- "│ B ┆ 1 ┆ 9962 │\n",
- "│ B ┆ 2 ┆ 9962 │\n",
+ "│ A ┆ 0 ┆ 9917 │\n",
+ "│ A ┆ 1 ┆ 9917 │\n",
+ "│ A ┆ 2 ┆ 9917 │\n",
+ "│ B ┆ 0 ┆ 9848 │\n",
+ "│ B ┆ 1 ┆ 9848 │\n",
+ "│ B ┆ 2 ┆ 9848 │\n",
"│ C ┆ 0 ┆ 10000 │\n",
"│ C ┆ 1 ┆ 10000 │\n",
"│ C ┆ 2 ┆ 10000 │\n",
diff --git a/pyproject.toml b/pyproject.toml
index 3eea4807..28986d28 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "maturin"
[project]
name = "polars_ds"
requires-python = ">=3.9"
-version = "0.6.3"
+version = "0.7.0"
license = { file = "LICENSE.txt" }
classifiers = [
diff --git a/python/polars_ds/__init__.py b/python/polars_ds/__init__.py
index e3f2a89f..b1b19c80 100644
--- a/python/polars_ds/__init__.py
+++ b/python/polars_ds/__init__.py
@@ -10,7 +10,7 @@
from polars_ds.expr_knn import * # noqa: F403
from polars_ds.expr_linear import * # noqa: F403
-__version__ = "0.6.3"
+__version__ = "0.7.0"
def frame(size: int = 2_000, index_name: str = "row_num") -> pl.DataFrame:
"""
diff --git a/python/polars_ds/pipeline.py b/python/polars_ds/pipeline.py
index 2267847b..9a4708cb 100644
--- a/python/polars_ds/pipeline.py
+++ b/python/polars_ds/pipeline.py
@@ -67,24 +67,27 @@ def __iter__(self):
@dataclass
class FitStep: # Not a FittedStep
func: FitTransformFunc
- cols: IntoExprColumn
+ cols: IntoExprColumn | None
exclude: List[str]
# Here we allow IntoExprColumn as input so that users can use selectors, or other polars expressions
# to specify input columns, which adds flexibility.
# We still need real column names so that the functions in transforms.py will work.
def fit(self, df: PolarsFrame) -> ExprTransform:
- if _IS_POLARS_V1:
- real_cols: List[str] = [
- x
- for x in df.lazy().select(self.cols).collect_schema().names()
- if x not in self.exclude
- ]
+ if self.cols is None:
+ return self.func(df)
else:
- real_cols: List[str] = [
- x for x in df.select(self.cols).columns if x not in self.exclude
- ]
- return self.func(df, real_cols)
+ if _IS_POLARS_V1:
+ real_cols: List[str] = [
+ x
+ for x in df.lazy().select(self.cols).collect_schema().names()
+ if x not in self.exclude
+ ]
+ else:
+ real_cols: List[str] = [
+ x for x in df.select(self.cols).columns if x not in self.exclude
+ ]
+ return self.func(df, real_cols)
Step: TypeAlias = Union[FitStep, SelectStep, WithColumnsStep, FilterStep, SQLStep]
@@ -480,7 +483,7 @@ def _get_target(self, target: str | pl.Expr | None = None) -> str | pl.Expr:
# self._steps = [deepcopy(s) for s in self._steps]
# return self
- def filter(self, *by: str | pl.Expr, all_: bool = True) -> Self:
+ def filter(self, by: str | pl.Expr) -> Self:
"""
Filters on the dataframe using native polars expressions or SQL boolean expressions.
@@ -488,14 +491,8 @@ def filter(self, *by: str | pl.Expr, all_: bool = True) -> Self:
----------
by
Native polars boolean expression or SQL strings
- all_
- Whether all conditions should be met by all or any (all = False).
"""
- exprs = [s if isinstance(s, pl.Expr) else pl.sql_expr(s) for s in by]
- if all_:
- self._steps.append(FilterStep(pl.all_horizontal(exprs)))
- else:
- self._steps.append(FilterStep(pl.any_horizontal(exprs)))
+ self._steps.append(FilterStep(by if isinstance(by, pl.Expr) else pl.sql_expr(by)))
return self
def sql_transform(self, sql: str) -> Self:
@@ -514,6 +511,13 @@ def sql_transform(self, sql: str) -> Self:
self._steps.append(SQLStep(sql))
return self
+ def cast_bools(self, to: pl.DataType = pl.UInt8) -> Self:
+ """
+ Cast all boolean columns in the dataframe to the given type.
+ """
+ self._steps.append(WithColumnsStep(cs.boolean().cast(to)))
+ return self
+
def impute(self, cols: IntoExprColumn, method: SimpleImputeMethod = "mean") -> Self:
"""
Imputes null values in the given columns. Note: this doesn't fill NaN. If filling for NaN is needed,
@@ -530,6 +534,27 @@ def impute(self, cols: IntoExprColumn, method: SimpleImputeMethod = "mean") -> S
self._steps.append(FitStep(partial(t.impute, method=method), cols, self.exclude))
return self
+ def conditional_impute(
+ self,
+ rules_dict: Dict[str, str | pl.Expr],
+ method:SimpleImputeMethod = "mean"
+ ) -> Self:
+ """
+ Conditionally imputes values in the given columns. This transform will collect if input is lazy.
+
+ Parameters
+ ----------
+ rules_dict
+ Dictionary where keys are column names (must be string), and values are SQL/Polars Conditions
+ that when true, those values in the column will be imputed,
+ and the value to impute will be learned on the data where the condition is false.
+ method
+ One of `mean`, `median`, `mode`. If `mode`, a random value will be chosen if there is
+ a tie.
+ """
+ self._steps.append(FitStep(partial(t.conditional_impute, rules_dict=rules_dict, method=method), None, self.exclude))
+ return self
+
def nan_to_null(self) -> Self:
"""
Maps NaN values in all columns to null.
diff --git a/python/polars_ds/sample_and_split.py b/python/polars_ds/sample_and_split.py
index f06182be..795376ec 100644
--- a/python/polars_ds/sample_and_split.py
+++ b/python/polars_ds/sample_and_split.py
@@ -3,6 +3,7 @@
import polars as pl
import random
import math
+from ._utils import _IS_POLARS_V1
from .typing import PolarsFrame
from typing import List, Tuple
from itertools import combinations, islice
@@ -153,19 +154,19 @@ def downsample(
def random_cols(
- df: PolarsFrame,
+ all_columns:List[str],
k: int,
keep: List[str] | None = None,
seed: int | None = None,
) -> List[str]:
"""
- Selects random columns in the dataframe. Returns the selected columns in a list. Note, it is
- impossible for this to randomly select both ["x", "y"] and ["y", "x"].
+ Selects random columns from the given pool of columns. Returns the selected columns in a list.
+ Note, it is impossible for this to randomly select both ["x", "y"] and ["y", "x"].
Parameters
----------
- df
- Either a lazy or eager Polars dataframe
+ all_columns
+ All column names
k
Select k random columns from all columns outside of `keep`.
keep
@@ -175,12 +176,12 @@ def random_cols(
"""
if keep is None:
out = []
- to_sample = combinations(df.columns, k)
+ to_sample = combinations(all_columns, k)
else:
out = keep
- to_sample = combinations((c for c in df.columns if c not in keep), k)
+ to_sample = combinations((c for c in all_columns if c not in keep), k)
- pool_size = len(df.columns) - len(out)
+ pool_size = len(all_columns) - len(out)
if pool_size < k:
raise ValueError("Not enough columns to select from.")
@@ -229,26 +230,29 @@ def split_by_ratio(
train = frames[(True,)].select(pl.col("*").exclude(["__id", "__tt"]))
test = frames[(False,)].select(pl.col("*").exclude(["__id", "__tt"]))
return [train, test]
- else:
- if sum(split_ratio) != 1:
- raise ValueError("Sum of the ratios is not 1.")
-
- df_eager = (
- df.with_row_index(name="__id")
- .with_columns(pl.col("__id").shuffle(seed=seed).alias("__tt"))
- .sort("__tt")
- .lazy()
- .collect()
- )
+ else: # Should work with iterable (with a length), not just list
+ if len(split_ratio) == 1:
+ return split_by_ratio(df, split_ratio[0], seed)
+ else:
+ if sum(split_ratio) != 1:
+ raise ValueError("Sum of the ratios is not 1.")
- n = len(df_eager)
- start = 0
- dfs = []
- for v in split_ratio:
- length = int(n * v)
- dfs.append(
- df_eager.slice(start, length=length).select(pl.col("*").exclude(["__id", "__tt"]))
+ df_eager = (
+ df.with_row_index(name="__id")
+ .with_columns(pl.col("__id").shuffle(seed=seed).alias("__tt"))
+ .sort("__tt")
+ .lazy()
+ .collect()
)
- start += length
- return dfs
+ n = len(df_eager)
+ start = 0
+ dfs = []
+ for v in split_ratio:
+ length = int(n * v)
+ dfs.append(
+ df_eager.slice(start, length=length).select(pl.col("*").exclude(["__id", "__tt"]))
+ )
+ start += length
+
+ return dfs
diff --git a/python/polars_ds/transforms.py b/python/polars_ds/transforms.py
index ed8a6ea8..fa6e8647 100644
--- a/python/polars_ds/transforms.py
+++ b/python/polars_ds/transforms.py
@@ -46,8 +46,47 @@ def impute(df: PolarsFrame, cols: List[str], method: SimpleImputeMethod = "mean"
temp = df.lazy().select(pl.col(cols).mode().list.first()).collect().row(0)
return [pl.col(c).fill_null(m) for c, m in zip(cols, temp)]
else:
- raise ValueError(f"Unknown input method: {method}")
+ raise ValueError(f"Unknown impute method: `{method}`")
+
+def conditional_impute(
+ df: PolarsFrame,
+ rules_dict: Dict[str, str | pl.Expr],
+ method: SimpleImputeMethod = "mean"
+) -> ExprTransform:
+ """
+ Conditionally imputes values in the given columns. This transform will collect if input is lazy.
+ Parameters
+ ----------
+ df
+ Either a lazy or an eager dataframe
+ rules_dict
+ Dictionary where keys are column names (must be string), and values are SQL/Polars Conditions
+ that when true, those values in the column will be imputed,
+ and the value to impute will be learned on the data where the condition is false.
+ method
+ One of `mean`, `median`, `mode`. If `mode`, a random value will be chosen if there is
+ a tie.
+ """
+ rules_dict = {
+ c: (r if isinstance(r, pl.Expr) else pl.sql_expr(r))
+ for c, r in rules_dict.items()
+ }
+ cols = list(rules_dict.keys())
+ # Learn on the data where the condition is false
+ if method == "mean":
+ temp = df.lazy().select(
+ *(pl.col(c).filter(rules_dict[c].not_()).mean() for c in rules_dict.keys())
+ ).collect().row(0)
+ return [pl.when(rules_dict[c]).then(m).otherwise(pl.col(c)).alias(c) for c, m in zip(cols, temp)]
+ elif method == "median":
+ temp = df.lazy().select(*(pl.col(c).filter(rules_dict[c].not_()).median() for c in rules_dict.keys())).collect().row(0)
+ return [pl.when(rules_dict[c]).then(m).otherwise(pl.col(c)).alias(c) for c, m in zip(cols, temp)]
+ elif method == "mode":
+ temp = df.lazy().select(*(pl.col(c).filter(rules_dict[c].not_()).mode().list.first() for c in rules_dict.keys())).collect().row(0)
+ return [pl.when(rules_dict[c]).then(m).otherwise(pl.col(c)).alias(c) for c, m in zip(cols, temp)]
+ else:
+ raise ValueError(f"Unknown impute method: `{method}`")
def linear_impute(
df: PolarsFrame, features: List[str], target: str | pl.Expr, add_bias: bool = False
diff --git a/tests/test_transforms.py b/tests/test_transforms.py
index 449ccf2a..bc293424 100644
--- a/tests/test_transforms.py
+++ b/tests/test_transforms.py
@@ -22,6 +22,31 @@ def test_linear_impute():
assert_frame_equal(imputed_c, correct_c)
+def test_conditional_impute():
+ df = pl.DataFrame({
+ "a": [float('nan'), None, float("inf"), 9999, 100, 100, 100, 800],
+ })
+
+ res = df.with_columns(
+ t.conditional_impute(
+ df,
+ {"a": ((pl.col("a").is_finite().not_()) | pl.col("a").is_null() | (pl.col("a") > 899))},
+ method = "mean"
+ )[0].alias("result")
+ )["result"]
+
+ assert list(res)[:4] == [275.0, 275.0, 275.0, 275.0]
+
+ res = df.with_columns(
+ t.conditional_impute(
+ df,
+ {"a": ((pl.col("a").is_finite().not_()) | pl.col("a").is_null() | (pl.col("a") > 899))},
+ method = "median"
+ )[0].alias("result")
+ )["result"]
+
+ assert list(res)[:4] == [100.0, 100.0, 100.0, 100.0]
+
def test_winsorize():
df = pds.frame(size=1000).select(