Skip to content

Commit

Permalink
Merge pull request abstractqqq#107 from abstractqqq/add_profile
Browse files Browse the repository at this point in the history
  • Loading branch information
abstractqqq authored Mar 22, 2024
2 parents eeefcb0 + 5207806 commit 1e87168
Show file tree
Hide file tree
Showing 5 changed files with 744 additions and 6 deletions.
Binary file added examples/dependency.parquet
Binary file not shown.
680 changes: 680 additions & 0 deletions examples/diagonsis.ipynb

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,16 @@ authors = [
]
dependencies = [
"polars >= 0.20.6, !=0.20.12",
"hvplot >= 0.9.1",
"graphviz >= 0.20"
]

keywords = ["polars-extension", "scientific-computing", "data-science"]

[project.optional-dependencies]
plot = [
"great-tables >= 0.4",
"graphviz >= 0.20"
]

[tool.maturin]
python-source = "python"
features = ["pyo3/extension-module"]
Expand Down
4 changes: 2 additions & 2 deletions python/polars_ds/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,13 @@ def random_data(
"""
if null_pct is None:
rand_cols = (
pl.col("row_num").stats.sample_uniform(low=0.0, high=1.0).alias(f"feature_{i+1}")
pl.col("row_num").stats.rand_uniform(low=0.0, high=1.0).alias(f"feature_{i+1}")
for i in range(n_cols)
)
else:
rand_cols = (
pl.col("row_num")
.stats.sample_uniform(low=0.0, high=1.0)
.stats.rand_uniform(low=0.0, high=1.0)
.stats.rand_null(null_pct)
.alias(f"feature_{i+1}")
for i in range(n_cols)
Expand Down
58 changes: 56 additions & 2 deletions python/polars_ds/diagnosis.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,20 @@
from .num import NumExt # noqa: F401
from itertools import combinations
import graphviz
from great_tables import GT

logger = logging.getLogger(__name__)


# Name to be decided
class Detector:
# DIA = Data Inspection Assistant / DIAgonsis
class DIA:

"""
Data Inspection Assistant.
If you cannot import this module, please try: pip install "polars_ds[plot]"
"""

def __init__(self, df: Union[pl.DataFrame, pl.LazyFrame]):
self._frame: pl.LazyFrame = df.lazy()
self.numerics: List[str] = df.select(cs.numeric()).columns
Expand All @@ -23,6 +31,52 @@ def __init__(self, df: Union[pl.DataFrame, pl.LazyFrame]):
self.simple_types: List[str] = self.numerics + self.strs + self.bools + self.cats
self.other_types: List[str] = [c for c in self._frame.columns if c not in self.simple_types]

def numeric_profile(self, n_bins: int = 20):
"""
Creates a numerical profile with a histogram plot. Notice that the histograms may have
completely different scales on the x-axis.
Parameters
----------
n_bins
Bins in the histogram
"""
to_check = self.numerics

cuts = [i / n_bins for i in range(n_bins)]
cuts[0] -= 1e-5
cuts[-1] += 1e-5
frames = []
for c in to_check:
temp = self._frame.select(
pl.lit(c).alias("column"),
(pl.col(c).null_count() / pl.len()).round(2).alias("null%"),
pl.col(c).mean().round(2).alias("mean"),
pl.col(c).median().cast(pl.Float64).round(2).alias("median"),
pl.col(c).std().round(2).alias("std"),
pl.struct(
((pl.col(c) - pl.col(c).min()) / (pl.col(c).max() - pl.col(c).min()))
.filter(pl.col(c).is_finite())
.cut(breaks=cuts, left_closed=True, include_breaks=True)
.struct.field("brk")
.value_counts()
.sort()
.struct.field("count")
.implode()
).alias("histogram"),
pl.col(c).min().cast(pl.Float64).round(2).alias("min"),
pl.col(c).max().cast(pl.Float64).round(2).alias("max"),
)
frames.append(temp)

df_final = pl.concat(pl.collect_all(frames))
return (
GT(df_final, rowname_col="column")
.tab_stubhead("column")
.fmt_percent("null%")
.fmt_nanoplot(columns="histogram", plot_type="bar")
)

@lru_cache
def infer_high_null(self, threshold: float = 0.75) -> List[str]:
"""
Expand Down

0 comments on commit 1e87168

Please sign in to comment.