Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature generate html #231

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
aebbb06
wip
ronanstokes-db Mar 26, 2023
5f0ffc0
merge from origin
ronanstokes-db Mar 27, 2023
1eda552
wip
ronanstokes-db Apr 7, 2023
7de014c
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 7, 2023
c859475
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 9, 2023
3094e96
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 13, 2023
3bf6e9b
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 17, 2023
caaff18
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 18, 2023
87d5c50
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 18, 2023
4536794
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 19, 2023
eba6193
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 21, 2023
c4fdc3b
wip
ronanstokes-db May 9, 2023
8734b19
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db May 30, 2023
f063235
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Jun 28, 2023
b9fb552
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Jul 1, 2023
3eb15f4
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Jul 11, 2023
c85f915
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Jul 13, 2023
e53f8fe
changes for release
ronanstokes-db Jul 13, 2023
4259cac
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Jul 21, 2023
23760ba
wip
ronanstokes-db Aug 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 36 additions & 16 deletions dbldatagen/data_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
"""
This module defines the ``DataAnalyzer`` class.

This code is experimental and both APIs and code generated is liable to change in future versions.
.. warning::
Experimental
This code is experimental and both APIs and code generated is liable to change in future versions.

"""
from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \
TimestampType, DateType, DecimalType, ByteType, BinaryType, StructType, ArrayType, DataType
Expand All @@ -14,6 +17,7 @@
import pyspark.sql.functions as F

from .utils import strip_margins
from .html_utils import HtmlUtils
from .spark_singleton import SparkSingleton


Expand Down Expand Up @@ -58,6 +62,11 @@
self._sparkSession = sparkSession
self._dataSummary = None

@property
def sourceDf(self):
""" Get source dataframe"""
return self._df

def _displayRow(self, row):
"""Display details for row"""
results = []
Expand Down Expand Up @@ -116,42 +125,42 @@
'schema',
summaryExpr=f"""to_json(named_struct('column_count', {len(dtypes)}))""",
fieldExprs=[f"'{dtype[1]}' as {dtype[0]}" for dtype in dtypes],
dfData=self._df)
dfData=self.sourceDf)

# count
dfDataSummary = self._addMeasureToSummary(
'count',
summaryExpr=f"{total_count}",
fieldExprs=[f"string(count({dtype[0]})) as {dtype[0]}" for dtype in dtypes],
dfData=self._df,
dfData=self.sourceDf,
dfSummary=dfDataSummary)

dfDataSummary = self._addMeasureToSummary(
'null_probability',
fieldExprs=[f"""string( round( ({total_count} - count({dtype[0]})) /{total_count}, 2)) as {dtype[0]}"""
for dtype in dtypes],
dfData=self._df,
dfData=self.sourceDf,
dfSummary=dfDataSummary)

# distinct count
dfDataSummary = self._addMeasureToSummary(
'distinct_count',
summaryExpr="count(distinct *)",
fieldExprs=[f"string(count(distinct {dtype[0]})) as {dtype[0]}" for dtype in dtypes],
dfData=self._df,
dfData=self.sourceDf,
dfSummary=dfDataSummary)

# min
dfDataSummary = self._addMeasureToSummary(
'min',
fieldExprs=[f"string(min({dtype[0]})) as {dtype[0]}" for dtype in dtypes],
dfData=self._df,
dfData=self.sourceDf,
dfSummary=dfDataSummary)

dfDataSummary = self._addMeasureToSummary(
'max',
fieldExprs=[f"string(max({dtype[0]})) as {dtype[0]}" for dtype in dtypes],
dfData=self._df,
dfData=self.sourceDf,
dfSummary=dfDataSummary)

descriptionDf = self._df.describe().where("summary in ('mean', 'stddev')")
Expand All @@ -169,20 +178,20 @@
dfDataSummary = self._addMeasureToSummary(
measure,
fieldExprs=[f"'{values[dtype[0]]}'" for dtype in dtypes],
dfData=self._df,
dfData=self.sourceDf,
dfSummary=dfDataSummary)

# string characteristics for strings and string representation of other values
dfDataSummary = self._addMeasureToSummary(
'print_len_min',
fieldExprs=[f"min(length(string({dtype[0]}))) as {dtype[0]}" for dtype in dtypes],
dfData=self._df,
dfData=self.sourceDf,
dfSummary=dfDataSummary)

dfDataSummary = self._addMeasureToSummary(
'print_len_max',
fieldExprs=[f"max(length(string({dtype[0]}))) as {dtype[0]}" for dtype in dtypes],
dfData=self._df,
dfData=self.sourceDf,
dfSummary=dfDataSummary)

return dfDataSummary
Expand Down Expand Up @@ -359,7 +368,7 @@
return "\n".join(stmts)

@classmethod
def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None):
def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None, asHtml=False):
"""
Generate outline data generator code from an existing dataframe

Expand All @@ -375,14 +384,20 @@
:param schema: Pyspark schema - i.e manually constructed StructType or return value from `dataframe.schema`
:param suppressOutput: Suppress printing of generated code if True
:param name: Optional name for data generator
:return: String containing skeleton code
:param asHtml: If True, will generate Html suitable for notebook ``displayHtml``. If true, suppresses output
:return: String containing skeleton code (in Html form if `asHtml` is True)

"""
return cls._scriptDataGeneratorCode(schema,
generated_code = cls._scriptDataGeneratorCode(schema,
suppressOutput=suppressOutput,
name=name)

def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):
if asHtml:
generated_code = HtmlUtils.formatCodeAsHtml(generated_code)

Check warning on line 396 in dbldatagen/data_analyzer.py

View check run for this annotation

Codecov / codecov/patch

dbldatagen/data_analyzer.py#L396

Added line #L396 was not covered by tests

return generated_code

def scriptDataGeneratorFromData(self, suppressOutput=False, name=None, asHtml=False):
"""
Generate outline data generator code from an existing dataframe

Expand Down Expand Up @@ -411,8 +426,13 @@
row_key_pairs = row.asDict()
self._dataSummary[row['measure_']] = row_key_pairs

return self._scriptDataGeneratorCode(self._df.schema,
generated_code = self._scriptDataGeneratorCode(self._df.schema,
suppressOutput=suppressOutput,
name=name,
dataSummary=self._dataSummary,
sourceDf=self._df)
sourceDf=self.sourceDf)

if asHtml:
generated_code = HtmlUtils.formatCodeAsHtml(generated_code)

Check warning on line 436 in dbldatagen/data_analyzer.py

View check run for this annotation

Codecov / codecov/patch

dbldatagen/data_analyzer.py#L436

Added line #L436 was not covered by tests

return generated_code
Loading