diff --git a/README.md b/README.md index 58dadfd..ed9a70e 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,8 @@ You may need to run these additional steps ## Project Structure * `gx-*/` - * `demos/`: full, working versions of demos as notebooks - * `scripts/`: full, working python scripts + * `demos/notebooks`: full, working versions of demos as notebooks + * `demos/scripts`: full, working python scripts * `requirements.txt`: requirements files for the specific version * `scripts/`: setup scripts diff --git a/gx-1.0.0a2/demos/01-authoring_expectation_suites.ipynb b/gx-1.0.0a2/demos/notebooks/01-authoring_expectation_suites.ipynb similarity index 100% rename from gx-1.0.0a2/demos/01-authoring_expectation_suites.ipynb rename to gx-1.0.0a2/demos/notebooks/01-authoring_expectation_suites.ipynb diff --git a/gx-1.0.0a2/demos/02-validation-definitions-and-checkpoints.ipynb b/gx-1.0.0a2/demos/notebooks/02-validation-definitions-and-checkpoints.ipynb similarity index 100% rename from gx-1.0.0a2/demos/02-validation-definitions-and-checkpoints.ipynb rename to gx-1.0.0a2/demos/notebooks/02-validation-definitions-and-checkpoints.ipynb diff --git a/gx-1.0.0a2/demos/03-sql_month_and_year.ipynb b/gx-1.0.0a2/demos/notebooks/03-sql_month_and_year.ipynb similarity index 100% rename from gx-1.0.0a2/demos/03-sql_month_and_year.ipynb rename to gx-1.0.0a2/demos/notebooks/03-sql_month_and_year.ipynb diff --git a/gx-1.0.0a4/README.md b/gx-1.0.0a4/README.md index f811dd1..018ab71 100644 --- a/gx-1.0.0a4/README.md +++ b/gx-1.0.0a4/README.md @@ -4,12 +4,14 @@ Demos of expectation authoring and validation workflows for great-expectations 1 These demos use python 3.10 with [1.0.0a4](https://pypi.org/project/great-expectations/1.0.0a4/). -## Notes about these scripts -The scripts in this directory will run against 1.0.0a4, and include TODOs on future changes planning for subsequent prereleases of 1.0.0. +## Notes about these demos + +* The scripts in this directory will run against 1.0.0a4, and include TODOs on future changes planning for subsequent prereleases of 1.0.0. +* To run the `demos/scripts/*.py` scripts, one must have valid AWS credentials for free access to the [New York City Taxi Data Set](https://registry.opendata.aws/nyc-tlc-trip-records-pds/) since it is served from S3. See the linked data set description for more information. ## Getting started 1. Create a virtual environment: `python -m venv .venv` -1. Source the virtual environment: `source .venv/bin/activate` -1. Install requirements: `pip install -r requirements.txt` -1. Start the postgres container: `../scripts/run_dockerized_pg.sh` -1. Run the notebooks in `demos/` +2. Source the virtual environment: `source .venv/bin/activate` +3. Install requirements: `pip install -r requirements.txt` +4. Start the postgres container: `../scripts/run_dockerized_pg.sh` +5. Run the notebooks and scripts in `demos/` diff --git a/gx-1.0.0a4/demos/01-authoring_expectation_suites.ipynb b/gx-1.0.0a4/demos/notebooks/01-authoring_expectation_suites.ipynb similarity index 100% rename from gx-1.0.0a4/demos/01-authoring_expectation_suites.ipynb rename to gx-1.0.0a4/demos/notebooks/01-authoring_expectation_suites.ipynb diff --git a/gx-1.0.0a4/demos/02-validation-definitions-and-checkpoints.ipynb b/gx-1.0.0a4/demos/notebooks/02-validation-definitions-and-checkpoints.ipynb similarity index 100% rename from gx-1.0.0a4/demos/02-validation-definitions-and-checkpoints.ipynb rename to gx-1.0.0a4/demos/notebooks/02-validation-definitions-and-checkpoints.ipynb diff --git a/gx-1.0.0a4/demos/03-sql_month_and_year.ipynb b/gx-1.0.0a4/demos/notebooks/03-sql_month_and_year.ipynb similarity index 100% rename from gx-1.0.0a4/demos/03-sql_month_and_year.ipynb rename to gx-1.0.0a4/demos/notebooks/03-sql_month_and_year.ipynb diff --git a/gx-1.0.0a4/demos/constants.py b/gx-1.0.0a4/demos/notebooks/constants.py similarity index 100% rename from gx-1.0.0a4/demos/constants.py rename to gx-1.0.0a4/demos/notebooks/constants.py diff --git a/gx-1.0.0a4/demos/scripts/01-create-expectations-interactively.py b/gx-1.0.0a4/demos/scripts/01-create-expectations-interactively.py new file mode 100644 index 0000000..0806632 --- /dev/null +++ b/gx-1.0.0a4/demos/scripts/01-create-expectations-interactively.py @@ -0,0 +1,37 @@ +# TODO: will become from great_expectations import get_context, ExpectationSuite +import great_expectations.expectations as gxe +from great_expectations import get_context +from great_expectations.core import ExpectationSuite +from great_expectations.exceptions import DataContextError + +context = get_context(project_root_dir="./") + +try: + suite = context.suites.get("project_name") +# TODO: error will change to ResourceNotFoundError +except DataContextError: + # TODO: will change to: + # suite = context.suites.add(name="project_name") + suite = context.suites.add(ExpectationSuite(name="project_name")) + +batch = context.data_sources.pandas_default.read_parquet( + "s3://nyc-tlc/trip data/yellow_tripdata_2019-01.parquet" +) + +# TODO: column_index will not be required +expectation = gxe.ExpectColumnToExist(column="VendorID", column_index=None) +result = batch.validate(expectation) +print(result) +suite.add_expectation(expectation) + +expectation = gxe.ExpectColumnValuesToMatchRegex(column="VendorID", regex="^[123456]$") +result = batch.validate(expectation) +print(result) +suite.add_expectation(expectation) + +expectation = gxe.ExpectColumnValuesToBeUnique(column="VendorID") +result = batch.validate(expectation) +print(result) +suite.add_expectation(expectation) + +print(suite) diff --git a/gx-1.0.0a4/demos/scripts/02-connect-to-data.py b/gx-1.0.0a4/demos/scripts/02-connect-to-data.py new file mode 100644 index 0000000..4a91d51 --- /dev/null +++ b/gx-1.0.0a4/demos/scripts/02-connect-to-data.py @@ -0,0 +1,37 @@ +import great_expectations.expectations as gxe +from great_expectations import get_context + +context = get_context(project_root_dir="./") + +try: + data_source = context.datasources["project_name"] + # TODO: this will be updated to become + # data_source = context.data_sources.get("project_name") +# TODO: instead of keyerror will be ResourceNotFoundError +except KeyError: + data_source = context.data_sources.add_pandas_s3( + name="project_name", + bucket="nyc-tlc", + ) + +try: + asset = data_source.get_asset("my_project") +# TODO: instead of LookupError will be ResourceNotFoundError +except LookupError: + asset = data_source.add_parquet_asset("my_project", s3_prefix="trip data/") + +try: + batch_definition = asset.get_batch_definition("monthly") +except KeyError: + import re + + pattern = re.compile( + r"yellow_tripdata_(?P[0-9]{4})-(?P[0-9]{2}).parquet" + ) + batch_definition = asset.add_batch_definition_monthly("monthly", regex=pattern) + + +# To verify that things worked... +batch = batch_definition.get_batch(batch_parameters={"year": "2020", "month": "04"}) + +print(batch.validate(gxe.ExpectColumnToExist(column="VendorID"))) diff --git a/gx-1.0.0a4/demos/scripts/03-create-expectations-programatically.py b/gx-1.0.0a4/demos/scripts/03-create-expectations-programatically.py new file mode 100644 index 0000000..dc4a720 --- /dev/null +++ b/gx-1.0.0a4/demos/scripts/03-create-expectations-programatically.py @@ -0,0 +1,30 @@ +import great_expectations.expectations as gxe +from great_expectations import get_context + +# TODO: will become from great_expectations import get_context, ExpectationSuite +from great_expectations.core import ExpectationSuite +from great_expectations.exceptions import DataContextError + +context = get_context(project_root_dir="./") + +# Create Expectation Suite +try: + suite = context.suites.get("project_name") +# TODO: instead of DataContextError will be ResourceNotFoundError +except DataContextError: + suite = context.suites.add(ExpectationSuite(name="project_name")) + +suite.expectations = [ + gxe.ExpectColumnToExist(column="VendorID", column_index=None), + gxe.ExpectColumnDistinctValuesToBeInSet( + column="VendorID", value_set=[1, 2, 3, 4, 5, 6] + ), + gxe.ExpectColumnValuesToBeBetween( + column="passenger_count", min_value=1, max_value=5, mostly=0.95 + ), + gxe.ExpectColumnValuesToBeBetween( + column="passenger_count", min_value=0, max_value=10 + ), +] + +suite.save() diff --git a/gx-1.0.0a4/demos/scripts/04-create-validation-definition.py b/gx-1.0.0a4/demos/scripts/04-create-validation-definition.py new file mode 100644 index 0000000..41d481c --- /dev/null +++ b/gx-1.0.0a4/demos/scripts/04-create-validation-definition.py @@ -0,0 +1,40 @@ +from great_expectations import get_context + +# TODO: will become from great_expectations import ValidationDefinition +from great_expectations.core import ValidationDefinition +from great_expectations.exceptions import DataContextError + +context = get_context(project_root_dir="./") + +# TODO: will become +# batch_definition = context.data_sources.get("project_name").get_asset("my_project").get_batch_definition("monthly") +batch_definition = ( + context.datasources["project_name"] + .get_asset("my_project") + .get_batch_definition("monthly") +) +suite = context.suites.get("project_name") + +try: + validation_definition = context.validation_definitions.get("my_project") +# TODO: will become except ResourceNotFoundError: +except DataContextError: + # TODO: will become + # validation_definition = context.validation_definitions.add( + # name="my_project", + # data=batch_definition, + # suite=suite + # ) + validation_definition = context.validation_definitions.add( + ValidationDefinition(name="my_project", data=batch_definition, suite=suite) + ) + +### To run this in your project it is critical to provide batch_parameters +result = context.validation_definitions.get("my_project").run( + batch_parameters={"year": "2020", "month": "04"} +) + +# TODO: This should only run on the latest batch, or it should fail entirely with an error that +# batch parameters are missing +# result = context.validation_definitions.get("my_project").run() +print(result) diff --git a/gx-1.0.0a4/demos/scripts/05-run-validation-definition.py b/gx-1.0.0a4/demos/scripts/05-run-validation-definition.py new file mode 100644 index 0000000..6e84570 --- /dev/null +++ b/gx-1.0.0a4/demos/scripts/05-run-validation-definition.py @@ -0,0 +1,8 @@ +from great_expectations import get_context + +context = get_context(project_root_dir="./") +# NOTE: It is critical to pass the batch_parameters to the run method, otherwise the validation stall +# by trying to read all the data. We will have a fix in place before the final release. +# TODO: Implement fix for above issue +validation_definition = context.validation_definitions.get("my_project") +result = validation_definition.run(batch_parameters={"year": "2020", "month": "04"}) diff --git a/gx-1.0.0a4/demos/scripts/06-create-checkpoint.py b/gx-1.0.0a4/demos/scripts/06-create-checkpoint.py new file mode 100644 index 0000000..cc4ee7a --- /dev/null +++ b/gx-1.0.0a4/demos/scripts/06-create-checkpoint.py @@ -0,0 +1,32 @@ +from great_expectations import get_context + +# TODO: will become from great_expectations import Checkpoint +# TODO will become from great_expectations.actions import SlackNotificationAction +from great_expectations.checkpoint import Checkpoint, SlackNotificationAction + +# TODO will become freom great_expectations.exceptions import ResourceNotFoundError +from great_expectations.exceptions import DataContextError + +context = get_context(project_root_dir="./") + +try: + checkpoint = context.checkpoints.get("project_integration_checkpoint") +# TODO: Will become ResourceNotFoundError +except DataContextError: + checkpoint = context.checkpoints.add( + Checkpoint( + name="project_integration_checkpoint", + validation_definitions=[context.validation_definitions.get("my_project")], + actions=[ + SlackNotificationAction( + name="slack_notification", + # TODO: config variable substitution not working + slack_token="${SLACK_NOTIFICATION_TOKEN}", + slack_channel="#alerts-timber-test", + ), + ], + ) + ) + +result = checkpoint.run(batch_parameters={"year": "2020", "month": "04"}) +print(result) diff --git a/gx-1.0.0a4/demos/scripts/07-run-checkpoint.py b/gx-1.0.0a4/demos/scripts/07-run-checkpoint.py new file mode 100644 index 0000000..a2e87f3 --- /dev/null +++ b/gx-1.0.0a4/demos/scripts/07-run-checkpoint.py @@ -0,0 +1,6 @@ +from great_expectations import get_context + +context = get_context(project_root_dir="./") +checkpoint = context.checkpoints.get("project_integration_checkpoint") +result = checkpoint.run(batch_parameters={"year": "2020", "month": "04"}) +print(result) diff --git a/gx-1.0.0a4/requirements.txt b/gx-1.0.0a4/requirements.txt index a293297..70d54bf 100644 --- a/gx-1.0.0a4/requirements.txt +++ b/gx-1.0.0a4/requirements.txt @@ -1,4 +1,9 @@ +boto3 +fastparquet great-expectations==1.0.0a4 jupyter +pandas psycopg2 +pyarrow +s3fs sqlalchemy