Skip to content

Commit

Permalink
Merge pull request #19 from great-expectations/add-demo-scripts
Browse files Browse the repository at this point in the history
Add demo scripts for gx-1.0.0a4
  • Loading branch information
billdirks authored Jun 24, 2024
2 parents 8b27c89 + b1804cb commit 3f1a2b9
Show file tree
Hide file tree
Showing 17 changed files with 205 additions and 8 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ You may need to run these additional steps
## Project Structure

* `gx-*/`
* `demos/`: full, working versions of demos as notebooks
* `scripts/`: full, working python scripts
* `demos/notebooks`: full, working versions of demos as notebooks
* `demos/scripts`: full, working python scripts
* `requirements.txt`: requirements files for the specific version
* `scripts/`: setup scripts

Expand Down
14 changes: 8 additions & 6 deletions gx-1.0.0a4/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ Demos of expectation authoring and validation workflows for great-expectations 1

These demos use python 3.10 with [1.0.0a4](https://pypi.org/project/great-expectations/1.0.0a4/).

## Notes about these scripts
The scripts in this directory will run against 1.0.0a4, and include TODOs on future changes planning for subsequent prereleases of 1.0.0.
## Notes about these demos

* The scripts in this directory will run against 1.0.0a4, and include TODOs on future changes planning for subsequent prereleases of 1.0.0.
* To run the `demos/scripts/*.py` scripts, one must have valid AWS credentials for free access to the [New York City Taxi Data Set](https://registry.opendata.aws/nyc-tlc-trip-records-pds/) since it is served from S3. See the linked data set description for more information.

## Getting started
1. Create a virtual environment: `python -m venv .venv`
1. Source the virtual environment: `source .venv/bin/activate`
1. Install requirements: `pip install -r requirements.txt`
1. Start the postgres container: `../scripts/run_dockerized_pg.sh`
1. Run the notebooks in `demos/`
2. Source the virtual environment: `source .venv/bin/activate`
3. Install requirements: `pip install -r requirements.txt`
4. Start the postgres container: `../scripts/run_dockerized_pg.sh`
5. Run the notebooks and scripts in `demos/`
File renamed without changes.
37 changes: 37 additions & 0 deletions gx-1.0.0a4/demos/scripts/01-create-expectations-interactively.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# TODO: will become from great_expectations import get_context, ExpectationSuite
import great_expectations.expectations as gxe
from great_expectations import get_context
from great_expectations.core import ExpectationSuite
from great_expectations.exceptions import DataContextError

context = get_context(project_root_dir="./")

try:
suite = context.suites.get("project_name")
# TODO: error will change to ResourceNotFoundError
except DataContextError:
# TODO: will change to:
# suite = context.suites.add(name="project_name")
suite = context.suites.add(ExpectationSuite(name="project_name"))

batch = context.data_sources.pandas_default.read_parquet(
"s3://nyc-tlc/trip data/yellow_tripdata_2019-01.parquet"
)

# TODO: column_index will not be required
expectation = gxe.ExpectColumnToExist(column="VendorID", column_index=None)
result = batch.validate(expectation)
print(result)
suite.add_expectation(expectation)

expectation = gxe.ExpectColumnValuesToMatchRegex(column="VendorID", regex="^[123456]$")
result = batch.validate(expectation)
print(result)
suite.add_expectation(expectation)

expectation = gxe.ExpectColumnValuesToBeUnique(column="VendorID")
result = batch.validate(expectation)
print(result)
suite.add_expectation(expectation)

print(suite)
37 changes: 37 additions & 0 deletions gx-1.0.0a4/demos/scripts/02-connect-to-data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import great_expectations.expectations as gxe
from great_expectations import get_context

context = get_context(project_root_dir="./")

try:
data_source = context.datasources["project_name"]
# TODO: this will be updated to become
# data_source = context.data_sources.get("project_name")
# TODO: instead of keyerror will be ResourceNotFoundError
except KeyError:
data_source = context.data_sources.add_pandas_s3(
name="project_name",
bucket="nyc-tlc",
)

try:
asset = data_source.get_asset("my_project")
# TODO: instead of LookupError will be ResourceNotFoundError
except LookupError:
asset = data_source.add_parquet_asset("my_project", s3_prefix="trip data/")

try:
batch_definition = asset.get_batch_definition("monthly")
except KeyError:
import re

pattern = re.compile(
r"yellow_tripdata_(?P<year>[0-9]{4})-(?P<month>[0-9]{2}).parquet"
)
batch_definition = asset.add_batch_definition_monthly("monthly", regex=pattern)


# To verify that things worked...
batch = batch_definition.get_batch(batch_parameters={"year": "2020", "month": "04"})

print(batch.validate(gxe.ExpectColumnToExist(column="VendorID")))
30 changes: 30 additions & 0 deletions gx-1.0.0a4/demos/scripts/03-create-expectations-programatically.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import great_expectations.expectations as gxe
from great_expectations import get_context

# TODO: will become from great_expectations import get_context, ExpectationSuite
from great_expectations.core import ExpectationSuite
from great_expectations.exceptions import DataContextError

context = get_context(project_root_dir="./")

# Create Expectation Suite
try:
suite = context.suites.get("project_name")
# TODO: instead of DataContextError will be ResourceNotFoundError
except DataContextError:
suite = context.suites.add(ExpectationSuite(name="project_name"))

suite.expectations = [
gxe.ExpectColumnToExist(column="VendorID", column_index=None),
gxe.ExpectColumnDistinctValuesToBeInSet(
column="VendorID", value_set=[1, 2, 3, 4, 5, 6]
),
gxe.ExpectColumnValuesToBeBetween(
column="passenger_count", min_value=1, max_value=5, mostly=0.95
),
gxe.ExpectColumnValuesToBeBetween(
column="passenger_count", min_value=0, max_value=10
),
]

suite.save()
40 changes: 40 additions & 0 deletions gx-1.0.0a4/demos/scripts/04-create-validation-definition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from great_expectations import get_context

# TODO: will become from great_expectations import ValidationDefinition
from great_expectations.core import ValidationDefinition
from great_expectations.exceptions import DataContextError

context = get_context(project_root_dir="./")

# TODO: will become
# batch_definition = context.data_sources.get("project_name").get_asset("my_project").get_batch_definition("monthly")
batch_definition = (
context.datasources["project_name"]
.get_asset("my_project")
.get_batch_definition("monthly")
)
suite = context.suites.get("project_name")

try:
validation_definition = context.validation_definitions.get("my_project")
# TODO: will become except ResourceNotFoundError:
except DataContextError:
# TODO: will become
# validation_definition = context.validation_definitions.add(
# name="my_project",
# data=batch_definition,
# suite=suite
# )
validation_definition = context.validation_definitions.add(
ValidationDefinition(name="my_project", data=batch_definition, suite=suite)
)

### To run this in your project it is critical to provide batch_parameters
result = context.validation_definitions.get("my_project").run(
batch_parameters={"year": "2020", "month": "04"}
)

# TODO: This should only run on the latest batch, or it should fail entirely with an error that
# batch parameters are missing
# result = context.validation_definitions.get("my_project").run()
print(result)
8 changes: 8 additions & 0 deletions gx-1.0.0a4/demos/scripts/05-run-validation-definition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from great_expectations import get_context

context = get_context(project_root_dir="./")
# NOTE: It is critical to pass the batch_parameters to the run method, otherwise the validation stall
# by trying to read all the data. We will have a fix in place before the final release.
# TODO: Implement fix for above issue
validation_definition = context.validation_definitions.get("my_project")
result = validation_definition.run(batch_parameters={"year": "2020", "month": "04"})
32 changes: 32 additions & 0 deletions gx-1.0.0a4/demos/scripts/06-create-checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from great_expectations import get_context

# TODO: will become from great_expectations import Checkpoint
# TODO will become from great_expectations.actions import SlackNotificationAction
from great_expectations.checkpoint import Checkpoint, SlackNotificationAction

# TODO will become freom great_expectations.exceptions import ResourceNotFoundError
from great_expectations.exceptions import DataContextError

context = get_context(project_root_dir="./")

try:
checkpoint = context.checkpoints.get("project_integration_checkpoint")
# TODO: Will become ResourceNotFoundError
except DataContextError:
checkpoint = context.checkpoints.add(
Checkpoint(
name="project_integration_checkpoint",
validation_definitions=[context.validation_definitions.get("my_project")],
actions=[
SlackNotificationAction(
name="slack_notification",
# TODO: config variable substitution not working
slack_token="${SLACK_NOTIFICATION_TOKEN}",
slack_channel="#alerts-timber-test",
),
],
)
)

result = checkpoint.run(batch_parameters={"year": "2020", "month": "04"})
print(result)
6 changes: 6 additions & 0 deletions gx-1.0.0a4/demos/scripts/07-run-checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from great_expectations import get_context

context = get_context(project_root_dir="./")
checkpoint = context.checkpoints.get("project_integration_checkpoint")
result = checkpoint.run(batch_parameters={"year": "2020", "month": "04"})
print(result)
5 changes: 5 additions & 0 deletions gx-1.0.0a4/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
boto3
fastparquet
great-expectations==1.0.0a4
jupyter
pandas
psycopg2
pyarrow
s3fs
sqlalchemy

0 comments on commit 3f1a2b9

Please sign in to comment.