From bfad8bf196773a516d2026f517b5240252219c9b Mon Sep 17 00:00:00 2001 From: Tarek Elqoulaq <65929345+luffyofproduct@users.noreply.github.com> Date: Mon, 29 Apr 2024 06:26:10 +0000 Subject: [PATCH 1/4] add docs for marts & macros --- macros/_macros__definitions.yml | 5 +++++ macros/_macros__docs.md | 13 +++++++++++++ models/marts/_marts__docs.md | 18 ++++++++++++++++++ models/marts/_marts__models.yml | 1 + 4 files changed, 37 insertions(+) create mode 100644 macros/_macros__definitions.yml create mode 100644 macros/_macros__docs.md create mode 100644 models/marts/_marts__docs.md diff --git a/macros/_macros__definitions.yml b/macros/_macros__definitions.yml new file mode 100644 index 0000000..94e6782 --- /dev/null +++ b/macros/_macros__definitions.yml @@ -0,0 +1,5 @@ +version: 2 + +macros: + - name: generate_schema_name + description: '{{ doc("generate_schema_name_description") }}' \ No newline at end of file diff --git a/macros/_macros__docs.md b/macros/_macros__docs.md new file mode 100644 index 0000000..6838146 --- /dev/null +++ b/macros/_macros__docs.md @@ -0,0 +1,13 @@ +{% docs generate_schema_name_description %} +This is a built-in dbt macro that changes the dataset name based on `target` and `+schema` config. + +* If `target` == `prod`, then models will deploy to separate datasets based on `+schema` config set in `dbt_project.yml`. For example: + * `staging` + * `warehouse` +* For any non-`prod` deployment, all models will deploy to the default dataset as indicated in `profiles.yml`. For example: + * `dev_mkahan` + * `dev_jdoe` + +More information can be found at the dbt docs site [here](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/using-custom-schemas#how-does-dbt-generate-a-models-schema-name) + +{% enddocs %} \ No newline at end of file diff --git a/models/marts/_marts__docs.md b/models/marts/_marts__docs.md new file mode 100644 index 0000000..69296ce --- /dev/null +++ b/models/marts/_marts__docs.md @@ -0,0 +1,18 @@ +{% docs nba_games_detail_description %} +A breakdown of metrics & other context related to individual `games`. + +This model is presented on a `game`-level granularity and built with reusability in mind. The expectations is that the data will be aggregated in various ways but access to the underlying data points will still be desired. Example aggregations include by date, team, conference, coach, etc. + +The metrics in this model include (but not limited to): +* Total games played +* Home/away scores +* Point differentials +* Home vs Away wins + +Other game & team related detail inslude (but not limited to): +* Team Names +* Head Coaches +* General Managers +* Day Info (Date, Weekday, Month) +* Game Status (Regulation, Overtime) +{% enddocs %} \ No newline at end of file diff --git a/models/marts/_marts__models.yml b/models/marts/_marts__models.yml index 237b89e..d2cefd9 100644 --- a/models/marts/_marts__models.yml +++ b/models/marts/_marts__models.yml @@ -2,6 +2,7 @@ version: 2 models: - name: nba_games_detail + description: '{{ doc("nba_games_detail_description") }}' columns: - name: game_id tests: From 91d89d513eba3bebd9c6f367eacf203cc638110e Mon Sep 17 00:00:00 2001 From: Tarek Elqoulaq <65929345+luffyofproduct@users.noreply.github.com> Date: Mon, 29 Apr 2024 06:32:33 +0000 Subject: [PATCH 2/4] update readme --- README.md | 55 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7874ac8..6c2b9b2 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,58 @@ -Welcome to your new dbt project! +## Our dbt Project +This repository consists of a [dbt](https://www.getdbt.com/) project that transforms raw data sources into clear, formatted models for Analytics. -### Using the starter project +To learn more about the overall architecture design & strategy can be found in our centralized handbook: +- [Data Architecture Handbook](https://docs.google.com/document/d/1bYec53rXu06qBMknMgoSoQPE69i645iqEcNlCoM4CcQ/edit?usp=sharing) -Try running the following commands: -- dbt run -- dbt test +### Sources: +Raw, unformatted data loaded directly from source systems using various data tools. +- `nba_data` - The primary source of NBA statistics data captured from an API & loaded via Airbyte. + - Schema: `analytics.raw_nba_data` +- `google_sheets` - Internally maintained reference sheets related to the project & loaded via Airbyte. + - Schema: `analytics.raw_google_sheets` +### Environments: +Transformed data models built via dbt with 3 distinct environments to enable a sustainable development workflow. +- **Development** + - Schema: `analytics.dev_[your-name]` + - One per developer to avoid conflicts or overriding changes during development. +- **CI** + - Schema: `analytics.ci` + - An isolated schema created specifically for testing Pull Request changes to ensure quality. +- **Production** + - Schemas: + - `analytics.staging` + - `analytics.warehouse` + - `analytics.marts` + - Separation by layer for easier navigation and permission management. + +### How to Get Started? +1. Create your local development environment + - Use a local IDE (ex. VS Code), dbt Cloud or GitHub Codespaces +2. Clone the current repo (or create a new one) + - Checkout the `main` branch and run `git pull` to sync changes +3. Create a New Branch for your new changes + - First, run `git branch your_branch_name` to create a new branch + - Then run `git checkout your_branch_name` to switch to it +4. Start developing! + - Commit & Sync all changes to your branch during development + - *IMPORTANT* - All changes should follow the team [Style Guide](_project_docs/style_guide.md) +5. Create Pull Request + - When development is complete, Push your branch to GitHub & create a request + - Request peer reviews & confirm automated CI jobs succeed +6. Merge changes to the `main` branch + - Confirm automated post-merge jobs succeed +7. Get latest changes in your local environment + - Checkout the `main` branch in your local terminal + - Run "git pull" to sync the latest version of the code +8. Continue to develop & repeat the process + +### Notes +- Provide any other important call-outs of platform-specific information here. + ### Resources: - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support -- Find [dbt events](https://events.getdbt.com) near you - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices From efd7b229628f1108573f3b53fb0aa49c7a1cfe19 Mon Sep 17 00:00:00 2001 From: Tarek Elqoulaq <65929345+luffyofproduct@users.noreply.github.com> Date: Mon, 29 Apr 2024 06:39:51 +0000 Subject: [PATCH 3/4] add style guide --- _project_docs/style_guide.md | 283 +++++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 _project_docs/style_guide.md diff --git a/_project_docs/style_guide.md b/_project_docs/style_guide.md new file mode 100644 index 0000000..d19a2e8 --- /dev/null +++ b/_project_docs/style_guide.md @@ -0,0 +1,283 @@ +# dbt Style Guide + +## Model Naming +Our models (typically) fit into three main categories: staging, warehouse, marts. For more detail about aspects of this structure, check out [the dbt best practices](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). + +The file and naming structures are as follows (example): +``` +ssp_analytics +├── .github +│ ├── workflows +│ │ ├── ci.yml +│ │ ├── daily_refresh.yml +│ │ └── post_merge_deploy.yml +│ └── pull_request_template.md +├── _project_docs +│ ├── automation +│ │ │ └── profiles.yml +│ └── style_guide.md +├── analyses +├── seeds +│ └── some_data.csv +├── snapshots +├── tests +│ └── assert_some_test_scenario.sql +├── macros +│ ├── _macros__definitions.yml +│ ├── _macros__docs.md +│ └── generate_schema_name.sql +├── models +│ ├── marts +│ │ ├── _marts__docs.md +│ │ ├── _marts__models.yml +│ │ └── nba_games_detail.sql +│ ├── staging +│ │ ├── nba +│ │ │ ├── _nba__docs.md +│ │ │ ├── _nba__models.yml +│ │ │ ├── _nba__sources.yml +│ │ │ ├── stg_nba__games.sql +│ │ │ └── stg_nba__teams.sql +│ │ └── gsheets +│ │ ├── _gsheets__models.yml +│ │ ├── _gsheets__sources.yml +│ │ ├── stg_gsheets__franchise_actives.yml +│ │ ├── stg_gsheets__franchise_general_managers.yml +│ │ └── stg_gsheets__franchise_head_coaches.sql +│ ├── warehouse +│ │ ├── dimensions +│ │ │ ├── _dimensions__docs.md +│ │ │ ├── _dimensions__models.yml +│ │ │ ├── dim_calendar_dates_.sql +│ │ │ ├── dim_games.sql +│ │ │ └── dim_teams.sql +│ │ └── facts +│ │ ├── _facts__docs.yml +│ │ ├── _facts__models.yml +│ │ └── fct_games_played.sql +├── README.md +├── dbt_project.yml +├── packages.yml +└── requirements.txt +``` +- All objects should be plural, such as: `stg_nba__teams` +- Staging models are 1:1 with each source table and named with the following convention: `stg___.sql` + - [Additional context on Staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging) +- Marts contain all of the useful data about a _particular entity_ at a granular level and should lean towards being wide and denormalized. + - [Additional context on Marts models](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) +- Intermediate tables (if needed) should help break apart complex or lengthy logic and follow the following convention: `int_[entity]s_[verb]s.sql` + - [Additional context on Intermediate models](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate) + + +## Model configuration + +- Model-specific attributes (like sort/dist keys) should be specified in the model. +- If a particular configuration applies to all models in a directory, it should be specified in the `dbt_project.yml` file. +- In-model configurations should be specified like this: + +```python +{{ + config( + materialized = 'table', + sort = 'id', + dist = 'id' + ) +}} +``` +- Marts should always be configured as tables + +## dbt conventions +* Only `stg_` models (or `base_` models if your project requires them) should select from `source`s. +* All other models should only select from other models. + +## Testing +- Every subdirectory should contain a `.yml` file, in which each model in the subdirectory is tested. For staging folders, there will be both `_sourcename__sources.yml` as well as `_sourcename__models.yml`. For other folders, the structure should be `_foldername__models.yml` (example `_finance__models.yml`). +- At a minimum, unique and not_null tests should be applied to the primary key of each model. + +## Naming and field conventions + +* Schema, table and column names should be in `snake_case`. +* Use names based on the _business_ terminology, rather than the source terminology. +* Each model should have a primary key. +* The primary key of a model should be named `_id`, e.g. `account_id` – this makes it easier to know what `id` is being referenced in downstream joined models. +* For base/staging models, fields should be ordered in categories, where identifiers are first and timestamps are at the end. +* Timestamp columns should be named `_at`, e.g. `created_at`, and should be in UTC. If a different timezone is being used, this should be indicated with a suffix, e.g `created_at_pt`. +* Booleans should be prefixed with `is_` or `has_`. +* Price/revenue fields should be in decimal currency (e.g. `19.99` for $19.99; many app databases store prices as integers in cents). If non-decimal currency is used, indicate this with suffix, e.g. `price_in_cents`. +* Avoid reserved words as column names +* Consistency is key! Use the same field names across models where possible, e.g. a key to the `customers` table should be named `customer_id` rather than `user_id`. + +## CTEs + +For more information about why we use so many CTEs, check out [this discourse post](https://discourse.getdbt.com/t/why-the-fishtown-sql-style-guide-uses-so-many-ctes/1091). + +- All `{{ ref('...') }}` statements should be placed in CTEs at the top of the file +- Where performance permits, CTEs should perform a single, logical unit of work. +- CTE names should be as verbose as needed to convey what they do +- CTEs with confusing or noteable logic should be commented +- CTEs that are duplicated across models should be pulled out into their own models +- create a `final` or similar CTE that you select from as your last line of code. This makes it easier to debug code within a model (without having to comment out code!) +- CTEs should be formatted like this: + +``` sql +with + +events as ( + + ... + +), + +-- CTE comments go here +filtered_events as ( + + ... + +) + +select * from filtered_events +``` + +## SQL style guide + +- Use trailing commas +- Indents should be four spaces (except for predicates, which should line up with the `where` keyword) +- Lines of SQL should be no longer than [80 characters](https://stackoverflow.com/questions/29968499/vertical-rulers-in-visual-studio-code) +- Field names and function names should all be lowercase +- The `as` keyword should be used when aliasing a field or table +- Fields should be stated before aggregates / window functions +- Aggregations should be executed as early as possible before joining to another table. +- Ordering and grouping by a number (eg. group by 1, 2) is preferred over listing the column names (see [this rant](https://blog.getdbt.com/write-better-sql-a-defense-of-group-by-1/) for why). Note that if you are grouping by more than a few columns, it may be worth revisiting your model design. +- Prefer `union all` to `union` [*](http://docs.aws.amazon.com/redshift/latest/dg/c_example_unionall_query.html) +- Avoid table aliases in join conditions (especially initialisms) – it's harder to understand what the table called "c" is compared to "customers". +- If joining two or more tables, _always_ prefix your column names with the table alias. If only selecting from one table, prefixes are not needed. +- Be explicit about your join (i.e. write `inner join` instead of `join`). `left joins` are normally the most useful, `right joins` often indicate that you should change which table you select `from` and which one you `join` to. + +- *DO NOT OPTIMIZE FOR A SMALLER NUMBER OF LINES OF CODE. NEWLINES ARE CHEAP, BRAIN TIME IS EXPENSIVE* + +### Example SQL +```sql +with + +my_data as ( + + select * from {{ ref('my_data') }} + +), + +some_cte as ( + + select * from {{ ref('some_cte') }} + +), + +some_cte_agg as ( + + select + id, + sum(field_4) as total_field_4, + max(field_5) as max_field_5 + + from some_cte + group by 1 + +), + +final as ( + + select [distinct] + my_data.field_1, + my_data.field_2, + my_data.field_3, + + -- use line breaks to visually separate calculations into blocks + case + when my_data.cancellation_date is null + and my_data.expiration_date is not null + then expiration_date + when my_data.cancellation_date is null + then my_data.start_date + 7 + else my_data.cancellation_date + end as cancellation_date, + + some_cte_agg.total_field_4, + some_cte_agg.max_field_5 + + from my_data + left join some_cte_agg + on my_data.id = some_cte_agg.id + where my_data.field_1 = 'abc' + and ( + my_data.field_2 = 'def' or + my_data.field_2 = 'ghi' + ) + having count(*) > 1 + +) + +select * from final + +``` + +- Your join should list the "left" table first (i.e. the table you are selecting `from`): +```sql +select + trips.*, + drivers.rating as driver_rating, + riders.rating as rider_rating + +from trips +left join users as drivers + on trips.driver_id = drivers.user_id +left join users as riders + on trips.rider_id = riders.user_id + +``` + +## YAML style guide + +* Indents should be two spaces +* List items should be indented +* Use a new line to separate list items that are dictionaries where appropriate +* Lines of YAML should be no longer than 80 characters. + +### Example YAML +```yaml +version: 2 + +models: + - name: events + columns: + - name: event_id + description: This is a unique identifier for the event + tests: + - unique + - not_null + + - name: event_time + description: "When the event occurred in UTC (eg. 2018-01-01 12:00:00)" + tests: + - not_null + + - name: user_id + description: The ID of the user who recorded the event + tests: + - not_null + - relationships: + to: ref('users') + field: id +``` + + +## Jinja style guide + +* When using Jinja delimiters, use spaces on the inside of your delimiter, like `{{ this }}` instead of `{{this}}` +* Use newlines to visually indicate logical blocks of Jinja + + +## Helpful Reference Links +* https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview +* https://discourse.getdbt.com/t/why-the-fishtown-sql-style-guide-uses-so-many-ctes/1091 +* https://blog.getdbt.com/write-better-sql-a-defense-of-group-by-1/ +* https://docs.getdbt.com/docs/about/viewpoint +* https://github.com/dbt-labs/corp/blob/main/dbt_style_guide.md \ No newline at end of file From 4680b2bee1d6a9926d4e35923b1fa6d0331778b0 Mon Sep 17 00:00:00 2001 From: Tarek Elqoulaq <65929345+luffyofproduct@users.noreply.github.com> Date: Mon, 29 Apr 2024 06:46:08 +0000 Subject: [PATCH 4/4] add requirements.txt --- requirements.txt | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6e5253d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,49 @@ +agate==1.7.1 +annotated-types==0.6.0 +attrs==23.2.0 +Babel==2.14.0 +certifi==2024.2.2 +cffi==1.16.0 +charset-normalizer==3.3.2 +click==8.1.7 +colorama==0.4.6 +dbt-core==1.7.13 +dbt-extractor==0.5.1 +dbt-postgres==1.7.13 +dbt-semantic-interfaces==0.4.4 +idna==3.7 +importlib-metadata==6.11.0 +isodate==0.6.1 +Jinja2==3.1.3 +jsonschema==4.21.1 +jsonschema-specifications==2023.12.1 +leather==0.4.0 +Logbook==1.5.3 +MarkupSafe==2.1.5 +mashumaro==3.12 +minimal-snowplow-tracker==0.0.2 +more-itertools==10.2.0 +msgpack==1.0.8 +networkx==3.3 +packaging==24.0 +parsedatetime==2.6 +pathspec==0.11.2 +protobuf==4.25.3 +psycopg2-binary==2.9.9 +pycparser==2.22 +pydantic==2.7.1 +pydantic_core==2.18.2 +python-dateutil==2.9.0.post0 +python-slugify==8.0.4 +pytimeparse==1.1.8 +pytz==2024.1 +PyYAML==6.0.1 +referencing==0.35.0 +requests==2.31.0 +rpds-py==0.18.0 +six==1.16.0 +sqlparse==0.5.0 +text-unidecode==1.3 +typing_extensions==4.11.0 +urllib3==1.26.18 +zipp==3.18.1