Skip to content

Commit

Permalink
storage billing model ddl optimization script (#456)
Browse files Browse the repository at this point in the history
  • Loading branch information
afleisc authored Oct 1, 2024
1 parent 4d8480c commit e351a46
Show file tree
Hide file tree
Showing 2 changed files with 164 additions and 0 deletions.
24 changes: 24 additions & 0 deletions scripts/optimization/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,30 @@ contains views with JOINs where the JOIN condition is potentially non-optimal.

</details>

<details><summary><b>&#128269; Storage Billing Model Savings DDL </b></summary>

## Storage Billing Model Savings DDL

The [storage_billing_model_savings_ddl.sql](storage_billing_model_savings_ddl.sql) script
creates a table named, `storage_billing_model_savings_ddl` which forecasts which datasets will benefit from
either the physical or logical storage billing model, and provides the DDL to adjust them across your organization.

Instructions: Search for marker 'REMEMBER' to tune the queries at your will. You must adjust the region that you're interested in
and the pricing for each region found [here](https://cloud.google.com/bigquery/pricing#storage).
### Examples of querying script results
* Top 10 datasets with their respective recommended storage billing model, ordered by forecasted savings
```sql
SELECT project_name, dataset_name, currently_on, better_on, monthly_savings, ddl
FROM optimization_workshop.storage_billing_model_savings_ddl
ORDER BY monthly_savings DESC
LIMIT 10
```
</details>
# Query Analysis
<details><summary><b>&#128269; Queries grouped by hash</b></summary>
Expand Down
140 changes: 140 additions & 0 deletions scripts/optimization/storage_billing_model_savings_ddl.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
-- WARNING #########################################################################################################################################
-- WARNING If you change the storage billing model of a dataset then you have to wait 14 days before changing it again.
-- WARNING See https://cloud.google.com/bigquery/docs/datasets-intro#dataset_limitations
-- WARNING #########################################################################################################################################


-- Instructions
-- Search for marker 'REMEMBER' to tune the queries at your will
-- Run the query to obtain the results including ALTER SCHEMA DDL


-- Permissions needed
-- * To run this query https://cloud.google.com/bigquery/docs/information-schema-table-storage-by-organization#required_permissions
-- * To run the ALTER SCHEMA DDL https://cloud.google.com/bigquery/docs/updating-datasets#required_permissions


-- REMEMBER: Put here the prices of the region of interest, current values are for the US
-- See https://cloud.google.com/bigquery/pricing#storage
DECLARE active_logical_gib_price FLOAT64 DEFAULT 0.02;
DECLARE long_term_logical_gib_price FLOAT64 DEFAULT 0.01;
DECLARE active_physical_gib_price FLOAT64 DEFAULT 0.04;
DECLARE long_term_physical_gib_price FLOAT64 DEFAULT 0.02;


-- REMEMBER: (optional) Change this if you plan to change the time travel window (e.g. if you currently use the default 7 days and want to reduce it to 2 days put 2.0/7.0)
-- See https://cloud.google.com/bigquery/docs/time-travel#configuring_the_time_travel_window
DECLARE time_travel_rescale FLOAT64 DEFAULT 1.0;
-- REMEMBER: (optional) Change this if you want that the generated DDL statements include time travel window settings (e.g. 2.0*24.0)
DECLARE time_travel_hours FLOAT64 DEFAULT NULL;
-- REMEMBER: (optional) Change this to filter based on savings absolute value
DECLARE min_monthly_saving FLOAT64 DEFAULT 0.0;
-- REMEMBER: (optional) Change this to filter based on savings % value (e.g. 1% is 0.01)
DECLARE min_monthly_saving_pct FLOAT64 DEFAULT 0.00;

CREATE SCHEMA IF NOT EXISTS optimization_workshop;
CREATE OR REPLACE TABLE optimization_workshop.storage_billing_model_savings_ddl AS
WITH
storage_sizes AS (
SELECT
project_id AS project_name,
table_schema AS dataset_name,
-- Logical
SUM(active_logical_bytes) / power(1024, 3) AS active_logical_gib,
SUM(long_term_logical_bytes) / power(1024, 3) AS long_term_logical_gib,
-- Physical
SUM(active_physical_bytes - time_travel_physical_bytes) / power(1024, 3) AS active_no_tt_no_fs_physical_gib,
SUM(time_travel_physical_bytes) / power(1024, 3) * time_travel_rescale AS time_travel_physical_gib,
SUM(fail_safe_physical_bytes) / power(1024, 3) AS fail_safe_physical_gib,
SUM(long_term_physical_bytes) / power(1024, 3) AS long_term_physical_gib,
FROM
-- REMEMBER: Change here to the region of interest
`region-us`.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION
-- See https://cloud.google.com/bigquery/docs/information-schema-table-storage-by-organization#schema
WHERE TRUE
AND total_physical_bytes > 0
-- REMEMBER: (optional) You may want to enable this filter to exclude ML models, materialized views, etc...
-- AND table_type = 'BASE TABLE'
GROUP BY 1,2
),
storage_prices AS (
SELECT
project_name,
dataset_name,
-- Logical
active_logical_gib AS active_logical_gib,
long_term_logical_gib AS long_term_logical_gib,
-- Physical
active_no_tt_no_fs_physical_gib AS active_no_tt_no_fs_physical_gib,
time_travel_physical_gib AS time_travel_physical_gib,
fail_safe_physical_gib AS fail_safe_physical_gib,
long_term_physical_gib AS long_term_physical_gib,
-- Compression ratio
SAFE_DIVIDE(active_logical_gib, active_no_tt_no_fs_physical_gib) AS active_compression_ratio,
SAFE_DIVIDE(long_term_logical_gib, long_term_physical_gib) AS long_term_compression_ratio,
-- Forecast costs logical
active_logical_gib * active_logical_gib_price AS forecast_active_logical_cost,
long_term_logical_gib * long_term_logical_gib_price AS forecast_long_term_logical_cost,
-- Forecast costs physical
active_no_tt_no_fs_physical_gib * active_physical_gib_price AS forecast_active_no_tt_no_fs_physical_cost,
time_travel_physical_gib * active_physical_gib_price AS forecast_travel_physical_cost,
fail_safe_physical_gib * active_physical_gib_price AS forecast_failsafe_physical_cost,
long_term_physical_gib * long_term_physical_gib_price AS forecast_long_term_physical_cost,
FROM
storage_sizes
),
storage_prices_total AS (
SELECT
project_name,
dataset_name,
(forecast_active_logical_cost+forecast_long_term_logical_cost) AS forecast_logical,
(forecast_active_no_tt_no_fs_physical_cost+forecast_travel_physical_cost+forecast_failsafe_physical_cost+forecast_long_term_physical_cost) AS forecast_physical,
FROM storage_prices
),
current_model AS (
SELECT
catalog_name AS project_name,
option_name AS dataset_name,
LOWER(option_value) AS current_model
FROM
-- REMEMBER: Change here to the region of interest
`region-us`.INFORMATION_SCHEMA.SCHEMATA_OPTIONS
WHERE option_name = "storage_billing_model"
),
storage_prices_compare AS (
SELECT
SPT.project_name,
SPT.dataset_name,
SPT.forecast_logical,
SPT.forecast_physical,
(SPT.forecast_logical - SPT.forecast_physical) AS forecast_compare,
IF(SPT.forecast_logical > SPT.forecast_physical,"physical","logical") AS better_on,
IFNULL(CM.current_model, "logical") AS currently_on
FROM
storage_prices_total AS SPT
LEFT JOIN current_model AS CM
USING(project_name, dataset_name)
),
storage_prices_compare2 AS (
SELECT
*,
IF(currently_on = "logical", forecast_logical, forecast_physical) AS monthly_spending,
ABS(forecast_compare) AS monthly_savings,
ABS(forecast_compare)/IF(currently_on = "logical", forecast_logical, forecast_physical) AS monthly_savings_pct,
FROM storage_prices_compare
),
storage_ddl AS (
SELECT
*,
IF(time_travel_hours IS NULL,
CONCAT("ALTER SCHEMA `", project_name, ".", dataset_name, "` SET OPTIONS(storage_billing_model='", better_on, "' );" ),
CONCAT("ALTER SCHEMA `", project_name, ".", dataset_name, "` SET OPTIONS(storage_billing_model='", better_on, "', max_time_travel_hours=", time_travel_hours, ");" )
) AS ddl,
FROM storage_prices_compare2
WHERE better_on != currently_on
AND monthly_savings > min_monthly_saving
AND monthly_savings_pct > min_monthly_saving_pct
ORDER BY monthly_savings DESC
)
-- REMEMBER: (optional) Change here the name of the pseudo-table (CTE) you want to check
SELECT * FROM storage_ddl;

0 comments on commit e351a46

Please sign in to comment.