From ced15fe1611e5d505d299d6d52d36094349a67c5 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 19 Nov 2024 09:14:41 +0100 Subject: [PATCH 01/10] =?UTF-8?q?=F0=9F=93=9C=20private=20datasets?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From c0c7ec96ed4a1eef9454ee768e8b0abd5a666a4e Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 19 Nov 2024 09:25:43 +0100 Subject: [PATCH 02/10] fix #2976 --- docs/guides/private-import.md | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/docs/guides/private-import.md b/docs/guides/private-import.md index 120ce5849e2..8da97007592 100644 --- a/docs/guides/private-import.md +++ b/docs/guides/private-import.md @@ -3,11 +3,10 @@ tags: - 👷 Staff --- -While most of the data at OWID is publicly available, some datasets are added to our catalogue with some restrictions. These include datasets that are not redistributable, or that are not meant to be shared with the public. This can happen due to a strict license by the data provider, or because the data is still in a draft stage and not ready for public consumption. +While most of the data at OWID is publicly available, some datasets are added to our catalog with some restrictions. These include datasets that are not redistributable, or that are not meant to be shared with the public. This can happen due to a strict license by the data provider, or because the data is still in a draft stage and not ready for public consumption. Various privacy configurations are available: -- Skip re-publishing to GitHub. - Disable data downloading options on Grapher. - Disable public access to the original file (snapshot). - Hide the dataset from our public catalog (accessible via `owid-catalog-py`). @@ -16,6 +15,12 @@ In the following, we explain how to create private steps in the ETL pipeline and ## Create a private step + +!!! tip "Make your dataset completely private" + + - **Snapshot**: Set `meta.is_public` to `false` in the snapshot DVC file. + - **Meadow, Garden, Grapher**: Use `data-private://` prefix in the step name in the DAG. Set `dataset.non_redistributable` to `true` in the dataset garden metadata. + ### Snapshot To create a private snapshot step, set the `meta.is_public` property in the snapshot .dvc file to false: @@ -34,7 +39,7 @@ This will prevent the file to be publicly accessible without the appropriate cre ### Meadow, Garden, Grapher -Creating a private data step means that the data will not be listed in the public catalog, and therefore will not be accessible via `owid-catalog-py`. In addition, private datasets will not be re-published to GitHub. +Creating a private data step means that the data will not be listed in the public catalog, and therefore will not be accessible via `owid-catalog-py`. To create a private data step (meadow, garden or grapher) simply use `data-private` prefix in the step name in the DAG. For example, the step `grapher/ihme_gbd/2024-06-10/leading_causes_deaths` (this is from [health.yml](https://github.com/owid/etl/blob/master/dag/health.yml)) is private: @@ -70,8 +75,8 @@ etl run run [step-name] --private If you want to make a private step public simply follow the steps below: -- **In the DAG:** Replace `data-private/` prefix with `data/`. -- **In the snapshot DVC file**: Set `meta.is_public` to `true` (or simply remove `is_public` property). -- (Optional) **Allow for Grapher downloads**: Set `dataset.non_redistributable` to `false` in the dataset garden metadata (or simply remove the property from the metadata). +- **In the DAG:** Replace `data-private://` prefix with `data://`. +- **In the snapshot DVC file**: Set `meta.is_public` to `true` (or simply remove this property). +- (Optional) **Allow for Grapher downloads**: Set `dataset.non_redistributable` to `false` in the dataset garden metadata (or simply remove this property). After this, re-run the snapshot step and commit your changes. From 662fb04c35c5b5e70615e37b32dd162e7b90eda3 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 19 Nov 2024 09:30:04 +0100 Subject: [PATCH 03/10] remove status: new --- docs/architecture/workflow/index.md | 4 ---- docs/guides/auto-regular-updates.md | 1 - 2 files changed, 5 deletions(-) diff --git a/docs/architecture/workflow/index.md b/docs/architecture/workflow/index.md index 0762d14b3b1..4de1754e352 100644 --- a/docs/architecture/workflow/index.md +++ b/docs/architecture/workflow/index.md @@ -1,7 +1,3 @@ ---- -status: new ---- - Our World in Data has a whole team dedicated to data management that takes data from publicly available sources (e.g. the _UN Food and Agriculture Organisation_), and makes it available to our researchers to analyse and create visualisation for their articles. ## Five stages diff --git a/docs/guides/auto-regular-updates.md b/docs/guides/auto-regular-updates.md index 50912419fbf..f82957e6809 100644 --- a/docs/guides/auto-regular-updates.md +++ b/docs/guides/auto-regular-updates.md @@ -1,7 +1,6 @@ --- tags: - 👷 Staff -status: new --- !!! warning "This is a work in progress" From eb97288452aecf4fc3c61339cee8e80cc0c0e859 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 19 Nov 2024 11:26:29 +0100 Subject: [PATCH 04/10] re-structure docs on export / explorers --- docs/architecture/workflow/index.md | 242 +++------------------- docs/architecture/workflow/other-steps.md | 31 ++- docs/guides/data-work/export-data.md | 157 ++++++++++++++ docs/guides/data-work/index.md | 2 - 4 files changed, 208 insertions(+), 224 deletions(-) create mode 100644 docs/guides/data-work/export-data.md diff --git a/docs/architecture/workflow/index.md b/docs/architecture/workflow/index.md index 4de1754e352..66bfd66fe0a 100644 --- a/docs/architecture/workflow/index.md +++ b/docs/architecture/workflow/index.md @@ -5,18 +5,17 @@ Our World in Data has a whole team dedicated to data management that takes data The ETL project provides an opinionated data management workflow, which separates a data manager's work into five stages: ```mermaid -graph TB +graph LR -snapshot --> format --> harmonise --> import --> publish +snapshot --> format --> harmonize/process --> import --> publish ``` The design of the ETL involves steps that mirror the stages above, which help us to meet several design goals of the project: -1. [Snapshot step](#snapshot): Take a **snapshot** of the upstream data product and store it on our end. - -- [Meadow step](#meadow): Bring the data into a **common format**. -- [Garden step](#garden): **Harmonise** the names of countries, genders and any other columns we may want to join on. Also do the necessary data processing to make the dataset usable for our needs. -- [Grapher step](#grapher): **Import** the data to our internal MySQL database. +1. [Snapshot step](#snapshot): Take a **snapshot** of the upstream data product and store it. +2. [Meadow step](#meadow): Bring the data into a **common format**. +3. [Garden step](#garden): **Harmonise** the names of countries, genders and any other columns we may want to join on. Also do the necessary **data processing** to make the dataset usable for our needs. +4. [Grapher step](#grapher): **Import** the data to our internal MySQL database. A data manager must implement all these steps to make something chartable on the Our World in Data site. @@ -28,9 +27,7 @@ A data manager must implement all these steps to make something chartable on the ## Snapshot -The initial step in importing data from an upstream source involves **transferring an external file directly into our platform**. This process is essential to ensure both reliable and secure access to the file. - -It's important to recognize that an external source might remove the file at any time. Furthermore, this method supports the reproducibility of all Extract, Transform, Load (ETL) processes. This is crucial because the content of the file at the source may undergo changes, such as the removal or addition of datapoints, or alterations in field names. +The initial step consists in **transferring an external file from an upstream provider into our platform**. This ensures, that the source data is always accessible. This is because the upstream provider might remove the file at any time, or change it. The accompanying diagram illustrates the process of importing various versions of the same dataset into our snapshot catalog, depicted over time. Imagine that the vertical axis represents time. @@ -56,9 +53,9 @@ flowchart LR The snapshot step typically consists of a DVC file and a script that downloads the upstream data and saves it to our snapshot catalog. Snapshot files are located in the [`snapshots/`](https://github.com/owid/etl/tree/master/snapshots) directory of the project. -Note that we need a DVC file per upstream data file; hence, in some instances, if the source publishes a datset using multiple files, we need multiple DVC files. +Note that we need a DVC file per upstream data file; hence, in some instances, if the source publishes a dataset using multiple files, we need multiple DVC files. -### Metadata +### Snapshot metadata A Snapshot is a picture of a data product (e.g. a data CSV file) provided by an upstream data provider at a particular point in time. It is the entrypoint to ETL and where we define metadata attributes of that picture. This is fundamental to ensure that the data is properly documented and that the metadata is propagated to the rest of the system. @@ -66,7 +63,7 @@ The metadata in Snapshot consists mainly of one object: `meta.origin`. !!! info "Learn more in our [metadata reference](../metadata/reference#origin)." -This metadata is captured in a DVC file (similar to a yaml file), which contains all the snapshot metadata fields as key-value pairs. + This metadata is captured in a DVC file (similar to a YAML file), which contains all the snapshot metadata fields as key-value pairs. ??? example "Example of [`snapshots/animal_welfare/2023-10-24/fur_laws.xlsx.dvc`](https://github.com/owid/etl/blob/master/snapshots/animal_welfare/2023-10-24/fur_laws.xlsx.dvc)" @@ -95,49 +92,17 @@ This metadata is captured in a DVC file (similar to a yaml file), which contains ## Meadow -The meadow step is the first Transform step of our ETL. - -In a meadow step, we load a `snapshot` and adapt it to be in a convenient format. A convenient format means creating an instance of a [`Dataset`](../../design/common-format/#datasets-owidcatalogdataset), with the appropriate data as a table (or tables). +The meadow step is the first Transform step of our ETL. In it, we load a [`Snapshot`](../../architecture/design/phases/#snapshot) and adapt it to be in a convenient format. A convenient format means creating an instance of a [`Dataset`](../../architecture/design/phases/#datasets), with the appropriate data as a `Table` (or tables). In this step, you can add and define metadata, but we rarely do this. Instead, we propagate the metadata defined in the Snapshot step and leave it to the Garden step to enhance the metadata. Meadow steps should only have `snapshot` (or `walden`) dependencies and ー by definition ー should not depend on `garden` steps. -A typical flow up to the Meadow step could look like: - -```mermaid -flowchart LR - - upstream1((____)):::node -.->|copy| snapshot1((____)):::node - snapshot1((____)):::node -->|format| meadow1((____)):::node - - subgraph id0 [Upstream] - upstream1 - end - - subgraph id1 [Snapshot] - snapshot1 - end - - subgraph id2 [Meadow] - meadow1 - end - - - subgraph id [ETL] - id1 - id2 - end - - classDef node fill:#002147,color:#002147 - classDef node_ss fill:#002147,color:#fff -``` - ## Garden The Garden step is where most of the work falls in. This is where the data manager needs to carefully look at the data, filter outliers, harmonize labels (e.g. country names), improve the dataset metadata, etc. -Garden steps typically depend on meadow steps. For instance, the Garden step `data://garden/un/2022-07-11/un_wpp`, which generates the dataset _World Population Prospects (UN, 2022)_, depends on this same process but in Meadow (i.e. `data://meadow/un/2022-07-11/un_wpp`). After some pre-liminary work (mostly re-formating table, and some minor cleaning), we can now focus on more complex processing steps in Garden. +Garden steps typically depend on meadow steps. For instance, the Garden step `data://garden/un/2024-07-12/un_wpp`, which generates the dataset _World Population Prospects (UN, 2024)_, depends on this same process but in Meadow (i.e. `data://meadow/un/2024-07-12/un_wpp`). After some pre-liminary work (mostly re-formating tables, and some minor cleaning), we can now focus on more complex processing steps in Garden. A typical flow up to the Garden step could look like: @@ -146,7 +111,7 @@ flowchart LR upstream1((____)):::node -.->|copy| snapshot1((____)):::node snapshot1((____)):::node -->|format| meadow1((____)):::node - meadow1((____)):::node -->|harmonize| garden1((____)):::node + meadow1((____)):::node -->|process| garden1((____)):::node subgraph id0 [Upstream] upstream1 @@ -174,7 +139,7 @@ flowchart LR classDef node_ss fill:#002147,color:#fff ``` -However, garden steps could also depend on other garden steps. This is often the case for datasets containing _long-run indicators_, where different `garden` datasets are combined. +However, Garden steps could also depend on other garden steps. This is often the case for datasets containing _long-run indicators_, or just in general when different `garden` datasets are combined. !!! info "Long-run indicators" @@ -186,27 +151,27 @@ However, garden steps could also depend on other garden steps. This is often the ```yaml data://garden/demography/2023-03-31/population: - - data://garden/hyde/2017/baseline - - data://garden/gapminder/2023-03-31/population - - data://garden/un/2022-07-11/un_wpp - - data://open_numbers/open_numbers/latest/gapminder__systema_globalis + - data://garden/hyde/2017/baseline + - data://garden/gapminder/2023-03-31/population + - data://garden/un/2022-07-11/un_wpp + - data://open_numbers/open_numbers/latest/gapminder__systema_globalis ``` -An important processing step in Garden is to standardise (or harmonise) the country names. You can learn more about this in our [country standardisation guide](../../guides/harmonize-countries). +An important processing step in Garden is to standardize (or harmonize) the country names. You can learn more about this in our [country harmonization guide](../../guides/harmonize-countries). -### Metadata +### Garden metadata After adapting and processing the origin's data, we have a curated dataset. This dataset, contains indicators (maybe not present in the origin) that we need to properly document. -The metadata in Garden consists mainly of two objects: `dataset` and `tables`. The metadata comes as a YAML file next to the processing scripts. +The metadata in Garden consists mainly of two objects: [`Dataset`](../../architecture/metadata/reference/#dataset) and [`Table`](../../architecture/metadata/reference/#table) (list). The metadata comes as a YAML file next to the processing scripts. !!! info "Learn more in our [dataset reference](../metadata/reference/#dataset), [tables reference](../metadata/reference/#table) and [indicator reference](../metadata/reference/#variable)." ## Grapher -In the `grapher` step the work should be minimal. Here, we create a `grapher` view by adapting our Garden dataset to fit the Grapher requirements. +In the Grapher step the work should be minimal. Here, we create a `Grapher` view by adapting our Garden dataset to adhere to the Grapher requirements. -Grapher views are still normal datasets, but they adapt the data to the way it must look when being inserted to MySQL. For each grapher view, there is a corresponding matching `grapher://` step automatically generated which does the actual insert to MySQL, if MySQL credentials have been configured. +Grapher views are still normal datasets, but they adapt the data to the way it must look when being inserted into MySQL. For each grapher view, there is a corresponding matching `grapher://` step automatically generated which does the actual insert to MySQL, if MySQL credentials have been configured. A typical flow up to the Grapher step could look like: @@ -254,163 +219,6 @@ flowchart LR classDef node_ss fill:#002147,color:#fff ``` -In principle, a grapher step only loads a single garden step. +In principle, a Grapher step only loads a single garden step. -Note that the diagram shows a final step outside of the ETL. This is when the `grapher://` step is executed, and takes data from the ETL (from the etl `garden` step) and imports it to oure database. - -!!! bug "TODO: Add an example of code" - -## Export - -Sometimes we want to perform an action instead of creating a dataset. For instance, we might want to create a TSV file for an explorer, commit a CSV to a GitHub repository, or create a config for a multi-dimensional indicator. This is where the `export` step comes in. - -Export steps are defined in `etl/steps/export` directory and have similar structure to regular steps. They are run with the `--export` flag. - -```bash -etlr export://explorers/minerals/latest/minerals --export -``` - -The `def run(dest_dir):` function doesn't save a dataset, but calls a method that performs the action. For instance `create_explorer(...)` or `gh.commit_file_to_github(...)`. Once the step is executed successfully, it won't be run again unless its code or dependencies change (it won't be "dirty"). - -### Creating explorers - -TSV files for explorers are created using the `create_explorer` function, usually from a configuration YAML file - -```python -# Create a new explorers dataset and tsv file. -ds_explorer = create_explorer(dest_dir=dest_dir, config=config, df_graphers=df_graphers) -ds_explorer.save() -``` -!!! info "Creating explorers on staging servers" - - Explorers can be created or edited on staging servers and then manually migrated to production. Each staging server creates a branch in the `owid-content` repository. Editing explorers in Admin or running the `create_explorer` function pushes changes to that branch. Once the PR is merged, the branch gets pushed to the `owid-content` repository (not to the `master` branch, but its own branch). You then need to manually create a PR from that branch and merge it into `master`. - - -### Creating multi-dimensional indicators - -Multi-dimensional indicators are powered by a configuration that is typically created from a YAML file. The structure of the YAML file looks like this: - -```yaml title="etl/steps/export/multidim/covid/latest/covid.deaths.yaml" -definitions: - table: {definitions.table} - -title: - title: COVID-19 deaths - titleVariant: by interval -defaultSelection: - - World - - Europe - - Asia -topicTags: - - COVID-19 - -dimensions: - - slug: interval - name: Interval - choices: - - slug: weekly - name: Weekly - description: null - - slug: biweekly - name: Biweekly - description: null - - - slug: metric - name: Metric - choices: - - slug: absolute - name: Absolute - description: null - - slug: per_capita - name: Per million people - description: null - - slug: change - name: Change from previous interval - description: null - -views: - - dimensions: - interval: weekly - metric: absolute - indicators: - y: "{definitions.table}#weekly_deaths" - - dimensions: - interval: weekly - metric: per_capita - indicators: - y: "{definitions.table}#weekly_deaths_per_million" - - dimensions: - interval: weekly - metric: change - indicators: - y: "{definitions.table}#weekly_pct_growth_deaths" - - - dimensions: - interval: biweekly - metric: absolute - indicators: - y: "{definitions.table}#biweekly_deaths" - - dimensions: - interval: biweekly - metric: per_capita - indicators: - y: "{definitions.table}#biweekly_deaths_per_million" - - dimensions: - interval: biweekly - metric: change - indicators: - y: "{definitions.table}#biweekly_pct_growth_deaths" -``` - -The `dimensions` field specifies selectors, and the `views` field defines views for the selection. Since there are numerous possible configurations, `views` are usually generated programmatically. However, it's a good idea to create a few of them manually to start. - -You can also combine manually defined views with generated ones. See the `etl.multidim` module for available helper functions or refer to examples from `etl/steps/export/multidim/`. Feel free to add or modify the helper functions as needed. - -The export step loads the YAML file, adds `views` to the config, and then calls the function. - -```python title="etl/steps/export/multidim/covid/latest/covid.py" -def run(dest_dir: str) -> None: - engine = get_engine() - - # Load YAML file - config = paths.load_mdim_config("covid.deaths.yaml") - - multidim.upsert_multidim_data_page("mdd-energy", config, engine) -``` - -To see the multi-dimensional indicator in Admin, run - -```bash -etlr export://multidim/energy/latest/energy --export -``` - -and check out the preview at http://staging-site-my-branch/admin/grapher/mdd-name. - - -### Exporting data to GitHub - -One common use case for the `export` step is to commit a dataset to a GitHub repository. This is useful when we want to make a dataset available to the public. The pattern for this looks like this: - -```python -if os.environ.get("CO2_BRANCH"): - dry_run = False - branch = os.environ["CO2_BRANCH"] -else: - dry_run = True - branch = "master" - -gh.commit_file_to_github( - combined.to_csv(), - repo_name="co2-data", - file_path="owid-co2-data.csv", - commit_message=":bar_chart: Automated update", - branch=branch, - dry_run=dry_run, -) -``` - -This code will commit the dataset to the `co2-data` repository on GitHub if you specify the `CO2_BRANCH` environment variable, i.e. - -```bash -CO2_BRANCH=main etlr export://co2/latest/co2 --export -``` +Note that the diagram shows a final step outside of the ETL. This is when the `grapher://` step is executed, and takes data from the ETL (from the etl `garden` step) and imports it to our database. diff --git a/docs/architecture/workflow/other-steps.md b/docs/architecture/workflow/other-steps.md index e738bb318c2..bdd1bf186ea 100644 --- a/docs/architecture/workflow/other-steps.md +++ b/docs/architecture/workflow/other-steps.md @@ -1,11 +1,35 @@ +--- +status: new +--- + So far you have learned about the standard steps. These should cover most of the cases. However, there are some other steps worth mentioning. -## Explorers +## Export steps + +Sometimes we want to perform an action instead of creating a dataset. For instance, we might want to create a TSV file for an explorer, commit a CSV to a GitHub repository, or create a config for a multi-dimensional indicator. This is where the `Export` step comes in. + +Export steps are used to perform an action on an already created dataset. This action typically implies making the data available to other parts of the system. There are different types of export steps: + +- **Explorers**: Create a TSV file for a data explorer. +- **Multi-dimensional indicators**: Create a configuration for a multi-dimensional indicator. +- **Export to GitHub**: Commit a dataset to a GitHub repository. + +Export steps should be used after the data has been processed and is ready to be used (post-Garden). + +!!! note "Learn more about [export steps](../../guides/data-work/export-data.md)" -Data explorers are Grapher charts expanded with additional functionalities to facilitate exploration, such as dynamic entity filters or customizable menus. They are powered by CSV files generated by ETL [served from S3](https://dash.cloudflare.com/078fcdfed9955087315dd86792e71a7e/r2/default/buckets/owid-catalog). Explorers data step in ETL is responsible for generating these CSV files. It works in the same way as e.g. garden step, but the transformations made there are meant to get the data ready for the data explorer (and not be consumed by users of catalog). +### Explorers + +Data explorers are Grapher charts expanded with additional functionalities to facilitate exploration, such as dynamic entity filters or customizable menus. They are usually powered by indicators from OWID's Grapher database. !!! info "Learn more about creating Data explorers [on Notion :octicons-arrow-right-24:](https://www.notion.so/owid/Creating-Data-Explorers-cf47a5ef90f14c1fba8fc243aba79be7)." +!!! note "Legacy explorers" + + In the past Explorers were manually defined from our Admin. Data was sourced by CSV files generated by ETL [served from S3](https://dash.cloudflare.com/078fcdfed9955087315dd86792e71a7e/r2/default/buckets/owid-catalog), or on GitHub. + + We have slowly transitioned into a new system where explorers are generated from the ETL pipeline. This is a more scalable and maintainable solution. + ## Backport Datasets from our production grapher database can be backported to ETL catalog. @@ -42,9 +66,6 @@ flowchart LR classDef node_ss fill:#002147,color:#fff ``` -## Open Numbers - -!!! warning "TO BE DONE" ## ETag diff --git a/docs/guides/data-work/export-data.md b/docs/guides/data-work/export-data.md new file mode 100644 index 00000000000..a3ff3341601 --- /dev/null +++ b/docs/guides/data-work/export-data.md @@ -0,0 +1,157 @@ +--- +status: new +--- + +!!! warning "Export steps are a work in progress" + +Export steps are defined in `etl/steps/export` directory and have similar structure to regular steps. They are run with the `--export` flag: + +```bash +etlr export://explorers/minerals/latest/minerals --export +``` + +The `def run(dest_dir):` function doesn't save a dataset, but calls a method that performs the action. For instance `create_explorer(...)` or `gh.commit_file_to_github(...)`. Once the step is executed successfully, it won't be run again unless its code or dependencies change (it won't be "dirty"). + +## Creating explorers + +TSV files for explorers are created using the `create_explorer` function, usually from a configuration YAML file + +```py +# Create a new explorers dataset and tsv file. +ds_explorer = create_explorer(dest_dir=dest_dir, config=config, df_graphers=df_graphers) +ds_explorer.save() +``` + +!!! info "Creating explorers on staging servers" + + Explorers can be created or edited on staging servers and then manually migrated to production. Each staging server creates a branch in the `owid-content` repository. Editing explorers in Admin or running the `create_explorer` function pushes changes to that branch. Once the PR is merged, the branch gets pushed to the `owid-content` repository (not to the `master` branch, but its own branch). You then need to manually create a PR from that branch and merge it into `master`. + + +## Creating multi-dimensional indicators + +Multi-dimensional indicators are powered by a configuration that is typically created from a YAML file. The structure of the YAML file looks like this: + +```yaml title="etl/steps/export/multidim/covid/latest/covid.deaths.yaml" +definitions: + table: {definitions.table} + +title: + title: COVID-19 deaths + titleVariant: by interval +defaultSelection: + - World + - Europe + - Asia +topicTags: + - COVID-19 + +dimensions: + - slug: interval + name: Interval + choices: + - slug: weekly + name: Weekly + description: null + - slug: biweekly + name: Biweekly + description: null + + - slug: metric + name: Metric + choices: + - slug: absolute + name: Absolute + description: null + - slug: per_capita + name: Per million people + description: null + - slug: change + name: Change from previous interval + description: null + +views: + - dimensions: + interval: weekly + metric: absolute + indicators: + y: "{definitions.table}#weekly_deaths" + - dimensions: + interval: weekly + metric: per_capita + indicators: + y: "{definitions.table}#weekly_deaths_per_million" + - dimensions: + interval: weekly + metric: change + indicators: + y: "{definitions.table}#weekly_pct_growth_deaths" + + - dimensions: + interval: biweekly + metric: absolute + indicators: + y: "{definitions.table}#biweekly_deaths" + - dimensions: + interval: biweekly + metric: per_capita + indicators: + y: "{definitions.table}#biweekly_deaths_per_million" + - dimensions: + interval: biweekly + metric: change + indicators: + y: "{definitions.table}#biweekly_pct_growth_deaths" +``` + +The `dimensions` field specifies selectors, and the `views` field defines views for the selection. Since there are numerous possible configurations, `views` are usually generated programmatically. However, it's a good idea to create a few of them manually to start. + +You can also combine manually defined views with generated ones. See the `etl.multidim` module for available helper functions or refer to examples from `etl/steps/export/multidim/`. Feel free to add or modify the helper functions as needed. + +The export step loads the YAML file, adds `views` to the config, and then calls the function. + +```python title="etl/steps/export/multidim/covid/latest/covid.py" +def run(dest_dir: str) -> None: + engine = get_engine() + + # Load YAML file + config = paths.load_mdim_config("covid.deaths.yaml") + + multidim.upsert_multidim_data_page("mdd-energy", config, engine) +``` + +To see the multi-dimensional indicator in Admin, run + +```bash +etlr export://multidim/energy/latest/energy --export +``` + +and check out the preview at http://staging-site-my-branch/admin/grapher/mdd-name. + + +## Exporting data to GitHub + +One common use case for the `export` step is to commit a dataset to a GitHub repository. This is useful when we want to make a dataset available to the public. The pattern for this looks like this: + +```python +if os.environ.get("CO2_BRANCH"): + dry_run = False + branch = os.environ["CO2_BRANCH"] +else: + dry_run = True + branch = "master" + +gh.commit_file_to_github( + combined.to_csv(), + repo_name="co2-data", + file_path="owid-co2-data.csv", + commit_message=":bar_chart: Automated update", + branch=branch, + dry_run=dry_run, +) +``` + +This code will commit the dataset to the `co2-data` repository on GitHub if you specify the `CO2_BRANCH` environment variable, i.e. + +```bash +CO2_BRANCH=main etlr export://co2/latest/co2 --export +``` diff --git a/docs/guides/data-work/index.md b/docs/guides/data-work/index.md index a4730cde44b..21ea8aab625 100644 --- a/docs/guides/data-work/index.md +++ b/docs/guides/data-work/index.md @@ -3,8 +3,6 @@ tags: - 👷 Staff --- -# Data work - Adding and updating datasets in ETL is part of our routinary work. To this end, we've simplified the process as much as possible. Find below the list of the steps involved in the workflow. Click on each step to learn more about it. ```mermaid From c025a70aa656dc16e4ec46cd51aa55382049bbee Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 19 Nov 2024 11:26:34 +0100 Subject: [PATCH 05/10] fix link --- docs/ignore/generate_dynamic_docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ignore/generate_dynamic_docs.py b/docs/ignore/generate_dynamic_docs.py index 5a6a96c0355..7cdd20532f4 100644 --- a/docs/ignore/generate_dynamic_docs.py +++ b/docs/ignore/generate_dynamic_docs.py @@ -15,7 +15,7 @@ - __[Indicator](#variable)__ (variable) - __[Origin](#origin)__ -- __[Table](#tables)__ +- __[Table](#table)__ - __[Dataset](#dataset)__ From 73a0f90634fd36a673290b7ea78815e81252d676 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 19 Nov 2024 11:26:45 +0100 Subject: [PATCH 06/10] re-organize toc --- mkdocs.yml | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 8ff65ffa179..1d57a246e37 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -205,23 +205,29 @@ nav: - Contributing: "contributing.md" - Guides: - "guides/index.md" - - Data work: + - Adding data: - "guides/data-work/index.md" - - Adding data: "guides/data-work/add-data.md" + - New data: "guides/data-work/add-data.md" - Updating data: "guides/data-work/update-data.md" - Update charts: "guides/data-work/update-charts.md" - - Wizard: "guides/wizard.md" - - CLI: "guides/etl-cli.md" - - Harmonize country names: "guides/harmonize-countries.md" - - Using different environments: "guides/environment.md" - - Staging servers: "guides/staging-servers.md" - - Private dataset import to ETL: "guides/private-import.md" - - Automate regular updates: "guides/auto-regular-updates.md" - - Backport a dataset to ETL: "guides/backport.md" - - Metadata in data pages: "guides/metadata-play.md" - - Edit the documentation: "dev/docs.md" - - OpenAI setup: "guides/openai.md" - - Sharing with external people: "guides/sharing-external.md" + - Export data: "guides/data-work/export-data.md" + - Main tools: + - Wizard: "guides/wizard.md" + - CLI: "guides/etl-cli.md" + - Harmonize country names: "guides/harmonize-countries.md" + - Backport from database: "guides/backport.md" + - Regular updates: "guides/auto-regular-updates.md" + - Servers & settings: + - Environments: "guides/environment.md" + - Staging servers: "guides/staging-servers.md" + - Public servers: "guides/sharing-external.md" + - Private datasets: "guides/private-import.md" + - OpenAI setup: "guides/openai.md" + + - Others: + - Edit the documentation: "dev/docs.md" + - Metadata in data pages: "guides/metadata-play.md" + - Design principles: - Design principles & workflow: architecture/index.md From 1de80e11b85d29be8f13312927436287115b7bd6 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 19 Nov 2024 11:45:03 +0100 Subject: [PATCH 07/10] test authorship --- docs/overrides/main.html | 13 +++++++++++++ mkdocs.yml | 7 ++++++- 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 docs/overrides/main.html diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 00000000000..e70aa10c879 --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} + +{% block content %} +{{ super() }} + +{% if git_page_authors %} +
+ + Authors: {{ git_page_authors | default('enable mkdocs-git-authors-plugin') }} + +
+{% endif %} +{% endblock %} diff --git a/mkdocs.yml b/mkdocs.yml index 1d57a246e37..5be0377516e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -93,6 +93,8 @@ extra: link: https://ourworldindata.org - icon: fontawesome/brands/instagram link: https://instagram.com/ourworldindata + - icon: fontawesome/brands/bluesky + link: https://bsky.app/profile/ourworldindata.org - icon: fontawesome/brands/x-twitter link: https://twitter.com/ourworldindata @@ -149,9 +151,12 @@ plugins: - git-authors: show_email_address: false # authorship_threshold_percent: 1 - # show_contribution: true + show_contribution: true # show_line_count: true # count_empty_lines: true + ignore_authors: + - owidbot + sort_authors_by: contribution - git-revision-date-localized - tags: tags_file: tags.md From 67ed80e81af87304a5d2a28e9030b3b7b2baccf6 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 19 Nov 2024 11:47:55 +0100 Subject: [PATCH 08/10] avoid duplicity --- docs/overrides/{main.html => main_aux.html} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/overrides/{main.html => main_aux.html} (100%) diff --git a/docs/overrides/main.html b/docs/overrides/main_aux.html similarity index 100% rename from docs/overrides/main.html rename to docs/overrides/main_aux.html From 9f654628e6cd511a6b1ef0d9e315dd73428221eb Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 19 Nov 2024 11:48:12 +0100 Subject: [PATCH 09/10] update mkdocs --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6599d40d745..952751da92f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,10 +92,11 @@ dev-dependencies = [ "boto3-stubs[s3]>=1.34.154", "gspread>=5.12.4", "jsonref>=1.1.0", + "mkdocs-material>=9.5.34", "mkdocs-jupyter>=0.24.8", "mkdocs-exclude>=1.0.2", "mkdocs-gen-files>=0.5.0", - "mkdocs-git-authors-plugin>=0.7.2", + "mkdocs-git-authors-plugin>=0.9.2", "mkdocs-git-revision-date-localized-plugin>=1.2.6", "mkdocs-click>=0.8.1", "mkdocs-glightbox>=0.3.7", From dcc4516839eeafb26df059a5df1440cefe182ad9 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 19 Nov 2024 11:57:34 +0100 Subject: [PATCH 10/10] lock dependencies --- uv.lock | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/uv.lock b/uv.lock index f332d1e4d37..540928b8db5 100644 --- a/uv.lock +++ b/uv.lock @@ -875,6 +875,7 @@ dev = [ { name = "mkdocs-git-revision-date-localized-plugin" }, { name = "mkdocs-glightbox" }, { name = "mkdocs-jupyter" }, + { name = "mkdocs-material" }, { name = "pandas-stubs" }, { name = "plotly" }, { name = "pyright" }, @@ -979,10 +980,11 @@ dev = [ { name = "mkdocs-click", specifier = ">=0.8.1" }, { name = "mkdocs-exclude", specifier = ">=1.0.2" }, { name = "mkdocs-gen-files", specifier = ">=0.5.0" }, - { name = "mkdocs-git-authors-plugin", specifier = ">=0.7.2" }, + { name = "mkdocs-git-authors-plugin", specifier = ">=0.9.2" }, { name = "mkdocs-git-revision-date-localized-plugin", specifier = ">=1.2.6" }, { name = "mkdocs-glightbox", specifier = ">=0.3.7" }, { name = "mkdocs-jupyter", specifier = ">=0.24.8" }, + { name = "mkdocs-material", specifier = ">=9.5.34" }, { name = "pandas-stubs", specifier = "==1.2.0.62" }, { name = "plotly", specifier = ">=5.23.0" }, { name = "pyright", specifier = "==1.1.373" }, @@ -2636,14 +2638,14 @@ wheels = [ [[package]] name = "mkdocs-git-authors-plugin" -version = "0.7.2" +version = "0.9.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "mkdocs" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/43/ff/a759124da74b0874b8db4988af5f60f594917316ce82baaef36abae94073/mkdocs-git-authors-plugin-0.7.2.tar.gz", hash = "sha256:f541730e4cabdafa0ac758c94d28ba5e8ddca4c859e5de4c89f1226cb6ccd0ad", size = 15785 } +sdist = { url = "https://files.pythonhosted.org/packages/80/ef/09ab7178d580e342cb3ba279c48eaf3abf55795a2ae6e5426fe2c725143c/mkdocs_git_authors_plugin-0.9.2.tar.gz", hash = "sha256:77f97c321e08a8757beb866293eb257070b11cd5a080976bc6696b249cbade4f", size = 21403 } wheels = [ - { url = "https://files.pythonhosted.org/packages/55/7c/c4b6d71921dd0cf33f87bfd69d7c72774bf4ece57b6aa23221d1ac31d9fb/mkdocs_git_authors_plugin-0.7.2-py3-none-any.whl", hash = "sha256:c8a2784a867db79ad3b477a96ee96875d17b09192b6d3be71f08df25afff76c4", size = 18860 }, + { url = "https://files.pythonhosted.org/packages/48/08/57d0fea1cc30096fcc94ec9cd4ccdee625be89fd710626f78d90fc13738e/mkdocs_git_authors_plugin-0.9.2-py3-none-any.whl", hash = "sha256:f6cefc4dc832865d26f7f9f944c0a8c7dc852742d79320f3800e0d97814e2a84", size = 20332 }, ] [[package]]