From 7b504acec7a10d4aa5c9e5a29df46f9249603657 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucas=20Rod=C3=A9s-Guirao?= Date: Tue, 19 Nov 2024 12:08:11 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=9C=20private=20datasets=20(#3561)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 📜 private datasets * fix #2976 * remove status: new * re-structure docs on export / explorers * fix link * re-organize toc * test authorship * avoid duplicity * update mkdocs * lock dependencies --- docs/architecture/workflow/index.md | 246 +++------------------- docs/architecture/workflow/other-steps.md | 31 ++- docs/guides/auto-regular-updates.md | 1 - docs/guides/data-work/export-data.md | 157 ++++++++++++++ docs/guides/data-work/index.md | 2 - docs/guides/private-import.md | 17 +- docs/ignore/generate_dynamic_docs.py | 2 +- docs/overrides/main_aux.html | 13 ++ mkdocs.yml | 41 ++-- pyproject.toml | 3 +- uv.lock | 10 +- 11 files changed, 267 insertions(+), 256 deletions(-) create mode 100644 docs/guides/data-work/export-data.md create mode 100644 docs/overrides/main_aux.html diff --git a/docs/architecture/workflow/index.md b/docs/architecture/workflow/index.md index 0762d14b3b1..66bfd66fe0a 100644 --- a/docs/architecture/workflow/index.md +++ b/docs/architecture/workflow/index.md @@ -1,7 +1,3 @@ ---- -status: new ---- - Our World in Data has a whole team dedicated to data management that takes data from publicly available sources (e.g. the _UN Food and Agriculture Organisation_), and makes it available to our researchers to analyse and create visualisation for their articles. ## Five stages @@ -9,18 +5,17 @@ Our World in Data has a whole team dedicated to data management that takes data The ETL project provides an opinionated data management workflow, which separates a data manager's work into five stages: ```mermaid -graph TB +graph LR -snapshot --> format --> harmonise --> import --> publish +snapshot --> format --> harmonize/process --> import --> publish ``` The design of the ETL involves steps that mirror the stages above, which help us to meet several design goals of the project: -1. [Snapshot step](#snapshot): Take a **snapshot** of the upstream data product and store it on our end. - -- [Meadow step](#meadow): Bring the data into a **common format**. -- [Garden step](#garden): **Harmonise** the names of countries, genders and any other columns we may want to join on. Also do the necessary data processing to make the dataset usable for our needs. -- [Grapher step](#grapher): **Import** the data to our internal MySQL database. +1. [Snapshot step](#snapshot): Take a **snapshot** of the upstream data product and store it. +2. [Meadow step](#meadow): Bring the data into a **common format**. +3. [Garden step](#garden): **Harmonise** the names of countries, genders and any other columns we may want to join on. Also do the necessary **data processing** to make the dataset usable for our needs. +4. [Grapher step](#grapher): **Import** the data to our internal MySQL database. A data manager must implement all these steps to make something chartable on the Our World in Data site. @@ -32,9 +27,7 @@ A data manager must implement all these steps to make something chartable on the ## Snapshot -The initial step in importing data from an upstream source involves **transferring an external file directly into our platform**. This process is essential to ensure both reliable and secure access to the file. - -It's important to recognize that an external source might remove the file at any time. Furthermore, this method supports the reproducibility of all Extract, Transform, Load (ETL) processes. This is crucial because the content of the file at the source may undergo changes, such as the removal or addition of datapoints, or alterations in field names. +The initial step consists in **transferring an external file from an upstream provider into our platform**. This ensures, that the source data is always accessible. This is because the upstream provider might remove the file at any time, or change it. The accompanying diagram illustrates the process of importing various versions of the same dataset into our snapshot catalog, depicted over time. Imagine that the vertical axis represents time. @@ -60,9 +53,9 @@ flowchart LR The snapshot step typically consists of a DVC file and a script that downloads the upstream data and saves it to our snapshot catalog. Snapshot files are located in the [`snapshots/`](https://github.com/owid/etl/tree/master/snapshots) directory of the project. -Note that we need a DVC file per upstream data file; hence, in some instances, if the source publishes a datset using multiple files, we need multiple DVC files. +Note that we need a DVC file per upstream data file; hence, in some instances, if the source publishes a dataset using multiple files, we need multiple DVC files. -### Metadata +### Snapshot metadata A Snapshot is a picture of a data product (e.g. a data CSV file) provided by an upstream data provider at a particular point in time. It is the entrypoint to ETL and where we define metadata attributes of that picture. This is fundamental to ensure that the data is properly documented and that the metadata is propagated to the rest of the system. @@ -70,7 +63,7 @@ The metadata in Snapshot consists mainly of one object: `meta.origin`. !!! info "Learn more in our [metadata reference](../metadata/reference#origin)." -This metadata is captured in a DVC file (similar to a yaml file), which contains all the snapshot metadata fields as key-value pairs. + This metadata is captured in a DVC file (similar to a YAML file), which contains all the snapshot metadata fields as key-value pairs. ??? example "Example of [`snapshots/animal_welfare/2023-10-24/fur_laws.xlsx.dvc`](https://github.com/owid/etl/blob/master/snapshots/animal_welfare/2023-10-24/fur_laws.xlsx.dvc)" @@ -99,49 +92,17 @@ This metadata is captured in a DVC file (similar to a yaml file), which contains ## Meadow -The meadow step is the first Transform step of our ETL. - -In a meadow step, we load a `snapshot` and adapt it to be in a convenient format. A convenient format means creating an instance of a [`Dataset`](../../design/common-format/#datasets-owidcatalogdataset), with the appropriate data as a table (or tables). +The meadow step is the first Transform step of our ETL. In it, we load a [`Snapshot`](../../architecture/design/phases/#snapshot) and adapt it to be in a convenient format. A convenient format means creating an instance of a [`Dataset`](../../architecture/design/phases/#datasets), with the appropriate data as a `Table` (or tables). In this step, you can add and define metadata, but we rarely do this. Instead, we propagate the metadata defined in the Snapshot step and leave it to the Garden step to enhance the metadata. Meadow steps should only have `snapshot` (or `walden`) dependencies and ー by definition ー should not depend on `garden` steps. -A typical flow up to the Meadow step could look like: - -```mermaid -flowchart LR - - upstream1((____)):::node -.->|copy| snapshot1((____)):::node - snapshot1((____)):::node -->|format| meadow1((____)):::node - - subgraph id0 [Upstream] - upstream1 - end - - subgraph id1 [Snapshot] - snapshot1 - end - - subgraph id2 [Meadow] - meadow1 - end - - - subgraph id [ETL] - id1 - id2 - end - - classDef node fill:#002147,color:#002147 - classDef node_ss fill:#002147,color:#fff -``` - ## Garden The Garden step is where most of the work falls in. This is where the data manager needs to carefully look at the data, filter outliers, harmonize labels (e.g. country names), improve the dataset metadata, etc. -Garden steps typically depend on meadow steps. For instance, the Garden step `data://garden/un/2022-07-11/un_wpp`, which generates the dataset _World Population Prospects (UN, 2022)_, depends on this same process but in Meadow (i.e. `data://meadow/un/2022-07-11/un_wpp`). After some pre-liminary work (mostly re-formating table, and some minor cleaning), we can now focus on more complex processing steps in Garden. +Garden steps typically depend on meadow steps. For instance, the Garden step `data://garden/un/2024-07-12/un_wpp`, which generates the dataset _World Population Prospects (UN, 2024)_, depends on this same process but in Meadow (i.e. `data://meadow/un/2024-07-12/un_wpp`). After some pre-liminary work (mostly re-formating tables, and some minor cleaning), we can now focus on more complex processing steps in Garden. A typical flow up to the Garden step could look like: @@ -150,7 +111,7 @@ flowchart LR upstream1((____)):::node -.->|copy| snapshot1((____)):::node snapshot1((____)):::node -->|format| meadow1((____)):::node - meadow1((____)):::node -->|harmonize| garden1((____)):::node + meadow1((____)):::node -->|process| garden1((____)):::node subgraph id0 [Upstream] upstream1 @@ -178,7 +139,7 @@ flowchart LR classDef node_ss fill:#002147,color:#fff ``` -However, garden steps could also depend on other garden steps. This is often the case for datasets containing _long-run indicators_, where different `garden` datasets are combined. +However, Garden steps could also depend on other garden steps. This is often the case for datasets containing _long-run indicators_, or just in general when different `garden` datasets are combined. !!! info "Long-run indicators" @@ -190,27 +151,27 @@ However, garden steps could also depend on other garden steps. This is often the ```yaml data://garden/demography/2023-03-31/population: - - data://garden/hyde/2017/baseline - - data://garden/gapminder/2023-03-31/population - - data://garden/un/2022-07-11/un_wpp - - data://open_numbers/open_numbers/latest/gapminder__systema_globalis + - data://garden/hyde/2017/baseline + - data://garden/gapminder/2023-03-31/population + - data://garden/un/2022-07-11/un_wpp + - data://open_numbers/open_numbers/latest/gapminder__systema_globalis ``` -An important processing step in Garden is to standardise (or harmonise) the country names. You can learn more about this in our [country standardisation guide](../../guides/harmonize-countries). +An important processing step in Garden is to standardize (or harmonize) the country names. You can learn more about this in our [country harmonization guide](../../guides/harmonize-countries). -### Metadata +### Garden metadata After adapting and processing the origin's data, we have a curated dataset. This dataset, contains indicators (maybe not present in the origin) that we need to properly document. -The metadata in Garden consists mainly of two objects: `dataset` and `tables`. The metadata comes as a YAML file next to the processing scripts. +The metadata in Garden consists mainly of two objects: [`Dataset`](../../architecture/metadata/reference/#dataset) and [`Table`](../../architecture/metadata/reference/#table) (list). The metadata comes as a YAML file next to the processing scripts. !!! info "Learn more in our [dataset reference](../metadata/reference/#dataset), [tables reference](../metadata/reference/#table) and [indicator reference](../metadata/reference/#variable)." ## Grapher -In the `grapher` step the work should be minimal. Here, we create a `grapher` view by adapting our Garden dataset to fit the Grapher requirements. +In the Grapher step the work should be minimal. Here, we create a `Grapher` view by adapting our Garden dataset to adhere to the Grapher requirements. -Grapher views are still normal datasets, but they adapt the data to the way it must look when being inserted to MySQL. For each grapher view, there is a corresponding matching `grapher://` step automatically generated which does the actual insert to MySQL, if MySQL credentials have been configured. +Grapher views are still normal datasets, but they adapt the data to the way it must look when being inserted into MySQL. For each grapher view, there is a corresponding matching `grapher://` step automatically generated which does the actual insert to MySQL, if MySQL credentials have been configured. A typical flow up to the Grapher step could look like: @@ -258,163 +219,6 @@ flowchart LR classDef node_ss fill:#002147,color:#fff ``` -In principle, a grapher step only loads a single garden step. - -Note that the diagram shows a final step outside of the ETL. This is when the `grapher://` step is executed, and takes data from the ETL (from the etl `garden` step) and imports it to oure database. - -!!! bug "TODO: Add an example of code" - -## Export - -Sometimes we want to perform an action instead of creating a dataset. For instance, we might want to create a TSV file for an explorer, commit a CSV to a GitHub repository, or create a config for a multi-dimensional indicator. This is where the `export` step comes in. - -Export steps are defined in `etl/steps/export` directory and have similar structure to regular steps. They are run with the `--export` flag. - -```bash -etlr export://explorers/minerals/latest/minerals --export -``` - -The `def run(dest_dir):` function doesn't save a dataset, but calls a method that performs the action. For instance `create_explorer(...)` or `gh.commit_file_to_github(...)`. Once the step is executed successfully, it won't be run again unless its code or dependencies change (it won't be "dirty"). - -### Creating explorers - -TSV files for explorers are created using the `create_explorer` function, usually from a configuration YAML file - -```python -# Create a new explorers dataset and tsv file. -ds_explorer = create_explorer(dest_dir=dest_dir, config=config, df_graphers=df_graphers) -ds_explorer.save() -``` -!!! info "Creating explorers on staging servers" - - Explorers can be created or edited on staging servers and then manually migrated to production. Each staging server creates a branch in the `owid-content` repository. Editing explorers in Admin or running the `create_explorer` function pushes changes to that branch. Once the PR is merged, the branch gets pushed to the `owid-content` repository (not to the `master` branch, but its own branch). You then need to manually create a PR from that branch and merge it into `master`. - - -### Creating multi-dimensional indicators - -Multi-dimensional indicators are powered by a configuration that is typically created from a YAML file. The structure of the YAML file looks like this: - -```yaml title="etl/steps/export/multidim/covid/latest/covid.deaths.yaml" -definitions: - table: {definitions.table} - -title: - title: COVID-19 deaths - titleVariant: by interval -defaultSelection: - - World - - Europe - - Asia -topicTags: - - COVID-19 - -dimensions: - - slug: interval - name: Interval - choices: - - slug: weekly - name: Weekly - description: null - - slug: biweekly - name: Biweekly - description: null - - - slug: metric - name: Metric - choices: - - slug: absolute - name: Absolute - description: null - - slug: per_capita - name: Per million people - description: null - - slug: change - name: Change from previous interval - description: null - -views: - - dimensions: - interval: weekly - metric: absolute - indicators: - y: "{definitions.table}#weekly_deaths" - - dimensions: - interval: weekly - metric: per_capita - indicators: - y: "{definitions.table}#weekly_deaths_per_million" - - dimensions: - interval: weekly - metric: change - indicators: - y: "{definitions.table}#weekly_pct_growth_deaths" - - - dimensions: - interval: biweekly - metric: absolute - indicators: - y: "{definitions.table}#biweekly_deaths" - - dimensions: - interval: biweekly - metric: per_capita - indicators: - y: "{definitions.table}#biweekly_deaths_per_million" - - dimensions: - interval: biweekly - metric: change - indicators: - y: "{definitions.table}#biweekly_pct_growth_deaths" -``` - -The `dimensions` field specifies selectors, and the `views` field defines views for the selection. Since there are numerous possible configurations, `views` are usually generated programmatically. However, it's a good idea to create a few of them manually to start. - -You can also combine manually defined views with generated ones. See the `etl.multidim` module for available helper functions or refer to examples from `etl/steps/export/multidim/`. Feel free to add or modify the helper functions as needed. - -The export step loads the YAML file, adds `views` to the config, and then calls the function. +In principle, a Grapher step only loads a single garden step. -```python title="etl/steps/export/multidim/covid/latest/covid.py" -def run(dest_dir: str) -> None: - engine = get_engine() - - # Load YAML file - config = paths.load_mdim_config("covid.deaths.yaml") - - multidim.upsert_multidim_data_page("mdd-energy", config, engine) -``` - -To see the multi-dimensional indicator in Admin, run - -```bash -etlr export://multidim/energy/latest/energy --export -``` - -and check out the preview at http://staging-site-my-branch/admin/grapher/mdd-name. - - -### Exporting data to GitHub - -One common use case for the `export` step is to commit a dataset to a GitHub repository. This is useful when we want to make a dataset available to the public. The pattern for this looks like this: - -```python -if os.environ.get("CO2_BRANCH"): - dry_run = False - branch = os.environ["CO2_BRANCH"] -else: - dry_run = True - branch = "master" - -gh.commit_file_to_github( - combined.to_csv(), - repo_name="co2-data", - file_path="owid-co2-data.csv", - commit_message=":bar_chart: Automated update", - branch=branch, - dry_run=dry_run, -) -``` - -This code will commit the dataset to the `co2-data` repository on GitHub if you specify the `CO2_BRANCH` environment variable, i.e. - -```bash -CO2_BRANCH=main etlr export://co2/latest/co2 --export -``` +Note that the diagram shows a final step outside of the ETL. This is when the `grapher://` step is executed, and takes data from the ETL (from the etl `garden` step) and imports it to our database. diff --git a/docs/architecture/workflow/other-steps.md b/docs/architecture/workflow/other-steps.md index e738bb318c2..bdd1bf186ea 100644 --- a/docs/architecture/workflow/other-steps.md +++ b/docs/architecture/workflow/other-steps.md @@ -1,11 +1,35 @@ +--- +status: new +--- + So far you have learned about the standard steps. These should cover most of the cases. However, there are some other steps worth mentioning. -## Explorers +## Export steps + +Sometimes we want to perform an action instead of creating a dataset. For instance, we might want to create a TSV file for an explorer, commit a CSV to a GitHub repository, or create a config for a multi-dimensional indicator. This is where the `Export` step comes in. + +Export steps are used to perform an action on an already created dataset. This action typically implies making the data available to other parts of the system. There are different types of export steps: + +- **Explorers**: Create a TSV file for a data explorer. +- **Multi-dimensional indicators**: Create a configuration for a multi-dimensional indicator. +- **Export to GitHub**: Commit a dataset to a GitHub repository. + +Export steps should be used after the data has been processed and is ready to be used (post-Garden). + +!!! note "Learn more about [export steps](../../guides/data-work/export-data.md)" -Data explorers are Grapher charts expanded with additional functionalities to facilitate exploration, such as dynamic entity filters or customizable menus. They are powered by CSV files generated by ETL [served from S3](https://dash.cloudflare.com/078fcdfed9955087315dd86792e71a7e/r2/default/buckets/owid-catalog). Explorers data step in ETL is responsible for generating these CSV files. It works in the same way as e.g. garden step, but the transformations made there are meant to get the data ready for the data explorer (and not be consumed by users of catalog). +### Explorers + +Data explorers are Grapher charts expanded with additional functionalities to facilitate exploration, such as dynamic entity filters or customizable menus. They are usually powered by indicators from OWID's Grapher database. !!! info "Learn more about creating Data explorers [on Notion :octicons-arrow-right-24:](https://www.notion.so/owid/Creating-Data-Explorers-cf47a5ef90f14c1fba8fc243aba79be7)." +!!! note "Legacy explorers" + + In the past Explorers were manually defined from our Admin. Data was sourced by CSV files generated by ETL [served from S3](https://dash.cloudflare.com/078fcdfed9955087315dd86792e71a7e/r2/default/buckets/owid-catalog), or on GitHub. + + We have slowly transitioned into a new system where explorers are generated from the ETL pipeline. This is a more scalable and maintainable solution. + ## Backport Datasets from our production grapher database can be backported to ETL catalog. @@ -42,9 +66,6 @@ flowchart LR classDef node_ss fill:#002147,color:#fff ``` -## Open Numbers - -!!! warning "TO BE DONE" ## ETag diff --git a/docs/guides/auto-regular-updates.md b/docs/guides/auto-regular-updates.md index 50912419fbf..f82957e6809 100644 --- a/docs/guides/auto-regular-updates.md +++ b/docs/guides/auto-regular-updates.md @@ -1,7 +1,6 @@ --- tags: - 👷 Staff -status: new --- !!! warning "This is a work in progress" diff --git a/docs/guides/data-work/export-data.md b/docs/guides/data-work/export-data.md new file mode 100644 index 00000000000..a3ff3341601 --- /dev/null +++ b/docs/guides/data-work/export-data.md @@ -0,0 +1,157 @@ +--- +status: new +--- + +!!! warning "Export steps are a work in progress" + +Export steps are defined in `etl/steps/export` directory and have similar structure to regular steps. They are run with the `--export` flag: + +```bash +etlr export://explorers/minerals/latest/minerals --export +``` + +The `def run(dest_dir):` function doesn't save a dataset, but calls a method that performs the action. For instance `create_explorer(...)` or `gh.commit_file_to_github(...)`. Once the step is executed successfully, it won't be run again unless its code or dependencies change (it won't be "dirty"). + +## Creating explorers + +TSV files for explorers are created using the `create_explorer` function, usually from a configuration YAML file + +```py +# Create a new explorers dataset and tsv file. +ds_explorer = create_explorer(dest_dir=dest_dir, config=config, df_graphers=df_graphers) +ds_explorer.save() +``` + +!!! info "Creating explorers on staging servers" + + Explorers can be created or edited on staging servers and then manually migrated to production. Each staging server creates a branch in the `owid-content` repository. Editing explorers in Admin or running the `create_explorer` function pushes changes to that branch. Once the PR is merged, the branch gets pushed to the `owid-content` repository (not to the `master` branch, but its own branch). You then need to manually create a PR from that branch and merge it into `master`. + + +## Creating multi-dimensional indicators + +Multi-dimensional indicators are powered by a configuration that is typically created from a YAML file. The structure of the YAML file looks like this: + +```yaml title="etl/steps/export/multidim/covid/latest/covid.deaths.yaml" +definitions: + table: {definitions.table} + +title: + title: COVID-19 deaths + titleVariant: by interval +defaultSelection: + - World + - Europe + - Asia +topicTags: + - COVID-19 + +dimensions: + - slug: interval + name: Interval + choices: + - slug: weekly + name: Weekly + description: null + - slug: biweekly + name: Biweekly + description: null + + - slug: metric + name: Metric + choices: + - slug: absolute + name: Absolute + description: null + - slug: per_capita + name: Per million people + description: null + - slug: change + name: Change from previous interval + description: null + +views: + - dimensions: + interval: weekly + metric: absolute + indicators: + y: "{definitions.table}#weekly_deaths" + - dimensions: + interval: weekly + metric: per_capita + indicators: + y: "{definitions.table}#weekly_deaths_per_million" + - dimensions: + interval: weekly + metric: change + indicators: + y: "{definitions.table}#weekly_pct_growth_deaths" + + - dimensions: + interval: biweekly + metric: absolute + indicators: + y: "{definitions.table}#biweekly_deaths" + - dimensions: + interval: biweekly + metric: per_capita + indicators: + y: "{definitions.table}#biweekly_deaths_per_million" + - dimensions: + interval: biweekly + metric: change + indicators: + y: "{definitions.table}#biweekly_pct_growth_deaths" +``` + +The `dimensions` field specifies selectors, and the `views` field defines views for the selection. Since there are numerous possible configurations, `views` are usually generated programmatically. However, it's a good idea to create a few of them manually to start. + +You can also combine manually defined views with generated ones. See the `etl.multidim` module for available helper functions or refer to examples from `etl/steps/export/multidim/`. Feel free to add or modify the helper functions as needed. + +The export step loads the YAML file, adds `views` to the config, and then calls the function. + +```python title="etl/steps/export/multidim/covid/latest/covid.py" +def run(dest_dir: str) -> None: + engine = get_engine() + + # Load YAML file + config = paths.load_mdim_config("covid.deaths.yaml") + + multidim.upsert_multidim_data_page("mdd-energy", config, engine) +``` + +To see the multi-dimensional indicator in Admin, run + +```bash +etlr export://multidim/energy/latest/energy --export +``` + +and check out the preview at http://staging-site-my-branch/admin/grapher/mdd-name. + + +## Exporting data to GitHub + +One common use case for the `export` step is to commit a dataset to a GitHub repository. This is useful when we want to make a dataset available to the public. The pattern for this looks like this: + +```python +if os.environ.get("CO2_BRANCH"): + dry_run = False + branch = os.environ["CO2_BRANCH"] +else: + dry_run = True + branch = "master" + +gh.commit_file_to_github( + combined.to_csv(), + repo_name="co2-data", + file_path="owid-co2-data.csv", + commit_message=":bar_chart: Automated update", + branch=branch, + dry_run=dry_run, +) +``` + +This code will commit the dataset to the `co2-data` repository on GitHub if you specify the `CO2_BRANCH` environment variable, i.e. + +```bash +CO2_BRANCH=main etlr export://co2/latest/co2 --export +``` diff --git a/docs/guides/data-work/index.md b/docs/guides/data-work/index.md index a4730cde44b..21ea8aab625 100644 --- a/docs/guides/data-work/index.md +++ b/docs/guides/data-work/index.md @@ -3,8 +3,6 @@ tags: - 👷 Staff --- -# Data work - Adding and updating datasets in ETL is part of our routinary work. To this end, we've simplified the process as much as possible. Find below the list of the steps involved in the workflow. Click on each step to learn more about it. ```mermaid diff --git a/docs/guides/private-import.md b/docs/guides/private-import.md index 120ce5849e2..8da97007592 100644 --- a/docs/guides/private-import.md +++ b/docs/guides/private-import.md @@ -3,11 +3,10 @@ tags: - 👷 Staff --- -While most of the data at OWID is publicly available, some datasets are added to our catalogue with some restrictions. These include datasets that are not redistributable, or that are not meant to be shared with the public. This can happen due to a strict license by the data provider, or because the data is still in a draft stage and not ready for public consumption. +While most of the data at OWID is publicly available, some datasets are added to our catalog with some restrictions. These include datasets that are not redistributable, or that are not meant to be shared with the public. This can happen due to a strict license by the data provider, or because the data is still in a draft stage and not ready for public consumption. Various privacy configurations are available: -- Skip re-publishing to GitHub. - Disable data downloading options on Grapher. - Disable public access to the original file (snapshot). - Hide the dataset from our public catalog (accessible via `owid-catalog-py`). @@ -16,6 +15,12 @@ In the following, we explain how to create private steps in the ETL pipeline and ## Create a private step + +!!! tip "Make your dataset completely private" + + - **Snapshot**: Set `meta.is_public` to `false` in the snapshot DVC file. + - **Meadow, Garden, Grapher**: Use `data-private://` prefix in the step name in the DAG. Set `dataset.non_redistributable` to `true` in the dataset garden metadata. + ### Snapshot To create a private snapshot step, set the `meta.is_public` property in the snapshot .dvc file to false: @@ -34,7 +39,7 @@ This will prevent the file to be publicly accessible without the appropriate cre ### Meadow, Garden, Grapher -Creating a private data step means that the data will not be listed in the public catalog, and therefore will not be accessible via `owid-catalog-py`. In addition, private datasets will not be re-published to GitHub. +Creating a private data step means that the data will not be listed in the public catalog, and therefore will not be accessible via `owid-catalog-py`. To create a private data step (meadow, garden or grapher) simply use `data-private` prefix in the step name in the DAG. For example, the step `grapher/ihme_gbd/2024-06-10/leading_causes_deaths` (this is from [health.yml](https://github.com/owid/etl/blob/master/dag/health.yml)) is private: @@ -70,8 +75,8 @@ etl run run [step-name] --private If you want to make a private step public simply follow the steps below: -- **In the DAG:** Replace `data-private/` prefix with `data/`. -- **In the snapshot DVC file**: Set `meta.is_public` to `true` (or simply remove `is_public` property). -- (Optional) **Allow for Grapher downloads**: Set `dataset.non_redistributable` to `false` in the dataset garden metadata (or simply remove the property from the metadata). +- **In the DAG:** Replace `data-private://` prefix with `data://`. +- **In the snapshot DVC file**: Set `meta.is_public` to `true` (or simply remove this property). +- (Optional) **Allow for Grapher downloads**: Set `dataset.non_redistributable` to `false` in the dataset garden metadata (or simply remove this property). After this, re-run the snapshot step and commit your changes. diff --git a/docs/ignore/generate_dynamic_docs.py b/docs/ignore/generate_dynamic_docs.py index 5a6a96c0355..7cdd20532f4 100644 --- a/docs/ignore/generate_dynamic_docs.py +++ b/docs/ignore/generate_dynamic_docs.py @@ -15,7 +15,7 @@ - __[Indicator](#variable)__ (variable) - __[Origin](#origin)__ -- __[Table](#tables)__ +- __[Table](#table)__ - __[Dataset](#dataset)__ diff --git a/docs/overrides/main_aux.html b/docs/overrides/main_aux.html new file mode 100644 index 00000000000..e70aa10c879 --- /dev/null +++ b/docs/overrides/main_aux.html @@ -0,0 +1,13 @@ +{% extends "base.html" %} + +{% block content %} +{{ super() }} + +{% if git_page_authors %} +
+ + Authors: {{ git_page_authors | default('enable mkdocs-git-authors-plugin') }} + +
+{% endif %} +{% endblock %} diff --git a/mkdocs.yml b/mkdocs.yml index 8ff65ffa179..5be0377516e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -93,6 +93,8 @@ extra: link: https://ourworldindata.org - icon: fontawesome/brands/instagram link: https://instagram.com/ourworldindata + - icon: fontawesome/brands/bluesky + link: https://bsky.app/profile/ourworldindata.org - icon: fontawesome/brands/x-twitter link: https://twitter.com/ourworldindata @@ -149,9 +151,12 @@ plugins: - git-authors: show_email_address: false # authorship_threshold_percent: 1 - # show_contribution: true + show_contribution: true # show_line_count: true # count_empty_lines: true + ignore_authors: + - owidbot + sort_authors_by: contribution - git-revision-date-localized - tags: tags_file: tags.md @@ -205,23 +210,29 @@ nav: - Contributing: "contributing.md" - Guides: - "guides/index.md" - - Data work: + - Adding data: - "guides/data-work/index.md" - - Adding data: "guides/data-work/add-data.md" + - New data: "guides/data-work/add-data.md" - Updating data: "guides/data-work/update-data.md" - Update charts: "guides/data-work/update-charts.md" - - Wizard: "guides/wizard.md" - - CLI: "guides/etl-cli.md" - - Harmonize country names: "guides/harmonize-countries.md" - - Using different environments: "guides/environment.md" - - Staging servers: "guides/staging-servers.md" - - Private dataset import to ETL: "guides/private-import.md" - - Automate regular updates: "guides/auto-regular-updates.md" - - Backport a dataset to ETL: "guides/backport.md" - - Metadata in data pages: "guides/metadata-play.md" - - Edit the documentation: "dev/docs.md" - - OpenAI setup: "guides/openai.md" - - Sharing with external people: "guides/sharing-external.md" + - Export data: "guides/data-work/export-data.md" + - Main tools: + - Wizard: "guides/wizard.md" + - CLI: "guides/etl-cli.md" + - Harmonize country names: "guides/harmonize-countries.md" + - Backport from database: "guides/backport.md" + - Regular updates: "guides/auto-regular-updates.md" + - Servers & settings: + - Environments: "guides/environment.md" + - Staging servers: "guides/staging-servers.md" + - Public servers: "guides/sharing-external.md" + - Private datasets: "guides/private-import.md" + - OpenAI setup: "guides/openai.md" + + - Others: + - Edit the documentation: "dev/docs.md" + - Metadata in data pages: "guides/metadata-play.md" + - Design principles: - Design principles & workflow: architecture/index.md diff --git a/pyproject.toml b/pyproject.toml index 6599d40d745..952751da92f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,10 +92,11 @@ dev-dependencies = [ "boto3-stubs[s3]>=1.34.154", "gspread>=5.12.4", "jsonref>=1.1.0", + "mkdocs-material>=9.5.34", "mkdocs-jupyter>=0.24.8", "mkdocs-exclude>=1.0.2", "mkdocs-gen-files>=0.5.0", - "mkdocs-git-authors-plugin>=0.7.2", + "mkdocs-git-authors-plugin>=0.9.2", "mkdocs-git-revision-date-localized-plugin>=1.2.6", "mkdocs-click>=0.8.1", "mkdocs-glightbox>=0.3.7", diff --git a/uv.lock b/uv.lock index f332d1e4d37..540928b8db5 100644 --- a/uv.lock +++ b/uv.lock @@ -875,6 +875,7 @@ dev = [ { name = "mkdocs-git-revision-date-localized-plugin" }, { name = "mkdocs-glightbox" }, { name = "mkdocs-jupyter" }, + { name = "mkdocs-material" }, { name = "pandas-stubs" }, { name = "plotly" }, { name = "pyright" }, @@ -979,10 +980,11 @@ dev = [ { name = "mkdocs-click", specifier = ">=0.8.1" }, { name = "mkdocs-exclude", specifier = ">=1.0.2" }, { name = "mkdocs-gen-files", specifier = ">=0.5.0" }, - { name = "mkdocs-git-authors-plugin", specifier = ">=0.7.2" }, + { name = "mkdocs-git-authors-plugin", specifier = ">=0.9.2" }, { name = "mkdocs-git-revision-date-localized-plugin", specifier = ">=1.2.6" }, { name = "mkdocs-glightbox", specifier = ">=0.3.7" }, { name = "mkdocs-jupyter", specifier = ">=0.24.8" }, + { name = "mkdocs-material", specifier = ">=9.5.34" }, { name = "pandas-stubs", specifier = "==1.2.0.62" }, { name = "plotly", specifier = ">=5.23.0" }, { name = "pyright", specifier = "==1.1.373" }, @@ -2636,14 +2638,14 @@ wheels = [ [[package]] name = "mkdocs-git-authors-plugin" -version = "0.7.2" +version = "0.9.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "mkdocs" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/43/ff/a759124da74b0874b8db4988af5f60f594917316ce82baaef36abae94073/mkdocs-git-authors-plugin-0.7.2.tar.gz", hash = "sha256:f541730e4cabdafa0ac758c94d28ba5e8ddca4c859e5de4c89f1226cb6ccd0ad", size = 15785 } +sdist = { url = "https://files.pythonhosted.org/packages/80/ef/09ab7178d580e342cb3ba279c48eaf3abf55795a2ae6e5426fe2c725143c/mkdocs_git_authors_plugin-0.9.2.tar.gz", hash = "sha256:77f97c321e08a8757beb866293eb257070b11cd5a080976bc6696b249cbade4f", size = 21403 } wheels = [ - { url = "https://files.pythonhosted.org/packages/55/7c/c4b6d71921dd0cf33f87bfd69d7c72774bf4ece57b6aa23221d1ac31d9fb/mkdocs_git_authors_plugin-0.7.2-py3-none-any.whl", hash = "sha256:c8a2784a867db79ad3b477a96ee96875d17b09192b6d3be71f08df25afff76c4", size = 18860 }, + { url = "https://files.pythonhosted.org/packages/48/08/57d0fea1cc30096fcc94ec9cd4ccdee625be89fd710626f78d90fc13738e/mkdocs_git_authors_plugin-0.9.2-py3-none-any.whl", hash = "sha256:f6cefc4dc832865d26f7f9f944c0a8c7dc852742d79320f3800e0d97814e2a84", size = 20332 }, ] [[package]]