From 96e0b2384b5b51f30fe246623a30386f1d90cb12 Mon Sep 17 00:00:00 2001
From: Pablo Arriagada <paarriagadap@gmail.com>
Date: Tue, 26 Nov 2024 19:12:53 +0000
Subject: [PATCH] :sparkles: all steps

---
 dag/archive/war.yml                           |  12 ++
 dag/war.yml                                   |  12 +-
 .../global_military_spending_dataset.meta.yml | 184 +++++++++++++++++
 .../global_military_spending_dataset.py       | 189 ++++++++++++++++++
 .../global_military_spending_dataset.py       |  28 +++
 .../global_military_spending_dataset.py       |  43 ++++
 ...global_military_spending_dataset_burden.py |  39 ++++
 ...l_military_spending_dataset_burden.rds.dvc |  32 +++
 8 files changed, 533 insertions(+), 6 deletions(-)
 create mode 100644 etl/steps/data/garden/harvard/2024-11-26/global_military_spending_dataset.meta.yml
 create mode 100644 etl/steps/data/garden/harvard/2024-11-26/global_military_spending_dataset.py
 create mode 100644 etl/steps/data/grapher/harvard/2024-11-26/global_military_spending_dataset.py
 create mode 100644 etl/steps/data/meadow/harvard/2024-11-26/global_military_spending_dataset.py
 create mode 100644 snapshots/harvard/2024-11-26/global_military_spending_dataset_burden.py
 create mode 100644 snapshots/harvard/2024-11-26/global_military_spending_dataset_burden.rds.dvc

diff --git a/dag/archive/war.yml b/dag/archive/war.yml
index 13d2a970e69..27301a7df9f 100644
--- a/dag/archive/war.yml
+++ b/dag/archive/war.yml
@@ -128,3 +128,15 @@ steps:
     - data://garden/geography/2023-11-28/nat_earth_110
   data://grapher/war/2024-10-02/ucdp_monthly:
     - data://garden/war/2024-10-02/ucdp_monthly
+
+  # # Global Military Spending Dataset (only one snapshot was modified)
+  # data://meadow/harvard/2024-07-22/global_military_spending_dataset:
+  #   - snapshot://harvard/2024-07-22/global_military_spending_dataset.rds
+  #   - snapshot://harvard/2024-07-22/global_military_spending_dataset_burden.rds
+  # data://garden/harvard/2024-07-22/global_military_spending_dataset:
+  #   - data://meadow/harvard/2024-07-22/global_military_spending_dataset
+  #   - data://garden/demography/2024-07-15/population
+  #   - data://garden/countries/2024-08-27/gleditsch
+  #   - data://garden/cow/2024-07-26/national_material_capabilities
+  # data://grapher/harvard/2024-07-22/global_military_spending_dataset:
+  #   - data://garden/harvard/2024-07-22/global_military_spending_dataset
diff --git a/dag/war.yml b/dag/war.yml
index ba73f455f24..8c0250937dd 100644
--- a/dag/war.yml
+++ b/dag/war.yml
@@ -221,16 +221,16 @@ steps:
     - data://garden/sipri/2024-07-08/military_expenditure
 
   # Global Military Spending Dataset
-  data://meadow/harvard/2024-07-22/global_military_spending_dataset:
+  data://meadow/harvard/2024-11-26/global_military_spending_dataset:
     - snapshot://harvard/2024-07-22/global_military_spending_dataset.rds
-    - snapshot://harvard/2024-07-22/global_military_spending_dataset_burden.rds
-  data://garden/harvard/2024-07-22/global_military_spending_dataset:
-    - data://meadow/harvard/2024-07-22/global_military_spending_dataset
+    - snapshot://harvard/2024-11-26/global_military_spending_dataset_burden.rds
+  data://garden/harvard/2024-11-26/global_military_spending_dataset:
+    - data://meadow/harvard/2024-11-26/global_military_spending_dataset
     - data://garden/demography/2024-07-15/population
     - data://garden/countries/2024-08-27/gleditsch
     - data://garden/cow/2024-07-26/national_material_capabilities
-  data://grapher/harvard/2024-07-22/global_military_spending_dataset:
-    - data://garden/harvard/2024-07-22/global_military_spending_dataset
+  data://grapher/harvard/2024-11-26/global_military_spending_dataset:
+    - data://garden/harvard/2024-11-26/global_military_spending_dataset
 
     # Correlates of War - National Material Capabilities
   data://meadow/cow/2024-07-26/national_material_capabilities:
diff --git a/etl/steps/data/garden/harvard/2024-11-26/global_military_spending_dataset.meta.yml b/etl/steps/data/garden/harvard/2024-11-26/global_military_spending_dataset.meta.yml
new file mode 100644
index 00000000000..bf3fa6ea4db
--- /dev/null
+++ b/etl/steps/data/garden/harvard/2024-11-26/global_military_spending_dataset.meta.yml
@@ -0,0 +1,184 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    processing_level: minor
+    presentation:
+      topic_tags:
+        - Military Personnel & Spending
+    display: &common-display
+      tolerance: 5
+
+  description_key_methodology: This data is calculated by using nine different military expenditure data sources and combining them using a model. The model links the country-year data together and estimates a mean with a prediction interval for each observation. For more information about the methodology, see [the original article](https://journals.sagepub.com/doi/10.1177/00220027241232964).
+
+  description_key_percentage_gdp: The military expenditure data is divided by gross domestic product (GDP) estimates obtained from a similar latent variable model, explained by the same authors in [a different article](https://journals.sagepub.com/doi/10.1177/00220027211054432).
+
+  description_key_percentage_sdp_1: The military expenditure data is divided by surplus domestic product (SDP), defined as the difference between gross domestic product (GDP) and the economic resources that the population consumes to survive, such that for each state i in year t, SDP(it) = GDP(it) − ((365 ∗ τ) ∗ Population(it)). τ is the subsistence threshold
+
+  description_key_percentage_sdp_2: The GDP estimates used to calculate SDP are obtained from a latent variable model that is similar to the one employed to obtain military expenditure data. For more information about the methodology, see [the original article](https://journals.sagepub.com/doi/10.1177/00220027211054432).
+
+  description_from_producer_methodology: |-
+    _Latent variable model_
+
+    In [the main manuscript](https://journals.sagepub.com/doi/10.1177/00220027241232964), we present, estimate, and describe a latent variable model that links together observed dataset values from across many sources of military expenditure data.
+
+    We are interested in estimating is country-year military spending. Using military ex- penditure data presents several challenges because the datasets are incomplete, cover short periods of time, and are presented in many different monetary units-of-measurement. To overcome these challenges, we specify a dynamic latent variable measurement model that links all of the available information across different contemporary and historical sources of arms spending data. We essentially want to estimate the country-year distribution or simply the average of military spending across all the available observed dataset values so that we generate the best estimate of military spending for each of the country-year units.
+
+    The observed dataset values are linked together through the estimation of a country- year parameter or latent trait. However, the latent trait parameter itself is not directly of interest for inference because it does not have a direct monetary interpretation. This is because it is scaled by the item-specific intercept parameter which transforms the latent trait into the unit-of-measurement of any one of the originally observed military expenditure variables. The measurement model provides predictive intervals for each of the original observed variables on the original scales of these variables. Notationally, we represent the observed country-year dataset values as yitj where i indexes countries, t indexes years of time, and j indexes the dataset. The model then produces posterior predictive distributions of yitj, which we denote as y ̃itj. These are normally distributed values (on the natural log scale). We can therefore take the average of y ̃itj as E(y ̃itj) or the standard deviation of y ̃itj as sd(y ̃itj).
+
+    For the applications in the main manuscript and in this appendix, y ̃itj is the key the quantity we care about. It is the estimated value of yitj, conditional on all the other observed information about military spending for a given country-year unit, which is captured by the latent trait θcur[it] and then scaled by the item-specific intercept parameter αj. Note that, as described in the main manuscript, that we also account for the relationship between current and constant monetary values through inflation by this year scaling relationship: θcon[it] = βt ∗ θcur[it]
+
+    We approximate the posterior distributions of y ̃itj by taking repeated draws from Bayesian simulation model. Specifically, the measurement models are estimated with four MCMC chains to run for 2,000 iterations each using the Stan software (Stan Development Team, 2021). The first 1,000 iterations are thrown away as a burn-in or warmup period. The 4,000 remaining samples were thinned by a factor of 2 and are used to generate the posterior prediction intervals for the original observed variables. Diagnostics (i.e. trace plots, effective sample size, and R-hats) all suggest convergence (Gelman and Hill, 2007).
+
+    So in the end, we have a normally distributed, posterior prediction interval: y ̃itj for every country-year dataset. We can then compare the observed dataset values to these prediction intervals to see how well the model is doing at approximating these observed dataset values. We learn a lot from these descriptive comparisons as we demonstrate in the main manuscript and in additional detail in the rest of this appendix. Ultimately, these comparisons help us validate the resulting estimates relative to other estimates. Even the original data represents historic and government estimates, so such validation efforts are essential, especially when comparing long term historical trends and making predictions about the future.
+
+  description_from_producer_burden: |-
+    _Military burdens_
+
+    Military burdens is the ratio of states spending on arming to available monetary resources are an important area of research for international relations scholars (Anders, Fariss and Markowitz, 2020; Cappella Zielinski, 2016; Fearon, 2018; Lind, 2011; Norloff and Wohlforth, 2019). Here we consider the military burdens of several countries and regions over time, building on results published by Anders, Fariss and Markowitz (2020).
+
+    Anders, Fariss, and Markowitz demonstrate that surplus domestic product (SDP) is a better conceptual representation of the economic resources available to states to invest in arming than gross domestic product (GDP), previously the default measure (see e.g., Fearon, 2018; Khanna, Sandler and Shimizu, 1998; Rasler and Thompson, 1985). Thus, we measure military burdens in two ways: as ratios of spending to SDP and to GDP. To compute SDP for each state i in year t, we subtract from GDP the economic resources that the population consumes to survive, such that SDPit = GDPit − ((365 ∗ τ) ∗ Populationit), where is the subsistence threshold (SDP is truncated to 0 if the resources needed for subsistence exceed GDP). Anders, Fariss, and Markowitz (2020) primarily use a subsistence threshold of $3 per day per person (and thresholds at $2, $1, and $0). In order to facilitate comparisons with previous results, we show military burdens at the $3 threshold. However, we also show results using a $2 per day subsistence threshold, as we are particularly interested in analyzing arming levels and military burdens in earlier historical time periods (facilitated by our new estimates of arming expenditures). Consistent with Anders, Fariss, and Markowitz (2020), we show here that when scaling military expenditures by SDP, the military burdens of poor states are much higher than the conventional measure (scaled by GDP).
+
+    We make two notable improvements to the calculation of military burdens in this paper. First, by including our new estimates of arms spending, we are able to incorporate uncertainty about expenditure values into the estimate of military burdens. Second, we include updated estimates of GDP from a recent article by Fariss et al. (2022), which also include uncertainty estimates, and recalculate SDP based on those estimates. In sum, we are able to bring together the most up-to-date estimates of military burdens component measures, and showcase key patterns for important states and regions over time.
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+dataset:
+  update_period_days: 365
+
+
+tables:
+  global_military_spending_dataset:
+    variables:
+      milex_estimate:
+        title: Military expenditure (constant US$)
+        unit: "constant 2021 US$"
+        short_unit: "$"
+        description_short: "This data is expressed in US dollars. It is adjusted for inflation but does not account for differences in the cost of living between countries."
+        description_key:
+          - "{definitions.description_key_methodology}"
+        description_from_producer: |-
+          "{definitions.description_from_producer_methodology}"
+        presentation:
+          title_public: Military expenditure
+        display:
+          name: Military expenditure
+          numDecimalPlaces: 0
+          <<: *common-display
+
+      milex_estimate_per_capita:
+        title: Military expenditure per capita (constant US$)
+        unit: "constant 2021 US$"
+        short_unit: "$"
+        description_short: "This data is expressed in US dollars. It is adjusted for inflation but does not account for differences in the cost of living between countries."
+        description_key:
+          - "{definitions.description_key_methodology}"
+        description_from_producer: |-
+          "{definitions.description_from_producer_methodology}"
+        processing_level: major
+        description_processing: |-
+          We calculated this indicator by dividing the total military expenditure by a population estimate for each year. The population estimates come from a long-run dataset [mantained by Our World in Data](https://ourworldindata.org/population-sources).
+        presentation:
+          title_public: Military expenditure per capita
+        display:
+          name: Military expenditure per capita
+          numDecimalPlaces: 0
+          <<: *common-display
+
+      milexgdp:
+        title: Military expenditure (% of GDP)
+        unit: "% of GDP"
+        short_unit: "%"
+        description_short: ""
+        description_key:
+          - "{definitions.description_key_methodology}"
+          - "{definitions.description_key_percentage_gdp}"
+        description_from_producer: |-
+          "{definitions.description_from_producer_methodology}"
+
+          "{definitions.description_from_producer_burden}"
+        presentation:
+          title_public: Military expenditure (% of GDP)
+        display:
+          name: Military expenditure (% of GDP)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      milexsurplus1095:
+        title: "Military expenditure (% of SDP) - Subsistence level: $3 per day"
+        unit: "% of SDP"
+        short_unit: "%"
+        description_short: ""
+        description_key:
+          - "{definitions.description_key_methodology}"
+          - "{definitions.description_key_percentage_sdp_1}, set as $3 per day per person for this indicator."
+          - "{definitions.description_key_percentage_sdp_2}"
+        description_from_producer: |-
+          "{definitions.description_from_producer_methodology}"
+
+          "{definitions.description_from_producer_burden}"
+        presentation:
+          title_public: Military expenditure (% of SDP)
+        display:
+          name: Military expenditure (% of SDP)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      milexsurplus365:
+        title: "Military expenditure (% of SDP) - Subsistence level: $1 per day"
+        unit: "% of SDP"
+        short_unit: "%"
+        description_short: ""
+        description_key:
+          - "{definitions.description_key_methodology}"
+          - "{definitions.description_key_percentage_sdp_1}, set as $1 per day per person for this indicator."
+          - "{definitions.description_key_percentage_sdp_2}"
+        description_from_producer: |-
+          "{definitions.description_from_producer_methodology}"
+
+          "{definitions.description_from_producer_burden}"
+        presentation:
+          title_public: Military expenditure (% of SDP)
+        display:
+          name: Military expenditure (% of SDP)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      milexsurplus730:
+        title: "Military expenditure (% of SDP) - Subsistence level: $2 per day"
+        unit: "% of SDP"
+        short_unit: "%"
+        description_short: ""
+        description_key:
+          - "{definitions.description_key_methodology}"
+          - "{definitions.description_key_percentage_sdp_1}, set as $2 per day per person for this indicator."
+          - "{definitions.description_key_percentage_sdp_2}"
+        description_from_producer: |-
+          "{definitions.description_from_producer_methodology}"
+
+          "{definitions.description_from_producer_burden}"
+        presentation:
+          title_public: Military expenditure (% of SDP)
+        display:
+          name: Military expenditure (% of SDP)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      milex_per_military_personnel:
+        title: Military expenditure per military personnel (constant US$)
+        unit: "constant 2021 US$"
+        short_unit: "$"
+        description_short: "This data is expressed in US dollars. It is adjusted for inflation but does not account for differences in the cost of living between countries."
+        description_key:
+          - "{definitions.description_key_methodology}"
+          - Military personnel are troops under the command of the national government, intended for use against foreign adversaries, and held ready for combat as of January 1 of the given year.
+        processing_level: major
+        description_processing: |-
+          We calculated this indicator by dividing the military expenditure by the [military personnel](https://ourworldindata.org/grapher/military-personnel) estimated by the Correlates of War's National Material Capabilities dataset.
+        description_from_producer: |-
+          "{definitions.description_from_producer_methodology}"
+        presentation:
+          title_public: Military expenditure per military personnel
+        display:
+          name: Military expenditure per military personnel
+          numDecimalPlaces: 0
+          <<: *common-display
diff --git a/etl/steps/data/garden/harvard/2024-11-26/global_military_spending_dataset.py b/etl/steps/data/garden/harvard/2024-11-26/global_military_spending_dataset.py
new file mode 100644
index 00000000000..44b7fba11c8
--- /dev/null
+++ b/etl/steps/data/garden/harvard/2024-11-26/global_military_spending_dataset.py
@@ -0,0 +1,189 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+import owid.catalog.processing as pr
+from owid.catalog import Dataset, Table
+
+from etl.data_helpers import geo
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+BURDEN_INDICATORS = ["milexgdp", "milexsurplus1095", "milexsurplus365", "milexsurplus730"]
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("global_military_spending_dataset")
+    ds_population = paths.load_dataset("population")
+    ds_gleditsch = paths.load_dataset("gleditsch")
+    ds_nmc = paths.load_dataset("national_material_capabilities")
+
+    # Read table from meadow dataset.
+    tb = ds_meadow["global_military_spending_dataset"].reset_index()
+    tb_burden = ds_meadow["global_military_spending_dataset_burden"].reset_index()
+
+    # Read Gleditsch country codes
+    tb_gleditsch = ds_gleditsch["gleditsch_countries"].reset_index()
+
+    # Read National Material Capabilities
+    tb_nmc = ds_nmc["national_material_capabilities"].reset_index()
+
+    #
+    # Process data.
+    #
+    # For tb_burden, select gwno, year, and the columns in BURDEN_INDICATORS
+    tb_burden = tb_burden[["gwno", "year"] + BURDEN_INDICATORS]
+
+    # Multiply value by 100 to get percentage
+    tb_burden[BURDEN_INDICATORS] = tb_burden[BURDEN_INDICATORS] * 100
+
+    tb = pick_gmsd_estimates(tb)
+
+    # Merge the two tables
+    tb = pr.merge(tb, tb_burden, on=["gwno", "year"], how="outer")
+
+    tb = harmonize_country_names(tb=tb, tb_gw=tb_gleditsch)
+
+    tb = calculate_milex_per_capita(tb=tb, ds_population=ds_population)
+
+    tb = calculate_milex_per_military_personnel(tb=tb, tb_nmc=tb_nmc)
+
+    tb = tb.format(["country", "year"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+    )
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
+
+
+def get_code_to_country(tb_gw):
+    """
+    Get code to country table.
+    From Lucas' code on `population_fariss`
+    """
+    # Sanity check: no duplicate country codes
+    ## We expect only two codes to have multiple country names assigned: 260 and 580.
+    x = tb_gw.groupby("id")["country"].nunique()
+    codes = set(x[x > 1].index)
+    assert codes == {260, 580}, "Unexpected duplicate country codes!"
+
+    # Make country string
+    tb_gw["country"] = tb_gw["country"].astype("string")
+
+    # Fix: Although there were different namings in the past for countries with codes 260 and 580, we want these to have the modern name.
+    tb_gw["country"] = tb_gw["country"].replace(
+        {
+            "Madagascar (Malagasy)": "Madagascar",
+            "West Germany": "Germany",
+        }
+    )
+
+    # Simplify table
+    tb_gw = tb_gw[["id", "country"]].drop_duplicates().set_index("id", verify_integrity=True).reset_index()
+
+    return tb_gw
+
+
+def pick_gmsd_estimates(tb: Table) -> Table:
+    """
+    Pick the mean GMSD estimates in SIPRI units for military spending.
+    """
+
+    tb = tb[tb["indicator"] == "milex_con_sipri"].reset_index(drop=True).copy()
+
+    # Keep only country, year and mean for tb
+    tb = tb[["gwno", "year", "mean"]]
+
+    # Multiply by 1e6 to get expenditure in US$
+    tb["mean"] = tb["mean"] * 1e6
+
+    # Rename columns
+    tb = tb.rename(columns={"mean": "milex_estimate"})
+
+    return tb
+
+
+def make_burden_table_wide(tb: Table) -> Table:
+    """
+    Make the military burden table wide.
+    """
+
+    tb = tb[["ccode", "year", "indicator", "value"]]
+
+    # Multiply value by 100 to get percentage
+    tb.loc[:, "value"] = tb["value"] * 100
+
+    tb = tb.pivot(index=["ccode", "year"], columns="indicator", values="value").reset_index()
+
+    # Rename columns
+    tb = tb.rename(columns={"ccode": "gwno"})
+
+    return tb
+
+
+def harmonize_country_names(tb: Table, tb_gw: Table) -> Table:
+    """
+    Harmonize country names in the table.
+    """
+
+    # Get code to country table
+    tb_gw = get_code_to_country(tb_gw)
+
+    # Get country names
+    tb = pr.merge(tb, tb_gw, left_on=["gwno"], right_on=["id"], how="left")
+
+    # Check for missing country names
+    assert tb["country"].notna().all(), f"Missing country names! {list(tb.loc[tb['country'].isna(), 'gwno'].unique())}"
+
+    # Drop columns
+    tb = tb.drop(columns=["gwno", "id"])
+
+    return tb
+
+
+def calculate_milex_per_capita(tb: Table, ds_population: Dataset) -> Table:
+    """
+    Calculate military spending per capita.
+    """
+
+    tb = geo.add_population_to_table(tb=tb, ds_population=ds_population, warn_on_missing_countries=False)
+
+    # Calculate military spending per capita
+    tb["milex_estimate_per_capita"] = tb["milex_estimate"] / tb["population"]
+
+    # Drop population column
+    tb = tb.drop(columns=["population"])
+
+    return tb
+
+
+def calculate_milex_per_military_personnel(tb: Table, tb_nmc: Table) -> Table:
+    """
+    Calculate military spending per military personnel.
+    """
+
+    # Merge tables
+    tb = pr.merge(tb, tb_nmc[["country", "year", "milper"]], on=["country", "year"], how="left")
+
+    # Calculate military spending per military personnel
+    tb["milex_per_military_personnel"] = tb["milex_estimate"] / tb["milper"]
+
+    # Replace infinite values with NaN
+    tb["milex_per_military_personnel"] = tb["milex_per_military_personnel"].replace(
+        [float("inf"), float("-inf")], float("nan")
+    )
+
+    # Drop milper column
+    tb = tb.drop(columns=["milper"])
+
+    return tb
diff --git a/etl/steps/data/grapher/harvard/2024-11-26/global_military_spending_dataset.py b/etl/steps/data/grapher/harvard/2024-11-26/global_military_spending_dataset.py
new file mode 100644
index 00000000000..ff4a0123b95
--- /dev/null
+++ b/etl/steps/data/grapher/harvard/2024-11-26/global_military_spending_dataset.py
@@ -0,0 +1,28 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("global_military_spending_dataset")
+
+    # Read table from garden dataset.
+    tb = ds_garden["global_military_spending_dataset"]
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+    )
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/harvard/2024-11-26/global_military_spending_dataset.py b/etl/steps/data/meadow/harvard/2024-11-26/global_military_spending_dataset.py
new file mode 100644
index 00000000000..1960f9f4251
--- /dev/null
+++ b/etl/steps/data/meadow/harvard/2024-11-26/global_military_spending_dataset.py
@@ -0,0 +1,43 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap_constant = paths.load_snapshot("global_military_spending_dataset.rds")
+    snap_burden = paths.load_snapshot("global_military_spending_dataset_burden.rds")
+
+    # Load data from snapshot.
+    tb_constant = snap_constant.read()
+    tb_burden = snap_burden.read()
+
+    #
+    # Process data.
+    #
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    tb_constant = tb_constant.format(["gwno", "year", "indicator"])
+    tb_burden = tb_burden.format(["gwno", "year"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(
+        dest_dir,
+        tables=[
+            tb_constant,
+            tb_burden,
+        ],
+        check_variables_metadata=True,
+        default_metadata=snap_constant.metadata,
+    )
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()
diff --git a/snapshots/harvard/2024-11-26/global_military_spending_dataset_burden.py b/snapshots/harvard/2024-11-26/global_military_spending_dataset_burden.py
new file mode 100644
index 00000000000..6a1eada2f96
--- /dev/null
+++ b/snapshots/harvard/2024-11-26/global_military_spending_dataset_burden.py
@@ -0,0 +1,39 @@
+"""
+Script to create a snapshot of dataset.
+
+The file is manually uploaded, because it is a small file from a 1.5 GB zip file.
+
+STEPS TO UPDATE THIS SNAPSHOT
+
+    1. Go to https://dataverse.harvard.edu/file.xhtml?fileId=8144788
+    2. Download the file by selecting "Access File" and then "ZIP Archive".
+    3. Unzip the file and copy the file in "data/milburden_all_xxxxx.rds" to this directory. xxxxx is the date of the latest version of the data.
+    4. Run this script with the path to the file as an argument.
+        python snapshots/harvard/{version}/global_military_spending_dataset_burden.py --path-to-file milburden_xxxxx.rds
+
+
+"""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.")
+def main(path_to_file: str, upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"harvard/{SNAPSHOT_VERSION}/global_military_spending_dataset_burden.rds")
+
+    # Copy local data file to snapshots data folder, add file to DVC and upload to S3.
+    snap.create_snapshot(filename=path_to_file, upload=upload)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snapshots/harvard/2024-11-26/global_military_spending_dataset_burden.rds.dvc b/snapshots/harvard/2024-11-26/global_military_spending_dataset_burden.rds.dvc
new file mode 100644
index 00000000000..1dd032b4686
--- /dev/null
+++ b/snapshots/harvard/2024-11-26/global_military_spending_dataset_burden.rds.dvc
@@ -0,0 +1,32 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: Global Military Spending Dataset
+    description: |-
+      Military spending data measure key international relations concepts such as balancing, arms races, the distribution of power, and the severity of military burdens. Unfortunately, missing values and measurement error threaten the validity of existing findings. Addressing this challenge, we introduce the Global Military Spending Dataset (GMSD). GMSD collates new and existing expenditure variables from a comprehensive collection of sources, expands data coverage, and employs a latent variable model to estimate missing values and quantify measurement error.
+    date_published: "2024-03-07"
+    version_producer: Version 1
+    title_snapshot: Global Military Spending Dataset - Military Burden
+
+    # Citation
+    producer: Barnum et al.
+    citation_full: |-
+      - Miriam Barnum; Christopher Fariss; Jonathan Markowitz; Gaea Morales (2024). Measuring Arms: Introducing the Global Military Spending Dataset. Journal of Conflict Resolution, 0(0). https://doi.org/10.1177/00220027241232964
+      - Miriam Barnum; Christopher J. Fariss; Jonathan N. Markowitz; Gaea Morales (2024). "Replication Data for: Measuring Arms: Introducing the Global Military Spending Dataset", https://doi.org/10.7910/DVN/RKJAKJ, Harvard Dataverse, V1; Replication-Arming.zip [fileName]
+    attribution: Barnum et al. - Global Military Spending Dataset (2024)
+    attribution_short: GMSD
+
+    # Files
+    url_main: https://journals.sagepub.com/doi/10.1177/00220027241232964
+    date_accessed: 2024-11-26
+
+    # License
+    license:
+      name: CC0
+      url: https://doi.org/10.7910/DVN/DHMZOW
+outs:
+  - md5: 71475dcdb57cd8f3f6a7417ca3cbf8fc
+    size: 3582544
+    path: global_military_spending_dataset_burden.rds