pypsa-meets-earth · davide-f · Feb 8, 2025 · Feb 8, 2025
diff --git a/doc/release_notes.rst b/doc/release_notes.rst
@@ -25,6 +25,8 @@ This part of documentation collects descriptive release notes to capture the mai
 
 * Update workflow to geopandas >= 1.0 `PR #1276 <https://github.com/pypsa-meets-earth/pypsa-earth/pull/1276>`__
 
+* Index hydro units by their location and drop use of alternative clustering option for hydro `PR #1331 <https://github.com/pypsa-meets-earth/pypsa-earth/pull/1331>`__
+
 **Minor Changes and bug-fixing**
 
 * Align structure of the components with consistency checks in the updated PyPSA version `PR #1315 <https://github.com/pypsa-meets-earth/pypsa-earth/pull/1315>`__

diff --git a/scripts/add_electricity.py b/scripts/add_electricity.py
@@ -243,7 +243,7 @@ def load_powerplants(ppl_fn):
     null_ppls = ppl[ppl.p_nom <= 0]
     if not null_ppls.empty:
         logger.warning(f"Drop powerplants with null capacity: {list(null_ppls.name)}.")
-        ppl = ppl.drop(null_ppls.index).reset_index(drop=True)
+        ppl = ppl.drop(null_ppls.index)
     return ppl
 
 
@@ -472,6 +472,7 @@ def attach_hydro(n, costs, ppl):
 
     ppl = (
         ppl.query('carrier == "hydro"')
+        .assign(ppl_id=lambda df: df.index)
         .reset_index(drop=True)
         .rename(index=lambda s: str(s) + " hydro")
     )
@@ -488,65 +489,36 @@ def attach_hydro(n, costs, ppl):
     ror = ppl.query('technology == "Run-Of-River"')
     phs = ppl.query('technology == "Pumped Storage"')
     hydro = ppl.query('technology == "Reservoir"')
-    if snakemake.params.alternative_clustering:
-        bus_id = ppl["region_id"]
-    else:
-        bus_id = ppl["bus"]
 
     inflow_idx = ror.index.union(hydro.index)
     if not inflow_idx.empty:
         with xr.open_dataarray(snakemake.input.profile_hydro) as inflow:
-            inflow_buses = bus_id[inflow_idx]
-            missing_plants = pd.Index(inflow_buses.unique()).difference(
-                inflow.indexes["plant"]
-            )
-            # map power plants index (regions_onshore) into power plants ids (powerplants.csv)
-            # plants_to_keep correspond to "plant" index of regions_onshore
-            # plants_to_keep.index corresponds to bus_id of PyPSA network
-            plants_with_data = inflow_buses[inflow_buses.isin(inflow.indexes["plant"])]
-            plants_to_keep = plants_with_data.to_numpy()
+            found_plants = ppl.ppl_id[ppl.ppl_id.isin(inflow.indexes["plant"])]
+            missing_plants_idxs = ppl.index.difference(found_plants.index)
 
             # if missing time series are found, notify the user and exclude missing hydro plants
-            if not missing_plants.empty:
+            if not missing_plants_idxs.empty:
                 # original total p_nom
                 total_p_nom = ror.p_nom.sum() + hydro.p_nom.sum()
-                # update plants_with_data to ensure proper match between "plant" index and bus_id
-                plants_with_data = inflow_buses[inflow_buses.isin(plants_to_keep)]
-                network_buses_to_keep = plants_with_data.index
-                plants_to_keep = plants_with_data.to_numpy()
 
-                ror = ror.loc[ror.index.intersection(network_buses_to_keep)]
-                hydro = hydro.loc[hydro.index.intersection(network_buses_to_keep)]
+                ror = ror.loc[ror.index.intersection(found_plants.index)]
+                hydro = hydro.loc[hydro.index.intersection(found_plants.index)]
                 # loss of p_nom
                 loss_p_nom = ror.p_nom.sum() + hydro.p_nom.sum() - total_p_nom
 
                 logger.warning(
-                    f"'{snakemake.input.profile_hydro}' is missing inflow time-series for at least one bus: {', '.join(missing_plants)}."
+                    f"'{snakemake.input.profile_hydro}' is missing inflow time-series for at least one bus: {', '.join(missing_plants_idxs)}."
                     f"Corresponding hydro plants are dropped, corresponding to a total loss of {loss_p_nom:.2f}MW out of {total_p_nom:.2f}MW."
                 )
 
             # if there are any plants for which runoff data are available
-            if not plants_with_data.empty:
-                network_buses_to_keep = plants_with_data.index
-                plants_to_keep = plants_with_data.to_numpy()
-
-                # hydro_inflow_factor is used to divide the inflow between the various units of each power plant
-                if not snakemake.params.alternative_clustering:
-                    hydro_inflow_factor = hydro["p_nom"] / hydro.groupby("bus")[
-                        "p_nom"
-                    ].transform("sum")
-                else:
-                    hydro_inflow_factor = hydro["p_nom"] / hydro.groupby("region_id")[
-                        "p_nom"
-                    ].transform("sum")
-
+            if not found_plants.empty:
                 inflow_t = (
-                    inflow.sel(plant=plants_to_keep)
+                    inflow.sel(plant=found_plants.values)
+                    .assign_coords(plant=found_plants.index)
                     .rename({"plant": "name"})
-                    .assign_coords(name=network_buses_to_keep)
                     .transpose("time", "name")
                     .to_pandas()
-                    * hydro_inflow_factor
                 )
 
     if "ror" in carriers and not ror.empty:

diff --git a/scripts/build_renewable_profiles.py b/scripts/build_renewable_profiles.py
@@ -215,6 +215,7 @@
 
 COPERNICUS_CRS = "EPSG:4326"
 GEBCO_CRS = "EPSG:4326"
+PPL_CRS = "EPSG:4326"
 
 
 def check_cutout_match(cutout, geodf):
@@ -356,9 +357,6 @@ def rescale_hydro(plants, runoff, normalize_using_yearly, normalization_year):
         logger.info("No bus has installed hydro plants, ignoring normalization.")
         return runoff
 
-    if snakemake.params.alternative_clustering:
-        plants = plants.set_index("shape_id")
-
     years_statistics = normalize_using_yearly.index
     if isinstance(years_statistics, pd.DatetimeIndex):
         years_statistics = years_statistics.year
@@ -488,7 +486,7 @@ def create_scaling_factor(
     if "snakemake" not in globals():
         from _helpers import mock_snakemake
 
-        snakemake = mock_snakemake("build_renewable_profiles", technology="solar")
+        snakemake = mock_snakemake("build_renewable_profiles", technology="hydro")
     configure_logging(snakemake)
 
     pgb.streams.wrap_stderr()
@@ -533,24 +531,6 @@ def create_scaling_factor(
         # the region should be restricted for non-hydro technologies, as the hydro potential is calculated across hydrobasins which may span beyond the region of the country
         cutout = filter_cutout_region(cutout, regions)
 
-    if snakemake.params.alternative_clustering:
-        regions = gpd.GeoDataFrame(
-            regions.reset_index()
-            .groupby("shape_id")
-            .agg(
-                {
-                    "x": "mean",
-                    "y": "mean",
-                    "country": "first",
-                    "geometry": "first",
-                    "bus": "first",
-                }
-            )
-            .reset_index()
-            .set_index("bus"),
-            crs=regions.crs,
-        )
-
     buses = regions.index
 
     func = getattr(cutout, resource.pop("method"))
@@ -564,58 +544,33 @@ def create_scaling_factor(
         hydrobasins = gpd.read_file(hydrobasins_path)
         ppls = load_powerplants(snakemake.input.powerplants)
 
-        hydro_ppls = ppls[ppls.carrier == "hydro"]
-
-        # commented out as rivers may span across multiple countries
-        # hydrobasins = hydrobasins[
-        #     [
-        #         any(country_shapes.geometry.intersects(geom))
-        #         for geom in hydrobasins.geometry
-        #     ]
-        # ]  # exclude hydrobasins shapes that do not intersect the countries of interest
-
-        # select busbar whose location (p) belongs to at least one hydrobasin geometry
-        # if extendable option is true, all buses are included
-        # otherwise only where hydro powerplants are available are considered
-        if snakemake.params.alternative_clustering:
-            filter_bus_to_consider = regions.index.map(
-                lambda bus_id: config.get("extendable", False)
-                | (bus_id in hydro_ppls.region_id.values)
-            )
-        ### TODO: quickfix. above case and the below case should by unified
-        if snakemake.params.alternative_clustering == False:
-            filter_bus_to_consider = regions.index.map(
-                lambda bus_id: config.get("extendable", False)
-                | (bus_id in hydro_ppls.bus.values)
-            )
-        bus_to_consider = regions.index[filter_bus_to_consider]
+        all_hydro_ppls = ppls[ppls.carrier == "hydro"]
+
+        # select hydro units within hydrobasins
+        hgdf = gpd.GeoDataFrame(
+            all_hydro_ppls,
+            index=all_hydro_ppls.index,
+            geometry=gpd.points_from_xy(all_hydro_ppls.lon, all_hydro_ppls.lat),
+            crs=PPL_CRS,
+        ).to_crs(hydrobasins.crs)
+        temp_gdf = gpd.sjoin(hgdf, hydrobasins, predicate="within", how="left")
 
-        # identify subset of buses within the hydrobasins
-        filter_bus_in_hydrobasins = regions[filter_bus_to_consider].apply(
-            lambda row: any(hydrobasins.geometry.contains(Point(row["x"], row["y"]))),
-            axis=1,
+        hydro_ppls = pd.DataFrame(
+            hgdf.loc[temp_gdf.index_right.dropna().index].drop(columns="geometry")
         )
-        bus_in_hydrobasins = filter_bus_in_hydrobasins[filter_bus_in_hydrobasins].index
 
         bus_notin_hydrobasins = list(
-            set(bus_to_consider).difference(set(bus_in_hydrobasins))
+            set(all_hydro_ppls.index).difference(set(hydro_ppls.index))
         )
 
-        resource["plants"] = regions.rename(
-            columns={"x": "lon", "y": "lat", "country": "countries"}
-        ).loc[bus_in_hydrobasins, ["lon", "lat", "countries", "shape_id"]]
+        resource["plants"] = hydro_ppls.rename(columns={"country": "countries"})[
+            ["lon", "lat", "countries"]
+        ]
 
-        # TODO: these cases shall be fixed by restructuring the alternative clustering procedure
-        if snakemake.params.alternative_clustering == False:
-            resource["plants"]["installed_hydro"] = [
-                True if (bus_id in hydro_ppls.bus.values) else False
-                for bus_id in resource["plants"].index
-            ]
-        else:
-            resource["plants"]["installed_hydro"] = [
-                True if (bus_id in hydro_ppls.region_id.values) else False
-                for bus_id in resource["plants"].shape_id.values
-            ]
+        # TODO: possibly revise to account for non-existent hydro powerplants
+        resource["plants"]["installed_hydro"] = [
+            True for bus_id in resource["plants"].index
+        ]
 
         # get normalization before executing runoff
         normalization = None
@@ -631,8 +586,6 @@ def create_scaling_factor(
         else:
             # otherwise perform the calculations
             inflow = correction_factor * func(capacity_factor=True, **resource)
-            if snakemake.params.alternative_clustering:
-                inflow["plant"] = regions.shape_id.loc[inflow["plant"]].values
 
             if "clip_min_inflow" in config:
                 inflow = inflow.where(inflow >= config["clip_min_inflow"], 0)
@@ -647,14 +600,13 @@ def create_scaling_factor(
                         get_hydro_capacities_annual_hydro_generation(
                             path_hydro_capacities, countries, norm_year
                         )
-                        * config.get("multiplier", 1.0)
                     )
 
                 elif method == "eia":
                     path_eia_stats = snakemake.input.eia_hydro_generation
                     normalize_using_yearly = get_eia_annual_hydro_generation(
                         path_eia_stats, countries
-                    ) * config.get("multiplier", 1.0)
+                    )
 
                 inflow = rescale_hydro(
                     resource["plants"], inflow, normalize_using_yearly, norm_year
@@ -669,8 +621,8 @@ def create_scaling_factor(
 
             # add zero values for out of hydrobasins elements
             if len(bus_notin_hydrobasins) > 0:
-                regions_notin = regions.loc[
-                    bus_notin_hydrobasins, ["x", "y", "country", "shape_id"]
+                regions_notin = all_hydro_ppls.loc[
+                    bus_notin_hydrobasins, ["lon", "lat", "country"]
                 ]
                 logger.warning(
                     f"Buses {bus_notin_hydrobasins} are not contained into hydrobasins."
@@ -684,11 +636,7 @@ def create_scaling_factor(
                     ),
                     dims=("plant", "time"),
                     coords=dict(
-                        plant=(
-                            bus_notin_hydrobasins
-                            if not snakemake.params.alternative_clustering
-                            else regions_notin["shape_id"].values
-                        ),
+                        plant=bus_notin_hydrobasins,
                         time=inflow.coords["time"],
                     ),
                 )
@@ -816,7 +764,7 @@ def create_scaling_factor(
 
         if snakemake.wildcards.technology.startswith("offwind"):
             logger.info("Calculate underwater fraction of connections.")
-            offshore_shape = gpd.read_file(paths["offshore_shapes"]).unary_union
+            offshore_shape = gpd.read_file(paths["offshore_shapes"]).union_all()
             underwater_fraction = []
             for bus in buses:
                 p = centre_of_mass.sel(bus=bus).data