From b156b4521876ef4eb497cfebd9c3f7193e5c0bdf Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Mon, 6 Jun 2022 16:40:46 -0500
Subject: [PATCH 01/29] Merge Split algorithm added

This file is a post processing step to the tobac tracking step. This is a first iteration in addressing split/merged cells.
---
 tobac/merge_split.py | 345 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 tobac/merge_split.py

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
new file mode 100644
index 00000000..0cbbc7e5
--- /dev/null
+++ b/tobac/merge_split.py
@@ -0,0 +1,345 @@
+#Tobac merge and split v0.1
+
+from geopy.distance import geodesic
+from networkx import *
+import numpy as np
+from pandas.core.common import flatten
+import xarray as xr
+
+def compress_all(nc_grids, min_dims=2):
+    for var in nc_grids:
+        if len(nc_grids[var].dims) >= min_dims:
+            # print("Compressing ", var)
+            nc_grids[var].encoding["zlib"] = True
+            nc_grids[var].encoding["complevel"] = 4
+            nc_grids[var].encoding["contiguous"] = False
+    return nc_grids
+    
+
+def standardize_track_dataset(TrackedFeatures, Mask, Projection):
+    """ Combine a feature mask with the feature data table into a common dataset.
+    Also renames th
+
+    returned by tobac.themes.tobac_v1.segmentation
+    with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy.
+
+    Also rename the variables to be more desciptive and comply with cf-tree.
+
+    Convert the default cell parent ID  to an integer table.
+
+    Add a cell dimension to reflect
+
+    Projection is an xarray DataArray
+
+    TODO: Add metadata attributes to
+
+     """
+    feature_standard_names = {
+        # new variable name, and long description for the NetCDF attribute
+        'frame':('feature_time_index',
+            'positional index of the feature along the time dimension of the mask, from 0 to N-1'),
+        'hdim_1':('feature_hdim1_coordinate',
+            'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).'
+        'The numbering is consistent with positional indexing of the coordinate, but can be'
+            'fractional, to account for a centroid not aligned to the grid.'),
+        'hdim_2':('feature_hdim2_coordinate',
+            'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)'
+            'The numbering is consistent with positional indexing of the coordinate, but can be'
+            'fractional, to account for a centroid not aligned to the grid.'),
+        'idx':('feature_id_this_frame',),
+        'num':('feature_grid_cell_count',
+            'Number of grid points that are within the threshold of this feature'),
+        'threshold_value':('feature_threshold_max',
+            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"),
+        'feature':('feature_id',
+            "Unique number of the feature; starts from 1 and increments by 1 to the number of features"),
+        'time':('feature_time','time of the feature, consistent with feature_time_index'),
+        'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'),
+        'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'),
+        'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'),
+        'lat':('feature_latitude','latitude of the feature'),
+        'lon':('feature_longitude','longitude of the feature'),
+        'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'),
+        'areas':('feature_area',),
+        'isolated':('feature_isolation_flag',),
+        'num_objects':('number_of_feature_neighbors',),
+        'cell':('feature_parent_cell_id',),
+        'time_cell':('feature_parent_cell_elapsed_time',),
+        'segmentation_mask':('2d segmentation mask',)
+    }
+    new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys()
+                             if k in TrackedFeatures.variables.keys()}
+
+    TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id'])
+    # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of
+    # the 'index' variable and call the dimension 'feature'
+    ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names),
+                  Mask])
+
+    # Add the projection data back in
+    ds['ProjectionCoordinateSystem']=Projection
+
+    # Convert the cell ID variable from float to integer
+    if 'int' not in str(TrackedFeatures.cell.dtype):
+        # The raw output from the tracking is actually an object array
+        # array([nan, 2, 3], dtype=object)
+        # (and is cast to a float array when saved as NetCDF, I think).
+        # Cast to float.
+        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64')
+
+        cell_id_data = TrackedFeatures.cell.astype('float64')
+        valid_cell = np.isfinite(cell_id_data)
+        valid_cell_ids = cell_id_data[valid_cell]
+        if not (np.unique(valid_cell_ids) > 0).all():
+            raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell')
+        int_cell[valid_cell] = valid_cell_ids.astype('int64')
+        #ds['feature_parent_cell_id'] = int_cell
+    return ds
+
+
+
+def merge_split(TRACK,distance = 25000,frame_len = 5):
+    '''
+    function to  postprocess tobac track data for merge/split cells
+    Input:
+        TRACK:    xarray dataset of tobac Track information
+        
+        distance:    float, optional distance threshold prior to adding a pair of points
+                            into the minimum spanning tree. Default is 25000 meters.
+        
+        frame_len: float, optional threshold for the spanning length within which two points
+                          can be separated. Default is five (5) frames.
+
+
+    Output:
+        d:    xarray dataset of 
+                        feature position along 1st horizontal dimension
+        hdim2_index:    float
+                        feature position along 2nd horizontal dimension
+                        
+    Example:
+        d = merge_split(Track)
+       ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem'])
+        # both_ds = xarray.combine_by_coords((ds,d), compat='override')
+        both_ds = xr.merge([ds, d],compat ='override')
+        both_ds = compress_all(both_ds)
+        both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
+    
+    '''
+    track_groups = TRACK.groupby('cell')
+    cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()}
+    id_data = np.fromiter(cell_ids.keys(), dtype=int)
+    count_data = np.fromiter(cell_ids.values(), dtype=int)
+    all_frames = np.sort(np.unique(TRACK.frame))
+    a_points = list()
+    b_points = list()
+    a_names = list()
+    b_names = list()
+    dist = list()
+    
+    
+    for i in id_data:
+        #print(i)
+        a_pointx = track_groups[i].projection_x_coordinate[-1].values
+        a_pointy = track_groups[i].projection_y_coordinate[-1].values
+        for j in id_data:
+            b_pointx = track_groups[j].projection_x_coordinate[0].values
+            b_pointy = track_groups[j].projection_y_coordinate[0].values
+            d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)))
+            if d <= distance:
+                a_points.append([a_pointx,a_pointy])
+                b_points.append([b_pointx, b_pointy])
+                dist.append(d)
+                a_names.append(i)
+                b_names.append(j)
+
+
+    
+    
+    
+#     for i in id_data:
+#         a_pointx = track_groups[i].grid_longitude[-1].values
+#         a_pointy = track_groups[i].grid_latitude[-1].values
+#         for j in id_data:
+#             b_pointx = track_groups[j].grid_longitude[0].values
+#             b_pointy = track_groups[j].grid_latitude[0].values
+#             d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km
+#             if d <= distance:
+#                 a_points.append([a_pointx,a_pointy])
+#                 b_points.append([b_pointx, b_pointy])
+#                 dist.append(d)
+#                 a_names.append(i)
+#                 b_names.append(j)
+
+    id = []
+    for i in range(len(dist)-1, -1, -1): 
+        if a_names[i] == b_names[i]:
+            id.append(i)
+            a_points.pop(i)
+            b_points.pop(i)
+            dist.pop(i)
+            a_names.pop(i)
+            b_names.pop(i)
+        else:
+            continue
+ 
+    g = Graph()
+    for i in np.arange(len(dist)):
+        g.add_edge(a_names[i], b_names[i],weight=dist[i])
+
+    tree = minimum_spanning_edges(g)
+    tree_list = list(minimum_spanning_edges(g))
+
+    new_tree = []
+    for i,j in enumerate(tree_list):
+        frame_a = np.nanmax(track_groups[j[0]].frame.values)
+        frame_b = np.nanmin(track_groups[j[1]].frame.values)
+        if np.abs(frame_a - frame_b) <= frame_len:
+            new_tree.append(tree_list[i][0:2])
+    new_tree_arr = np.array(new_tree)
+
+    TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values))
+    cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])
+    track_id = dict() #same size as number of total merged tracks
+
+    arr = np.array([0])
+    for p in cell_id:
+        j = np.where(arr == int(p))
+        if len(j[0]) > 0:
+            continue
+        else:
+            k = np.where(new_tree_arr == p)
+            if len(k[0]) == 0:
+                track_id[p] = [p]
+                arr = np.append(arr,p)
+            else:
+                temp1 = list(np.unique(new_tree_arr[k[0]]))
+                temp = list(np.unique(new_tree_arr[k[0]]))
+
+                for l in range(15):
+                    for i in temp1:
+                        k2 = np.where(new_tree_arr == i)
+                        temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
+                        temp = list(flatten(temp))
+                        temp = list(np.unique(temp))
+
+                    if len(temp1) == len(temp):
+                        break
+                    temp1 = np.array(temp)
+                
+                for i in temp1:
+                    k2 = np.where(new_tree_arr == i)
+                    temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
+
+                temp = list(flatten(temp))
+                temp = list(np.unique(temp))
+                arr = np.append(arr,np.unique(temp))
+                
+                track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
+                
+                
+
+    storm_id = [0] #default because we don't track larger storm systems *yet*
+    print('found storm id')
+
+
+    track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet*
+    print('found track parent storm ids')
+
+    track_ids = np.array(list(track_id.keys()))
+    print('found track ids')
+
+
+    cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]))
+    print('found cell ids')
+
+    cell_parent_track_id = []
+
+    for i, id in enumerate(track_id):
+    
+        if len(track_id[int(id)]) ==  1:
+            cell_parent_track_id.append(int(id))
+
+        else:
+            cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)])))
+
+
+    cell_parent_track_id = list(flatten(cell_parent_track_id))
+    print('found cell parent track ids')
+
+    feature_parent_cell_id = list(TRACK.cell.values.astype(float))
+
+    print('found feature parent cell ids')
+
+    #This version includes all the feature regardless of if they are used in cells or not.
+    feature_id = list(TRACK.feature.values.astype(int))
+    print('found feature ids')
+
+    feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm
+    print('found feature parent storm ids')
+
+    feature_parent_track_id  = []
+    feature_parent_track_id = np.zeros(len(feature_id))
+    for i, id in enumerate(feature_id):
+        cellid = feature_parent_cell_id[i]
+        if np.isnan(cellid):
+            feature_parent_track_id[i] = -1 
+        else:
+            j = np.where(cell_id == cellid)
+            j = np.squeeze(j)
+            trackid = cell_parent_track_id[j]
+            feature_parent_track_id[i] = trackid
+        
+    print('found feature parent track ids')
+
+
+    storm_child_track_count = [len(track_id)]
+    print('found storm child track count')
+
+    track_child_cell_count = []
+    for i,id in enumerate(track_id):
+        track_child_cell_count.append(len(track_id[int(id)]))
+    print('found track child cell count')
+    
+
+    cell_child_feature_count = []
+    for i,id in enumerate(cell_id):
+        cell_child_feature_count.append(len(track_groups[id].feature.values))
+    print('found cell child feature count')
+
+    storm_child_cell_count =  [len(cell_id)] 
+    storm_child_feature_count = [len(feature_id)]
+    
+    storm_dim = 'nstorms'
+    track_dim = 'ntracks'
+    cell_dim = 'ncells'
+    feature_dim = 'nfeatures'
+    
+    d = xr.Dataset({
+        'storm_id': (storm_dim, storm_id),
+        'track_id': (track_dim, track_ids),
+        'track_parent_storm_id': (track_dim, track_parent_storm_id),
+        'cell_id': (cell_dim, cell_id),
+        'cell_parent_track_id': (cell_dim, cell_parent_track_id),
+        'feature_id': (feature_dim, feature_id),
+        'feature_parent_cell_id': (feature_dim, feature_parent_cell_id),
+        'feature_parent_track_id': (feature_dim, feature_parent_track_id),
+        'feature_parent_storm_id': (feature_dim, feature_parent_storm_id),
+        'storm_child_track_count': (storm_dim, storm_child_track_count),
+        'storm_child_cell_count': (storm_dim, storm_child_cell_count),
+        'storm_child_feature_count': (storm_dim, storm_child_feature_count),
+        'track_child_cell_count': (track_dim, track_child_cell_count),
+        'cell_child_feature_count': (cell_dim, cell_child_feature_count),
+            })
+    d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) 
+
+    assert len(track_id) == len(track_parent_storm_id)
+    assert len(cell_id) == len(cell_parent_track_id)
+    assert len(feature_id) == len(feature_parent_cell_id)
+    assert sum(storm_child_track_count) == len(track_id)
+    assert sum(storm_child_cell_count) == len(cell_id)
+    assert sum(storm_child_feature_count) == len(feature_id)
+    assert sum(track_child_cell_count) == len(cell_id)
+    assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id)
+    
+    return d
\ No newline at end of file

From b550e3b713b3caaab0be1f5debd6a977ae889b20 Mon Sep 17 00:00:00 2001
From: Juli <julia.kukulies@gu.se>
Date: Thu, 16 Jun 2022 14:22:43 +0200
Subject: [PATCH 02/29] black formatting

---
 tobac/merge_split.py | 322 ++++++++++++++++++++++++-------------------
 1 file changed, 181 insertions(+), 141 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 0cbbc7e5..4183a3f0 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -1,4 +1,4 @@
-#Tobac merge and split v0.1
+# Tobac merge and split v0.1
 
 from geopy.distance import geodesic
 from networkx import *
@@ -6,6 +6,7 @@
 from pandas.core.common import flatten
 import xarray as xr
 
+
 def compress_all(nc_grids, min_dims=2):
     for var in nc_grids:
         if len(nc_grids[var].dims) >= min_dims:
@@ -14,7 +15,7 @@ def compress_all(nc_grids, min_dims=2):
             nc_grids[var].encoding["complevel"] = 4
             nc_grids[var].encoding["contiguous"] = False
     return nc_grids
-    
+
 
 def standardize_track_dataset(TrackedFeatures, Mask, Projection):
     """ Combine a feature mask with the feature data table into a common dataset.
@@ -36,70 +37,107 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection):
      """
     feature_standard_names = {
         # new variable name, and long description for the NetCDF attribute
-        'frame':('feature_time_index',
-            'positional index of the feature along the time dimension of the mask, from 0 to N-1'),
-        'hdim_1':('feature_hdim1_coordinate',
-            'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).'
-        'The numbering is consistent with positional indexing of the coordinate, but can be'
-            'fractional, to account for a centroid not aligned to the grid.'),
-        'hdim_2':('feature_hdim2_coordinate',
-            'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)'
-            'The numbering is consistent with positional indexing of the coordinate, but can be'
-            'fractional, to account for a centroid not aligned to the grid.'),
-        'idx':('feature_id_this_frame',),
-        'num':('feature_grid_cell_count',
-            'Number of grid points that are within the threshold of this feature'),
-        'threshold_value':('feature_threshold_max',
-            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"),
-        'feature':('feature_id',
-            "Unique number of the feature; starts from 1 and increments by 1 to the number of features"),
-        'time':('feature_time','time of the feature, consistent with feature_time_index'),
-        'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'),
-        'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'),
-        'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'),
-        'lat':('feature_latitude','latitude of the feature'),
-        'lon':('feature_longitude','longitude of the feature'),
-        'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'),
-        'areas':('feature_area',),
-        'isolated':('feature_isolation_flag',),
-        'num_objects':('number_of_feature_neighbors',),
-        'cell':('feature_parent_cell_id',),
-        'time_cell':('feature_parent_cell_elapsed_time',),
-        'segmentation_mask':('2d segmentation mask',)
+        "frame": (
+            "feature_time_index",
+            "positional index of the feature along the time dimension of the mask, from 0 to N-1",
+        ),
+        "hdim_1": (
+            "feature_hdim1_coordinate",
+            "position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x)."
+            "The numbering is consistent with positional indexing of the coordinate, but can be"
+            "fractional, to account for a centroid not aligned to the grid.",
+        ),
+        "hdim_2": (
+            "feature_hdim2_coordinate",
+            "position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)"
+            "The numbering is consistent with positional indexing of the coordinate, but can be"
+            "fractional, to account for a centroid not aligned to the grid.",
+        ),
+        "idx": ("feature_id_this_frame",),
+        "num": (
+            "feature_grid_cell_count",
+            "Number of grid points that are within the threshold of this feature",
+        ),
+        "threshold_value": (
+            "feature_threshold_max",
+            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments",
+        ),
+        "feature": (
+            "feature_id",
+            "Unique number of the feature; starts from 1 and increments by 1 to the number of features",
+        ),
+        "time": (
+            "feature_time",
+            "time of the feature, consistent with feature_time_index",
+        ),
+        "timestr": (
+            "feature_time_str",
+            "String representation of the feature time, YYYY-MM-DD HH:MM:SS",
+        ),
+        "projection_y_coordinate": (
+            "feature_projection_y_coordinate",
+            "y position of the feature in the projection given by ProjectionCoordinateSystem",
+        ),
+        "projection_x_coordinate": (
+            "feature_projection_x_coordinate",
+            "x position of the feature in the projection given by ProjectionCoordinateSystem",
+        ),
+        "lat": ("feature_latitude", "latitude of the feature"),
+        "lon": ("feature_longitude", "longitude of the feature"),
+        "ncells": (
+            "feature_ncells",
+            "number of grid cells for this feature (meaning uncertain)",
+        ),
+        "areas": ("feature_area",),
+        "isolated": ("feature_isolation_flag",),
+        "num_objects": ("number_of_feature_neighbors",),
+        "cell": ("feature_parent_cell_id",),
+        "time_cell": ("feature_parent_cell_elapsed_time",),
+        "segmentation_mask": ("2d segmentation mask",),
+    }
+    new_feature_var_names = {
+        k: feature_standard_names[k][0]
+        for k in feature_standard_names.keys()
+        if k in TrackedFeatures.variables.keys()
     }
-    new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys()
-                             if k in TrackedFeatures.variables.keys()}
 
-    TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id'])
+    TrackedFeatures = TrackedFeatures.drop(["cell_parent_track_id"])
     # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of
     # the 'index' variable and call the dimension 'feature'
-    ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names),
-                  Mask])
+    ds = xr.merge(
+        [
+            TrackedFeatures.swap_dims({"index": "feature"})
+            .drop("index")
+            .rename_vars(new_feature_var_names),
+            Mask,
+        ]
+    )
 
     # Add the projection data back in
-    ds['ProjectionCoordinateSystem']=Projection
+    ds["ProjectionCoordinateSystem"] = Projection
 
     # Convert the cell ID variable from float to integer
-    if 'int' not in str(TrackedFeatures.cell.dtype):
+    if "int" not in str(TrackedFeatures.cell.dtype):
         # The raw output from the tracking is actually an object array
         # array([nan, 2, 3], dtype=object)
         # (and is cast to a float array when saved as NetCDF, I think).
         # Cast to float.
-        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64')
+        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype="int64")
 
-        cell_id_data = TrackedFeatures.cell.astype('float64')
+        cell_id_data = TrackedFeatures.cell.astype("float64")
         valid_cell = np.isfinite(cell_id_data)
         valid_cell_ids = cell_id_data[valid_cell]
         if not (np.unique(valid_cell_ids) > 0).all():
-            raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell')
-        int_cell[valid_cell] = valid_cell_ids.astype('int64')
-        #ds['feature_parent_cell_id'] = int_cell
+            raise AssertionError(
+                "Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell"
+            )
+        int_cell[valid_cell] = valid_cell_ids.astype("int64")
+        # ds['feature_parent_cell_id'] = int_cell
     return ds
 
 
-
-def merge_split(TRACK,distance = 25000,frame_len = 5):
-    '''
+def merge_split(TRACK, distance=25000, frame_len=5):
+    """
     function to  postprocess tobac track data for merge/split cells
     Input:
         TRACK:    xarray dataset of tobac Track information
@@ -125,9 +163,9 @@ def merge_split(TRACK,distance = 25000,frame_len = 5):
         both_ds = compress_all(both_ds)
         both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
     
-    '''
-    track_groups = TRACK.groupby('cell')
-    cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()}
+    """
+    track_groups = TRACK.groupby("cell")
+    cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()}
     id_data = np.fromiter(cell_ids.keys(), dtype=int)
     count_data = np.fromiter(cell_ids.values(), dtype=int)
     all_frames = np.sort(np.unique(TRACK.frame))
@@ -136,43 +174,40 @@ def merge_split(TRACK,distance = 25000,frame_len = 5):
     a_names = list()
     b_names = list()
     dist = list()
-    
-    
+
     for i in id_data:
-        #print(i)
+        # print(i)
         a_pointx = track_groups[i].projection_x_coordinate[-1].values
         a_pointy = track_groups[i].projection_y_coordinate[-1].values
         for j in id_data:
             b_pointx = track_groups[j].projection_x_coordinate[0].values
             b_pointy = track_groups[j].projection_y_coordinate[0].values
-            d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)))
+            d = np.linalg.norm(
+                np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))
+            )
             if d <= distance:
-                a_points.append([a_pointx,a_pointy])
+                a_points.append([a_pointx, a_pointy])
                 b_points.append([b_pointx, b_pointy])
                 dist.append(d)
                 a_names.append(i)
                 b_names.append(j)
 
-
-    
-    
-    
-#     for i in id_data:
-#         a_pointx = track_groups[i].grid_longitude[-1].values
-#         a_pointy = track_groups[i].grid_latitude[-1].values
-#         for j in id_data:
-#             b_pointx = track_groups[j].grid_longitude[0].values
-#             b_pointy = track_groups[j].grid_latitude[0].values
-#             d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km
-#             if d <= distance:
-#                 a_points.append([a_pointx,a_pointy])
-#                 b_points.append([b_pointx, b_pointy])
-#                 dist.append(d)
-#                 a_names.append(i)
-#                 b_names.append(j)
+    #     for i in id_data:
+    #         a_pointx = track_groups[i].grid_longitude[-1].values
+    #         a_pointy = track_groups[i].grid_latitude[-1].values
+    #         for j in id_data:
+    #             b_pointx = track_groups[j].grid_longitude[0].values
+    #             b_pointy = track_groups[j].grid_latitude[0].values
+    #             d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km
+    #             if d <= distance:
+    #                 a_points.append([a_pointx,a_pointy])
+    #                 b_points.append([b_pointx, b_pointy])
+    #                 dist.append(d)
+    #                 a_names.append(i)
+    #                 b_names.append(j)
 
     id = []
-    for i in range(len(dist)-1, -1, -1): 
+    for i in range(len(dist) - 1, -1, -1):
         if a_names[i] == b_names[i]:
             id.append(i)
             a_points.pop(i)
@@ -182,25 +217,27 @@ def merge_split(TRACK,distance = 25000,frame_len = 5):
             b_names.pop(i)
         else:
             continue
- 
+
     g = Graph()
     for i in np.arange(len(dist)):
-        g.add_edge(a_names[i], b_names[i],weight=dist[i])
+        g.add_edge(a_names[i], b_names[i], weight=dist[i])
 
     tree = minimum_spanning_edges(g)
     tree_list = list(minimum_spanning_edges(g))
 
     new_tree = []
-    for i,j in enumerate(tree_list):
+    for i, j in enumerate(tree_list):
         frame_a = np.nanmax(track_groups[j[0]].frame.values)
         frame_b = np.nanmin(track_groups[j[1]].frame.values)
         if np.abs(frame_a - frame_b) <= frame_len:
             new_tree.append(tree_list[i][0:2])
     new_tree_arr = np.array(new_tree)
 
-    TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values))
-    cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])
-    track_id = dict() #same size as number of total merged tracks
+    TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values))
+    cell_id = np.unique(
+        TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]
+    )
+    track_id = dict()  # same size as number of total merged tracks
 
     arr = np.array([0])
     for p in cell_id:
@@ -211,7 +248,7 @@ def merge_split(TRACK,distance = 25000,frame_len = 5):
             k = np.where(new_tree_arr == p)
             if len(k[0]) == 0:
                 track_id[p] = [p]
-                arr = np.append(arr,p)
+                arr = np.append(arr, p)
             else:
                 temp1 = list(np.unique(new_tree_arr[k[0]]))
                 temp = list(np.unique(new_tree_arr[k[0]]))
@@ -226,112 +263,113 @@ def merge_split(TRACK,distance = 25000,frame_len = 5):
                     if len(temp1) == len(temp):
                         break
                     temp1 = np.array(temp)
-                
+
                 for i in temp1:
                     k2 = np.where(new_tree_arr == i)
                     temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
 
                 temp = list(flatten(temp))
                 temp = list(np.unique(temp))
-                arr = np.append(arr,np.unique(temp))
-                
-                track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
-                
-                
+                arr = np.append(arr, np.unique(temp))
 
-    storm_id = [0] #default because we don't track larger storm systems *yet*
-    print('found storm id')
+                track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
 
+    storm_id = [0]  # default because we don't track larger storm systems *yet*
+    print("found storm id")
 
-    track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet*
-    print('found track parent storm ids')
+    track_parent_storm_id = np.repeat(
+        0, len(track_id)
+    )  # This will always be zero when we don't track larger storm systems *yet*
+    print("found track parent storm ids")
 
     track_ids = np.array(list(track_id.keys()))
-    print('found track ids')
+    print("found track ids")
 
-
-    cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]))
-    print('found cell ids')
+    cell_id = list(
+        np.unique(
+            TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]
+        )
+    )
+    print("found cell ids")
 
     cell_parent_track_id = []
 
     for i, id in enumerate(track_id):
-    
-        if len(track_id[int(id)]) ==  1:
+
+        if len(track_id[int(id)]) == 1:
             cell_parent_track_id.append(int(id))
 
         else:
-            cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)])))
-
+            cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)])))
 
     cell_parent_track_id = list(flatten(cell_parent_track_id))
-    print('found cell parent track ids')
+    print("found cell parent track ids")
 
     feature_parent_cell_id = list(TRACK.cell.values.astype(float))
 
-    print('found feature parent cell ids')
+    print("found feature parent cell ids")
 
-    #This version includes all the feature regardless of if they are used in cells or not.
+    # This version includes all the feature regardless of if they are used in cells or not.
     feature_id = list(TRACK.feature.values.astype(int))
-    print('found feature ids')
+    print("found feature ids")
 
-    feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm
-    print('found feature parent storm ids')
+    feature_parent_storm_id = np.repeat(0, len(feature_id))  # we don't do storms atm
+    print("found feature parent storm ids")
 
-    feature_parent_track_id  = []
+    feature_parent_track_id = []
     feature_parent_track_id = np.zeros(len(feature_id))
     for i, id in enumerate(feature_id):
         cellid = feature_parent_cell_id[i]
         if np.isnan(cellid):
-            feature_parent_track_id[i] = -1 
+            feature_parent_track_id[i] = -1
         else:
             j = np.where(cell_id == cellid)
             j = np.squeeze(j)
             trackid = cell_parent_track_id[j]
             feature_parent_track_id[i] = trackid
-        
-    print('found feature parent track ids')
 
+    print("found feature parent track ids")
 
     storm_child_track_count = [len(track_id)]
-    print('found storm child track count')
+    print("found storm child track count")
 
     track_child_cell_count = []
-    for i,id in enumerate(track_id):
+    for i, id in enumerate(track_id):
         track_child_cell_count.append(len(track_id[int(id)]))
-    print('found track child cell count')
-    
+    print("found track child cell count")
 
     cell_child_feature_count = []
-    for i,id in enumerate(cell_id):
+    for i, id in enumerate(cell_id):
         cell_child_feature_count.append(len(track_groups[id].feature.values))
-    print('found cell child feature count')
+    print("found cell child feature count")
 
-    storm_child_cell_count =  [len(cell_id)] 
+    storm_child_cell_count = [len(cell_id)]
     storm_child_feature_count = [len(feature_id)]
-    
-    storm_dim = 'nstorms'
-    track_dim = 'ntracks'
-    cell_dim = 'ncells'
-    feature_dim = 'nfeatures'
-    
-    d = xr.Dataset({
-        'storm_id': (storm_dim, storm_id),
-        'track_id': (track_dim, track_ids),
-        'track_parent_storm_id': (track_dim, track_parent_storm_id),
-        'cell_id': (cell_dim, cell_id),
-        'cell_parent_track_id': (cell_dim, cell_parent_track_id),
-        'feature_id': (feature_dim, feature_id),
-        'feature_parent_cell_id': (feature_dim, feature_parent_cell_id),
-        'feature_parent_track_id': (feature_dim, feature_parent_track_id),
-        'feature_parent_storm_id': (feature_dim, feature_parent_storm_id),
-        'storm_child_track_count': (storm_dim, storm_child_track_count),
-        'storm_child_cell_count': (storm_dim, storm_child_cell_count),
-        'storm_child_feature_count': (storm_dim, storm_child_feature_count),
-        'track_child_cell_count': (track_dim, track_child_cell_count),
-        'cell_child_feature_count': (cell_dim, cell_child_feature_count),
-            })
-    d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) 
+
+    storm_dim = "nstorms"
+    track_dim = "ntracks"
+    cell_dim = "ncells"
+    feature_dim = "nfeatures"
+
+    d = xr.Dataset(
+        {
+            "storm_id": (storm_dim, storm_id),
+            "track_id": (track_dim, track_ids),
+            "track_parent_storm_id": (track_dim, track_parent_storm_id),
+            "cell_id": (cell_dim, cell_id),
+            "cell_parent_track_id": (cell_dim, cell_parent_track_id),
+            "feature_id": (feature_dim, feature_id),
+            "feature_parent_cell_id": (feature_dim, feature_parent_cell_id),
+            "feature_parent_track_id": (feature_dim, feature_parent_track_id),
+            "feature_parent_storm_id": (feature_dim, feature_parent_storm_id),
+            "storm_child_track_count": (storm_dim, storm_child_track_count),
+            "storm_child_cell_count": (storm_dim, storm_child_cell_count),
+            "storm_child_feature_count": (storm_dim, storm_child_feature_count),
+            "track_child_cell_count": (track_dim, track_child_cell_count),
+            "cell_child_feature_count": (cell_dim, cell_child_feature_count),
+        }
+    )
+    d = d.set_coords(["feature_id", "cell_id", "track_id", "storm_id"])
 
     assert len(track_id) == len(track_parent_storm_id)
     assert len(cell_id) == len(cell_parent_track_id)
@@ -340,6 +378,8 @@ def merge_split(TRACK,distance = 25000,frame_len = 5):
     assert sum(storm_child_cell_count) == len(cell_id)
     assert sum(storm_child_feature_count) == len(feature_id)
     assert sum(track_child_cell_count) == len(cell_id)
-    assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id)
-    
-    return d
\ No newline at end of file
+    assert sum(
+        [sum(cell_child_feature_count), (len(np.where(feature_parent_track_id < 0)[0]))]
+    ) == len(feature_id)
+
+    return d

From edc5f25bdf60021d2bfa98dbf115fe16a87eefcf Mon Sep 17 00:00:00 2001
From: Juli <julia.kukulies@gu.se>
Date: Thu, 16 Jun 2022 14:39:40 +0200
Subject: [PATCH 03/29] black formatting

---
 tobac/merge_split.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 4183a3f0..2f4915a7 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -18,7 +18,7 @@ def compress_all(nc_grids, min_dims=2):
 
 
 def standardize_track_dataset(TrackedFeatures, Mask, Projection):
-    """ Combine a feature mask with the feature data table into a common dataset.
+    """Combine a feature mask with the feature data table into a common dataset.
     Also renames th
 
     returned by tobac.themes.tobac_v1.segmentation
@@ -34,7 +34,7 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection):
 
     TODO: Add metadata attributes to
 
-     """
+    """
     feature_standard_names = {
         # new variable name, and long description for the NetCDF attribute
         "frame": (
@@ -141,20 +141,20 @@ def merge_split(TRACK, distance=25000, frame_len=5):
     function to  postprocess tobac track data for merge/split cells
     Input:
         TRACK:    xarray dataset of tobac Track information
-        
+
         distance:    float, optional distance threshold prior to adding a pair of points
                             into the minimum spanning tree. Default is 25000 meters.
-        
+
         frame_len: float, optional threshold for the spanning length within which two points
                           can be separated. Default is five (5) frames.
 
 
     Output:
-        d:    xarray dataset of 
+        d:    xarray dataset of
                         feature position along 1st horizontal dimension
         hdim2_index:    float
                         feature position along 2nd horizontal dimension
-                        
+
     Example:
         d = merge_split(Track)
        ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem'])
@@ -162,7 +162,7 @@ def merge_split(TRACK, distance=25000, frame_len=5):
         both_ds = xr.merge([ds, d],compat ='override')
         both_ds = compress_all(both_ds)
         both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
-    
+
     """
     track_groups = TRACK.groupby("cell")
     cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()}

From d1f23b48effa041fb7e8636417b6745354f5f7b0 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Sat, 2 Jul 2022 21:36:49 -0600
Subject: [PATCH 04/29] Update merge_split.py

Updated changes to merge/split based on suggestions for lat/lon convection, and documentation.
---
 tobac/merge_split.py | 467 +++++++++++++++++++++++--------------------
 1 file changed, 249 insertions(+), 218 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 2f4915a7..60e929be 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -1,4 +1,13 @@
-# Tobac merge and split v0.1
+'''
+    Tobac merge and split v0.1
+    This submodule is a post processing step to address tracked cells which merge/split. 
+    The first iteration of this module is to combine the cells which are merging but receive 
+    a new cell id once merged. In general this submodule will label these three cell-ids using
+    the largest cell number of those within the merged/split cell ids. 
+    
+    
+'''
+
 
 from geopy.distance import geodesic
 from networkx import *
@@ -6,8 +15,30 @@
 from pandas.core.common import flatten
 import xarray as xr
 
-
 def compress_all(nc_grids, min_dims=2):
+    '''
+    The purpose of this subroutine is to compress the netcdf variables as they are saved
+    this does not change the data, but sets netcdf encoding parameters.
+    We allocate a minimum number of dimensions as variables with dimensions
+    under the minimum value do not benefit from tangibly from this encoding.
+
+    Parameters
+    ----------
+    nc_grids : xarray.core.dataset.Dataset
+        Xarray dataset that is intended to be exported as netcdf
+    
+    min_dims : integer
+        The minimum number of dimesnions, in integer value, a variable must have in order 
+        set the netcdf compression encoding.
+    
+    Returns
+    -------
+    nc_grids : xarray.core.dataset.Dataset
+        Xarray dataset with netcdf compression encoding for variables with two (2) or more dimensions
+    
+    '''
+
+    
     for var in nc_grids:
         if len(nc_grids[var].dims) >= min_dims:
             # print("Compressing ", var)
@@ -15,11 +46,11 @@ def compress_all(nc_grids, min_dims=2):
             nc_grids[var].encoding["complevel"] = 4
             nc_grids[var].encoding["contiguous"] = False
     return nc_grids
+    
 
-
-def standardize_track_dataset(TrackedFeatures, Mask, Projection):
-    """Combine a feature mask with the feature data table into a common dataset.
-    Also renames th
+def standardize_track_dataset(TrackedFeatures, Mask, Projection = None):
+    ''' 
+    Combine a feature mask with the feature data table into a common dataset.
 
     returned by tobac.themes.tobac_v1.segmentation
     with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy.
@@ -32,140 +63,143 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection):
 
     Projection is an xarray DataArray
 
-    TODO: Add metadata attributes to
-
-    """
+    TODO: Add metadata attributes
+    
+    Parameters
+    ----------
+    TrackedFeatures : xarray.core.dataset.Dataset
+        xarray dataset of tobac Track information, the xarray dataset returned by tobac.tracking.linking_trackpy
+    
+    Mask: xarray.core.dataset.Dataset
+        xarray dataset of tobac segmentation mask information, the xarray dataset returned
+        by tobac.segmentation.segmentation
+        
+            
+    Projection : xarray.core.dataarray.DataArray, default = None
+        array.DataArray of the original input dataset (gridded nexrad data for example). 
+        If using gridded nexrad data, this can be input as: data['ProjectionCoordinateSystem']
+        An example of the type of information in the dataarray includes the following attributes:
+        latitude_of_projection_origin :29.471900939941406
+        longitude_of_projection_origin :-95.0787353515625
+        _CoordinateTransformType :Projection
+        _CoordinateAxes :x y z time
+        _CoordinateAxesTypes :GeoX GeoY Height Time
+        grid_mapping_name :azimuthal_equidistant
+        semi_major_axis :6370997.0
+        inverse_flattening :298.25
+        longitude_of_prime_meridian :0.0
+        false_easting :0.0
+        false_northing :0.0
+        
+    Returns
+    ------- 
+    
+    ds : xarray.core.dataset.Dataset
+        xarray dataset of merged Track and Segmentation Mask datasets with renamed variables.
+    
+    '''
     feature_standard_names = {
         # new variable name, and long description for the NetCDF attribute
-        "frame": (
-            "feature_time_index",
-            "positional index of the feature along the time dimension of the mask, from 0 to N-1",
-        ),
-        "hdim_1": (
-            "feature_hdim1_coordinate",
-            "position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x)."
-            "The numbering is consistent with positional indexing of the coordinate, but can be"
-            "fractional, to account for a centroid not aligned to the grid.",
-        ),
-        "hdim_2": (
-            "feature_hdim2_coordinate",
-            "position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)"
-            "The numbering is consistent with positional indexing of the coordinate, but can be"
-            "fractional, to account for a centroid not aligned to the grid.",
-        ),
-        "idx": ("feature_id_this_frame",),
-        "num": (
-            "feature_grid_cell_count",
-            "Number of grid points that are within the threshold of this feature",
-        ),
-        "threshold_value": (
-            "feature_threshold_max",
-            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments",
-        ),
-        "feature": (
-            "feature_id",
-            "Unique number of the feature; starts from 1 and increments by 1 to the number of features",
-        ),
-        "time": (
-            "feature_time",
-            "time of the feature, consistent with feature_time_index",
-        ),
-        "timestr": (
-            "feature_time_str",
-            "String representation of the feature time, YYYY-MM-DD HH:MM:SS",
-        ),
-        "projection_y_coordinate": (
-            "feature_projection_y_coordinate",
-            "y position of the feature in the projection given by ProjectionCoordinateSystem",
-        ),
-        "projection_x_coordinate": (
-            "feature_projection_x_coordinate",
-            "x position of the feature in the projection given by ProjectionCoordinateSystem",
-        ),
-        "lat": ("feature_latitude", "latitude of the feature"),
-        "lon": ("feature_longitude", "longitude of the feature"),
-        "ncells": (
-            "feature_ncells",
-            "number of grid cells for this feature (meaning uncertain)",
-        ),
-        "areas": ("feature_area",),
-        "isolated": ("feature_isolation_flag",),
-        "num_objects": ("number_of_feature_neighbors",),
-        "cell": ("feature_parent_cell_id",),
-        "time_cell": ("feature_parent_cell_elapsed_time",),
-        "segmentation_mask": ("2d segmentation mask",),
-    }
-    new_feature_var_names = {
-        k: feature_standard_names[k][0]
-        for k in feature_standard_names.keys()
-        if k in TrackedFeatures.variables.keys()
+        'frame':('feature_time_index',
+            'positional index of the feature along the time dimension of the mask, from 0 to N-1'),
+        'hdim_1':('feature_hdim1_coordinate',
+            'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).'
+        'The numbering is consistent with positional indexing of the coordinate, but can be'
+            'fractional, to account for a centroid not aligned to the grid.'),
+        'hdim_2':('feature_hdim2_coordinate',
+            'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)'
+            'The numbering is consistent with positional indexing of the coordinate, but can be'
+            'fractional, to account for a centroid not aligned to the grid.'),
+        'idx':('feature_id_this_frame',),
+        'num':('feature_grid_cell_count',
+            'Number of grid points that are within the threshold of this feature'),
+        'threshold_value':('feature_threshold_max',
+            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"),
+        'feature':('feature_id',
+            "Unique number of the feature; starts from 1 and increments by 1 to the number of features"),
+        'time':('feature_time','time of the feature, consistent with feature_time_index'),
+        'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'),
+        'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'),
+        'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'),
+        'lat':('feature_latitude','latitude of the feature'),
+        'lon':('feature_longitude','longitude of the feature'),
+        'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'),
+        'areas':('feature_area',),
+        'isolated':('feature_isolation_flag',),
+        'num_objects':('number_of_feature_neighbors',),
+        'cell':('feature_parent_cell_id',),
+        'time_cell':('feature_parent_cell_elapsed_time',),
+        'segmentation_mask':('2d segmentation mask',)
     }
+    new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys()
+                             if k in TrackedFeatures.variables.keys()}
 
-    TrackedFeatures = TrackedFeatures.drop(["cell_parent_track_id"])
+    TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id'])
     # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of
     # the 'index' variable and call the dimension 'feature'
-    ds = xr.merge(
-        [
-            TrackedFeatures.swap_dims({"index": "feature"})
-            .drop("index")
-            .rename_vars(new_feature_var_names),
-            Mask,
-        ]
-    )
+    ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names),
+                  Mask])
 
     # Add the projection data back in
-    ds["ProjectionCoordinateSystem"] = Projection
+    if not None in Projection:
+        ds['ProjectionCoordinateSystem']=Projection
 
     # Convert the cell ID variable from float to integer
-    if "int" not in str(TrackedFeatures.cell.dtype):
+    if 'int' not in str(TrackedFeatures.cell.dtype):
         # The raw output from the tracking is actually an object array
         # array([nan, 2, 3], dtype=object)
         # (and is cast to a float array when saved as NetCDF, I think).
         # Cast to float.
-        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype="int64")
+        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64')
 
-        cell_id_data = TrackedFeatures.cell.astype("float64")
+        cell_id_data = TrackedFeatures.cell.astype('float64')
         valid_cell = np.isfinite(cell_id_data)
         valid_cell_ids = cell_id_data[valid_cell]
         if not (np.unique(valid_cell_ids) > 0).all():
-            raise AssertionError(
-                "Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell"
-            )
-        int_cell[valid_cell] = valid_cell_ids.astype("int64")
-        # ds['feature_parent_cell_id'] = int_cell
+            raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell')
+        int_cell[valid_cell] = valid_cell_ids.astype('int64')
+        #ds['feature_parent_cell_id'] = int_cell
     return ds
 
 
-def merge_split(TRACK, distance=25000, frame_len=5):
-    """
-    function to  postprocess tobac track data for merge/split cells
-    Input:
-        TRACK:    xarray dataset of tobac Track information
 
-        distance:    float, optional distance threshold prior to adding a pair of points
-                            into the minimum spanning tree. Default is 25000 meters.
-
-        frame_len: float, optional threshold for the spanning length within which two points
-                          can be separated. Default is five (5) frames.
-
-
-    Output:
-        d:    xarray dataset of
-                        feature position along 1st horizontal dimension
-        hdim2_index:    float
-                        feature position along 2nd horizontal dimension
-
-    Example:
+def merge_split(TRACK,distance = 25000,frame_len = 5):
+    '''
+    function to  postprocess tobac track data for merge/split cells
+    
+        
+    Parameters
+    ----------
+    TRACK : xarray.core.dataset.Dataset
+        xarray dataset of tobac Track information
+        
+    distance : float, optional 
+        Distance threshold prior to adding a pair of points into the minimum spanning tree. 
+        Default is 25000 meters.
+        
+    frame_len : float, optional
+        Threshold for the spanning length within which two points can be separated. 
+        Default is five (5) frames.
+
+    Returns
+    ------- 
+
+    d : xarray.core.dataset.Dataset
+        xarray dataset of tobac merge/split cells with parent and child designations.
+       
+                        
+    Example usage:
         d = merge_split(Track)
-       ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem'])
-        # both_ds = xarray.combine_by_coords((ds,d), compat='override')
+        ds = standardize_track_dataset(Track, refl_mask)
         both_ds = xr.merge([ds, d],compat ='override')
         both_ds = compress_all(both_ds)
         both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
-
-    """
-    track_groups = TRACK.groupby("cell")
-    cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()}
+    
+    '''
+    
+    print('this is an update 3')
+    track_groups = TRACK.groupby('cell')
+    cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()}
     id_data = np.fromiter(cell_ids.keys(), dtype=int)
     count_data = np.fromiter(cell_ids.values(), dtype=int)
     all_frames = np.sort(np.unique(TRACK.frame))
@@ -174,40 +208,42 @@ def merge_split(TRACK, distance=25000, frame_len=5):
     a_names = list()
     b_names = list()
     dist = list()
+    
+    if hasattr(TRACK, 'grid_longitude'):
+        print('is in lat/lonx')
+        for i in id_data:
+            a_pointx = track_groups[i].grid_longitude[-1].values
+            a_pointy = track_groups[i].grid_latitude[-1].values
+            for j in id_data:
+                b_pointx = track_groups[j].grid_longitude[0].values
+                b_pointy = track_groups[j].grid_latitude[0].values
+                d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).m
+                if d <= distance:
+                    a_points.append([a_pointx,a_pointy])
+                    b_points.append([b_pointx, b_pointy])
+                    dist.append(d)
+                    a_names.append(i)
+                    b_names.append(j)
+    else:
+    
+        for i in id_data:
+        #print(i)
+            a_pointx = track_groups[i].projection_x_coordinate[-1].values
+            a_pointy = track_groups[i].projection_y_coordinate[-1].values
+            for j in id_data:
+                b_pointx = track_groups[j].projection_x_coordinate[0].values
+                b_pointy = track_groups[j].projection_y_coordinate[0].values
+                d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)))
+                if d <= distance:
+                    a_points.append([a_pointx,a_pointy])
+                    b_points.append([b_pointx, b_pointy])
+                    dist.append(d)
+                    a_names.append(i)
+                    b_names.append(j)
 
-    for i in id_data:
-        # print(i)
-        a_pointx = track_groups[i].projection_x_coordinate[-1].values
-        a_pointy = track_groups[i].projection_y_coordinate[-1].values
-        for j in id_data:
-            b_pointx = track_groups[j].projection_x_coordinate[0].values
-            b_pointy = track_groups[j].projection_y_coordinate[0].values
-            d = np.linalg.norm(
-                np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))
-            )
-            if d <= distance:
-                a_points.append([a_pointx, a_pointy])
-                b_points.append([b_pointx, b_pointy])
-                dist.append(d)
-                a_names.append(i)
-                b_names.append(j)
-
-    #     for i in id_data:
-    #         a_pointx = track_groups[i].grid_longitude[-1].values
-    #         a_pointy = track_groups[i].grid_latitude[-1].values
-    #         for j in id_data:
-    #             b_pointx = track_groups[j].grid_longitude[0].values
-    #             b_pointy = track_groups[j].grid_latitude[0].values
-    #             d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km
-    #             if d <= distance:
-    #                 a_points.append([a_pointx,a_pointy])
-    #                 b_points.append([b_pointx, b_pointy])
-    #                 dist.append(d)
-    #                 a_names.append(i)
-    #                 b_names.append(j)
 
     id = []
-    for i in range(len(dist) - 1, -1, -1):
+    for i in range(len(dist)-1, -1, -1): 
         if a_names[i] == b_names[i]:
             id.append(i)
             a_points.pop(i)
@@ -217,27 +253,25 @@ def merge_split(TRACK, distance=25000, frame_len=5):
             b_names.pop(i)
         else:
             continue
-
+ 
     g = Graph()
     for i in np.arange(len(dist)):
-        g.add_edge(a_names[i], b_names[i], weight=dist[i])
+        g.add_edge(a_names[i], b_names[i],weight=dist[i])
 
     tree = minimum_spanning_edges(g)
     tree_list = list(minimum_spanning_edges(g))
 
     new_tree = []
-    for i, j in enumerate(tree_list):
+    for i,j in enumerate(tree_list):
         frame_a = np.nanmax(track_groups[j[0]].frame.values)
         frame_b = np.nanmin(track_groups[j[1]].frame.values)
         if np.abs(frame_a - frame_b) <= frame_len:
             new_tree.append(tree_list[i][0:2])
     new_tree_arr = np.array(new_tree)
 
-    TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values))
-    cell_id = np.unique(
-        TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]
-    )
-    track_id = dict()  # same size as number of total merged tracks
+    TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values))
+    cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])
+    track_id = dict() #same size as number of total merged tracks
 
     arr = np.array([0])
     for p in cell_id:
@@ -248,7 +282,7 @@ def merge_split(TRACK, distance=25000, frame_len=5):
             k = np.where(new_tree_arr == p)
             if len(k[0]) == 0:
                 track_id[p] = [p]
-                arr = np.append(arr, p)
+                arr = np.append(arr,p)
             else:
                 temp1 = list(np.unique(new_tree_arr[k[0]]))
                 temp = list(np.unique(new_tree_arr[k[0]]))
@@ -263,113 +297,112 @@ def merge_split(TRACK, distance=25000, frame_len=5):
                     if len(temp1) == len(temp):
                         break
                     temp1 = np.array(temp)
-
+                
                 for i in temp1:
                     k2 = np.where(new_tree_arr == i)
                     temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
 
                 temp = list(flatten(temp))
                 temp = list(np.unique(temp))
-                arr = np.append(arr, np.unique(temp))
-
+                arr = np.append(arr,np.unique(temp))
+                
                 track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
+                
+                
+
+    storm_id = [0] #default because we don't track larger storm systems *yet*
+    print('found storm id')
 
-    storm_id = [0]  # default because we don't track larger storm systems *yet*
-    print("found storm id")
 
-    track_parent_storm_id = np.repeat(
-        0, len(track_id)
-    )  # This will always be zero when we don't track larger storm systems *yet*
-    print("found track parent storm ids")
+    track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet*
+    print('found track parent storm ids')
 
     track_ids = np.array(list(track_id.keys()))
-    print("found track ids")
+    print('found track ids')
+
 
-    cell_id = list(
-        np.unique(
-            TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]
-        )
-    )
-    print("found cell ids")
+    cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]))
+    print('found cell ids')
 
     cell_parent_track_id = []
 
     for i, id in enumerate(track_id):
-
-        if len(track_id[int(id)]) == 1:
+    
+        if len(track_id[int(id)]) ==  1:
             cell_parent_track_id.append(int(id))
 
         else:
-            cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)])))
+            cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)])))
+
 
     cell_parent_track_id = list(flatten(cell_parent_track_id))
-    print("found cell parent track ids")
+    print('found cell parent track ids')
 
     feature_parent_cell_id = list(TRACK.cell.values.astype(float))
 
-    print("found feature parent cell ids")
+    print('found feature parent cell ids')
 
-    # This version includes all the feature regardless of if they are used in cells or not.
+    #This version includes all the feature regardless of if they are used in cells or not.
     feature_id = list(TRACK.feature.values.astype(int))
-    print("found feature ids")
+    print('found feature ids')
 
-    feature_parent_storm_id = np.repeat(0, len(feature_id))  # we don't do storms atm
-    print("found feature parent storm ids")
+    feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm
+    print('found feature parent storm ids')
 
-    feature_parent_track_id = []
+    feature_parent_track_id  = []
     feature_parent_track_id = np.zeros(len(feature_id))
     for i, id in enumerate(feature_id):
         cellid = feature_parent_cell_id[i]
         if np.isnan(cellid):
-            feature_parent_track_id[i] = -1
+            feature_parent_track_id[i] = -1 
         else:
             j = np.where(cell_id == cellid)
             j = np.squeeze(j)
             trackid = cell_parent_track_id[j]
             feature_parent_track_id[i] = trackid
+        
+    print('found feature parent track ids')
 
-    print("found feature parent track ids")
 
     storm_child_track_count = [len(track_id)]
-    print("found storm child track count")
+    print('found storm child track count')
 
     track_child_cell_count = []
-    for i, id in enumerate(track_id):
+    for i,id in enumerate(track_id):
         track_child_cell_count.append(len(track_id[int(id)]))
-    print("found track child cell count")
+    print('found track child cell count')
+    
 
     cell_child_feature_count = []
-    for i, id in enumerate(cell_id):
+    for i,id in enumerate(cell_id):
         cell_child_feature_count.append(len(track_groups[id].feature.values))
-    print("found cell child feature count")
+    print('found cell child feature count')
 
-    storm_child_cell_count = [len(cell_id)]
+    storm_child_cell_count =  [len(cell_id)] 
     storm_child_feature_count = [len(feature_id)]
-
-    storm_dim = "nstorms"
-    track_dim = "ntracks"
-    cell_dim = "ncells"
-    feature_dim = "nfeatures"
-
-    d = xr.Dataset(
-        {
-            "storm_id": (storm_dim, storm_id),
-            "track_id": (track_dim, track_ids),
-            "track_parent_storm_id": (track_dim, track_parent_storm_id),
-            "cell_id": (cell_dim, cell_id),
-            "cell_parent_track_id": (cell_dim, cell_parent_track_id),
-            "feature_id": (feature_dim, feature_id),
-            "feature_parent_cell_id": (feature_dim, feature_parent_cell_id),
-            "feature_parent_track_id": (feature_dim, feature_parent_track_id),
-            "feature_parent_storm_id": (feature_dim, feature_parent_storm_id),
-            "storm_child_track_count": (storm_dim, storm_child_track_count),
-            "storm_child_cell_count": (storm_dim, storm_child_cell_count),
-            "storm_child_feature_count": (storm_dim, storm_child_feature_count),
-            "track_child_cell_count": (track_dim, track_child_cell_count),
-            "cell_child_feature_count": (cell_dim, cell_child_feature_count),
-        }
-    )
-    d = d.set_coords(["feature_id", "cell_id", "track_id", "storm_id"])
+    
+    storm_dim = 'nstorms'
+    track_dim = 'ntracks'
+    cell_dim = 'ncells'
+    feature_dim = 'nfeatures'
+    
+    d = xr.Dataset({
+        'storm_id': (storm_dim, storm_id),
+        'track_id': (track_dim, track_ids),
+        'track_parent_storm_id': (track_dim, track_parent_storm_id),
+        'cell_id': (cell_dim, cell_id),
+        'cell_parent_track_id': (cell_dim, cell_parent_track_id),
+        'feature_id': (feature_dim, feature_id),
+        'feature_parent_cell_id': (feature_dim, feature_parent_cell_id),
+        'feature_parent_track_id': (feature_dim, feature_parent_track_id),
+        'feature_parent_storm_id': (feature_dim, feature_parent_storm_id),
+        'storm_child_track_count': (storm_dim, storm_child_track_count),
+        'storm_child_cell_count': (storm_dim, storm_child_cell_count),
+        'storm_child_feature_count': (storm_dim, storm_child_feature_count),
+        'track_child_cell_count': (track_dim, track_child_cell_count),
+        'cell_child_feature_count': (cell_dim, cell_child_feature_count),
+            })
+    d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) 
 
     assert len(track_id) == len(track_parent_storm_id)
     assert len(cell_id) == len(cell_parent_track_id)
@@ -378,8 +411,6 @@ def merge_split(TRACK, distance=25000, frame_len=5):
     assert sum(storm_child_cell_count) == len(cell_id)
     assert sum(storm_child_feature_count) == len(feature_id)
     assert sum(track_child_cell_count) == len(cell_id)
-    assert sum(
-        [sum(cell_child_feature_count), (len(np.where(feature_parent_track_id < 0)[0]))]
-    ) == len(feature_id)
-
-    return d
+    assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id)
+    
+    return d
\ No newline at end of file

From d3aa8ea7322fef19452fe9f3831c26b5f2b56df0 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Wed, 6 Jul 2022 19:10:02 -0600
Subject: [PATCH 05/29] Ran Black formatting on file.

Formatted using the Black python package.
---
 tobac/merge_split.py | 367 ++++++++++++++++++++++++-------------------
 1 file changed, 205 insertions(+), 162 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 60e929be..c58be5aa 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -1,4 +1,4 @@
-'''
+"""
     Tobac merge and split v0.1
     This submodule is a post processing step to address tracked cells which merge/split. 
     The first iteration of this module is to combine the cells which are merging but receive 
@@ -6,7 +6,7 @@
     the largest cell number of those within the merged/split cell ids. 
     
     
-'''
+"""
 
 
 from geopy.distance import geodesic
@@ -15,8 +15,9 @@
 from pandas.core.common import flatten
 import xarray as xr
 
+
 def compress_all(nc_grids, min_dims=2):
-    '''
+    """
     The purpose of this subroutine is to compress the netcdf variables as they are saved
     this does not change the data, but sets netcdf encoding parameters.
     We allocate a minimum number of dimensions as variables with dimensions
@@ -26,19 +27,18 @@ def compress_all(nc_grids, min_dims=2):
     ----------
     nc_grids : xarray.core.dataset.Dataset
         Xarray dataset that is intended to be exported as netcdf
-    
+
     min_dims : integer
-        The minimum number of dimesnions, in integer value, a variable must have in order 
+        The minimum number of dimesnions, in integer value, a variable must have in order
         set the netcdf compression encoding.
-    
+
     Returns
     -------
     nc_grids : xarray.core.dataset.Dataset
         Xarray dataset with netcdf compression encoding for variables with two (2) or more dimensions
-    
-    '''
 
-    
+    """
+
     for var in nc_grids:
         if len(nc_grids[var].dims) >= min_dims:
             # print("Compressing ", var)
@@ -46,10 +46,10 @@ def compress_all(nc_grids, min_dims=2):
             nc_grids[var].encoding["complevel"] = 4
             nc_grids[var].encoding["contiguous"] = False
     return nc_grids
-    
 
-def standardize_track_dataset(TrackedFeatures, Mask, Projection = None):
-    ''' 
+
+def standardize_track_dataset(TrackedFeatures, Mask, Projection=None):
+    """
     Combine a feature mask with the feature data table into a common dataset.
 
     returned by tobac.themes.tobac_v1.segmentation
@@ -64,19 +64,19 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection = None):
     Projection is an xarray DataArray
 
     TODO: Add metadata attributes
-    
+
     Parameters
     ----------
     TrackedFeatures : xarray.core.dataset.Dataset
         xarray dataset of tobac Track information, the xarray dataset returned by tobac.tracking.linking_trackpy
-    
+
     Mask: xarray.core.dataset.Dataset
         xarray dataset of tobac segmentation mask information, the xarray dataset returned
         by tobac.segmentation.segmentation
-        
-            
+
+
     Projection : xarray.core.dataarray.DataArray, default = None
-        array.DataArray of the original input dataset (gridded nexrad data for example). 
+        array.DataArray of the original input dataset (gridded nexrad data for example).
         If using gridded nexrad data, this can be input as: data['ProjectionCoordinateSystem']
         An example of the type of information in the dataarray includes the following attributes:
         latitude_of_projection_origin :29.471900939941406
@@ -90,116 +90,153 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection = None):
         longitude_of_prime_meridian :0.0
         false_easting :0.0
         false_northing :0.0
-        
+
     Returns
-    ------- 
-    
+    -------
+
     ds : xarray.core.dataset.Dataset
         xarray dataset of merged Track and Segmentation Mask datasets with renamed variables.
-    
-    '''
+
+    """
     feature_standard_names = {
         # new variable name, and long description for the NetCDF attribute
-        'frame':('feature_time_index',
-            'positional index of the feature along the time dimension of the mask, from 0 to N-1'),
-        'hdim_1':('feature_hdim1_coordinate',
-            'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).'
-        'The numbering is consistent with positional indexing of the coordinate, but can be'
-            'fractional, to account for a centroid not aligned to the grid.'),
-        'hdim_2':('feature_hdim2_coordinate',
-            'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)'
-            'The numbering is consistent with positional indexing of the coordinate, but can be'
-            'fractional, to account for a centroid not aligned to the grid.'),
-        'idx':('feature_id_this_frame',),
-        'num':('feature_grid_cell_count',
-            'Number of grid points that are within the threshold of this feature'),
-        'threshold_value':('feature_threshold_max',
-            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"),
-        'feature':('feature_id',
-            "Unique number of the feature; starts from 1 and increments by 1 to the number of features"),
-        'time':('feature_time','time of the feature, consistent with feature_time_index'),
-        'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'),
-        'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'),
-        'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'),
-        'lat':('feature_latitude','latitude of the feature'),
-        'lon':('feature_longitude','longitude of the feature'),
-        'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'),
-        'areas':('feature_area',),
-        'isolated':('feature_isolation_flag',),
-        'num_objects':('number_of_feature_neighbors',),
-        'cell':('feature_parent_cell_id',),
-        'time_cell':('feature_parent_cell_elapsed_time',),
-        'segmentation_mask':('2d segmentation mask',)
+        "frame": (
+            "feature_time_index",
+            "positional index of the feature along the time dimension of the mask, from 0 to N-1",
+        ),
+        "hdim_1": (
+            "feature_hdim1_coordinate",
+            "position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x)."
+            "The numbering is consistent with positional indexing of the coordinate, but can be"
+            "fractional, to account for a centroid not aligned to the grid.",
+        ),
+        "hdim_2": (
+            "feature_hdim2_coordinate",
+            "position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)"
+            "The numbering is consistent with positional indexing of the coordinate, but can be"
+            "fractional, to account for a centroid not aligned to the grid.",
+        ),
+        "idx": ("feature_id_this_frame",),
+        "num": (
+            "feature_grid_cell_count",
+            "Number of grid points that are within the threshold of this feature",
+        ),
+        "threshold_value": (
+            "feature_threshold_max",
+            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments",
+        ),
+        "feature": (
+            "feature_id",
+            "Unique number of the feature; starts from 1 and increments by 1 to the number of features",
+        ),
+        "time": (
+            "feature_time",
+            "time of the feature, consistent with feature_time_index",
+        ),
+        "timestr": (
+            "feature_time_str",
+            "String representation of the feature time, YYYY-MM-DD HH:MM:SS",
+        ),
+        "projection_y_coordinate": (
+            "feature_projection_y_coordinate",
+            "y position of the feature in the projection given by ProjectionCoordinateSystem",
+        ),
+        "projection_x_coordinate": (
+            "feature_projection_x_coordinate",
+            "x position of the feature in the projection given by ProjectionCoordinateSystem",
+        ),
+        "lat": ("feature_latitude", "latitude of the feature"),
+        "lon": ("feature_longitude", "longitude of the feature"),
+        "ncells": (
+            "feature_ncells",
+            "number of grid cells for this feature (meaning uncertain)",
+        ),
+        "areas": ("feature_area",),
+        "isolated": ("feature_isolation_flag",),
+        "num_objects": ("number_of_feature_neighbors",),
+        "cell": ("feature_parent_cell_id",),
+        "time_cell": ("feature_parent_cell_elapsed_time",),
+        "segmentation_mask": ("2d segmentation mask",),
+    }
+    new_feature_var_names = {
+        k: feature_standard_names[k][0]
+        for k in feature_standard_names.keys()
+        if k in TrackedFeatures.variables.keys()
     }
-    new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys()
-                             if k in TrackedFeatures.variables.keys()}
 
-    TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id'])
+    TrackedFeatures = TrackedFeatures.drop(["cell_parent_track_id"])
     # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of
     # the 'index' variable and call the dimension 'feature'
-    ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names),
-                  Mask])
+    ds = xr.merge(
+        [
+            TrackedFeatures.swap_dims({"index": "feature"})
+            .drop("index")
+            .rename_vars(new_feature_var_names),
+            Mask,
+        ]
+    )
 
     # Add the projection data back in
     if not None in Projection:
-        ds['ProjectionCoordinateSystem']=Projection
+        ds["ProjectionCoordinateSystem"] = Projection
 
     # Convert the cell ID variable from float to integer
-    if 'int' not in str(TrackedFeatures.cell.dtype):
+    if "int" not in str(TrackedFeatures.cell.dtype):
         # The raw output from the tracking is actually an object array
         # array([nan, 2, 3], dtype=object)
         # (and is cast to a float array when saved as NetCDF, I think).
         # Cast to float.
-        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64')
+        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype="int64")
 
-        cell_id_data = TrackedFeatures.cell.astype('float64')
+        cell_id_data = TrackedFeatures.cell.astype("float64")
         valid_cell = np.isfinite(cell_id_data)
         valid_cell_ids = cell_id_data[valid_cell]
         if not (np.unique(valid_cell_ids) > 0).all():
-            raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell')
-        int_cell[valid_cell] = valid_cell_ids.astype('int64')
-        #ds['feature_parent_cell_id'] = int_cell
+            raise AssertionError(
+                "Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell"
+            )
+        int_cell[valid_cell] = valid_cell_ids.astype("int64")
+        # ds['feature_parent_cell_id'] = int_cell
     return ds
 
 
-
-def merge_split(TRACK,distance = 25000,frame_len = 5):
-    '''
+def merge_split(TRACK, distance=25000, frame_len=5):
+    """
     function to  postprocess tobac track data for merge/split cells
-    
-        
+
+
     Parameters
     ----------
     TRACK : xarray.core.dataset.Dataset
         xarray dataset of tobac Track information
-        
-    distance : float, optional 
-        Distance threshold prior to adding a pair of points into the minimum spanning tree. 
+
+    distance : float, optional
+        Distance threshold prior to adding a pair of points into the minimum spanning tree.
         Default is 25000 meters.
-        
+
     frame_len : float, optional
-        Threshold for the spanning length within which two points can be separated. 
+        Threshold for the spanning length within which two points can be separated.
         Default is five (5) frames.
 
     Returns
-    ------- 
+    -------
 
     d : xarray.core.dataset.Dataset
         xarray dataset of tobac merge/split cells with parent and child designations.
-       
-                        
+
+
     Example usage:
         d = merge_split(Track)
         ds = standardize_track_dataset(Track, refl_mask)
         both_ds = xr.merge([ds, d],compat ='override')
         both_ds = compress_all(both_ds)
         both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
-    
-    '''
-    
-    print('this is an update 3')
-    track_groups = TRACK.groupby('cell')
-    cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()}
+
+    """
+
+    print("this is an update 3")
+    track_groups = TRACK.groupby("cell")
+    cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()}
     id_data = np.fromiter(cell_ids.keys(), dtype=int)
     count_data = np.fromiter(cell_ids.values(), dtype=int)
     all_frames = np.sort(np.unique(TRACK.frame))
@@ -208,42 +245,43 @@ def merge_split(TRACK,distance = 25000,frame_len = 5):
     a_names = list()
     b_names = list()
     dist = list()
-    
-    if hasattr(TRACK, 'grid_longitude'):
-        print('is in lat/lonx')
+
+    if hasattr(TRACK, "longitude"):
+        print("is in lat/lon")
         for i in id_data:
             a_pointx = track_groups[i].grid_longitude[-1].values
             a_pointy = track_groups[i].grid_latitude[-1].values
             for j in id_data:
                 b_pointx = track_groups[j].grid_longitude[0].values
                 b_pointy = track_groups[j].grid_latitude[0].values
-                d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).m
+                d = geodesic((a_pointy, a_pointx), (b_pointy, b_pointx)).m
                 if d <= distance:
-                    a_points.append([a_pointx,a_pointy])
+                    a_points.append([a_pointx, a_pointy])
                     b_points.append([b_pointx, b_pointy])
                     dist.append(d)
                     a_names.append(i)
                     b_names.append(j)
     else:
-    
+
         for i in id_data:
-        #print(i)
+            # print(i)
             a_pointx = track_groups[i].projection_x_coordinate[-1].values
             a_pointy = track_groups[i].projection_y_coordinate[-1].values
             for j in id_data:
                 b_pointx = track_groups[j].projection_x_coordinate[0].values
                 b_pointy = track_groups[j].projection_y_coordinate[0].values
-                d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)))
+                d = np.linalg.norm(
+                    np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))
+                )
                 if d <= distance:
-                    a_points.append([a_pointx,a_pointy])
+                    a_points.append([a_pointx, a_pointy])
                     b_points.append([b_pointx, b_pointy])
                     dist.append(d)
                     a_names.append(i)
                     b_names.append(j)
 
-
     id = []
-    for i in range(len(dist)-1, -1, -1): 
+    for i in range(len(dist) - 1, -1, -1):
         if a_names[i] == b_names[i]:
             id.append(i)
             a_points.pop(i)
@@ -253,25 +291,27 @@ def merge_split(TRACK,distance = 25000,frame_len = 5):
             b_names.pop(i)
         else:
             continue
- 
+
     g = Graph()
     for i in np.arange(len(dist)):
-        g.add_edge(a_names[i], b_names[i],weight=dist[i])
+        g.add_edge(a_names[i], b_names[i], weight=dist[i])
 
     tree = minimum_spanning_edges(g)
     tree_list = list(minimum_spanning_edges(g))
 
     new_tree = []
-    for i,j in enumerate(tree_list):
+    for i, j in enumerate(tree_list):
         frame_a = np.nanmax(track_groups[j[0]].frame.values)
         frame_b = np.nanmin(track_groups[j[1]].frame.values)
         if np.abs(frame_a - frame_b) <= frame_len:
             new_tree.append(tree_list[i][0:2])
     new_tree_arr = np.array(new_tree)
 
-    TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values))
-    cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])
-    track_id = dict() #same size as number of total merged tracks
+    TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values))
+    cell_id = np.unique(
+        TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]
+    )
+    track_id = dict()  # same size as number of total merged tracks
 
     arr = np.array([0])
     for p in cell_id:
@@ -282,7 +322,7 @@ def merge_split(TRACK,distance = 25000,frame_len = 5):
             k = np.where(new_tree_arr == p)
             if len(k[0]) == 0:
                 track_id[p] = [p]
-                arr = np.append(arr,p)
+                arr = np.append(arr, p)
             else:
                 temp1 = list(np.unique(new_tree_arr[k[0]]))
                 temp = list(np.unique(new_tree_arr[k[0]]))
@@ -297,112 +337,113 @@ def merge_split(TRACK,distance = 25000,frame_len = 5):
                     if len(temp1) == len(temp):
                         break
                     temp1 = np.array(temp)
-                
+
                 for i in temp1:
                     k2 = np.where(new_tree_arr == i)
                     temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
 
                 temp = list(flatten(temp))
                 temp = list(np.unique(temp))
-                arr = np.append(arr,np.unique(temp))
-                
-                track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
-                
-                
+                arr = np.append(arr, np.unique(temp))
 
-    storm_id = [0] #default because we don't track larger storm systems *yet*
-    print('found storm id')
+                track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
 
+    storm_id = [0]  # default because we don't track larger storm systems *yet*
+    print("found storm id")
 
-    track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet*
-    print('found track parent storm ids')
+    track_parent_storm_id = np.repeat(
+        0, len(track_id)
+    )  # This will always be zero when we don't track larger storm systems *yet*
+    print("found track parent storm ids")
 
     track_ids = np.array(list(track_id.keys()))
-    print('found track ids')
-
+    print("found track ids")
 
-    cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]))
-    print('found cell ids')
+    cell_id = list(
+        np.unique(
+            TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]
+        )
+    )
+    print("found cell ids")
 
     cell_parent_track_id = []
 
     for i, id in enumerate(track_id):
-    
-        if len(track_id[int(id)]) ==  1:
+
+        if len(track_id[int(id)]) == 1:
             cell_parent_track_id.append(int(id))
 
         else:
-            cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)])))
-
+            cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)])))
 
     cell_parent_track_id = list(flatten(cell_parent_track_id))
-    print('found cell parent track ids')
+    print("found cell parent track ids")
 
     feature_parent_cell_id = list(TRACK.cell.values.astype(float))
 
-    print('found feature parent cell ids')
+    print("found feature parent cell ids")
 
-    #This version includes all the feature regardless of if they are used in cells or not.
+    # This version includes all the feature regardless of if they are used in cells or not.
     feature_id = list(TRACK.feature.values.astype(int))
-    print('found feature ids')
+    print("found feature ids")
 
-    feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm
-    print('found feature parent storm ids')
+    feature_parent_storm_id = np.repeat(0, len(feature_id))  # we don't do storms atm
+    print("found feature parent storm ids")
 
-    feature_parent_track_id  = []
+    feature_parent_track_id = []
     feature_parent_track_id = np.zeros(len(feature_id))
     for i, id in enumerate(feature_id):
         cellid = feature_parent_cell_id[i]
         if np.isnan(cellid):
-            feature_parent_track_id[i] = -1 
+            feature_parent_track_id[i] = -1
         else:
             j = np.where(cell_id == cellid)
             j = np.squeeze(j)
             trackid = cell_parent_track_id[j]
             feature_parent_track_id[i] = trackid
-        
-    print('found feature parent track ids')
 
+    print("found feature parent track ids")
 
     storm_child_track_count = [len(track_id)]
-    print('found storm child track count')
+    print("found storm child track count")
 
     track_child_cell_count = []
-    for i,id in enumerate(track_id):
+    for i, id in enumerate(track_id):
         track_child_cell_count.append(len(track_id[int(id)]))
-    print('found track child cell count')
-    
+    print("found track child cell count")
 
     cell_child_feature_count = []
-    for i,id in enumerate(cell_id):
+    for i, id in enumerate(cell_id):
         cell_child_feature_count.append(len(track_groups[id].feature.values))
-    print('found cell child feature count')
+    print("found cell child feature count")
 
-    storm_child_cell_count =  [len(cell_id)] 
+    storm_child_cell_count = [len(cell_id)]
     storm_child_feature_count = [len(feature_id)]
-    
-    storm_dim = 'nstorms'
-    track_dim = 'ntracks'
-    cell_dim = 'ncells'
-    feature_dim = 'nfeatures'
-    
-    d = xr.Dataset({
-        'storm_id': (storm_dim, storm_id),
-        'track_id': (track_dim, track_ids),
-        'track_parent_storm_id': (track_dim, track_parent_storm_id),
-        'cell_id': (cell_dim, cell_id),
-        'cell_parent_track_id': (cell_dim, cell_parent_track_id),
-        'feature_id': (feature_dim, feature_id),
-        'feature_parent_cell_id': (feature_dim, feature_parent_cell_id),
-        'feature_parent_track_id': (feature_dim, feature_parent_track_id),
-        'feature_parent_storm_id': (feature_dim, feature_parent_storm_id),
-        'storm_child_track_count': (storm_dim, storm_child_track_count),
-        'storm_child_cell_count': (storm_dim, storm_child_cell_count),
-        'storm_child_feature_count': (storm_dim, storm_child_feature_count),
-        'track_child_cell_count': (track_dim, track_child_cell_count),
-        'cell_child_feature_count': (cell_dim, cell_child_feature_count),
-            })
-    d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) 
+
+    storm_dim = "nstorms"
+    track_dim = "ntracks"
+    cell_dim = "ncells"
+    feature_dim = "nfeatures"
+
+    d = xr.Dataset(
+        {
+            "storm_id": (storm_dim, storm_id),
+            "track_id": (track_dim, track_ids),
+            "track_parent_storm_id": (track_dim, track_parent_storm_id),
+            "cell_id": (cell_dim, cell_id),
+            "cell_parent_track_id": (cell_dim, cell_parent_track_id),
+            "feature_id": (feature_dim, feature_id),
+            "feature_parent_cell_id": (feature_dim, feature_parent_cell_id),
+            "feature_parent_track_id": (feature_dim, feature_parent_track_id),
+            "feature_parent_storm_id": (feature_dim, feature_parent_storm_id),
+            "storm_child_track_count": (storm_dim, storm_child_track_count),
+            "storm_child_cell_count": (storm_dim, storm_child_cell_count),
+            "storm_child_feature_count": (storm_dim, storm_child_feature_count),
+            "track_child_cell_count": (track_dim, track_child_cell_count),
+            "cell_child_feature_count": (cell_dim, cell_child_feature_count),
+        }
+    )
+    d = d.set_coords(["feature_id", "cell_id", "track_id", "storm_id"])
 
     assert len(track_id) == len(track_parent_storm_id)
     assert len(cell_id) == len(cell_parent_track_id)
@@ -411,6 +452,8 @@ def merge_split(TRACK,distance = 25000,frame_len = 5):
     assert sum(storm_child_cell_count) == len(cell_id)
     assert sum(storm_child_feature_count) == len(feature_id)
     assert sum(track_child_cell_count) == len(cell_id)
-    assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id)
-    
-    return d
\ No newline at end of file
+    assert sum(
+        [sum(cell_child_feature_count), (len(np.where(feature_parent_track_id < 0)[0]))]
+    ) == len(feature_id)
+
+    return d

From cbc80f8faa88d1301b995968769d48d5a9fb11df Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Wed, 6 Jul 2022 19:35:35 -0600
Subject: [PATCH 06/29] Allowed optional import of additional packages: geopy,
 and networkx.

Allowed optional import of additional packages: geopy, and networkx. Now a message will print if the package is not available to import.
---
 tobac/merge_split.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index c58be5aa..f157852c 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -8,9 +8,27 @@
     
 """
 
-
-from geopy.distance import geodesic
-from networkx import *
+try:
+    import geopy
+except ImportError:
+    geopy = None
+
+if geopy:
+    from geopy.distance import geodesic
+else:
+    print("You could be merge/splitting in lat/lons if you had geopy.")
+
+try:
+    import networkx
+except ImportError:
+    networkx = None
+
+if networkx:
+    from networkx import *
+else:
+    print("Cannot Merge/Split. Please install networkx.")
+# from geopy.distance import geodesic
+# from networkx import *
 import numpy as np
 from pandas.core.common import flatten
 import xarray as xr

From 6ba5d41ae516bc9f376971d08f25e4d8f427c19c Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Thu, 18 Aug 2022 16:03:21 -0500
Subject: [PATCH 07/29] Updated with suggestions from code reviewers, and tobac
 v1.3.

Updated to tobac v1.3.2, and made suggested changes. Merge_split.py now ingests a pandas dataframe, ancillary functions have been moved to utils.py.
---
 tobac/merge_split.py | 346 ++++++++++---------------------------------
 tobac/utils.py       | 169 +++++++++++++++++++++
 2 files changed, 245 insertions(+), 270 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index f157852c..c64b30bc 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -8,225 +8,16 @@
     
 """
 
-try:
-    import geopy
-except ImportError:
-    geopy = None
-
-if geopy:
-    from geopy.distance import geodesic
-else:
-    print("You could be merge/splitting in lat/lons if you had geopy.")
-
-try:
-    import networkx
-except ImportError:
-    networkx = None
-
-if networkx:
-    from networkx import *
-else:
-    print("Cannot Merge/Split. Please install networkx.")
-# from geopy.distance import geodesic
-# from networkx import *
-import numpy as np
-from pandas.core.common import flatten
-import xarray as xr
-
-
-def compress_all(nc_grids, min_dims=2):
-    """
-    The purpose of this subroutine is to compress the netcdf variables as they are saved
-    this does not change the data, but sets netcdf encoding parameters.
-    We allocate a minimum number of dimensions as variables with dimensions
-    under the minimum value do not benefit from tangibly from this encoding.
-
-    Parameters
-    ----------
-    nc_grids : xarray.core.dataset.Dataset
-        Xarray dataset that is intended to be exported as netcdf
-
-    min_dims : integer
-        The minimum number of dimesnions, in integer value, a variable must have in order
-        set the netcdf compression encoding.
-
-    Returns
-    -------
-    nc_grids : xarray.core.dataset.Dataset
-        Xarray dataset with netcdf compression encoding for variables with two (2) or more dimensions
-
-    """
-
-    for var in nc_grids:
-        if len(nc_grids[var].dims) >= min_dims:
-            # print("Compressing ", var)
-            nc_grids[var].encoding["zlib"] = True
-            nc_grids[var].encoding["complevel"] = 4
-            nc_grids[var].encoding["contiguous"] = False
-    return nc_grids
-
-
-def standardize_track_dataset(TrackedFeatures, Mask, Projection=None):
-    """
-    Combine a feature mask with the feature data table into a common dataset.
-
-    returned by tobac.themes.tobac_v1.segmentation
-    with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy.
-
-    Also rename the variables to be more desciptive and comply with cf-tree.
-
-    Convert the default cell parent ID  to an integer table.
-
-    Add a cell dimension to reflect
-
-    Projection is an xarray DataArray
-
-    TODO: Add metadata attributes
-
-    Parameters
-    ----------
-    TrackedFeatures : xarray.core.dataset.Dataset
-        xarray dataset of tobac Track information, the xarray dataset returned by tobac.tracking.linking_trackpy
-
-    Mask: xarray.core.dataset.Dataset
-        xarray dataset of tobac segmentation mask information, the xarray dataset returned
-        by tobac.segmentation.segmentation
-
-
-    Projection : xarray.core.dataarray.DataArray, default = None
-        array.DataArray of the original input dataset (gridded nexrad data for example).
-        If using gridded nexrad data, this can be input as: data['ProjectionCoordinateSystem']
-        An example of the type of information in the dataarray includes the following attributes:
-        latitude_of_projection_origin :29.471900939941406
-        longitude_of_projection_origin :-95.0787353515625
-        _CoordinateTransformType :Projection
-        _CoordinateAxes :x y z time
-        _CoordinateAxesTypes :GeoX GeoY Height Time
-        grid_mapping_name :azimuthal_equidistant
-        semi_major_axis :6370997.0
-        inverse_flattening :298.25
-        longitude_of_prime_meridian :0.0
-        false_easting :0.0
-        false_northing :0.0
-
-    Returns
-    -------
-
-    ds : xarray.core.dataset.Dataset
-        xarray dataset of merged Track and Segmentation Mask datasets with renamed variables.
 
-    """
-    feature_standard_names = {
-        # new variable name, and long description for the NetCDF attribute
-        "frame": (
-            "feature_time_index",
-            "positional index of the feature along the time dimension of the mask, from 0 to N-1",
-        ),
-        "hdim_1": (
-            "feature_hdim1_coordinate",
-            "position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x)."
-            "The numbering is consistent with positional indexing of the coordinate, but can be"
-            "fractional, to account for a centroid not aligned to the grid.",
-        ),
-        "hdim_2": (
-            "feature_hdim2_coordinate",
-            "position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)"
-            "The numbering is consistent with positional indexing of the coordinate, but can be"
-            "fractional, to account for a centroid not aligned to the grid.",
-        ),
-        "idx": ("feature_id_this_frame",),
-        "num": (
-            "feature_grid_cell_count",
-            "Number of grid points that are within the threshold of this feature",
-        ),
-        "threshold_value": (
-            "feature_threshold_max",
-            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments",
-        ),
-        "feature": (
-            "feature_id",
-            "Unique number of the feature; starts from 1 and increments by 1 to the number of features",
-        ),
-        "time": (
-            "feature_time",
-            "time of the feature, consistent with feature_time_index",
-        ),
-        "timestr": (
-            "feature_time_str",
-            "String representation of the feature time, YYYY-MM-DD HH:MM:SS",
-        ),
-        "projection_y_coordinate": (
-            "feature_projection_y_coordinate",
-            "y position of the feature in the projection given by ProjectionCoordinateSystem",
-        ),
-        "projection_x_coordinate": (
-            "feature_projection_x_coordinate",
-            "x position of the feature in the projection given by ProjectionCoordinateSystem",
-        ),
-        "lat": ("feature_latitude", "latitude of the feature"),
-        "lon": ("feature_longitude", "longitude of the feature"),
-        "ncells": (
-            "feature_ncells",
-            "number of grid cells for this feature (meaning uncertain)",
-        ),
-        "areas": ("feature_area",),
-        "isolated": ("feature_isolation_flag",),
-        "num_objects": ("number_of_feature_neighbors",),
-        "cell": ("feature_parent_cell_id",),
-        "time_cell": ("feature_parent_cell_elapsed_time",),
-        "segmentation_mask": ("2d segmentation mask",),
-    }
-    new_feature_var_names = {
-        k: feature_standard_names[k][0]
-        for k in feature_standard_names.keys()
-        if k in TrackedFeatures.variables.keys()
-    }
-
-    TrackedFeatures = TrackedFeatures.drop(["cell_parent_track_id"])
-    # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of
-    # the 'index' variable and call the dimension 'feature'
-    ds = xr.merge(
-        [
-            TrackedFeatures.swap_dims({"index": "feature"})
-            .drop("index")
-            .rename_vars(new_feature_var_names),
-            Mask,
-        ]
-    )
-
-    # Add the projection data back in
-    if not None in Projection:
-        ds["ProjectionCoordinateSystem"] = Projection
-
-    # Convert the cell ID variable from float to integer
-    if "int" not in str(TrackedFeatures.cell.dtype):
-        # The raw output from the tracking is actually an object array
-        # array([nan, 2, 3], dtype=object)
-        # (and is cast to a float array when saved as NetCDF, I think).
-        # Cast to float.
-        int_cell = xr.zeros_like(TrackedFeatures.cell, dtype="int64")
-
-        cell_id_data = TrackedFeatures.cell.astype("float64")
-        valid_cell = np.isfinite(cell_id_data)
-        valid_cell_ids = cell_id_data[valid_cell]
-        if not (np.unique(valid_cell_ids) > 0).all():
-            raise AssertionError(
-                "Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell"
-            )
-        int_cell[valid_cell] = valid_cell_ids.astype("int64")
-        # ds['feature_parent_cell_id'] = int_cell
-    return ds
-
-
-def merge_split(TRACK, distance=25000, frame_len=5):
+def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     """
     function to  postprocess tobac track data for merge/split cells
 
 
     Parameters
     ----------
-    TRACK : xarray.core.dataset.Dataset
-        xarray dataset of tobac Track information
+    TRACK : pandas.core.frame.DataFrame
+        Pandas dataframe of tobac Track information
 
     distance : float, optional
         Distance threshold prior to adding a pair of points into the minimum spanning tree.
@@ -236,6 +27,10 @@ def merge_split(TRACK, distance=25000, frame_len=5):
         Threshold for the spanning length within which two points can be separated.
         Default is five (5) frames.
 
+    dxy : float, optional
+        The x/y/ grid spacing of the data.
+        Default is 500m.
+
     Returns
     -------
 
@@ -252,7 +47,38 @@ def merge_split(TRACK, distance=25000, frame_len=5):
 
     """
 
-    print("this is an update 3")
+    try:
+        import geopy
+    except ImportError:
+        geopy = None
+
+    if geopy:
+        from geopy.distance import geodesic
+    else:
+        print(
+            "Geopy not available, Merge/Split will proceed in tobac general coordinates."
+        )
+
+    try:
+        import networkx as nx
+    except ImportError:
+        networkx = None
+
+    #     if networkx:
+    #         import networkx as nx
+    #     else:
+    #         print("Cannot Merge/Split. Please install networkx.")
+    import logging
+    import numpy as np
+    from pandas.core.common import flatten
+    import xarray as xr
+
+    # Check if dxy is in meters of in kilometers. It should be in meters
+    if dxy <= 5:
+        dxy *= 1000
+
+    # Immediately convert pandas dataframe of track information to xarray:
+    TRACK = TRACK.to_xarray()
     track_groups = TRACK.groupby("cell")
     cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()}
     id_data = np.fromiter(cell_ids.keys(), dtype=int)
@@ -283,11 +109,11 @@ def merge_split(TRACK, distance=25000, frame_len=5):
 
         for i in id_data:
             # print(i)
-            a_pointx = track_groups[i].projection_x_coordinate[-1].values
-            a_pointy = track_groups[i].projection_y_coordinate[-1].values
+            a_pointx = track_groups[i].hdim_2[-1].values * dxy
+            a_pointy = track_groups[i].hdim_1[-1].values * dxy
             for j in id_data:
-                b_pointx = track_groups[j].projection_x_coordinate[0].values
-                b_pointy = track_groups[j].projection_y_coordinate[0].values
+                b_pointx = track_groups[j].hdim_2[0].values * dxy
+                b_pointy = track_groups[j].hdim_1[0].values * dxy
                 d = np.linalg.norm(
                     np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))
                 )
@@ -299,6 +125,8 @@ def merge_split(TRACK, distance=25000, frame_len=5):
                     b_names.append(j)
 
     id = []
+    # This is removing any tracks that have matched to themselves - e.g.,
+    # a beginning of a track has found its tail.
     for i in range(len(dist) - 1, -1, -1):
         if a_names[i] == b_names[i]:
             id.append(i)
@@ -310,12 +138,12 @@ def merge_split(TRACK, distance=25000, frame_len=5):
         else:
             continue
 
-    g = Graph()
+    g = nx.Graph()
     for i in np.arange(len(dist)):
         g.add_edge(a_names[i], b_names[i], weight=dist[i])
 
-    tree = minimum_spanning_edges(g)
-    tree_list = list(minimum_spanning_edges(g))
+    tree = nx.minimum_spanning_edges(g)
+    tree_list = list(tree)
 
     new_tree = []
     for i, j in enumerate(tree_list):
@@ -327,7 +155,7 @@ def merge_split(TRACK, distance=25000, frame_len=5):
 
     TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values))
     cell_id = np.unique(
-        TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]
+        TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))]
     )
     track_id = dict()  # same size as number of total merged tracks
 
@@ -366,23 +194,15 @@ def merge_split(TRACK, distance=25000, frame_len=5):
 
                 track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
 
-    storm_id = [0]  # default because we don't track larger storm systems *yet*
-    print("found storm id")
-
-    track_parent_storm_id = np.repeat(
-        0, len(track_id)
-    )  # This will always be zero when we don't track larger storm systems *yet*
-    print("found track parent storm ids")
-
     track_ids = np.array(list(track_id.keys()))
-    print("found track ids")
+    logging.debug("found track ids")
 
     cell_id = list(
         np.unique(
-            TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]
+            TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))]
         )
     )
-    print("found cell ids")
+    logging.debug("found cell ids")
 
     cell_parent_track_id = []
 
@@ -395,18 +215,14 @@ def merge_split(TRACK, distance=25000, frame_len=5):
             cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)])))
 
     cell_parent_track_id = list(flatten(cell_parent_track_id))
-    print("found cell parent track ids")
-
-    feature_parent_cell_id = list(TRACK.cell.values.astype(float))
+    logging.debug("found cell parent track ids")
 
-    print("found feature parent cell ids")
+    feature_parent_cell_id = list(TRACK.cell.values.astype(int))
+    logging.debug("found feature parent cell ids")
 
     # This version includes all the feature regardless of if they are used in cells or not.
     feature_id = list(TRACK.feature.values.astype(int))
-    print("found feature ids")
-
-    feature_parent_storm_id = np.repeat(0, len(feature_id))  # we don't do storms atm
-    print("found feature parent storm ids")
+    logging.debug("found feature ids")
 
     feature_parent_track_id = []
     feature_parent_track_id = np.zeros(len(feature_id))
@@ -420,58 +236,48 @@ def merge_split(TRACK, distance=25000, frame_len=5):
             trackid = cell_parent_track_id[j]
             feature_parent_track_id[i] = trackid
 
-    print("found feature parent track ids")
-
-    storm_child_track_count = [len(track_id)]
-    print("found storm child track count")
+    logging.debug("found feature parent track ids")
 
     track_child_cell_count = []
     for i, id in enumerate(track_id):
         track_child_cell_count.append(len(track_id[int(id)]))
-    print("found track child cell count")
+    logging.debug("found track child cell count")
 
     cell_child_feature_count = []
     for i, id in enumerate(cell_id):
         cell_child_feature_count.append(len(track_groups[id].feature.values))
-    print("found cell child feature count")
-
-    storm_child_cell_count = [len(cell_id)]
-    storm_child_feature_count = [len(feature_id)]
+    logging.debug("found cell child feature count")
 
-    storm_dim = "nstorms"
-    track_dim = "ntracks"
-    cell_dim = "ncells"
-    feature_dim = "nfeatures"
+    track_dim = "tracks"
+    cell_dim = "cells"
+    feature_dim = "features"
 
     d = xr.Dataset(
         {
-            "storm_id": (storm_dim, storm_id),
-            "track_id": (track_dim, track_ids),
-            "track_parent_storm_id": (track_dim, track_parent_storm_id),
-            "cell_id": (cell_dim, cell_id),
+            "track": (track_dim, track_ids),
+            "cell": (cell_dim, cell_id),
             "cell_parent_track_id": (cell_dim, cell_parent_track_id),
-            "feature_id": (feature_dim, feature_id),
+            "feature": (feature_dim, feature_id),
             "feature_parent_cell_id": (feature_dim, feature_parent_cell_id),
             "feature_parent_track_id": (feature_dim, feature_parent_track_id),
-            "feature_parent_storm_id": (feature_dim, feature_parent_storm_id),
-            "storm_child_track_count": (storm_dim, storm_child_track_count),
-            "storm_child_cell_count": (storm_dim, storm_child_cell_count),
-            "storm_child_feature_count": (storm_dim, storm_child_feature_count),
             "track_child_cell_count": (track_dim, track_child_cell_count),
             "cell_child_feature_count": (cell_dim, cell_child_feature_count),
         }
     )
-    d = d.set_coords(["feature_id", "cell_id", "track_id", "storm_id"])
 
-    assert len(track_id) == len(track_parent_storm_id)
+    d = d.set_coords(["feature", "cell", "track"])
+
     assert len(cell_id) == len(cell_parent_track_id)
     assert len(feature_id) == len(feature_parent_cell_id)
-    assert sum(storm_child_track_count) == len(track_id)
-    assert sum(storm_child_cell_count) == len(cell_id)
-    assert sum(storm_child_feature_count) == len(feature_id)
     assert sum(track_child_cell_count) == len(cell_id)
-    assert sum(
-        [sum(cell_child_feature_count), (len(np.where(feature_parent_track_id < 0)[0]))]
-    ) == len(feature_id)
+    assert (
+        sum(
+            [
+                sum(cell_child_feature_count[1:]),
+                (len(np.where(feature_parent_track_id < 0)[0])),
+            ]
+        )
+        == len(feature_id)
+    )
 
     return d
diff --git a/tobac/utils.py b/tobac/utils.py
index 2509b97f..ca44e02c 100644
--- a/tobac/utils.py
+++ b/tobac/utils.py
@@ -578,3 +578,172 @@ def get_indices_of_labels_from_reg_prop_dict(region_property_dict):
         curr_loc_indices[index] = len(curr_y_ixs)
 
     return (curr_loc_indices, y_indices, x_indices)
+
+
+def compress_all(nc_grids, min_dims=2, comp_level=4):
+    """
+    The purpose of this subroutine is to compress the netcdf variables as they are saved.
+    This does not change the data, but sets netcdf encoding parameters.
+    We allocate a minimum number of dimensions as variables with dimensions
+    under the minimum value do not benefit from tangibly from this encoding.
+
+    Parameters
+    ----------
+    nc_grids : xarray.core.dataset.Dataset
+        Xarray dataset that is intended to be exported as netcdf
+
+    min_dims : integer
+        The minimum number of dimesnions, in integer value, a variable must have in order
+        set the netcdf compression encoding.
+    comp_level : integer
+        The level of compression. Default values is 4.
+
+    Returns
+    -------
+    nc_grids : xarray.core.dataset.Dataset
+        Xarray dataset with netcdf compression encoding for variables with two (2) or more dimensions
+
+    """
+
+    for var in nc_grids:
+        if len(nc_grids[var].dims) >= min_dims:
+            # print("Compressing ", var)
+            nc_grids[var].encoding["zlib"] = True
+            nc_grids[var].encoding["complevel"] = comp_level
+            nc_grids[var].encoding["contiguous"] = False
+    return nc_grids
+
+
+def standardize_track_dataset(TrackedFeatures, Mask, Projection=None):
+    """
+    Combine a feature mask with the feature data table into a common dataset.
+
+    returned by tobac.segmentation
+    with the TrackedFeatures dataset returned by tobac.linking_trackpy.
+
+    Also rename the variables to be more desciptive and comply with cf-tree.
+
+    Convert the default cell parent ID  to an integer table.
+
+    Add a cell dimension to reflect
+
+    Projection is an xarray DataArray
+
+    TODO: Add metadata attributes
+
+    Parameters
+    ----------
+    TrackedFeatures : xarray.core.dataset.Dataset
+        xarray dataset of tobac Track information, the xarray dataset returned by tobac.tracking.linking_trackpy
+
+    Mask: xarray.core.dataset.Dataset
+        xarray dataset of tobac segmentation mask information, the xarray dataset returned
+        by tobac.segmentation.segmentation
+
+
+    Projection : xarray.core.dataarray.DataArray, default = None
+        array.DataArray of the original input dataset (gridded nexrad data for example).
+        If using gridded nexrad data, this can be input as: data['ProjectionCoordinateSystem']
+        An example of the type of information in the dataarray includes the following attributes:
+        latitude_of_projection_origin :29.471900939941406
+        longitude_of_projection_origin :-95.0787353515625
+        _CoordinateTransformType :Projection
+        _CoordinateAxes :x y z time
+        _CoordinateAxesTypes :GeoX GeoY Height Time
+        grid_mapping_name :azimuthal_equidistant
+        semi_major_axis :6370997.0
+        inverse_flattening :298.25
+        longitude_of_prime_meridian :0.0
+        false_easting :0.0
+        false_northing :0.0
+
+    Returns
+    -------
+
+    ds : xarray.core.dataset.Dataset
+        xarray dataset of merged Track and Segmentation Mask datasets with renamed variables.
+
+    """
+    feature_standard_names = {
+        # new variable name, and long description for the NetCDF attribute
+        "frame": (
+            "feature_time_index",
+            "positional index of the feature along the time dimension of the mask, from 0 to N-1",
+        ),
+        "hdim_1": (
+            "feature_hdim1_coordinate",
+            "position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x)."
+            "The numbering is consistent with positional indexing of the coordinate, but can be"
+            "fractional, to account for a centroid not aligned to the grid.",
+        ),
+        "hdim_2": (
+            "feature_hdim2_coordinate",
+            "position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)"
+            "The numbering is consistent with positional indexing of the coordinate, but can be"
+            "fractional, to account for a centroid not aligned to the grid.",
+        ),
+        "idx": ("feature_id_this_frame",),
+        "num": (
+            "feature_grid_cell_count",
+            "Number of grid points that are within the threshold of this feature",
+        ),
+        "threshold_value": (
+            "feature_threshold_max",
+            "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments",
+        ),
+        "feature": (
+            "feature_id",
+            "Unique number of the feature; starts from 1 and increments by 1 to the number of features",
+        ),
+        "time": (
+            "feature_time",
+            "time of the feature, consistent with feature_time_index",
+        ),
+        "timestr": (
+            "feature_time_str",
+            "String representation of the feature time, YYYY-MM-DD HH:MM:SS",
+        ),
+        "projection_y_coordinate": (
+            "feature_projection_y_coordinate",
+            "y position of the feature in the projection given by ProjectionCoordinateSystem",
+        ),
+        "projection_x_coordinate": (
+            "feature_projection_x_coordinate",
+            "x position of the feature in the projection given by ProjectionCoordinateSystem",
+        ),
+        "lat": ("feature_latitude", "latitude of the feature"),
+        "lon": ("feature_longitude", "longitude of the feature"),
+        "ncells": (
+            "feature_ncells",
+            "number of grid cells for this feature (meaning uncertain)",
+        ),
+        "areas": ("feature_area",),
+        "isolated": ("feature_isolation_flag",),
+        "num_objects": ("number_of_feature_neighbors",),
+        "cell": ("feature_parent_cell_id",),
+        "time_cell": ("feature_parent_cell_elapsed_time",),
+        "segmentation_mask": ("2d segmentation mask",),
+    }
+    new_feature_var_names = {
+        k: feature_standard_names[k][0]
+        for k in feature_standard_names.keys()
+        if k in TrackedFeatures.variables.keys()
+    }
+
+    #     TrackedFeatures = TrackedFeatures.drop(["cell_parent_track_id"])
+    # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of
+    # the 'index' variable and call the dimension 'feature'
+    ds = xr.merge(
+        [
+            TrackedFeatures.swap_dims({"index": "feature"})
+            .drop("index")
+            .rename_vars(new_feature_var_names),
+            Mask,
+        ]
+    )
+
+    # Add the projection data back in
+    if not None in Projection:
+        ds["ProjectionCoordinateSystem"] = Projection
+
+    return ds

From 9d058577cd029ded3af7f9dfe3afe7425b34ed06 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Wed, 7 Sep 2022 12:19:56 -0500
Subject: [PATCH 08/29] Black formatting

---
 tobac/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tobac/utils.py b/tobac/utils.py
index ca44e02c..73683c4c 100644
--- a/tobac/utils.py
+++ b/tobac/utils.py
@@ -664,6 +664,8 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection=None):
         xarray dataset of merged Track and Segmentation Mask datasets with renamed variables.
 
     """
+    import xarray as xr
+
     feature_standard_names = {
         # new variable name, and long description for the NetCDF attribute
         "frame": (

From a25f9842edc3b4f3a5509dc0ebaa613b6cf55ed3 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Wed, 7 Sep 2022 12:21:57 -0500
Subject: [PATCH 09/29] Black formatting

---
 tobac/merge_split.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index c64b30bc..b0f482b0 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -270,14 +270,11 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     assert len(cell_id) == len(cell_parent_track_id)
     assert len(feature_id) == len(feature_parent_cell_id)
     assert sum(track_child_cell_count) == len(cell_id)
-    assert (
-        sum(
-            [
-                sum(cell_child_feature_count[1:]),
-                (len(np.where(feature_parent_track_id < 0)[0])),
-            ]
-        )
-        == len(feature_id)
-    )
+    assert sum(
+        [
+            sum(cell_child_feature_count[1:]),
+            (len(np.where(feature_parent_track_id < 0)[0])),
+        ]
+    ) == len(feature_id)
 
     return d

From c25f588d6f27adf157684229ebe6aaa53561ae21 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Sun, 18 Sep 2022 11:33:33 -0500
Subject: [PATCH 10/29] Updates to documentation and merge_split

Removed the lat/lon in merge_split, and added documentation.
---
 doc/merge_split.rst  | 19 +++++++++
 tobac/merge_split.py | 96 ++++++++++++++++++--------------------------
 2 files changed, 57 insertions(+), 58 deletions(-)
 create mode 100644 doc/merge_split.rst

diff --git a/doc/merge_split.rst b/doc/merge_split.rst
new file mode 100644
index 00000000..ca02f291
--- /dev/null
+++ b/doc/merge_split.rst
@@ -0,0 +1,19 @@
+*Merge and Split 
+======================
+
+
+This submodule is a post processing step to address tracked cells which merge/split. 
+The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. 
+This submodule will label merged/split cells with a TRACK number in addition to its CELL number.
+
+Features, cells, and tracks are combined using parent/child nomenclature. 
+(quick note on terms; “feature” is a detected object at a single time step. “cell” is a series of features linked together over multiple timesteps. "track" may be an individual cell or series of cells which have merged and/or split.)
+
+
+Overview of the output dataframe from merge_split
+  - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. 
+  - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. 
+  - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. 
+  - track_child_cell_count: The total number of features belonging to all child cells of a given track id.
+  - cell_child_feature_count: The total number of features for each cell. 
+
diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index b0f482b0..fd61d93d 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -1,10 +1,9 @@
 """
     Tobac merge and split v0.1
     This submodule is a post processing step to address tracked cells which merge/split. 
-    The first iteration of this module is to combine the cells which are merging but receive 
-    a new cell id once merged. In general this submodule will label these three cell-ids using
-    the largest cell number of those within the merged/split cell ids. 
-    
+    The first iteration of this module is to combine the cells which are merging but have received
+    a new cell id (and are considered a new cell) once merged. In general this submodule will label merged/split cells
+    with a TRACK number in addition to its CELL number.
     
 """
 
@@ -37,28 +36,23 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     d : xarray.core.dataset.Dataset
         xarray dataset of tobac merge/split cells with parent and child designations.
 
+                Parent/child variables include:
+        - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.
+                - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id.
+                - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number.
+                - track_child_cell_count: The total number of features belonging to all child cells of a given track id.
+                - cell_child_feature_count: The total number of features for each cell.
+
 
     Example usage:
         d = merge_split(Track)
-        ds = standardize_track_dataset(Track, refl_mask)
+        ds = tobac.utils.standardize_track_dataset(Track, refl_mask)
         both_ds = xr.merge([ds, d],compat ='override')
-        both_ds = compress_all(both_ds)
+        both_ds = tobac.utils.compress_all(both_ds)
         both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
 
     """
 
-    try:
-        import geopy
-    except ImportError:
-        geopy = None
-
-    if geopy:
-        from geopy.distance import geodesic
-    else:
-        print(
-            "Geopy not available, Merge/Split will proceed in tobac general coordinates."
-        )
-
     try:
         import networkx as nx
     except ImportError:
@@ -90,39 +84,22 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     b_names = list()
     dist = list()
 
-    if hasattr(TRACK, "longitude"):
-        print("is in lat/lon")
-        for i in id_data:
-            a_pointx = track_groups[i].grid_longitude[-1].values
-            a_pointy = track_groups[i].grid_latitude[-1].values
-            for j in id_data:
-                b_pointx = track_groups[j].grid_longitude[0].values
-                b_pointy = track_groups[j].grid_latitude[0].values
-                d = geodesic((a_pointy, a_pointx), (b_pointy, b_pointx)).m
-                if d <= distance:
-                    a_points.append([a_pointx, a_pointy])
-                    b_points.append([b_pointx, b_pointy])
-                    dist.append(d)
-                    a_names.append(i)
-                    b_names.append(j)
-    else:
-
-        for i in id_data:
-            # print(i)
-            a_pointx = track_groups[i].hdim_2[-1].values * dxy
-            a_pointy = track_groups[i].hdim_1[-1].values * dxy
-            for j in id_data:
-                b_pointx = track_groups[j].hdim_2[0].values * dxy
-                b_pointy = track_groups[j].hdim_1[0].values * dxy
-                d = np.linalg.norm(
-                    np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))
-                )
-                if d <= distance:
-                    a_points.append([a_pointx, a_pointy])
-                    b_points.append([b_pointx, b_pointy])
-                    dist.append(d)
-                    a_names.append(i)
-                    b_names.append(j)
+    for i in id_data:
+        # print(i)
+        a_pointx = track_groups[i].hdim_2[-1].values * dxy
+        a_pointy = track_groups[i].hdim_1[-1].values * dxy
+        for j in id_data:
+            b_pointx = track_groups[j].hdim_2[0].values * dxy
+            b_pointy = track_groups[j].hdim_1[0].values * dxy
+            d = np.linalg.norm(
+                np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))
+            )
+            if d <= distance:
+                a_points.append([a_pointx, a_pointy])
+                b_points.append([b_pointx, b_pointy])
+                dist.append(d)
+                a_names.append(i)
+                b_names.append(j)
 
     id = []
     # This is removing any tracks that have matched to themselves - e.g.,
@@ -137,7 +114,7 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
             b_names.pop(i)
         else:
             continue
-
+    # This is inputting data to the object with will perform the spanning tree.
     g = nx.Graph()
     for i in np.arange(len(dist)):
         g.add_edge(a_names[i], b_names[i], weight=dist[i])
@@ -270,11 +247,14 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     assert len(cell_id) == len(cell_parent_track_id)
     assert len(feature_id) == len(feature_parent_cell_id)
     assert sum(track_child_cell_count) == len(cell_id)
-    assert sum(
-        [
-            sum(cell_child_feature_count[1:]),
-            (len(np.where(feature_parent_track_id < 0)[0])),
-        ]
-    ) == len(feature_id)
+    assert (
+        sum(
+            [
+                sum(cell_child_feature_count[1:]),
+                (len(np.where(feature_parent_track_id < 0)[0])),
+            ]
+        )
+        == len(feature_id)
+    )
 
     return d

From 6327c8dd5dbc6942f0a64f6b65727c52915ed613 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Sun, 18 Sep 2022 11:38:50 -0500
Subject: [PATCH 11/29] Formatting

---
 build/lib/tobac/merge_split.py | 259 +++++++++++++++++++++++++++++++++
 tobac/merge_split.py           |  23 ++-
 2 files changed, 269 insertions(+), 13 deletions(-)
 create mode 100644 build/lib/tobac/merge_split.py

diff --git a/build/lib/tobac/merge_split.py b/build/lib/tobac/merge_split.py
new file mode 100644
index 00000000..609ae47f
--- /dev/null
+++ b/build/lib/tobac/merge_split.py
@@ -0,0 +1,259 @@
+"""
+    Tobac merge and split v0.1
+    This submodule is a post processing step to address tracked cells which merge/split. 
+    The first iteration of this module is to combine the cells which are merging but have received
+    a new cell id (and are considered a new cell) once merged. In general this submodule will label merged/split cells
+    with a TRACK number in addition to its CELL number.
+    
+"""
+
+
+def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
+    """
+    function to  postprocess tobac track data for merge/split cells
+
+
+    Parameters
+    ----------
+    TRACK : pandas.core.frame.DataFrame
+        Pandas dataframe of tobac Track information
+
+    distance : float, optional
+        Distance threshold prior to adding a pair of points into the minimum spanning tree.
+        Default is 25000 meters.
+
+    frame_len : float, optional
+        Threshold for the spanning length within which two points can be separated.
+        Default is five (5) frames.
+
+    dxy : float, optional
+        The x/y/ grid spacing of the data.
+        Default is 500m.
+
+    Returns
+    -------
+
+    d : xarray.core.dataset.Dataset
+        xarray dataset of tobac merge/split cells with parent and child designations.
+        
+		Parent/child variables include:
+        - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. 
+  		- feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. 
+  		- feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. 
+  		- track_child_cell_count: The total number of features belonging to all child cells of a given track id.
+  		- cell_child_feature_count: The total number of features for each cell. 
+
+
+    Example usage:
+        d = merge_split(Track)
+        ds = tobac.utils.standardize_track_dataset(Track, refl_mask)
+        both_ds = xr.merge([ds, d],compat ='override')
+        both_ds = tobac.utils.compress_all(both_ds)
+        both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
+
+    """
+
+
+    try:
+        import networkx as nx
+    except ImportError:
+        networkx = None
+
+    #     if networkx:
+    #         import networkx as nx
+    #     else:
+    #         print("Cannot Merge/Split. Please install networkx.")
+    import logging
+    import numpy as np
+    from pandas.core.common import flatten
+    import xarray as xr
+
+    # Check if dxy is in meters of in kilometers. It should be in meters
+    if dxy <= 5:
+        dxy *= 1000
+
+    # Immediately convert pandas dataframe of track information to xarray:
+    TRACK = TRACK.to_xarray()
+    track_groups = TRACK.groupby("cell")
+    cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()}
+    id_data = np.fromiter(cell_ids.keys(), dtype=int)
+    count_data = np.fromiter(cell_ids.values(), dtype=int)
+    all_frames = np.sort(np.unique(TRACK.frame))
+    a_points = list()
+    b_points = list()
+    a_names = list()
+    b_names = list()
+    dist = list()
+
+
+    for i in id_data:
+        # print(i)
+        a_pointx = track_groups[i].hdim_2[-1].values * dxy
+        a_pointy = track_groups[i].hdim_1[-1].values * dxy
+        for j in id_data:
+            b_pointx = track_groups[j].hdim_2[0].values * dxy
+            b_pointy = track_groups[j].hdim_1[0].values * dxy
+            d = np.linalg.norm(
+                np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))
+            )
+            if d <= distance:
+                a_points.append([a_pointx, a_pointy])
+                b_points.append([b_pointx, b_pointy])
+                dist.append(d)
+                a_names.append(i)
+                b_names.append(j)
+
+    id = []
+    # This is removing any tracks that have matched to themselves - e.g.,
+    # a beginning of a track has found its tail.
+    for i in range(len(dist) - 1, -1, -1):
+        if a_names[i] == b_names[i]:
+            id.append(i)
+            a_points.pop(i)
+            b_points.pop(i)
+            dist.pop(i)
+            a_names.pop(i)
+            b_names.pop(i)
+        else:
+            continue
+    # This is inputting data to the object with will perform the spanning tree.
+    g = nx.Graph()
+    for i in np.arange(len(dist)):
+        g.add_edge(a_names[i], b_names[i], weight=dist[i])
+
+    tree = nx.minimum_spanning_edges(g)
+    tree_list = list(tree)
+
+    new_tree = []
+    for i, j in enumerate(tree_list):
+        frame_a = np.nanmax(track_groups[j[0]].frame.values)
+        frame_b = np.nanmin(track_groups[j[1]].frame.values)
+        if np.abs(frame_a - frame_b) <= frame_len:
+            new_tree.append(tree_list[i][0:2])
+    new_tree_arr = np.array(new_tree)
+
+    TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values))
+    cell_id = np.unique(
+        TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))]
+    )
+    track_id = dict()  # same size as number of total merged tracks
+
+    arr = np.array([0])
+    for p in cell_id:
+        j = np.where(arr == int(p))
+        if len(j[0]) > 0:
+            continue
+        else:
+            k = np.where(new_tree_arr == p)
+            if len(k[0]) == 0:
+                track_id[p] = [p]
+                arr = np.append(arr, p)
+            else:
+                temp1 = list(np.unique(new_tree_arr[k[0]]))
+                temp = list(np.unique(new_tree_arr[k[0]]))
+
+                for l in range(15):
+                    for i in temp1:
+                        k2 = np.where(new_tree_arr == i)
+                        temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
+                        temp = list(flatten(temp))
+                        temp = list(np.unique(temp))
+
+                    if len(temp1) == len(temp):
+                        break
+                    temp1 = np.array(temp)
+
+                for i in temp1:
+                    k2 = np.where(new_tree_arr == i)
+                    temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
+
+                temp = list(flatten(temp))
+                temp = list(np.unique(temp))
+                arr = np.append(arr, np.unique(temp))
+
+                track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
+
+    track_ids = np.array(list(track_id.keys()))
+    logging.debug("found track ids")
+
+    cell_id = list(
+        np.unique(
+            TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))]
+        )
+    )
+    logging.debug("found cell ids")
+
+    cell_parent_track_id = []
+
+    for i, id in enumerate(track_id):
+
+        if len(track_id[int(id)]) == 1:
+            cell_parent_track_id.append(int(id))
+
+        else:
+            cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)])))
+
+    cell_parent_track_id = list(flatten(cell_parent_track_id))
+    logging.debug("found cell parent track ids")
+
+    feature_parent_cell_id = list(TRACK.cell.values.astype(int))
+    logging.debug("found feature parent cell ids")
+
+    # This version includes all the feature regardless of if they are used in cells or not.
+    feature_id = list(TRACK.feature.values.astype(int))
+    logging.debug("found feature ids")
+
+    feature_parent_track_id = []
+    feature_parent_track_id = np.zeros(len(feature_id))
+    for i, id in enumerate(feature_id):
+        cellid = feature_parent_cell_id[i]
+        if np.isnan(cellid):
+            feature_parent_track_id[i] = -1
+        else:
+            j = np.where(cell_id == cellid)
+            j = np.squeeze(j)
+            trackid = cell_parent_track_id[j]
+            feature_parent_track_id[i] = trackid
+
+    logging.debug("found feature parent track ids")
+
+    track_child_cell_count = []
+    for i, id in enumerate(track_id):
+        track_child_cell_count.append(len(track_id[int(id)]))
+    logging.debug("found track child cell count")
+
+    cell_child_feature_count = []
+    for i, id in enumerate(cell_id):
+        cell_child_feature_count.append(len(track_groups[id].feature.values))
+    logging.debug("found cell child feature count")
+
+    track_dim = "tracks"
+    cell_dim = "cells"
+    feature_dim = "features"
+
+    d = xr.Dataset(
+        {
+            "track": (track_dim, track_ids),
+            "cell": (cell_dim, cell_id),
+            "cell_parent_track_id": (cell_dim, cell_parent_track_id),
+            "feature": (feature_dim, feature_id),
+            "feature_parent_cell_id": (feature_dim, feature_parent_cell_id),
+            "feature_parent_track_id": (feature_dim, feature_parent_track_id),
+            "track_child_cell_count": (track_dim, track_child_cell_count),
+            "cell_child_feature_count": (cell_dim, cell_child_feature_count),
+        }
+    )
+
+    d = d.set_coords(["feature", "cell", "track"])
+
+    assert len(cell_id) == len(cell_parent_track_id)
+    assert len(feature_id) == len(feature_parent_cell_id)
+    assert sum(track_child_cell_count) == len(cell_id)
+    assert sum(
+        [
+            sum(cell_child_feature_count[1:]),
+            (len(np.where(feature_parent_track_id < 0)[0])),
+        ]
+    ) == len(feature_id)
+
+    return d
diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index fd61d93d..51dc1286 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -38,10 +38,10 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
 
                 Parent/child variables include:
         - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.
-                - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id.
-                - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number.
-                - track_child_cell_count: The total number of features belonging to all child cells of a given track id.
-                - cell_child_feature_count: The total number of features for each cell.
+        - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id.
+        - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number.
+        - track_child_cell_count: The total number of features belonging to all child cells of a given track id.
+        - cell_child_feature_count: The total number of features for each cell.
 
 
     Example usage:
@@ -247,14 +247,11 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     assert len(cell_id) == len(cell_parent_track_id)
     assert len(feature_id) == len(feature_parent_cell_id)
     assert sum(track_child_cell_count) == len(cell_id)
-    assert (
-        sum(
-            [
-                sum(cell_child_feature_count[1:]),
-                (len(np.where(feature_parent_track_id < 0)[0])),
-            ]
-        )
-        == len(feature_id)
-    )
+    assert sum(
+        [
+            sum(cell_child_feature_count[1:]),
+            (len(np.where(feature_parent_track_id < 0)[0])),
+        ]
+    ) == len(feature_id)
 
     return d

From 99cb1790b5b3820e1ae7bf486033e314cecc632a Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Sun, 18 Sep 2022 20:52:44 -0500
Subject: [PATCH 12/29] Updated utils.py with changes made for 1.4

Included the spectral_filtering function in V1.4
---
 tobac/utils.py | 576 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 446 insertions(+), 130 deletions(-)

diff --git a/tobac/utils.py b/tobac/utils.py
index 73683c4c..85c8fdb9 100644
--- a/tobac/utils.py
+++ b/tobac/utils.py
@@ -2,18 +2,27 @@
 
 
 def column_mask_from2D(mask_2D, cube, z_coord="model_level_number"):
-    """function to turn 2D watershedding mask into a 3D mask of selected columns
-    Input:
-    cube:              iris.cube.Cube
-                       data cube
-    mask_2D:           iris.cube.Cube
-                       2D cube containing mask (int id for tacked volumes 0 everywhere else)
-    z_coord:           str
-                       name of the vertical coordinate in the cube
-    Output:
-    mask_2D:           iris.cube.Cube
-                       3D cube containing columns of 2D mask (int id for tacked volumes 0 everywhere else)
+    """Turn 2D watershedding mask into a 3D mask of selected columns.
+
+    Parameters
+    ----------
+    cube : iris.cube.Cube
+        Data cube.
+
+    mask_2D : iris.cube.Cube
+        2D cube containing mask (int id for tacked volumes 0
+        everywhere else).
+
+    z_coord : str
+        Name of the vertical coordinate in the cube.
+
+    Returns
+    -------
+    mask_2D : iris.cube.Cube
+        3D cube containing columns of 2D mask (int id for tracked
+        volumes, 0 everywhere else).
     """
+
     from copy import deepcopy
 
     mask_3D = deepcopy(cube)
@@ -28,18 +37,29 @@ def column_mask_from2D(mask_2D, cube, z_coord="model_level_number"):
 
 
 def mask_cube_cell(variable_cube, mask, cell, track):
-    """Mask cube for tracked volume of an individual cell
-    Input:
-    variable_cube:     iris.cube.Cube
-                       unmasked data cube
-    mask:              iris.cube.Cube
-                       cube containing mask (int id for tacked volumes 0 everywhere else)
-    cell:          int
-                       interger id of cell to create masked cube for
-    Output:
-    variable_cube_out: iris.cube.Cube
-                       Masked cube with data for respective cell
+    """Mask cube for tracked volume of an individual cell.
+
+    Parameters
+    ----------
+    variable_cube : iris.cube.Cube
+        Unmasked data cube.
+
+    mask : iris.cube.Cube
+        Cube containing mask (int id for tracked volumes, 0 everywhere
+        else).
+
+    cell : int
+        Integer id of cell to create masked cube for.
+
+    track : pandas.DataFrame
+        Output of the linking.
+
+    Returns
+    -------
+    variable_cube_out : iris.cube.Cube
+        Masked cube with data for respective cell.
     """
+
     from copy import deepcopy
 
     variable_cube_out = deepcopy(variable_cube)
@@ -49,16 +69,23 @@ def mask_cube_cell(variable_cube, mask, cell, track):
 
 
 def mask_cube_all(variable_cube, mask):
-    """Mask cube for untracked volume
-    Input:
-    variable_cube:     iris.cube.Cube
-                       unmasked data cube
-    mask:              iris.cube.Cube
-                       cube containing mask (int id for tacked volumes 0 everywhere else)
-    Output:
-    variable_cube_out: iris.cube.Cube
-                       Masked cube for untracked volume
+    """Mask cube (iris.cube) for tracked volume.
+
+    Parameters
+    ----------
+    variable_cube : iris.cube.Cube
+        Unmasked data cube.
+
+    mask : iris.cube.Cube
+        Cube containing mask (int id for tacked volumes 0 everywhere
+        else).
+
+    Returns
+    -------
+    variable_cube_out : iris.cube.Cube
+        Masked cube for untracked volume.
     """
+
     from dask.array import ma
     from copy import deepcopy
 
@@ -70,16 +97,23 @@ def mask_cube_all(variable_cube, mask):
 
 
 def mask_cube_untracked(variable_cube, mask):
-    """Mask cube for untracked volume
-    Input:
-    variable_cube:     iris.cube.Cube
-                       unmasked data cube
-    mask:              iris.cube.Cube
-                       cube containing mask (int id for tacked volumes 0 everywhere else)
-    Output:
-    variable_cube_out: iris.cube.Cube
-                       Masked cube for untracked volume
+    """Mask cube (iris.cube) for untracked volume.
+
+    Parameters
+    ----------
+    variable_cube : iris.cube.Cube
+        Unmasked data cube.
+
+    mask : iris.cube.Cube
+        Cube containing mask (int id for tacked volumes 0 everywhere
+        else).
+
+    Returns
+    -------
+    variable_cube_out : iris.cube.Cube
+        Masked cube for untracked volume.
     """
+
     from dask.array import ma
     from copy import deepcopy
 
@@ -91,47 +125,91 @@ def mask_cube_untracked(variable_cube, mask):
 
 
 def mask_cube(cube_in, mask):
-    """Mask cube where mask is larger than zero
-    Input:
-    cube_in:           iris.cube.Cube
-                       unmasked data cube
-    mask:              numpy.ndarray or dask.array
-                       mask to use for masking, >0 where cube is supposed to be masked
-    Output:
-    cube_out:          iris.cube.Cube
-                       Masked cube
+    """Mask cube where mask is not zero.
+
+    Parameters
+    ----------
+    cube_in : iris.cube.Cube
+        Unmasked data cube.
+
+    mask : iris.cube.Cube
+        Mask to use for masking, >0 where cube is supposed to be masked.
+
+    Returns
+    -------
+    variable_cube_out : iris.cube.Cube
+        Masked cube.
     """
+
     from dask.array import ma
     from copy import deepcopy
 
     cube_out = deepcopy(cube_in)
-    cube_out.data = ma.masked_where(mask != 0, cube_in.core_data())
+    cube_out.data = ma.masked_where(mask.core_data() != 0, cube_in.core_data())
     return cube_out
 
 
 def mask_cell(mask, cell, track, masked=False):
-    """create mask for specific cell
-    Input:
-    mask:              iris.cube.Cube
-                       cube containing mask (int id for tacked volumes 0 everywhere else)
-    Output:
-    variable_cube_out: numpy.ndarray
-                       Masked cube for untracked volume
+    """Create mask for specific cell.
+
+    Parameters
+    ----------
+    mask : iris.cube.Cube
+        Cube containing mask (int id for tracked volumes 0 everywhere
+        else).
+
+    cell : int
+        Integer id of cell to create masked cube for.
+
+    track : pandas.DataFrame
+        Output of the linking.
+
+    masked : bool, optional
+        Bool determining whether to mask the mask for the cell where
+        it is 0. Default is False.
+
+    Returns
+    -------
+    mask_i : numpy.ndarray
+        Mask for a specific cell.
     """
+
     feature_ids = track.loc[track["cell"] == cell, "feature"].values
     mask_i = mask_features(mask, feature_ids, masked=masked)
     return mask_i
 
 
 def mask_cell_surface(mask, cell, track, masked=False, z_coord="model_level_number"):
-    """Create surface projection of mask for individual cell
-    Input:
-    mask:              iris.cube.Cube
-                       cube containing mask (int id for tacked volumes 0 everywhere else)
-    Output:
-    variable_cube_out: iris.cube.Cube
-                       Masked cube for untracked volume
+    """Create surface projection of 3d-mask for individual cell by
+    collapsing one coordinate.
+
+    Parameters
+    ----------
+    mask : iris.cube.Cube
+        Cube containing mask (int id for tacked volumes, 0 everywhere
+        else).
+
+    cell : int
+        Integer id of cell to create masked cube for.
+
+    track : pandas.DataFrame
+        Output of the linking.
+
+    masked : bool, optional
+        Bool determining whether to mask the mask for the cell where
+        it is 0. Default is False.
+
+    z_coord : str, optional
+        Name of the coordinate to collapse. Default is 'model_level_number'.
+
+    Returns
+    -------
+    mask_i_surface : iris.cube.Cube
+        Collapsed Masked cube for the cell with the maximum value
+        along the collapsed coordinate.
+
     """
+
     feature_ids = track.loc[track["cell"] == cell, "feature"].values
     mask_i_surface = mask_features_surface(
         mask, feature_ids, masked=masked, z_coord=z_coord
@@ -140,32 +218,69 @@ def mask_cell_surface(mask, cell, track, masked=False, z_coord="model_level_numb
 
 
 def mask_cell_columns(mask, cell, track, masked=False, z_coord="model_level_number"):
-    """Create mask with entire columns for individual cell
-    Input:
-    mask:              iris.cube.Cube
-                       cube containing mask (int id for tacked volumes 0 everywhere else)
-    Output:
-    variable_cube_out: iris.cube.Cube
-                       Masked cube for untracked volume
+    """Create mask with entire columns for individual cell.
+
+    Parameters
+    ----------
+    mask : iris.cube.Cube
+        Cube containing mask (int id for tacked volumes 0 everywhere
+        else).
+
+    cell : int
+        Interger id of cell to create the masked cube for.
+
+    track : pandas.DataFrame
+        Output of the linking.
+
+    masked : bool, optional
+        Bool determining whether to mask the mask for the cell where
+        it is 0. Default is False.
+
+    z_coord : str, optional
+        Default is 'model_level_number'.
+
+    Returns
+    -------
+    mask_i : iris.cube.Cube
+        Masked cube for untracked volume.
+
+    Notes
+    -------
+    Function is not working since mask_features_columns()
+    is commented out
     """
+
+    raise NotImplementedError(
+        "The function mask_cell_columns() is not implemented currently."
+    )
+
     feature_ids = track.loc[track["cell"] == cell].loc["feature"]
     mask_i = mask_features_columns(mask, feature_ids, masked=masked, z_coord=z_coord)
     return mask_i
 
 
 def mask_cube_features(variable_cube, mask, feature_ids):
-    """Mask cube for tracked volume of an individual cell
-    Input:
-    variable_cube:     iris.cube.Cube
-                       unmasked data cube
-    mask:              iris.cube.Cube
-                       cube containing mask (int id for tacked volumes 0 everywhere else)
-    cell:          int
-                       interger id of cell to create masked cube for
-    Output:
-    variable_cube_out: iris.cube.Cube
-                       Masked cube with data for respective cell
+    """Mask cube for tracked volume of one or more specific
+    features.
+
+    Parameters
+    ----------
+    variable_cube : iris.cube.Cube
+        Unmasked data cube.
+
+    mask : iris.cube.Cube
+        Cube containing mask (int id for tacked volumes, 0 everywhere
+        else).
+
+    feature_ids : int or list of ints
+        Integer ids of features to create masked cube for.
+
+    Returns
+    -------
+    variable_cube_out : iris.cube.Cube
+        Masked cube with data for respective features.
     """
+
     from dask.array import ma, isin
     from copy import deepcopy
 
@@ -177,14 +292,27 @@ def mask_cube_features(variable_cube, mask, feature_ids):
 
 
 def mask_features(mask, feature_ids, masked=False):
-    """create mask for specific features
-    Input:
-    mask:              iris.cube.Cube
-                       cube containing mask (int id for tacked volumes 0 everywhere else)
-    Output:
-    variable_cube_out: numpy.ndarray
-                       Masked cube for untracked volume
+    """Create mask for specific features.
+
+    Parameters
+    ----------
+    mask : iris.cube.Cube
+        Cube containing mask (int id for tacked volumes 0 everywhere
+        else).
+
+    feature_ids : int or list of ints
+        Integer ids of the features to create the masked cube for.
+
+    masked : bool, optional
+        Bool determining whether to mask the mask for the cell where
+        it is 0. Default is False.
+
+    Returns
+    -------
+    mask_i : numpy.ndarray
+        Masked cube for specific features.
     """
+
     from dask.array import ma, isin
     from copy import deepcopy
 
@@ -199,14 +327,33 @@ def mask_features(mask, feature_ids, masked=False):
 def mask_features_surface(
     mask, feature_ids, masked=False, z_coord="model_level_number"
 ):
-    """create surface mask for individual features
-    Input:
-    mask:              iris.cube.Cube
-                       cube containing mask (int id for tacked volumes 0 everywhere else)
-    Output:
-    variable_cube_out: iris.cube.Cube
-                       Masked cube for untracked volume
+    """Create surface projection of 3d-mask for specific features
+    by collapsing one coordinate.
+
+    Parameters
+    ----------
+    mask : iris.cube.Cube
+        Cube containing mask (int id for tacked volumes 0 everywhere
+        else).
+
+    feature_ids : int or list of ints
+        Integer ids of the features to create the masked cube for.
+
+    masked : bool, optional
+        Bool determining whether to mask the mask for the cell where
+        it is 0. Default is False
+
+    z_coord : str, optional
+        Name of the coordinate to collapse. Default is
+        'model_level_number'.
+
+    Returns
+    -------
+    mask_i_surface : iris.cube.Cube
+        Collapsed Masked cube for the features with the maximum value
+        along the collapsed coordinate.
     """
+
     from iris.analysis import MAX
     from dask.array import ma, isin
     from copy import deepcopy
@@ -222,14 +369,30 @@ def mask_features_surface(
 
 
 def mask_all_surface(mask, masked=False, z_coord="model_level_number"):
-    """create surface mask for individual features
-    Input:
-    mask:              iris.cube.Cube
-                       cube containing mask (int id for tacked volumes 0 everywhere else)
-    Output:
-    mask_i_surface:    iris.cube.Cube (2D)
-                       Mask with 1 below features and 0 everywhere else
+    """Create surface projection of 3d-mask for all features
+    by collapsing one coordinate.
+
+    Parameters
+    ----------
+    mask : iris.cube.Cube
+        Cube containing mask (int id for tacked volumes 0 everywhere
+        else).
+
+    masked : bool, optional
+        Bool determining whether to mask the mask for the cell where
+        it is 0. Default is False
+
+    z_coord : str, optional
+        Name of the coordinate to collapse. Default is
+        'model_level_number'.
+
+    Returns
+    -------
+    mask_i_surface : iris.cube.Cube (2D)
+        Collapsed Masked cube for the features with the maximum value
+        along the collapsed coordinate.
     """
+
     from iris.analysis import MAX
     from dask.array import ma, isin
     from copy import deepcopy
@@ -237,7 +400,7 @@ def mask_all_surface(mask, masked=False, z_coord="model_level_number"):
     mask_i = deepcopy(mask)
     mask_i_surface = mask_i.collapsed(z_coord, MAX)
     mask_i_surface_data = mask_i_surface.core_data()
-    mask_i_surface[mask_i_surface_data > 0] = 1
+    mask_i_surface.data[mask_i_surface_data > 0] = 1
     if masked:
         mask_i_surface.data = ma.masked_equal(mask_i_surface.core_data(), 0)
     return mask_i_surface
@@ -306,18 +469,27 @@ def mask_all_surface(mask, masked=False, z_coord="model_level_number"):
 
 
 def add_coordinates(t, variable_cube):
-    import numpy as np
+    """Add coordinates from the input cube of the feature detection
+    to the trajectories/features.
+
+    Parameters
+    ----------
+    t : pandas.DataFrame
+        Trajectories/features from feature detection or linking step.
+
+    variable_cube : iris.cube.Cube
+        Input data used for the tracking with coordinate information
+        to transfer to the resulting DataFrame. Needs to contain the
+        coordinate 'time'.
+
+    Returns
+    -------
+    t : pandas.DataFrame
+        Trajectories with added coordinates.
 
-    """ Function adding coordinates from the tracking cube to the trajectories: time, longitude&latitude, x&y dimensions
-    Input:
-    t:             pandas DataFrame
-                   trajectories/features
-    variable_cube: iris.cube.Cube 
-                   Cube containing the dimensions 'time','longitude','latitude','x_projection_coordinate','y_projection_coordinate', usually cube that the tracking is performed on
-    Output:
-    t:             pandas DataFrame 
-                   trajectories with added coordinated
     """
+
+    import numpy as np
     from scipy.interpolate import interp2d, interp1d
 
     logging.debug("start adding coordinates from cube")
@@ -440,11 +612,29 @@ def add_coordinates(t, variable_cube):
 
 
 def get_bounding_box(x, buffer=1):
-    from numpy import delete, arange, diff, nonzero, array
-
-    """ Calculates the bounding box of a ndarray
+    """Finds the bounding box of a ndarray, i.e. the smallest
+    bounding rectangle for nonzero values as explained here:
     https://stackoverflow.com/questions/31400769/bounding-box-of-numpy-array
+
+    Parameters
+    ----------
+    x : numpy.ndarray
+        Array for which the bounding box is to be determined.
+
+    buffer : int, optional
+        Number to set a buffer between the nonzero values and
+        the edges of the box. Default is 1.
+
+    Returns
+    -------
+    bbox : list
+        Dimensionwise list of the indices representing the edges
+        of the bounding box.
+
     """
+
+    from numpy import delete, arange, diff, nonzero, array
+
     mask = x == 0
 
     bbox = []
@@ -472,6 +662,38 @@ def get_bounding_box(x, buffer=1):
 
 
 def get_spacings(field_in, grid_spacing=None, time_spacing=None):
+    """Determine spatial and temporal grid spacing of the
+    input data.
+
+    Parameters
+    ----------
+    field_in : iris.cube.Cube
+        Input field where to get spacings.
+
+    grid_spacing : float, optional
+        Manually sets the grid spacing if specified.
+        Default is None.
+
+    time_spacing : float, optional
+        Manually sets the time spacing if specified.
+        Default is None.
+
+    Returns
+    -------
+    dxy : float
+        Grid spacing in metres.
+
+    dt : float
+        Time resolution in seconds.
+
+    Raises
+    ------
+    ValueError
+        If input_cube does not contain projection_x_coord and
+        projection_y_coord or keyword argument grid_spacing.
+
+    """
+
     import numpy as np
     from copy import deepcopy
 
@@ -516,14 +738,16 @@ def get_label_props_in_dict(labels):
 
     Parameters
     ----------
-    labels:    2D array-like
-        comes from the `skimage.measure.label` function
+    labels : 2D array-like
+        Output of the `skimage.measure.label` function.
 
     Returns
     -------
-    dict
-        output from skimage.measure.regionprops in dictionary format, where they key is the label number
+    region_properties_dict: dict
+        Output from skimage.measure.regionprops in dictionary
+        format, where they key is the label number.
     """
+
     import skimage.measure
 
     region_properties_raw = skimage.measure.regionprops(labels)
@@ -540,24 +764,27 @@ def get_indices_of_labels_from_reg_prop_dict(region_property_dict):
 
     Parameters
     ----------
-    region_property_dict:    dict of region_property objects
+    region_property_dict : dict of region_property objects
         This dict should come from the get_label_props_in_dict function.
 
     Returns
     -------
-    dict (key: label number, int)
-        The number of points in the label number
-    dict (key: label number, int)
-        the y indices in the label number
-    dict (key: label number, int)
-        the x indices in the label number
+    curr_loc_indices : dict
+        The number of points in the label number (key: label number).
+
+    y_indices : dict
+        The y indices in the label number (key: label number).
+
+    x_indices : dict
+        The x indices in the label number (key: label number).
 
     Raises
     ------
     ValueError
-        a ValueError is raised if there are no regions in the region property dict
-
+        A ValueError is raised if there are no regions in the region
+        property dict.
     """
+
     import numpy as np
 
     if len(region_property_dict) == 0:
@@ -580,6 +807,95 @@ def get_indices_of_labels_from_reg_prop_dict(region_property_dict):
     return (curr_loc_indices, y_indices, x_indices)
 
 
+def spectral_filtering(
+    dxy, field_in, lambda_min, lambda_max, return_transfer_function=False
+):
+    """This function creates and applies a 2D transfer function that
+    can be used as a bandpass filter to remove certain wavelengths
+    of an atmospheric input field (e.g. vorticity, IVT, etc).
+
+    Parameters:
+    -----------
+    dxy : float
+        Grid spacing in m.
+
+    field_in: numpy.array
+        2D field with input data.
+
+    lambda_min: float
+        Minimum wavelength in m.
+
+    lambda_max: float
+        Maximum wavelength in m.
+
+    return_transfer_function: boolean, optional
+        default: False. If set to True, then the 2D transfer function and
+        the corresponding wavelengths are returned.
+
+    Returns:
+    --------
+    filtered_field: numpy.array
+        Spectrally filtered 2D field of data (with same shape as input data).
+
+    transfer_function: tuple
+        Two 2D fields, where the first one corresponds to the wavelengths
+        in the spectral space of the domain and the second one to the 2D
+        transfer function of the bandpass filter. Only returned, if
+        return_transfer_function is True.
+    """
+
+    import numpy as np
+    from scipy import signal
+    from scipy import fft
+
+    # check if valid value for dxy is given
+    if dxy <= 0:
+        raise ValueError(
+            "Invalid value for dxy. Please provide the grid spacing in meter."
+        )
+
+    # get number of grid cells in x and y direction
+    Ni = field_in.shape[-2]
+    Nj = field_in.shape[-1]
+    # wavenumber space
+    m, n = np.meshgrid(np.arange(Ni), np.arange(Nj), indexing="ij")
+
+    # if domain is squared:
+    if Ni == Nj:
+        wavenumber = np.sqrt(m**2 + n**2)
+        lambda_mn = (2 * Ni * (dxy)) / wavenumber
+    else:
+        # if domain is a rectangle:
+        # alpha is the normalized wavenumber in wavenumber space
+        alpha = np.sqrt(m**2 / Ni**2 + n**2 / Nj**2)
+        # compute wavelengths for target grid in m
+        lambda_mn = 2 * dxy / alpha
+
+    ############### create a 2D bandpass filter (butterworth) #######################
+    b, a = signal.iirfilter(
+        2,
+        [1 / lambda_max, 1 / lambda_min],
+        btype="band",
+        ftype="butter",
+        fs=1 / dxy,
+        output="ba",
+    )
+    w, h = signal.freqz(b, a, 1 / lambda_mn.flatten(), fs=1 / dxy)
+    transfer_function = np.reshape(abs(h), lambda_mn.shape)
+
+    # 2-dimensional discrete cosine transformation to convert data to spectral space
+    spectral = fft.dctn(field_in.data)
+    # multiplication of spectral coefficients with transfer function
+    filtered = spectral * transfer_function
+    # inverse discrete cosine transformation to go back from spectral to original space
+    filtered_field = fft.idctn(filtered)
+
+    if return_transfer_function is True:
+        return (lambda_mn, transfer_function), filtered_field
+    else:
+        return filtered_field
+
+
 def compress_all(nc_grids, min_dims=2, comp_level=4):
     """
     The purpose of this subroutine is to compress the netcdf variables as they are saved.

From 6391513009014a59227c7454fac06654b4177550 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Sat, 24 Sep 2022 22:01:15 -0500
Subject: [PATCH 13/29] Updating Cell numbering.

Changed cell_parent_track_id to begin new count rather than use max cell ID.
---
 tobac/merge_split.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 51dc1286..6b598375 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -171,8 +171,8 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
 
                 track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
 
-    track_ids = np.array(list(track_id.keys()))
-    logging.debug("found track ids")
+    #     track_ids = np.array(list(track_id.keys()))
+    #     logging.debug("found track ids")
 
     cell_id = list(
         np.unique(
@@ -186,14 +186,17 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     for i, id in enumerate(track_id):
 
         if len(track_id[int(id)]) == 1:
-            cell_parent_track_id.append(int(id))
+            cell_parent_track_id.append(int(i))
 
         else:
-            cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)])))
+            cell_parent_track_id.append(np.repeat(int(i), len(track_id[int(id)])))
 
     cell_parent_track_id = list(flatten(cell_parent_track_id))
     logging.debug("found cell parent track ids")
 
+    track_ids = np.array(np.unique(cell_parent_track_id))
+    logging.debug("found track ids")
+
     feature_parent_cell_id = list(TRACK.cell.values.astype(int))
     logging.debug("found feature parent cell ids")
 
@@ -205,7 +208,7 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     feature_parent_track_id = np.zeros(len(feature_id))
     for i, id in enumerate(feature_id):
         cellid = feature_parent_cell_id[i]
-        if np.isnan(cellid):
+        if cellid < 0:
             feature_parent_track_id[i] = -1
         else:
             j = np.where(cell_id == cellid)

From b9c4d262f08aff2fc12bff9060a43cecac22fa06 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Sun, 25 Sep 2022 09:01:36 -0500
Subject: [PATCH 14/29] None type error in Projection option

Fixed None type error for importing Projection keyword.
---
 tobac/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tobac/utils.py b/tobac/utils.py
index 85c8fdb9..854bf178 100644
--- a/tobac/utils.py
+++ b/tobac/utils.py
@@ -1061,7 +1061,7 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection=None):
     )
 
     # Add the projection data back in
-    if not None in Projection:
+    if Projection is not None:
         ds["ProjectionCoordinateSystem"] = Projection
 
     return ds

From fb184139eb55692fa28c5133e7def5d3b8801b77 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Sun, 2 Oct 2022 20:52:22 -0400
Subject: [PATCH 15/29] Deleting the build/lib file that is not needed for the
 PR

---
 build/lib/tobac/merge_split.py | 259 ---------------------------------
 1 file changed, 259 deletions(-)
 delete mode 100644 build/lib/tobac/merge_split.py

diff --git a/build/lib/tobac/merge_split.py b/build/lib/tobac/merge_split.py
deleted file mode 100644
index 609ae47f..00000000
--- a/build/lib/tobac/merge_split.py
+++ /dev/null
@@ -1,259 +0,0 @@
-"""
-    Tobac merge and split v0.1
-    This submodule is a post processing step to address tracked cells which merge/split. 
-    The first iteration of this module is to combine the cells which are merging but have received
-    a new cell id (and are considered a new cell) once merged. In general this submodule will label merged/split cells
-    with a TRACK number in addition to its CELL number.
-    
-"""
-
-
-def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
-    """
-    function to  postprocess tobac track data for merge/split cells
-
-
-    Parameters
-    ----------
-    TRACK : pandas.core.frame.DataFrame
-        Pandas dataframe of tobac Track information
-
-    distance : float, optional
-        Distance threshold prior to adding a pair of points into the minimum spanning tree.
-        Default is 25000 meters.
-
-    frame_len : float, optional
-        Threshold for the spanning length within which two points can be separated.
-        Default is five (5) frames.
-
-    dxy : float, optional
-        The x/y/ grid spacing of the data.
-        Default is 500m.
-
-    Returns
-    -------
-
-    d : xarray.core.dataset.Dataset
-        xarray dataset of tobac merge/split cells with parent and child designations.
-        
-		Parent/child variables include:
-        - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. 
-  		- feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. 
-  		- feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. 
-  		- track_child_cell_count: The total number of features belonging to all child cells of a given track id.
-  		- cell_child_feature_count: The total number of features for each cell. 
-
-
-    Example usage:
-        d = merge_split(Track)
-        ds = tobac.utils.standardize_track_dataset(Track, refl_mask)
-        both_ds = xr.merge([ds, d],compat ='override')
-        both_ds = tobac.utils.compress_all(both_ds)
-        both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
-
-    """
-
-
-    try:
-        import networkx as nx
-    except ImportError:
-        networkx = None
-
-    #     if networkx:
-    #         import networkx as nx
-    #     else:
-    #         print("Cannot Merge/Split. Please install networkx.")
-    import logging
-    import numpy as np
-    from pandas.core.common import flatten
-    import xarray as xr
-
-    # Check if dxy is in meters of in kilometers. It should be in meters
-    if dxy <= 5:
-        dxy *= 1000
-
-    # Immediately convert pandas dataframe of track information to xarray:
-    TRACK = TRACK.to_xarray()
-    track_groups = TRACK.groupby("cell")
-    cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()}
-    id_data = np.fromiter(cell_ids.keys(), dtype=int)
-    count_data = np.fromiter(cell_ids.values(), dtype=int)
-    all_frames = np.sort(np.unique(TRACK.frame))
-    a_points = list()
-    b_points = list()
-    a_names = list()
-    b_names = list()
-    dist = list()
-
-
-    for i in id_data:
-        # print(i)
-        a_pointx = track_groups[i].hdim_2[-1].values * dxy
-        a_pointy = track_groups[i].hdim_1[-1].values * dxy
-        for j in id_data:
-            b_pointx = track_groups[j].hdim_2[0].values * dxy
-            b_pointy = track_groups[j].hdim_1[0].values * dxy
-            d = np.linalg.norm(
-                np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))
-            )
-            if d <= distance:
-                a_points.append([a_pointx, a_pointy])
-                b_points.append([b_pointx, b_pointy])
-                dist.append(d)
-                a_names.append(i)
-                b_names.append(j)
-
-    id = []
-    # This is removing any tracks that have matched to themselves - e.g.,
-    # a beginning of a track has found its tail.
-    for i in range(len(dist) - 1, -1, -1):
-        if a_names[i] == b_names[i]:
-            id.append(i)
-            a_points.pop(i)
-            b_points.pop(i)
-            dist.pop(i)
-            a_names.pop(i)
-            b_names.pop(i)
-        else:
-            continue
-    # This is inputting data to the object with will perform the spanning tree.
-    g = nx.Graph()
-    for i in np.arange(len(dist)):
-        g.add_edge(a_names[i], b_names[i], weight=dist[i])
-
-    tree = nx.minimum_spanning_edges(g)
-    tree_list = list(tree)
-
-    new_tree = []
-    for i, j in enumerate(tree_list):
-        frame_a = np.nanmax(track_groups[j[0]].frame.values)
-        frame_b = np.nanmin(track_groups[j[1]].frame.values)
-        if np.abs(frame_a - frame_b) <= frame_len:
-            new_tree.append(tree_list[i][0:2])
-    new_tree_arr = np.array(new_tree)
-
-    TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values))
-    cell_id = np.unique(
-        TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))]
-    )
-    track_id = dict()  # same size as number of total merged tracks
-
-    arr = np.array([0])
-    for p in cell_id:
-        j = np.where(arr == int(p))
-        if len(j[0]) > 0:
-            continue
-        else:
-            k = np.where(new_tree_arr == p)
-            if len(k[0]) == 0:
-                track_id[p] = [p]
-                arr = np.append(arr, p)
-            else:
-                temp1 = list(np.unique(new_tree_arr[k[0]]))
-                temp = list(np.unique(new_tree_arr[k[0]]))
-
-                for l in range(15):
-                    for i in temp1:
-                        k2 = np.where(new_tree_arr == i)
-                        temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
-                        temp = list(flatten(temp))
-                        temp = list(np.unique(temp))
-
-                    if len(temp1) == len(temp):
-                        break
-                    temp1 = np.array(temp)
-
-                for i in temp1:
-                    k2 = np.where(new_tree_arr == i)
-                    temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
-
-                temp = list(flatten(temp))
-                temp = list(np.unique(temp))
-                arr = np.append(arr, np.unique(temp))
-
-                track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
-
-    track_ids = np.array(list(track_id.keys()))
-    logging.debug("found track ids")
-
-    cell_id = list(
-        np.unique(
-            TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))]
-        )
-    )
-    logging.debug("found cell ids")
-
-    cell_parent_track_id = []
-
-    for i, id in enumerate(track_id):
-
-        if len(track_id[int(id)]) == 1:
-            cell_parent_track_id.append(int(id))
-
-        else:
-            cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)])))
-
-    cell_parent_track_id = list(flatten(cell_parent_track_id))
-    logging.debug("found cell parent track ids")
-
-    feature_parent_cell_id = list(TRACK.cell.values.astype(int))
-    logging.debug("found feature parent cell ids")
-
-    # This version includes all the feature regardless of if they are used in cells or not.
-    feature_id = list(TRACK.feature.values.astype(int))
-    logging.debug("found feature ids")
-
-    feature_parent_track_id = []
-    feature_parent_track_id = np.zeros(len(feature_id))
-    for i, id in enumerate(feature_id):
-        cellid = feature_parent_cell_id[i]
-        if np.isnan(cellid):
-            feature_parent_track_id[i] = -1
-        else:
-            j = np.where(cell_id == cellid)
-            j = np.squeeze(j)
-            trackid = cell_parent_track_id[j]
-            feature_parent_track_id[i] = trackid
-
-    logging.debug("found feature parent track ids")
-
-    track_child_cell_count = []
-    for i, id in enumerate(track_id):
-        track_child_cell_count.append(len(track_id[int(id)]))
-    logging.debug("found track child cell count")
-
-    cell_child_feature_count = []
-    for i, id in enumerate(cell_id):
-        cell_child_feature_count.append(len(track_groups[id].feature.values))
-    logging.debug("found cell child feature count")
-
-    track_dim = "tracks"
-    cell_dim = "cells"
-    feature_dim = "features"
-
-    d = xr.Dataset(
-        {
-            "track": (track_dim, track_ids),
-            "cell": (cell_dim, cell_id),
-            "cell_parent_track_id": (cell_dim, cell_parent_track_id),
-            "feature": (feature_dim, feature_id),
-            "feature_parent_cell_id": (feature_dim, feature_parent_cell_id),
-            "feature_parent_track_id": (feature_dim, feature_parent_track_id),
-            "track_child_cell_count": (track_dim, track_child_cell_count),
-            "cell_child_feature_count": (cell_dim, cell_child_feature_count),
-        }
-    )
-
-    d = d.set_coords(["feature", "cell", "track"])
-
-    assert len(cell_id) == len(cell_parent_track_id)
-    assert len(feature_id) == len(feature_parent_cell_id)
-    assert sum(track_child_cell_count) == len(cell_id)
-    assert sum(
-        [
-            sum(cell_child_feature_count[1:]),
-            (len(np.where(feature_parent_track_id < 0)[0])),
-        ]
-    ) == len(feature_id)
-
-    return d

From 2a9255c24b617fc1b6c49bc9e7a2682cf000dfd7 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Mon, 3 Oct 2022 22:01:09 -0400
Subject: [PATCH 16/29] Updating documentation, and merge/split methods.

Updated merge_split methods to increase speed. Added output documentation table to docs, and updated the index to reflect the changes.
---
 doc/index.rst                   |   7 +++
 doc/merge_split.rst             |  56 ++++++++++++++++--
 doc/merge_split_out_vars.csv    |   9 +++
 doc/merge_split_output_vars.rst |   9 +++
 tobac/merge_split.py            | 102 ++++++++++++--------------------
 tobac/utils.py                  |   4 +-
 6 files changed, 118 insertions(+), 69 deletions(-)
 create mode 100644 doc/merge_split_out_vars.csv
 create mode 100644 doc/merge_split_output_vars.rst

diff --git a/doc/index.rst b/doc/index.rst
index 96b589cf..b0ee6567 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -49,6 +49,13 @@ The project is currently being extended by several contributors to include addit
 
    linking  
    tracking_output
+   
+.. toctree::
+   :caption: Merge/Split
+   :maxdepth: 2
+
+   merge_split
+   merge_split_out_vars
 
 .. toctree::
    :caption: API Reference
diff --git a/doc/merge_split.rst b/doc/merge_split.rst
index ca02f291..b01168c1 100644
--- a/doc/merge_split.rst
+++ b/doc/merge_split.rst
@@ -2,6 +2,16 @@
 ======================
 
 
+This submodule is a post processing step to address tracked cells which merge/split. 
+The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. 
+This submodule will label merged/split cells with a TRACK number in addition to its CELL number.
+
+Features, cells, and tracks are combined using parent/child nomenclature. 
+(quick note on terms; “feature” is a detected object at a single time step. “cell” is a series of features linked together over multiple timesteps. "track" may be an individual cell or series of cells which have merged and/or split.)
+*Merge and Split 
+======================
+
+
 This submodule is a post processing step to address tracked cells which merge/split. 
 The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. 
 This submodule will label merged/split cells with a TRACK number in addition to its CELL number.
@@ -11,9 +21,45 @@ Features, cells, and tracks are combined using parent/child nomenclature.
 
 
 Overview of the output dataframe from merge_split
-  - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. 
-  - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. 
-  - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. 
-  - track_child_cell_count: The total number of features belonging to all child cells of a given track id.
-  - cell_child_feature_count: The total number of features for each cell. 
+d : xarray.core.dataset.Dataset
+    xarray dataset of tobac merge/split cells with parent and child designations.
+
+Parent/child variables include:
+- cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.
+- feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id.
+- feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number.
+- track_child_cell_count: The total number of features belonging to all child cells of a given track id.
+- cell_child_feature_count: The total number of features for each cell.
+
+
+Example usage:
+        
+d = merge_split(Track)
+
+
+Overview of the output dataframe from merge_split
+d : xarray.core.dataset.Dataset
+    xarray dataset of tobac merge/split cells with parent and child designations.
+
+Parent/child variables include:
+- cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.
+- feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id.
+- feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number.
+- track_child_cell_count: The total number of features belonging to all child cells of a given track id.
+- cell_child_feature_count: The total number of features for each cell.
+
+
+Example usage:
+        
+d = merge_split(Track)
+
+
+merge_split outputs an `xarray` dataset with several variables. The variables, (with column names listed in the `Variable Name` column), are described below with units. Coordinates and dataset dimensions are Feature, Cell, and Track. 
+
+Variables that are common to all feature detection files:
+
+.. csv-table:: tobac Merge_Split Track Output Variables
+   :file: ./merge_split_out_vars.csv
+   :widths: 3, 35, 3, 3
+   :header-rows: 1
 
diff --git a/doc/merge_split_out_vars.csv b/doc/merge_split_out_vars.csv
new file mode 100644
index 00000000..38b86679
--- /dev/null
+++ b/doc/merge_split_out_vars.csv
@@ -0,0 +1,9 @@
+﻿Variable Name,Description,Units,Type
+feature,Unique number of the feature; starts from 1 and increments by 1 to the number of features identified in all frames,n/a,int64
+Cell,Tracked cell number; generally starts from 1. Untracked cell value is -1.,n/a,int64
+track,Unique number of the track; starts from 0 and increments by 1 to the number of tracks identified. Untracked cells and features have a track id of -1.,n/a,int64
+cell_parent_track_id,"The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.",n/a,int64
+feature_parent_cell_id,The associated parent cell id for each feature. All feature in a given cell will have the same cell id.,n/a,int64
+feature_parent_track_id,The associated parent track id for each feature. This is not the same as the cell id number.,n/a,int64
+track_child_cell_count,The number of features belonging to all child cells of a given track id.,n/a,int64
+cell_child_feature_count,The number of features for each cell.,n/a,int64
\ No newline at end of file
diff --git a/doc/merge_split_output_vars.rst b/doc/merge_split_output_vars.rst
new file mode 100644
index 00000000..a17ae327
--- /dev/null
+++ b/doc/merge_split_output_vars.rst
@@ -0,0 +1,9 @@
+merge_split outputs an `xarray` dataset with several variables. The variables, (with column names listed in the `Variable Name` column), are described below with units. Coordinates and dataset dimensions are Feature, Cell, and Track. 
+
+Variables that are common to all feature detection files:
+
+.. csv-table:: tobac Merge_Split Track Output Variables
+   :file: ./merge_split_out_vars.csv
+   :widths: 3, 35, 3, 3
+   :header-rows: 1
+
diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 6b598375..837e8185 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -1,5 +1,5 @@
 """
-    Tobac merge and split v0.1
+    Tobac merge and split
     This submodule is a post processing step to address tracked cells which merge/split. 
     The first iteration of this module is to combine the cells which are merging but have received
     a new cell id (and are considered a new cell) once merged. In general this submodule will label merged/split cells
@@ -8,7 +8,7 @@
 """
 
 
-def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
+def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
     """
     function to  postprocess tobac track data for merge/split cells
 
@@ -18,18 +18,19 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     TRACK : pandas.core.frame.DataFrame
         Pandas dataframe of tobac Track information
 
+    dxy : float, mandatory
+        The x/y grid spacing of the data.
+        Should be in meters.
+
+
     distance : float, optional
-        Distance threshold prior to adding a pair of points into the minimum spanning tree.
+        Distance threshold determining how close two features must be in order to consider merge/splitting.
         Default is 25000 meters.
 
     frame_len : float, optional
-        Threshold for the spanning length within which two points can be separated.
+        Threshold for the maximum number of frames that can separate the end of cell and the start of a related cell.
         Default is five (5) frames.
 
-    dxy : float, optional
-        The x/y/ grid spacing of the data.
-        Default is 500m.
-
     Returns
     -------
 
@@ -58,63 +59,43 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     except ImportError:
         networkx = None
 
-    #     if networkx:
-    #         import networkx as nx
-    #     else:
-    #         print("Cannot Merge/Split. Please install networkx.")
     import logging
     import numpy as np
     from pandas.core.common import flatten
     import xarray as xr
-
-    # Check if dxy is in meters of in kilometers. It should be in meters
-    if dxy <= 5:
-        dxy *= 1000
+    from scipy.spatial.distance import cdist
 
     # Immediately convert pandas dataframe of track information to xarray:
     TRACK = TRACK.to_xarray()
     track_groups = TRACK.groupby("cell")
-    cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()}
-    id_data = np.fromiter(cell_ids.keys(), dtype=int)
-    count_data = np.fromiter(cell_ids.values(), dtype=int)
-    all_frames = np.sort(np.unique(TRACK.frame))
-    a_points = list()
-    b_points = list()
+    first = track_groups.first()
+    last = track_groups.last()
+
     a_names = list()
     b_names = list()
     dist = list()
 
-    for i in id_data:
-        # print(i)
-        a_pointx = track_groups[i].hdim_2[-1].values * dxy
-        a_pointy = track_groups[i].hdim_1[-1].values * dxy
-        for j in id_data:
-            b_pointx = track_groups[j].hdim_2[0].values * dxy
-            b_pointy = track_groups[j].hdim_1[0].values * dxy
-            d = np.linalg.norm(
-                np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))
-            )
-            if d <= distance:
-                a_points.append([a_pointx, a_pointy])
-                b_points.append([b_pointx, b_pointy])
-                dist.append(d)
-                a_names.append(i)
-                b_names.append(j)
-
-    id = []
-    # This is removing any tracks that have matched to themselves - e.g.,
-    # a beginning of a track has found its tail.
-    for i in range(len(dist) - 1, -1, -1):
-        if a_names[i] == b_names[i]:
-            id.append(i)
-            a_points.pop(i)
-            b_points.pop(i)
-            dist.pop(i)
-            a_names.pop(i)
-            b_names.pop(i)
-        else:
-            continue
-    # This is inputting data to the object with will perform the spanning tree.
+    # write all sets of points (a and b) as Nx2 arrays
+    l = len(last["hdim_2"].values)
+    cells = first["cell"].values
+    a_xy = np.zeros((l, 2))
+    a_xy[:, 0] = last["hdim_2"].values * dxy
+    a_xy[:, 1] = last["hdim_1"].values * dxy
+    b_xy = np.zeros((l, 2))
+    b_xy[:, 0] = first["hdim_2"].values * dxy
+    b_xy[:, 1] = first["hdim_1"].values * dxy
+
+    # Use cdist to find distance matrix
+    out = cdist(a_xy, b_xy)
+    # Find all cells under the distance threshold
+    j = np.where(out <= distance)
+
+    # Compile cells meeting the criteria to an array of both the distance and cell ids
+    a_names = cells[j[0]]
+    b_names = cells[j[1]]
+    dist = out[j]
+
+    # This is inputing data to the object which will perform the spanning tree.
     g = nx.Graph()
     for i in np.arange(len(dist)):
         g.add_edge(a_names[i], b_names[i], weight=dist[i])
@@ -123,6 +104,8 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     tree_list = list(tree)
 
     new_tree = []
+    
+    #Pruning the tree for time limits.
     for i, j in enumerate(tree_list):
         frame_a = np.nanmax(track_groups[j[0]].frame.values)
         frame_b = np.nanmin(track_groups[j[1]].frame.values)
@@ -136,6 +119,7 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
     )
     track_id = dict()  # same size as number of total merged tracks
 
+	#Cleaning up tracks, combining tracks which contain the same cells. 
     arr = np.array([0])
     for p in cell_id:
         j = np.where(arr == int(p))
@@ -171,20 +155,12 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500):
 
                 track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp))
 
-    #     track_ids = np.array(list(track_id.keys()))
-    #     logging.debug("found track ids")
-
-    cell_id = list(
-        np.unique(
-            TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))]
-        )
-    )
+    cell_id = list(np.unique(TRACK.cell.values.astype(int)))
     logging.debug("found cell ids")
 
     cell_parent_track_id = []
 
-    for i, id in enumerate(track_id):
-
+    for i, id in enumerate(track_id, start=-1):
         if len(track_id[int(id)]) == 1:
             cell_parent_track_id.append(int(i))
 
diff --git a/tobac/utils.py b/tobac/utils.py
index 854bf178..d665d55c 100644
--- a/tobac/utils.py
+++ b/tobac/utils.py
@@ -932,12 +932,14 @@ def compress_all(nc_grids, min_dims=2, comp_level=4):
 
 def standardize_track_dataset(TrackedFeatures, Mask, Projection=None):
     """
+    CAUTION: this function is experimental. No data structures output are guaranteed to be supported in future versions of tobac.
+
     Combine a feature mask with the feature data table into a common dataset.
 
     returned by tobac.segmentation
     with the TrackedFeatures dataset returned by tobac.linking_trackpy.
 
-    Also rename the variables to be more desciptive and comply with cf-tree.
+    Also rename the variables to be more descriptive and comply with cf-tree.
 
     Convert the default cell parent ID  to an integer table.
 

From 53cf5b35437eee2b9fd21b75ec25a6dff3247ca5 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Mon, 3 Oct 2022 22:04:19 -0400
Subject: [PATCH 17/29] Black formatting

---
 tobac/merge_split.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 837e8185..509b282a 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -39,7 +39,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
 
                 Parent/child variables include:
         - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.
-        - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id.
+        - feature_parent_cell_id: The associated parent cell id for each feature. All features in a given cell will have the same cell id. This is the original TRACK cell_id.
         - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number.
         - track_child_cell_count: The total number of features belonging to all child cells of a given track id.
         - cell_child_feature_count: The total number of features for each cell.
@@ -104,8 +104,8 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
     tree_list = list(tree)
 
     new_tree = []
-    
-    #Pruning the tree for time limits.
+
+    # Pruning the tree for time limits.
     for i, j in enumerate(tree_list):
         frame_a = np.nanmax(track_groups[j[0]].frame.values)
         frame_b = np.nanmin(track_groups[j[1]].frame.values)
@@ -119,7 +119,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
     )
     track_id = dict()  # same size as number of total merged tracks
 
-	#Cleaning up tracks, combining tracks which contain the same cells. 
+    # Cleaning up tracks, combining tracks which contain the same cells.
     arr = np.array([0])
     for p in cell_id:
         j = np.where(arr == int(p))

From f23b6b3321d46c407e765ffaf038d88f8bd238f4 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Tue, 11 Oct 2022 11:37:13 -0500
Subject: [PATCH 18/29] Testing update

---
 tobac/merge_split.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 509b282a..911b58cd 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -226,11 +226,6 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
     assert len(cell_id) == len(cell_parent_track_id)
     assert len(feature_id) == len(feature_parent_cell_id)
     assert sum(track_child_cell_count) == len(cell_id)
-    assert sum(
-        [
-            sum(cell_child_feature_count[1:]),
-            (len(np.where(feature_parent_track_id < 0)[0])),
-        ]
-    ) == len(feature_id)
+    assert sum(cell_child_feature_count) == len(feature_id)
 
     return d

From dd88c30d6d4a94c87405e84accd0cd4b48c9e5a2 Mon Sep 17 00:00:00 2001
From: Juli <julia.kukulies@gu.se>
Date: Tue, 25 Oct 2022 09:34:16 +0200
Subject: [PATCH 19/29] removed redundant text and fixed formatting for RTD
 page

---
 doc/merge_split.rst             | 43 ++++++++-------------------------
 doc/merge_split_output_vars.rst |  9 -------
 2 files changed, 10 insertions(+), 42 deletions(-)
 delete mode 100644 doc/merge_split_output_vars.rst

diff --git a/doc/merge_split.rst b/doc/merge_split.rst
index b01168c1..6ea3b0ce 100644
--- a/doc/merge_split.rst
+++ b/doc/merge_split.rst
@@ -1,58 +1,35 @@
-*Merge and Split 
+Merge and Split 
 ======================
 
-
 This submodule is a post processing step to address tracked cells which merge/split. 
 The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. 
 This submodule will label merged/split cells with a TRACK number in addition to its CELL number.
 
 Features, cells, and tracks are combined using parent/child nomenclature. 
 (quick note on terms; “feature” is a detected object at a single time step. “cell” is a series of features linked together over multiple timesteps. "track" may be an individual cell or series of cells which have merged and/or split.)
-*Merge and Split 
-======================
-
-
-This submodule is a post processing step to address tracked cells which merge/split. 
-The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. 
-This submodule will label merged/split cells with a TRACK number in addition to its CELL number.
 
-Features, cells, and tracks are combined using parent/child nomenclature. 
-(quick note on terms; “feature” is a detected object at a single time step. “cell” is a series of features linked together over multiple timesteps. "track" may be an individual cell or series of cells which have merged and/or split.)
+Overview of the output dataframe from merge_split
 
+d : `xarray.core.dataset.Dataset`
 
-Overview of the output dataframe from merge_split
-d : xarray.core.dataset.Dataset
-    xarray dataset of tobac merge/split cells with parent and child designations.
+xarray dataset of tobac merge/split cells with parent and child designations.
 
 Parent/child variables include:
-- cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.
-- feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id.
-- feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number.
-- track_child_cell_count: The total number of features belonging to all child cells of a given track id.
-- cell_child_feature_count: The total number of features for each cell.
 
+* cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.
 
-Example usage:
-        
-d = merge_split(Track)
+* feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id.
 
+* feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number.
 
-Overview of the output dataframe from merge_split
-d : xarray.core.dataset.Dataset
-    xarray dataset of tobac merge/split cells with parent and child designations.
+* track_child_cell_count: The total number of features belonging to all child cells of a given track id.
 
-Parent/child variables include:
-- cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.
-- feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id.
-- feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number.
-- track_child_cell_count: The total number of features belonging to all child cells of a given track id.
-- cell_child_feature_count: The total number of features for each cell.
+* cell_child_feature_count: The total number of features for each cell.
 
 
 Example usage:
         
-d = merge_split(Track)
-
+``d = merge_split(Track)``
 
 merge_split outputs an `xarray` dataset with several variables. The variables, (with column names listed in the `Variable Name` column), are described below with units. Coordinates and dataset dimensions are Feature, Cell, and Track. 
 
diff --git a/doc/merge_split_output_vars.rst b/doc/merge_split_output_vars.rst
deleted file mode 100644
index a17ae327..00000000
--- a/doc/merge_split_output_vars.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-merge_split outputs an `xarray` dataset with several variables. The variables, (with column names listed in the `Variable Name` column), are described below with units. Coordinates and dataset dimensions are Feature, Cell, and Track. 
-
-Variables that are common to all feature detection files:
-
-.. csv-table:: tobac Merge_Split Track Output Variables
-   :file: ./merge_split_out_vars.csv
-   :widths: 3, 35, 3, 3
-   :header-rows: 1
-

From 9c0e33ecfed057fa27881508f1fd6a3c07ae75d1 Mon Sep 17 00:00:00 2001
From: Sean Freeman <freemansw1@gmail.com>
Date: Mon, 31 Oct 2022 09:59:55 -0500
Subject: [PATCH 20/29] black formatting

---
 tobac/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tobac/utils.py b/tobac/utils.py
index 32f34d4b..0ee24a8d 100644
--- a/tobac/utils.py
+++ b/tobac/utils.py
@@ -895,6 +895,7 @@ def spectral_filtering(
     else:
         return filtered_field
 
+
 def compress_all(nc_grids, min_dims=2, comp_level=4):
     """
     The purpose of this subroutine is to compress the netcdf variables as they are saved.
@@ -1067,6 +1068,7 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection=None):
 
     return ds
 
+
 def combine_tobac_feats(list_of_feats, preserve_old_feat_nums=None):
     """Function to combine a list of tobac feature detection dataframes
     into one combined dataframe that can be used for tracking

From 7e626529fda3f357d1cad3447b404494729df22a Mon Sep 17 00:00:00 2001
From: Sean Freeman <freemansw1@gmail.com>
Date: Mon, 31 Oct 2022 11:01:01 -0500
Subject: [PATCH 21/29] added merge_split to init; changed start track number
 to 0 from -1

---
 tobac/__init__.py    | 1 +
 tobac/merge_split.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tobac/__init__.py b/tobac/__init__.py
index f1aeb95e..6d7627bf 100644
--- a/tobac/__init__.py
+++ b/tobac/__init__.py
@@ -74,6 +74,7 @@
 from .tracking import linking_trackpy
 from .wrapper import maketrack
 from .wrapper import tracking_wrapper
+from . import merge_split
 
 # Set version number
 __version__ = "1.4.0"
diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 911b58cd..02ff5804 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -160,7 +160,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
 
     cell_parent_track_id = []
 
-    for i, id in enumerate(track_id, start=-1):
+    for i, id in enumerate(track_id, start=0):
         if len(track_id[int(id)]) == 1:
             cell_parent_track_id.append(int(i))
 

From a40b27d754835e0cc701f2f496968eebc557d061 Mon Sep 17 00:00:00 2001
From: Sean Freeman <freemansw1@gmail.com>
Date: Mon, 31 Oct 2022 11:10:10 -0500
Subject: [PATCH 22/29] added basic merge test

---
 tobac/tests/test_merge_split.py | 96 +++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 tobac/tests/test_merge_split.py

diff --git a/tobac/tests/test_merge_split.py b/tobac/tests/test_merge_split.py
new file mode 100644
index 00000000..afc817ab
--- /dev/null
+++ b/tobac/tests/test_merge_split.py
@@ -0,0 +1,96 @@
+"""Module to test tobac.merge_split
+"""
+
+import pandas as pd
+import numpy as np
+
+import datetime
+
+import tobac.testing as tbtest
+import tobac.tracking as tbtrack
+import tobac.merge_split as mergesplit
+
+
+def test_merge_split_cells():
+    """Tests tobac.merge_split.merge_split_cells by
+    generating two cells, colliding them into one another,
+    and merging them.
+    """
+
+    cell_1 = tbtest.generate_single_feature(
+        1,
+        1,
+        min_h1=0,
+        max_h1=100,
+        min_h2=0,
+        max_h2=100,
+        frame_start=0,
+        num_frames=2,
+        spd_h1=20,
+        spd_h2=20,
+        start_date=datetime.datetime(2022, 1, 1, 0),
+    )
+    cell_1["feature"] = [1, 3]
+
+    cell_2 = tbtest.generate_single_feature(
+        1,
+        100,
+        min_h1=0,
+        max_h1=100,
+        min_h2=0,
+        max_h2=100,
+        frame_start=0,
+        num_frames=2,
+        spd_h1=20,
+        spd_h2=-20,
+        start_date=datetime.datetime(2022, 1, 1, 0),
+    )
+    cell_2["feature"] = [2, 4]
+    cell_3 = tbtest.generate_single_feature(
+        30,
+        50,
+        min_h1=0,
+        max_h1=100,
+        min_h2=0,
+        max_h2=100,
+        frame_start=2,
+        num_frames=2,
+        spd_h1=20,
+        spd_h2=0,
+        start_date=datetime.datetime(2022, 1, 1, 0, 10),
+    )
+    cell_3["feature"] = [5, 6]
+    features = pd.concat([cell_1, cell_2, cell_3])
+    output = tbtrack.linking_trackpy(features, None, 1, 1, v_max=40)
+
+    dist_between = np.sqrt(
+        np.power(
+            output[output["frame"] == 1].iloc[0]["hdim_1"]
+            - output[output["frame"] == 1].iloc[1]["hdim_1"],
+            2,
+        )
+        + np.power(
+            output[output["frame"] == 1].iloc[0]["hdim_2"]
+            - output[output["frame"] == 1].iloc[1]["hdim_2"],
+            2,
+        )
+    )
+
+    # Test a successful merger
+    mergesplit_output_merged = mergesplit.merge_split_cells(
+        output, dxy=1, distance=dist_between + 50
+    )
+
+    # These cells should have merged together.
+    assert len(mergesplit_output_merged["track"]) == 1
+
+    # Test an unsuccessful merger.
+    mergesplit_output_unmerged = mergesplit.merge_split_cells(
+        output, dxy=1, distance=dist_between - 50
+    )
+
+    # These cells should NOT have merged together.
+    print(mergesplit_output_unmerged["track"])
+    assert len(mergesplit_output_unmerged["track"]) == 2
+
+    pass

From fa09e968633204dfdb652bcb5f531d6e29395bce Mon Sep 17 00:00:00 2001
From: Sean Freeman <freemansw1@gmail.com>
Date: Mon, 31 Oct 2022 11:27:38 -0500
Subject: [PATCH 23/29] removed final pass

---
 tobac/tests/test_merge_split.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tobac/tests/test_merge_split.py b/tobac/tests/test_merge_split.py
index afc817ab..fbc51f88 100644
--- a/tobac/tests/test_merge_split.py
+++ b/tobac/tests/test_merge_split.py
@@ -92,5 +92,3 @@ def test_merge_split_cells():
     # These cells should NOT have merged together.
     print(mergesplit_output_unmerged["track"])
     assert len(mergesplit_output_unmerged["track"]) == 2
-
-    pass

From 5354ebee79a617db71357ceb6f32fc429c82c2c6 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Fri, 4 Nov 2022 21:31:27 -0500
Subject: [PATCH 24/29] Update dimension names, correct the merged track
 numbering, etc.

This update also includes a fix to an existing error in the merge_split algorithm that incorrectly multiplied the dimension and not values.
---
 tobac/merge_split.py | 28 ++++++++++++++++------------
 tobac/utils.py       |  2 +-
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 02ff5804..23ec76f9 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -8,7 +8,7 @@
 """
 
 
-def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
+def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
     """
     function to  postprocess tobac track data for merge/split cells
 
@@ -25,7 +25,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
 
     distance : float, optional
         Distance threshold determining how close two features must be in order to consider merge/splitting.
-        Default is 25000 meters.
+        Default is 10x the x/y grid spacing of the data, given in dxy.
 
     frame_len : float, optional
         Threshold for the maximum number of frames that can separate the end of cell and the start of a related cell.
@@ -53,7 +53,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
         both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
 
     """
-
+    print('update')
     try:
         import networkx as nx
     except ImportError:
@@ -71,6 +71,9 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
     first = track_groups.first()
     last = track_groups.last()
 
+    if distance == None:
+         distance = dxy*25.
+    
     a_names = list()
     b_names = list()
     dist = list()
@@ -84,7 +87,8 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
     b_xy = np.zeros((l, 2))
     b_xy[:, 0] = first["hdim_2"].values * dxy
     b_xy[:, 1] = first["hdim_1"].values * dxy
-
+    a_xy = a_xy*dxy
+    b_xy = b_xy*dxy
     # Use cdist to find distance matrix
     out = cdist(a_xy, b_xy)
     # Find all cells under the distance threshold
@@ -134,7 +138,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
                 temp1 = list(np.unique(new_tree_arr[k[0]]))
                 temp = list(np.unique(new_tree_arr[k[0]]))
 
-                for l in range(15):
+                for l in range(len(cell_id)):
                     for i in temp1:
                         k2 = np.where(new_tree_arr == i)
                         temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze()))
@@ -204,9 +208,9 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
         cell_child_feature_count.append(len(track_groups[id].feature.values))
     logging.debug("found cell child feature count")
 
-    track_dim = "tracks"
-    cell_dim = "cells"
-    feature_dim = "features"
+    track_dim = "track"
+    cell_dim = "cell"
+    feature_dim = "feature"
 
     d = xr.Dataset(
         {
@@ -223,9 +227,9 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5):
 
     d = d.set_coords(["feature", "cell", "track"])
 
-    assert len(cell_id) == len(cell_parent_track_id)
-    assert len(feature_id) == len(feature_parent_cell_id)
-    assert sum(track_child_cell_count) == len(cell_id)
-    assert sum(cell_child_feature_count) == len(feature_id)
+#     assert len(cell_id) == len(cell_parent_track_id)
+#     assert len(feature_id) == len(feature_parent_cell_id)
+#     assert sum(track_child_cell_count) == len(cell_id)
+#     assert sum(cell_child_feature_count) == len(feature_id)
 
     return d
diff --git a/tobac/utils.py b/tobac/utils.py
index 0ee24a8d..bedcde1c 100644
--- a/tobac/utils.py
+++ b/tobac/utils.py
@@ -1012,7 +1012,7 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection=None):
             "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments",
         ),
         "feature": (
-            "feature_id",
+            "feature",
             "Unique number of the feature; starts from 1 and increments by 1 to the number of features",
         ),
         "time": (

From 9e722f8a4d49ea5b863300d14199958eb00afda4 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Fri, 4 Nov 2022 21:34:39 -0500
Subject: [PATCH 25/29] Updated for formatting

---
 tobac/merge_split.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 23ec76f9..72fe2f7b 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -53,7 +53,7 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
         both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
 
     """
-    print('update')
+    print("update")
     try:
         import networkx as nx
     except ImportError:
@@ -72,8 +72,8 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
     last = track_groups.last()
 
     if distance == None:
-         distance = dxy*25.
-    
+        distance = dxy * 25.0
+
     a_names = list()
     b_names = list()
     dist = list()
@@ -87,8 +87,8 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
     b_xy = np.zeros((l, 2))
     b_xy[:, 0] = first["hdim_2"].values * dxy
     b_xy[:, 1] = first["hdim_1"].values * dxy
-    a_xy = a_xy*dxy
-    b_xy = b_xy*dxy
+    a_xy = a_xy * dxy
+    b_xy = b_xy * dxy
     # Use cdist to find distance matrix
     out = cdist(a_xy, b_xy)
     # Find all cells under the distance threshold
@@ -227,9 +227,9 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
 
     d = d.set_coords(["feature", "cell", "track"])
 
-#     assert len(cell_id) == len(cell_parent_track_id)
-#     assert len(feature_id) == len(feature_parent_cell_id)
-#     assert sum(track_child_cell_count) == len(cell_id)
-#     assert sum(cell_child_feature_count) == len(feature_id)
+    #     assert len(cell_id) == len(cell_parent_track_id)
+    #     assert len(feature_id) == len(feature_parent_cell_id)
+    #     assert sum(track_child_cell_count) == len(cell_id)
+    #     assert sum(cell_child_feature_count) == len(feature_id)
 
     return d

From 019d0f344b1a0802e40e8779e6012852a4c19802 Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Sat, 5 Nov 2022 08:45:29 -0500
Subject: [PATCH 26/29] Removed unnecessary print statement

---
 tobac/merge_split.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 72fe2f7b..2bac5647 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -53,7 +53,6 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
         both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc'))
 
     """
-    print("update")
     try:
         import networkx as nx
     except ImportError:

From ef48aa9c6c26fd4e14008e34281f60f6db0c8ffd Mon Sep 17 00:00:00 2001
From: Sean Freeman <freemansw1@gmail.com>
Date: Mon, 7 Nov 2022 08:00:17 -0600
Subject: [PATCH 27/29] added merge_split to the API reference

---
 doc/tobac.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/doc/tobac.rst b/doc/tobac.rst
index 6f7f4ccb..ac0d88fe 100644
--- a/doc/tobac.rst
+++ b/doc/tobac.rst
@@ -28,6 +28,14 @@ tobac.feature\_detection module
    :undoc-members:
    :show-inheritance:
 
+tobac.merge_split module
+---------------------
+
+.. automodule:: tobac.merge_split
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 tobac.plotting module
 ---------------------
 

From 4b1101568197b224eaba2d5a65aab0337ad2f5bb Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Mon, 7 Nov 2022 12:37:08 -0600
Subject: [PATCH 28/29] Updates with documentation and removing duplicated dxy
 lines in merge_split.

---
 doc/merge_split.rst          | 2 +-
 doc/merge_split_out_vars.csv | 2 +-
 tobac/merge_split.py         | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/merge_split.rst b/doc/merge_split.rst
index 6ea3b0ce..31811661 100644
--- a/doc/merge_split.rst
+++ b/doc/merge_split.rst
@@ -6,7 +6,7 @@ The first iteration of this module is to combine the cells which are merging but
 This submodule will label merged/split cells with a TRACK number in addition to its CELL number.
 
 Features, cells, and tracks are combined using parent/child nomenclature. 
-(quick note on terms; “feature” is a detected object at a single time step. “cell” is a series of features linked together over multiple timesteps. "track" may be an individual cell or series of cells which have merged and/or split.)
+(quick note on terms; “feature” is a detected object at a single time step (see :doc:`feature_detection_overview`). “cell” is a series of features linked together over multiple timesteps (see :doc:`linking`). "track" may be an individual cell or series of cells which have merged and/or split.)
 
 Overview of the output dataframe from merge_split
 
diff --git a/doc/merge_split_out_vars.csv b/doc/merge_split_out_vars.csv
index 38b86679..b4b6069c 100644
--- a/doc/merge_split_out_vars.csv
+++ b/doc/merge_split_out_vars.csv
@@ -1,6 +1,6 @@
 ﻿Variable Name,Description,Units,Type
 feature,Unique number of the feature; starts from 1 and increments by 1 to the number of features identified in all frames,n/a,int64
-Cell,Tracked cell number; generally starts from 1. Untracked cell value is -1.,n/a,int64
+cell,Tracked cell number; generally starts from 1. Untracked cell value is -1.,n/a,int64
 track,Unique number of the track; starts from 0 and increments by 1 to the number of tracks identified. Untracked cells and features have a track id of -1.,n/a,int64
 cell_parent_track_id,"The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.",n/a,int64
 feature_parent_cell_id,The associated parent cell id for each feature. All feature in a given cell will have the same cell id.,n/a,int64
diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 2bac5647..1dbe2ae4 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -25,7 +25,8 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
 
     distance : float, optional
         Distance threshold determining how close two features must be in order to consider merge/splitting.
-        Default is 10x the x/y grid spacing of the data, given in dxy.
+        Default is 25x the x/y grid spacing of the data, given in dxy.
+        The distance should be in units of meters.
 
     frame_len : float, optional
         Threshold for the maximum number of frames that can separate the end of cell and the start of a related cell.
@@ -70,7 +71,7 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
     first = track_groups.first()
     last = track_groups.last()
 
-    if distance == None:
+    if distance is None:
         distance = dxy * 25.0
 
     a_names = list()
@@ -86,8 +87,7 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
     b_xy = np.zeros((l, 2))
     b_xy[:, 0] = first["hdim_2"].values * dxy
     b_xy[:, 1] = first["hdim_1"].values * dxy
-    a_xy = a_xy * dxy
-    b_xy = b_xy * dxy
+
     # Use cdist to find distance matrix
     out = cdist(a_xy, b_xy)
     # Find all cells under the distance threshold

From 800fdfe6fdea261d742595762340879e749e4a0e Mon Sep 17 00:00:00 2001
From: kelcyno <88055123+kelcyno@users.noreply.github.com>
Date: Tue, 8 Nov 2022 11:23:52 -0600
Subject: [PATCH 29/29] Renamed the function using the postfix MEST, and
 corrected a track numbering error which incorrectly numbered tracks for
 merged cells.

---
 doc/merge_split.rst             |  3 ++-
 tobac/merge_split.py            | 35 ++++++++++++---------------------
 tobac/tests/test_merge_split.py |  6 +++---
 3 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/doc/merge_split.rst b/doc/merge_split.rst
index 31811661..b6b56cbd 100644
--- a/doc/merge_split.rst
+++ b/doc/merge_split.rst
@@ -3,6 +3,7 @@ Merge and Split
 
 This submodule is a post processing step to address tracked cells which merge/split. 
 The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. 
+This module uses a minimum euclidian spanning tree to combine merging cells, thus the postfix for the function is MEST.
 This submodule will label merged/split cells with a TRACK number in addition to its CELL number.
 
 Features, cells, and tracks are combined using parent/child nomenclature. 
@@ -29,7 +30,7 @@ Parent/child variables include:
 
 Example usage:
         
-``d = merge_split(Track)``
+``d = merge_split_MEST(Track)``
 
 merge_split outputs an `xarray` dataset with several variables. The variables, (with column names listed in the `Variable Name` column), are described below with units. Coordinates and dataset dimensions are Feature, Cell, and Track. 
 
diff --git a/tobac/merge_split.py b/tobac/merge_split.py
index 1dbe2ae4..6e53eb97 100644
--- a/tobac/merge_split.py
+++ b/tobac/merge_split.py
@@ -8,9 +8,9 @@
 """
 
 
-def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
+def merge_split_MEST(TRACK, dxy, distance=None, frame_len=5):
     """
-    function to  postprocess tobac track data for merge/split cells
+    function to  postprocess tobac track data for merge/split cells using a minimum euclidian spanning tree
 
 
     Parameters
@@ -47,7 +47,7 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
 
 
     Example usage:
-        d = merge_split(Track)
+        d = merge_split_MEST(Track)
         ds = tobac.utils.standardize_track_dataset(Track, refl_mask)
         both_ds = xr.merge([ds, d],compat ='override')
         both_ds = tobac.utils.compress_all(both_ds)
@@ -87,7 +87,6 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
     b_xy = np.zeros((l, 2))
     b_xy[:, 0] = first["hdim_2"].values * dxy
     b_xy[:, 1] = first["hdim_1"].values * dxy
-
     # Use cdist to find distance matrix
     out = cdist(a_xy, b_xy)
     # Find all cells under the distance threshold
@@ -161,16 +160,13 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
     cell_id = list(np.unique(TRACK.cell.values.astype(int)))
     logging.debug("found cell ids")
 
-    cell_parent_track_id = []
+    cell_parent_track_id = np.zeros(len(cell_id))
+    cell_parent_track_id[:] = -1
 
     for i, id in enumerate(track_id, start=0):
-        if len(track_id[int(id)]) == 1:
-            cell_parent_track_id.append(int(i))
-
-        else:
-            cell_parent_track_id.append(np.repeat(int(i), len(track_id[int(id)])))
+        for j in track_id[int(id)]:
+            cell_parent_track_id[cell_id.index(j)] = int(i)
 
-    cell_parent_track_id = list(flatten(cell_parent_track_id))
     logging.debug("found cell parent track ids")
 
     track_ids = np.array(np.unique(cell_parent_track_id))
@@ -179,7 +175,7 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
     feature_parent_cell_id = list(TRACK.cell.values.astype(int))
     logging.debug("found feature parent cell ids")
 
-    # This version includes all the feature regardless of if they are used in cells or not.
+    #     # This version includes all the feature regardless of if they are used in cells or not.
     feature_id = list(TRACK.feature.values.astype(int))
     logging.debug("found feature ids")
 
@@ -190,21 +186,16 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5):
         if cellid < 0:
             feature_parent_track_id[i] = -1
         else:
-            j = np.where(cell_id == cellid)
-            j = np.squeeze(j)
-            trackid = cell_parent_track_id[j]
-            feature_parent_track_id[i] = trackid
-
-    logging.debug("found feature parent track ids")
+            feature_parent_track_id[i] = cell_parent_track_id[cell_id.index(cellid)]
 
-    track_child_cell_count = []
+    track_child_cell_count = np.zeros(len(track_id))
     for i, id in enumerate(track_id):
-        track_child_cell_count.append(len(track_id[int(id)]))
+        track_child_cell_count[i] = len(np.where(cell_parent_track_id == i)[0])
     logging.debug("found track child cell count")
 
-    cell_child_feature_count = []
+    cell_child_feature_count = np.zeros(len(cell_id))
     for i, id in enumerate(cell_id):
-        cell_child_feature_count.append(len(track_groups[id].feature.values))
+        cell_child_feature_count[i] = len(track_groups[id].feature.values)
     logging.debug("found cell child feature count")
 
     track_dim = "track"
diff --git a/tobac/tests/test_merge_split.py b/tobac/tests/test_merge_split.py
index fbc51f88..389c3dc7 100644
--- a/tobac/tests/test_merge_split.py
+++ b/tobac/tests/test_merge_split.py
@@ -11,7 +11,7 @@
 import tobac.merge_split as mergesplit
 
 
-def test_merge_split_cells():
+def test_merge_split_MEST():
     """Tests tobac.merge_split.merge_split_cells by
     generating two cells, colliding them into one another,
     and merging them.
@@ -77,7 +77,7 @@ def test_merge_split_cells():
     )
 
     # Test a successful merger
-    mergesplit_output_merged = mergesplit.merge_split_cells(
+    mergesplit_output_merged = mergesplit.merge_split_MEST(
         output, dxy=1, distance=dist_between + 50
     )
 
@@ -85,7 +85,7 @@ def test_merge_split_cells():
     assert len(mergesplit_output_merged["track"]) == 1
 
     # Test an unsuccessful merger.
-    mergesplit_output_unmerged = mergesplit.merge_split_cells(
+    mergesplit_output_unmerged = mergesplit.merge_split_MEST(
         output, dxy=1, distance=dist_between - 50
     )