From b156b4521876ef4eb497cfebd9c3f7193e5c0bdf Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Mon, 6 Jun 2022 16:40:46 -0500 Subject: [PATCH 01/29] Merge Split algorithm added This file is a post processing step to the tobac tracking step. This is a first iteration in addressing split/merged cells. --- tobac/merge_split.py | 345 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 tobac/merge_split.py diff --git a/tobac/merge_split.py b/tobac/merge_split.py new file mode 100644 index 00000000..0cbbc7e5 --- /dev/null +++ b/tobac/merge_split.py @@ -0,0 +1,345 @@ +#Tobac merge and split v0.1 + +from geopy.distance import geodesic +from networkx import * +import numpy as np +from pandas.core.common import flatten +import xarray as xr + +def compress_all(nc_grids, min_dims=2): + for var in nc_grids: + if len(nc_grids[var].dims) >= min_dims: + # print("Compressing ", var) + nc_grids[var].encoding["zlib"] = True + nc_grids[var].encoding["complevel"] = 4 + nc_grids[var].encoding["contiguous"] = False + return nc_grids + + +def standardize_track_dataset(TrackedFeatures, Mask, Projection): + """ Combine a feature mask with the feature data table into a common dataset. + Also renames th + + returned by tobac.themes.tobac_v1.segmentation + with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy. + + Also rename the variables to be more desciptive and comply with cf-tree. + + Convert the default cell parent ID to an integer table. + + Add a cell dimension to reflect + + Projection is an xarray DataArray + + TODO: Add metadata attributes to + + """ + feature_standard_names = { + # new variable name, and long description for the NetCDF attribute + 'frame':('feature_time_index', + 'positional index of the feature along the time dimension of the mask, from 0 to N-1'), + 'hdim_1':('feature_hdim1_coordinate', + 'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).' + 'The numbering is consistent with positional indexing of the coordinate, but can be' + 'fractional, to account for a centroid not aligned to the grid.'), + 'hdim_2':('feature_hdim2_coordinate', + 'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)' + 'The numbering is consistent with positional indexing of the coordinate, but can be' + 'fractional, to account for a centroid not aligned to the grid.'), + 'idx':('feature_id_this_frame',), + 'num':('feature_grid_cell_count', + 'Number of grid points that are within the threshold of this feature'), + 'threshold_value':('feature_threshold_max', + "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"), + 'feature':('feature_id', + "Unique number of the feature; starts from 1 and increments by 1 to the number of features"), + 'time':('feature_time','time of the feature, consistent with feature_time_index'), + 'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'), + 'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'), + 'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'), + 'lat':('feature_latitude','latitude of the feature'), + 'lon':('feature_longitude','longitude of the feature'), + 'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'), + 'areas':('feature_area',), + 'isolated':('feature_isolation_flag',), + 'num_objects':('number_of_feature_neighbors',), + 'cell':('feature_parent_cell_id',), + 'time_cell':('feature_parent_cell_elapsed_time',), + 'segmentation_mask':('2d segmentation mask',) + } + new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys() + if k in TrackedFeatures.variables.keys()} + + TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id']) + # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of + # the 'index' variable and call the dimension 'feature' + ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names), + Mask]) + + # Add the projection data back in + ds['ProjectionCoordinateSystem']=Projection + + # Convert the cell ID variable from float to integer + if 'int' not in str(TrackedFeatures.cell.dtype): + # The raw output from the tracking is actually an object array + # array([nan, 2, 3], dtype=object) + # (and is cast to a float array when saved as NetCDF, I think). + # Cast to float. + int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64') + + cell_id_data = TrackedFeatures.cell.astype('float64') + valid_cell = np.isfinite(cell_id_data) + valid_cell_ids = cell_id_data[valid_cell] + if not (np.unique(valid_cell_ids) > 0).all(): + raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell') + int_cell[valid_cell] = valid_cell_ids.astype('int64') + #ds['feature_parent_cell_id'] = int_cell + return ds + + + +def merge_split(TRACK,distance = 25000,frame_len = 5): + ''' + function to postprocess tobac track data for merge/split cells + Input: + TRACK: xarray dataset of tobac Track information + + distance: float, optional distance threshold prior to adding a pair of points + into the minimum spanning tree. Default is 25000 meters. + + frame_len: float, optional threshold for the spanning length within which two points + can be separated. Default is five (5) frames. + + + Output: + d: xarray dataset of + feature position along 1st horizontal dimension + hdim2_index: float + feature position along 2nd horizontal dimension + + Example: + d = merge_split(Track) + ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem']) + # both_ds = xarray.combine_by_coords((ds,d), compat='override') + both_ds = xr.merge([ds, d],compat ='override') + both_ds = compress_all(both_ds) + both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) + + ''' + track_groups = TRACK.groupby('cell') + cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()} + id_data = np.fromiter(cell_ids.keys(), dtype=int) + count_data = np.fromiter(cell_ids.values(), dtype=int) + all_frames = np.sort(np.unique(TRACK.frame)) + a_points = list() + b_points = list() + a_names = list() + b_names = list() + dist = list() + + + for i in id_data: + #print(i) + a_pointx = track_groups[i].projection_x_coordinate[-1].values + a_pointy = track_groups[i].projection_y_coordinate[-1].values + for j in id_data: + b_pointx = track_groups[j].projection_x_coordinate[0].values + b_pointy = track_groups[j].projection_y_coordinate[0].values + d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))) + if d <= distance: + a_points.append([a_pointx,a_pointy]) + b_points.append([b_pointx, b_pointy]) + dist.append(d) + a_names.append(i) + b_names.append(j) + + + + + +# for i in id_data: +# a_pointx = track_groups[i].grid_longitude[-1].values +# a_pointy = track_groups[i].grid_latitude[-1].values +# for j in id_data: +# b_pointx = track_groups[j].grid_longitude[0].values +# b_pointy = track_groups[j].grid_latitude[0].values +# d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km +# if d <= distance: +# a_points.append([a_pointx,a_pointy]) +# b_points.append([b_pointx, b_pointy]) +# dist.append(d) +# a_names.append(i) +# b_names.append(j) + + id = [] + for i in range(len(dist)-1, -1, -1): + if a_names[i] == b_names[i]: + id.append(i) + a_points.pop(i) + b_points.pop(i) + dist.pop(i) + a_names.pop(i) + b_names.pop(i) + else: + continue + + g = Graph() + for i in np.arange(len(dist)): + g.add_edge(a_names[i], b_names[i],weight=dist[i]) + + tree = minimum_spanning_edges(g) + tree_list = list(minimum_spanning_edges(g)) + + new_tree = [] + for i,j in enumerate(tree_list): + frame_a = np.nanmax(track_groups[j[0]].frame.values) + frame_b = np.nanmin(track_groups[j[1]].frame.values) + if np.abs(frame_a - frame_b) <= frame_len: + new_tree.append(tree_list[i][0:2]) + new_tree_arr = np.array(new_tree) + + TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values)) + cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]) + track_id = dict() #same size as number of total merged tracks + + arr = np.array([0]) + for p in cell_id: + j = np.where(arr == int(p)) + if len(j[0]) > 0: + continue + else: + k = np.where(new_tree_arr == p) + if len(k[0]) == 0: + track_id[p] = [p] + arr = np.append(arr,p) + else: + temp1 = list(np.unique(new_tree_arr[k[0]])) + temp = list(np.unique(new_tree_arr[k[0]])) + + for l in range(15): + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + + if len(temp1) == len(temp): + break + temp1 = np.array(temp) + + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + arr = np.append(arr,np.unique(temp)) + + track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) + + + + storm_id = [0] #default because we don't track larger storm systems *yet* + print('found storm id') + + + track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet* + print('found track parent storm ids') + + track_ids = np.array(list(track_id.keys())) + print('found track ids') + + + cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])) + print('found cell ids') + + cell_parent_track_id = [] + + for i, id in enumerate(track_id): + + if len(track_id[int(id)]) == 1: + cell_parent_track_id.append(int(id)) + + else: + cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)]))) + + + cell_parent_track_id = list(flatten(cell_parent_track_id)) + print('found cell parent track ids') + + feature_parent_cell_id = list(TRACK.cell.values.astype(float)) + + print('found feature parent cell ids') + + #This version includes all the feature regardless of if they are used in cells or not. + feature_id = list(TRACK.feature.values.astype(int)) + print('found feature ids') + + feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm + print('found feature parent storm ids') + + feature_parent_track_id = [] + feature_parent_track_id = np.zeros(len(feature_id)) + for i, id in enumerate(feature_id): + cellid = feature_parent_cell_id[i] + if np.isnan(cellid): + feature_parent_track_id[i] = -1 + else: + j = np.where(cell_id == cellid) + j = np.squeeze(j) + trackid = cell_parent_track_id[j] + feature_parent_track_id[i] = trackid + + print('found feature parent track ids') + + + storm_child_track_count = [len(track_id)] + print('found storm child track count') + + track_child_cell_count = [] + for i,id in enumerate(track_id): + track_child_cell_count.append(len(track_id[int(id)])) + print('found track child cell count') + + + cell_child_feature_count = [] + for i,id in enumerate(cell_id): + cell_child_feature_count.append(len(track_groups[id].feature.values)) + print('found cell child feature count') + + storm_child_cell_count = [len(cell_id)] + storm_child_feature_count = [len(feature_id)] + + storm_dim = 'nstorms' + track_dim = 'ntracks' + cell_dim = 'ncells' + feature_dim = 'nfeatures' + + d = xr.Dataset({ + 'storm_id': (storm_dim, storm_id), + 'track_id': (track_dim, track_ids), + 'track_parent_storm_id': (track_dim, track_parent_storm_id), + 'cell_id': (cell_dim, cell_id), + 'cell_parent_track_id': (cell_dim, cell_parent_track_id), + 'feature_id': (feature_dim, feature_id), + 'feature_parent_cell_id': (feature_dim, feature_parent_cell_id), + 'feature_parent_track_id': (feature_dim, feature_parent_track_id), + 'feature_parent_storm_id': (feature_dim, feature_parent_storm_id), + 'storm_child_track_count': (storm_dim, storm_child_track_count), + 'storm_child_cell_count': (storm_dim, storm_child_cell_count), + 'storm_child_feature_count': (storm_dim, storm_child_feature_count), + 'track_child_cell_count': (track_dim, track_child_cell_count), + 'cell_child_feature_count': (cell_dim, cell_child_feature_count), + }) + d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) + + assert len(track_id) == len(track_parent_storm_id) + assert len(cell_id) == len(cell_parent_track_id) + assert len(feature_id) == len(feature_parent_cell_id) + assert sum(storm_child_track_count) == len(track_id) + assert sum(storm_child_cell_count) == len(cell_id) + assert sum(storm_child_feature_count) == len(feature_id) + assert sum(track_child_cell_count) == len(cell_id) + assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id) + + return d \ No newline at end of file From b550e3b713b3caaab0be1f5debd6a977ae889b20 Mon Sep 17 00:00:00 2001 From: Juli Date: Thu, 16 Jun 2022 14:22:43 +0200 Subject: [PATCH 02/29] black formatting --- tobac/merge_split.py | 322 ++++++++++++++++++++++++------------------- 1 file changed, 181 insertions(+), 141 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 0cbbc7e5..4183a3f0 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -1,4 +1,4 @@ -#Tobac merge and split v0.1 +# Tobac merge and split v0.1 from geopy.distance import geodesic from networkx import * @@ -6,6 +6,7 @@ from pandas.core.common import flatten import xarray as xr + def compress_all(nc_grids, min_dims=2): for var in nc_grids: if len(nc_grids[var].dims) >= min_dims: @@ -14,7 +15,7 @@ def compress_all(nc_grids, min_dims=2): nc_grids[var].encoding["complevel"] = 4 nc_grids[var].encoding["contiguous"] = False return nc_grids - + def standardize_track_dataset(TrackedFeatures, Mask, Projection): """ Combine a feature mask with the feature data table into a common dataset. @@ -36,70 +37,107 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection): """ feature_standard_names = { # new variable name, and long description for the NetCDF attribute - 'frame':('feature_time_index', - 'positional index of the feature along the time dimension of the mask, from 0 to N-1'), - 'hdim_1':('feature_hdim1_coordinate', - 'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).' - 'The numbering is consistent with positional indexing of the coordinate, but can be' - 'fractional, to account for a centroid not aligned to the grid.'), - 'hdim_2':('feature_hdim2_coordinate', - 'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)' - 'The numbering is consistent with positional indexing of the coordinate, but can be' - 'fractional, to account for a centroid not aligned to the grid.'), - 'idx':('feature_id_this_frame',), - 'num':('feature_grid_cell_count', - 'Number of grid points that are within the threshold of this feature'), - 'threshold_value':('feature_threshold_max', - "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"), - 'feature':('feature_id', - "Unique number of the feature; starts from 1 and increments by 1 to the number of features"), - 'time':('feature_time','time of the feature, consistent with feature_time_index'), - 'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'), - 'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'), - 'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'), - 'lat':('feature_latitude','latitude of the feature'), - 'lon':('feature_longitude','longitude of the feature'), - 'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'), - 'areas':('feature_area',), - 'isolated':('feature_isolation_flag',), - 'num_objects':('number_of_feature_neighbors',), - 'cell':('feature_parent_cell_id',), - 'time_cell':('feature_parent_cell_elapsed_time',), - 'segmentation_mask':('2d segmentation mask',) + "frame": ( + "feature_time_index", + "positional index of the feature along the time dimension of the mask, from 0 to N-1", + ), + "hdim_1": ( + "feature_hdim1_coordinate", + "position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x)." + "The numbering is consistent with positional indexing of the coordinate, but can be" + "fractional, to account for a centroid not aligned to the grid.", + ), + "hdim_2": ( + "feature_hdim2_coordinate", + "position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)" + "The numbering is consistent with positional indexing of the coordinate, but can be" + "fractional, to account for a centroid not aligned to the grid.", + ), + "idx": ("feature_id_this_frame",), + "num": ( + "feature_grid_cell_count", + "Number of grid points that are within the threshold of this feature", + ), + "threshold_value": ( + "feature_threshold_max", + "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments", + ), + "feature": ( + "feature_id", + "Unique number of the feature; starts from 1 and increments by 1 to the number of features", + ), + "time": ( + "feature_time", + "time of the feature, consistent with feature_time_index", + ), + "timestr": ( + "feature_time_str", + "String representation of the feature time, YYYY-MM-DD HH:MM:SS", + ), + "projection_y_coordinate": ( + "feature_projection_y_coordinate", + "y position of the feature in the projection given by ProjectionCoordinateSystem", + ), + "projection_x_coordinate": ( + "feature_projection_x_coordinate", + "x position of the feature in the projection given by ProjectionCoordinateSystem", + ), + "lat": ("feature_latitude", "latitude of the feature"), + "lon": ("feature_longitude", "longitude of the feature"), + "ncells": ( + "feature_ncells", + "number of grid cells for this feature (meaning uncertain)", + ), + "areas": ("feature_area",), + "isolated": ("feature_isolation_flag",), + "num_objects": ("number_of_feature_neighbors",), + "cell": ("feature_parent_cell_id",), + "time_cell": ("feature_parent_cell_elapsed_time",), + "segmentation_mask": ("2d segmentation mask",), + } + new_feature_var_names = { + k: feature_standard_names[k][0] + for k in feature_standard_names.keys() + if k in TrackedFeatures.variables.keys() } - new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys() - if k in TrackedFeatures.variables.keys()} - TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id']) + TrackedFeatures = TrackedFeatures.drop(["cell_parent_track_id"]) # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of # the 'index' variable and call the dimension 'feature' - ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names), - Mask]) + ds = xr.merge( + [ + TrackedFeatures.swap_dims({"index": "feature"}) + .drop("index") + .rename_vars(new_feature_var_names), + Mask, + ] + ) # Add the projection data back in - ds['ProjectionCoordinateSystem']=Projection + ds["ProjectionCoordinateSystem"] = Projection # Convert the cell ID variable from float to integer - if 'int' not in str(TrackedFeatures.cell.dtype): + if "int" not in str(TrackedFeatures.cell.dtype): # The raw output from the tracking is actually an object array # array([nan, 2, 3], dtype=object) # (and is cast to a float array when saved as NetCDF, I think). # Cast to float. - int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64') + int_cell = xr.zeros_like(TrackedFeatures.cell, dtype="int64") - cell_id_data = TrackedFeatures.cell.astype('float64') + cell_id_data = TrackedFeatures.cell.astype("float64") valid_cell = np.isfinite(cell_id_data) valid_cell_ids = cell_id_data[valid_cell] if not (np.unique(valid_cell_ids) > 0).all(): - raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell') - int_cell[valid_cell] = valid_cell_ids.astype('int64') - #ds['feature_parent_cell_id'] = int_cell + raise AssertionError( + "Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell" + ) + int_cell[valid_cell] = valid_cell_ids.astype("int64") + # ds['feature_parent_cell_id'] = int_cell return ds - -def merge_split(TRACK,distance = 25000,frame_len = 5): - ''' +def merge_split(TRACK, distance=25000, frame_len=5): + """ function to postprocess tobac track data for merge/split cells Input: TRACK: xarray dataset of tobac Track information @@ -125,9 +163,9 @@ def merge_split(TRACK,distance = 25000,frame_len = 5): both_ds = compress_all(both_ds) both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) - ''' - track_groups = TRACK.groupby('cell') - cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()} + """ + track_groups = TRACK.groupby("cell") + cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()} id_data = np.fromiter(cell_ids.keys(), dtype=int) count_data = np.fromiter(cell_ids.values(), dtype=int) all_frames = np.sort(np.unique(TRACK.frame)) @@ -136,43 +174,40 @@ def merge_split(TRACK,distance = 25000,frame_len = 5): a_names = list() b_names = list() dist = list() - - + for i in id_data: - #print(i) + # print(i) a_pointx = track_groups[i].projection_x_coordinate[-1].values a_pointy = track_groups[i].projection_y_coordinate[-1].values for j in id_data: b_pointx = track_groups[j].projection_x_coordinate[0].values b_pointy = track_groups[j].projection_y_coordinate[0].values - d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))) + d = np.linalg.norm( + np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)) + ) if d <= distance: - a_points.append([a_pointx,a_pointy]) + a_points.append([a_pointx, a_pointy]) b_points.append([b_pointx, b_pointy]) dist.append(d) a_names.append(i) b_names.append(j) - - - - -# for i in id_data: -# a_pointx = track_groups[i].grid_longitude[-1].values -# a_pointy = track_groups[i].grid_latitude[-1].values -# for j in id_data: -# b_pointx = track_groups[j].grid_longitude[0].values -# b_pointy = track_groups[j].grid_latitude[0].values -# d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km -# if d <= distance: -# a_points.append([a_pointx,a_pointy]) -# b_points.append([b_pointx, b_pointy]) -# dist.append(d) -# a_names.append(i) -# b_names.append(j) + # for i in id_data: + # a_pointx = track_groups[i].grid_longitude[-1].values + # a_pointy = track_groups[i].grid_latitude[-1].values + # for j in id_data: + # b_pointx = track_groups[j].grid_longitude[0].values + # b_pointy = track_groups[j].grid_latitude[0].values + # d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km + # if d <= distance: + # a_points.append([a_pointx,a_pointy]) + # b_points.append([b_pointx, b_pointy]) + # dist.append(d) + # a_names.append(i) + # b_names.append(j) id = [] - for i in range(len(dist)-1, -1, -1): + for i in range(len(dist) - 1, -1, -1): if a_names[i] == b_names[i]: id.append(i) a_points.pop(i) @@ -182,25 +217,27 @@ def merge_split(TRACK,distance = 25000,frame_len = 5): b_names.pop(i) else: continue - + g = Graph() for i in np.arange(len(dist)): - g.add_edge(a_names[i], b_names[i],weight=dist[i]) + g.add_edge(a_names[i], b_names[i], weight=dist[i]) tree = minimum_spanning_edges(g) tree_list = list(minimum_spanning_edges(g)) new_tree = [] - for i,j in enumerate(tree_list): + for i, j in enumerate(tree_list): frame_a = np.nanmax(track_groups[j[0]].frame.values) frame_b = np.nanmin(track_groups[j[1]].frame.values) if np.abs(frame_a - frame_b) <= frame_len: new_tree.append(tree_list[i][0:2]) new_tree_arr = np.array(new_tree) - TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values)) - cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]) - track_id = dict() #same size as number of total merged tracks + TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values)) + cell_id = np.unique( + TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))] + ) + track_id = dict() # same size as number of total merged tracks arr = np.array([0]) for p in cell_id: @@ -211,7 +248,7 @@ def merge_split(TRACK,distance = 25000,frame_len = 5): k = np.where(new_tree_arr == p) if len(k[0]) == 0: track_id[p] = [p] - arr = np.append(arr,p) + arr = np.append(arr, p) else: temp1 = list(np.unique(new_tree_arr[k[0]])) temp = list(np.unique(new_tree_arr[k[0]])) @@ -226,112 +263,113 @@ def merge_split(TRACK,distance = 25000,frame_len = 5): if len(temp1) == len(temp): break temp1 = np.array(temp) - + for i in temp1: k2 = np.where(new_tree_arr == i) temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) temp = list(flatten(temp)) temp = list(np.unique(temp)) - arr = np.append(arr,np.unique(temp)) - - track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) - - + arr = np.append(arr, np.unique(temp)) - storm_id = [0] #default because we don't track larger storm systems *yet* - print('found storm id') + track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) + storm_id = [0] # default because we don't track larger storm systems *yet* + print("found storm id") - track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet* - print('found track parent storm ids') + track_parent_storm_id = np.repeat( + 0, len(track_id) + ) # This will always be zero when we don't track larger storm systems *yet* + print("found track parent storm ids") track_ids = np.array(list(track_id.keys())) - print('found track ids') + print("found track ids") - - cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])) - print('found cell ids') + cell_id = list( + np.unique( + TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))] + ) + ) + print("found cell ids") cell_parent_track_id = [] for i, id in enumerate(track_id): - - if len(track_id[int(id)]) == 1: + + if len(track_id[int(id)]) == 1: cell_parent_track_id.append(int(id)) else: - cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)]))) - + cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)]))) cell_parent_track_id = list(flatten(cell_parent_track_id)) - print('found cell parent track ids') + print("found cell parent track ids") feature_parent_cell_id = list(TRACK.cell.values.astype(float)) - print('found feature parent cell ids') + print("found feature parent cell ids") - #This version includes all the feature regardless of if they are used in cells or not. + # This version includes all the feature regardless of if they are used in cells or not. feature_id = list(TRACK.feature.values.astype(int)) - print('found feature ids') + print("found feature ids") - feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm - print('found feature parent storm ids') + feature_parent_storm_id = np.repeat(0, len(feature_id)) # we don't do storms atm + print("found feature parent storm ids") - feature_parent_track_id = [] + feature_parent_track_id = [] feature_parent_track_id = np.zeros(len(feature_id)) for i, id in enumerate(feature_id): cellid = feature_parent_cell_id[i] if np.isnan(cellid): - feature_parent_track_id[i] = -1 + feature_parent_track_id[i] = -1 else: j = np.where(cell_id == cellid) j = np.squeeze(j) trackid = cell_parent_track_id[j] feature_parent_track_id[i] = trackid - - print('found feature parent track ids') + print("found feature parent track ids") storm_child_track_count = [len(track_id)] - print('found storm child track count') + print("found storm child track count") track_child_cell_count = [] - for i,id in enumerate(track_id): + for i, id in enumerate(track_id): track_child_cell_count.append(len(track_id[int(id)])) - print('found track child cell count') - + print("found track child cell count") cell_child_feature_count = [] - for i,id in enumerate(cell_id): + for i, id in enumerate(cell_id): cell_child_feature_count.append(len(track_groups[id].feature.values)) - print('found cell child feature count') + print("found cell child feature count") - storm_child_cell_count = [len(cell_id)] + storm_child_cell_count = [len(cell_id)] storm_child_feature_count = [len(feature_id)] - - storm_dim = 'nstorms' - track_dim = 'ntracks' - cell_dim = 'ncells' - feature_dim = 'nfeatures' - - d = xr.Dataset({ - 'storm_id': (storm_dim, storm_id), - 'track_id': (track_dim, track_ids), - 'track_parent_storm_id': (track_dim, track_parent_storm_id), - 'cell_id': (cell_dim, cell_id), - 'cell_parent_track_id': (cell_dim, cell_parent_track_id), - 'feature_id': (feature_dim, feature_id), - 'feature_parent_cell_id': (feature_dim, feature_parent_cell_id), - 'feature_parent_track_id': (feature_dim, feature_parent_track_id), - 'feature_parent_storm_id': (feature_dim, feature_parent_storm_id), - 'storm_child_track_count': (storm_dim, storm_child_track_count), - 'storm_child_cell_count': (storm_dim, storm_child_cell_count), - 'storm_child_feature_count': (storm_dim, storm_child_feature_count), - 'track_child_cell_count': (track_dim, track_child_cell_count), - 'cell_child_feature_count': (cell_dim, cell_child_feature_count), - }) - d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) + + storm_dim = "nstorms" + track_dim = "ntracks" + cell_dim = "ncells" + feature_dim = "nfeatures" + + d = xr.Dataset( + { + "storm_id": (storm_dim, storm_id), + "track_id": (track_dim, track_ids), + "track_parent_storm_id": (track_dim, track_parent_storm_id), + "cell_id": (cell_dim, cell_id), + "cell_parent_track_id": (cell_dim, cell_parent_track_id), + "feature_id": (feature_dim, feature_id), + "feature_parent_cell_id": (feature_dim, feature_parent_cell_id), + "feature_parent_track_id": (feature_dim, feature_parent_track_id), + "feature_parent_storm_id": (feature_dim, feature_parent_storm_id), + "storm_child_track_count": (storm_dim, storm_child_track_count), + "storm_child_cell_count": (storm_dim, storm_child_cell_count), + "storm_child_feature_count": (storm_dim, storm_child_feature_count), + "track_child_cell_count": (track_dim, track_child_cell_count), + "cell_child_feature_count": (cell_dim, cell_child_feature_count), + } + ) + d = d.set_coords(["feature_id", "cell_id", "track_id", "storm_id"]) assert len(track_id) == len(track_parent_storm_id) assert len(cell_id) == len(cell_parent_track_id) @@ -340,6 +378,8 @@ def merge_split(TRACK,distance = 25000,frame_len = 5): assert sum(storm_child_cell_count) == len(cell_id) assert sum(storm_child_feature_count) == len(feature_id) assert sum(track_child_cell_count) == len(cell_id) - assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id) - - return d \ No newline at end of file + assert sum( + [sum(cell_child_feature_count), (len(np.where(feature_parent_track_id < 0)[0]))] + ) == len(feature_id) + + return d From edc5f25bdf60021d2bfa98dbf115fe16a87eefcf Mon Sep 17 00:00:00 2001 From: Juli Date: Thu, 16 Jun 2022 14:39:40 +0200 Subject: [PATCH 03/29] black formatting --- tobac/merge_split.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 4183a3f0..2f4915a7 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -18,7 +18,7 @@ def compress_all(nc_grids, min_dims=2): def standardize_track_dataset(TrackedFeatures, Mask, Projection): - """ Combine a feature mask with the feature data table into a common dataset. + """Combine a feature mask with the feature data table into a common dataset. Also renames th returned by tobac.themes.tobac_v1.segmentation @@ -34,7 +34,7 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection): TODO: Add metadata attributes to - """ + """ feature_standard_names = { # new variable name, and long description for the NetCDF attribute "frame": ( @@ -141,20 +141,20 @@ def merge_split(TRACK, distance=25000, frame_len=5): function to postprocess tobac track data for merge/split cells Input: TRACK: xarray dataset of tobac Track information - + distance: float, optional distance threshold prior to adding a pair of points into the minimum spanning tree. Default is 25000 meters. - + frame_len: float, optional threshold for the spanning length within which two points can be separated. Default is five (5) frames. Output: - d: xarray dataset of + d: xarray dataset of feature position along 1st horizontal dimension hdim2_index: float feature position along 2nd horizontal dimension - + Example: d = merge_split(Track) ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem']) @@ -162,7 +162,7 @@ def merge_split(TRACK, distance=25000, frame_len=5): both_ds = xr.merge([ds, d],compat ='override') both_ds = compress_all(both_ds) both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) - + """ track_groups = TRACK.groupby("cell") cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()} From d1f23b48effa041fb7e8636417b6745354f5f7b0 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Sat, 2 Jul 2022 21:36:49 -0600 Subject: [PATCH 04/29] Update merge_split.py Updated changes to merge/split based on suggestions for lat/lon convection, and documentation. --- tobac/merge_split.py | 467 +++++++++++++++++++++++-------------------- 1 file changed, 249 insertions(+), 218 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 2f4915a7..60e929be 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -1,4 +1,13 @@ -# Tobac merge and split v0.1 +''' + Tobac merge and split v0.1 + This submodule is a post processing step to address tracked cells which merge/split. + The first iteration of this module is to combine the cells which are merging but receive + a new cell id once merged. In general this submodule will label these three cell-ids using + the largest cell number of those within the merged/split cell ids. + + +''' + from geopy.distance import geodesic from networkx import * @@ -6,8 +15,30 @@ from pandas.core.common import flatten import xarray as xr - def compress_all(nc_grids, min_dims=2): + ''' + The purpose of this subroutine is to compress the netcdf variables as they are saved + this does not change the data, but sets netcdf encoding parameters. + We allocate a minimum number of dimensions as variables with dimensions + under the minimum value do not benefit from tangibly from this encoding. + + Parameters + ---------- + nc_grids : xarray.core.dataset.Dataset + Xarray dataset that is intended to be exported as netcdf + + min_dims : integer + The minimum number of dimesnions, in integer value, a variable must have in order + set the netcdf compression encoding. + + Returns + ------- + nc_grids : xarray.core.dataset.Dataset + Xarray dataset with netcdf compression encoding for variables with two (2) or more dimensions + + ''' + + for var in nc_grids: if len(nc_grids[var].dims) >= min_dims: # print("Compressing ", var) @@ -15,11 +46,11 @@ def compress_all(nc_grids, min_dims=2): nc_grids[var].encoding["complevel"] = 4 nc_grids[var].encoding["contiguous"] = False return nc_grids + - -def standardize_track_dataset(TrackedFeatures, Mask, Projection): - """Combine a feature mask with the feature data table into a common dataset. - Also renames th +def standardize_track_dataset(TrackedFeatures, Mask, Projection = None): + ''' + Combine a feature mask with the feature data table into a common dataset. returned by tobac.themes.tobac_v1.segmentation with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy. @@ -32,140 +63,143 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection): Projection is an xarray DataArray - TODO: Add metadata attributes to - - """ + TODO: Add metadata attributes + + Parameters + ---------- + TrackedFeatures : xarray.core.dataset.Dataset + xarray dataset of tobac Track information, the xarray dataset returned by tobac.tracking.linking_trackpy + + Mask: xarray.core.dataset.Dataset + xarray dataset of tobac segmentation mask information, the xarray dataset returned + by tobac.segmentation.segmentation + + + Projection : xarray.core.dataarray.DataArray, default = None + array.DataArray of the original input dataset (gridded nexrad data for example). + If using gridded nexrad data, this can be input as: data['ProjectionCoordinateSystem'] + An example of the type of information in the dataarray includes the following attributes: + latitude_of_projection_origin :29.471900939941406 + longitude_of_projection_origin :-95.0787353515625 + _CoordinateTransformType :Projection + _CoordinateAxes :x y z time + _CoordinateAxesTypes :GeoX GeoY Height Time + grid_mapping_name :azimuthal_equidistant + semi_major_axis :6370997.0 + inverse_flattening :298.25 + longitude_of_prime_meridian :0.0 + false_easting :0.0 + false_northing :0.0 + + Returns + ------- + + ds : xarray.core.dataset.Dataset + xarray dataset of merged Track and Segmentation Mask datasets with renamed variables. + + ''' feature_standard_names = { # new variable name, and long description for the NetCDF attribute - "frame": ( - "feature_time_index", - "positional index of the feature along the time dimension of the mask, from 0 to N-1", - ), - "hdim_1": ( - "feature_hdim1_coordinate", - "position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x)." - "The numbering is consistent with positional indexing of the coordinate, but can be" - "fractional, to account for a centroid not aligned to the grid.", - ), - "hdim_2": ( - "feature_hdim2_coordinate", - "position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)" - "The numbering is consistent with positional indexing of the coordinate, but can be" - "fractional, to account for a centroid not aligned to the grid.", - ), - "idx": ("feature_id_this_frame",), - "num": ( - "feature_grid_cell_count", - "Number of grid points that are within the threshold of this feature", - ), - "threshold_value": ( - "feature_threshold_max", - "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments", - ), - "feature": ( - "feature_id", - "Unique number of the feature; starts from 1 and increments by 1 to the number of features", - ), - "time": ( - "feature_time", - "time of the feature, consistent with feature_time_index", - ), - "timestr": ( - "feature_time_str", - "String representation of the feature time, YYYY-MM-DD HH:MM:SS", - ), - "projection_y_coordinate": ( - "feature_projection_y_coordinate", - "y position of the feature in the projection given by ProjectionCoordinateSystem", - ), - "projection_x_coordinate": ( - "feature_projection_x_coordinate", - "x position of the feature in the projection given by ProjectionCoordinateSystem", - ), - "lat": ("feature_latitude", "latitude of the feature"), - "lon": ("feature_longitude", "longitude of the feature"), - "ncells": ( - "feature_ncells", - "number of grid cells for this feature (meaning uncertain)", - ), - "areas": ("feature_area",), - "isolated": ("feature_isolation_flag",), - "num_objects": ("number_of_feature_neighbors",), - "cell": ("feature_parent_cell_id",), - "time_cell": ("feature_parent_cell_elapsed_time",), - "segmentation_mask": ("2d segmentation mask",), - } - new_feature_var_names = { - k: feature_standard_names[k][0] - for k in feature_standard_names.keys() - if k in TrackedFeatures.variables.keys() + 'frame':('feature_time_index', + 'positional index of the feature along the time dimension of the mask, from 0 to N-1'), + 'hdim_1':('feature_hdim1_coordinate', + 'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).' + 'The numbering is consistent with positional indexing of the coordinate, but can be' + 'fractional, to account for a centroid not aligned to the grid.'), + 'hdim_2':('feature_hdim2_coordinate', + 'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)' + 'The numbering is consistent with positional indexing of the coordinate, but can be' + 'fractional, to account for a centroid not aligned to the grid.'), + 'idx':('feature_id_this_frame',), + 'num':('feature_grid_cell_count', + 'Number of grid points that are within the threshold of this feature'), + 'threshold_value':('feature_threshold_max', + "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"), + 'feature':('feature_id', + "Unique number of the feature; starts from 1 and increments by 1 to the number of features"), + 'time':('feature_time','time of the feature, consistent with feature_time_index'), + 'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'), + 'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'), + 'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'), + 'lat':('feature_latitude','latitude of the feature'), + 'lon':('feature_longitude','longitude of the feature'), + 'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'), + 'areas':('feature_area',), + 'isolated':('feature_isolation_flag',), + 'num_objects':('number_of_feature_neighbors',), + 'cell':('feature_parent_cell_id',), + 'time_cell':('feature_parent_cell_elapsed_time',), + 'segmentation_mask':('2d segmentation mask',) } + new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys() + if k in TrackedFeatures.variables.keys()} - TrackedFeatures = TrackedFeatures.drop(["cell_parent_track_id"]) + TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id']) # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of # the 'index' variable and call the dimension 'feature' - ds = xr.merge( - [ - TrackedFeatures.swap_dims({"index": "feature"}) - .drop("index") - .rename_vars(new_feature_var_names), - Mask, - ] - ) + ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names), + Mask]) # Add the projection data back in - ds["ProjectionCoordinateSystem"] = Projection + if not None in Projection: + ds['ProjectionCoordinateSystem']=Projection # Convert the cell ID variable from float to integer - if "int" not in str(TrackedFeatures.cell.dtype): + if 'int' not in str(TrackedFeatures.cell.dtype): # The raw output from the tracking is actually an object array # array([nan, 2, 3], dtype=object) # (and is cast to a float array when saved as NetCDF, I think). # Cast to float. - int_cell = xr.zeros_like(TrackedFeatures.cell, dtype="int64") + int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64') - cell_id_data = TrackedFeatures.cell.astype("float64") + cell_id_data = TrackedFeatures.cell.astype('float64') valid_cell = np.isfinite(cell_id_data) valid_cell_ids = cell_id_data[valid_cell] if not (np.unique(valid_cell_ids) > 0).all(): - raise AssertionError( - "Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell" - ) - int_cell[valid_cell] = valid_cell_ids.astype("int64") - # ds['feature_parent_cell_id'] = int_cell + raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell') + int_cell[valid_cell] = valid_cell_ids.astype('int64') + #ds['feature_parent_cell_id'] = int_cell return ds -def merge_split(TRACK, distance=25000, frame_len=5): - """ - function to postprocess tobac track data for merge/split cells - Input: - TRACK: xarray dataset of tobac Track information - distance: float, optional distance threshold prior to adding a pair of points - into the minimum spanning tree. Default is 25000 meters. - - frame_len: float, optional threshold for the spanning length within which two points - can be separated. Default is five (5) frames. - - - Output: - d: xarray dataset of - feature position along 1st horizontal dimension - hdim2_index: float - feature position along 2nd horizontal dimension - - Example: +def merge_split(TRACK,distance = 25000,frame_len = 5): + ''' + function to postprocess tobac track data for merge/split cells + + + Parameters + ---------- + TRACK : xarray.core.dataset.Dataset + xarray dataset of tobac Track information + + distance : float, optional + Distance threshold prior to adding a pair of points into the minimum spanning tree. + Default is 25000 meters. + + frame_len : float, optional + Threshold for the spanning length within which two points can be separated. + Default is five (5) frames. + + Returns + ------- + + d : xarray.core.dataset.Dataset + xarray dataset of tobac merge/split cells with parent and child designations. + + + Example usage: d = merge_split(Track) - ds = standardize_track_dataset(Track, refl_mask, data['ProjectionCoordinateSystem']) - # both_ds = xarray.combine_by_coords((ds,d), compat='override') + ds = standardize_track_dataset(Track, refl_mask) both_ds = xr.merge([ds, d],compat ='override') both_ds = compress_all(both_ds) both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) - - """ - track_groups = TRACK.groupby("cell") - cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()} + + ''' + + print('this is an update 3') + track_groups = TRACK.groupby('cell') + cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()} id_data = np.fromiter(cell_ids.keys(), dtype=int) count_data = np.fromiter(cell_ids.values(), dtype=int) all_frames = np.sort(np.unique(TRACK.frame)) @@ -174,40 +208,42 @@ def merge_split(TRACK, distance=25000, frame_len=5): a_names = list() b_names = list() dist = list() + + if hasattr(TRACK, 'grid_longitude'): + print('is in lat/lonx') + for i in id_data: + a_pointx = track_groups[i].grid_longitude[-1].values + a_pointy = track_groups[i].grid_latitude[-1].values + for j in id_data: + b_pointx = track_groups[j].grid_longitude[0].values + b_pointy = track_groups[j].grid_latitude[0].values + d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).m + if d <= distance: + a_points.append([a_pointx,a_pointy]) + b_points.append([b_pointx, b_pointy]) + dist.append(d) + a_names.append(i) + b_names.append(j) + else: + + for i in id_data: + #print(i) + a_pointx = track_groups[i].projection_x_coordinate[-1].values + a_pointy = track_groups[i].projection_y_coordinate[-1].values + for j in id_data: + b_pointx = track_groups[j].projection_x_coordinate[0].values + b_pointy = track_groups[j].projection_y_coordinate[0].values + d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))) + if d <= distance: + a_points.append([a_pointx,a_pointy]) + b_points.append([b_pointx, b_pointy]) + dist.append(d) + a_names.append(i) + b_names.append(j) - for i in id_data: - # print(i) - a_pointx = track_groups[i].projection_x_coordinate[-1].values - a_pointy = track_groups[i].projection_y_coordinate[-1].values - for j in id_data: - b_pointx = track_groups[j].projection_x_coordinate[0].values - b_pointy = track_groups[j].projection_y_coordinate[0].values - d = np.linalg.norm( - np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)) - ) - if d <= distance: - a_points.append([a_pointx, a_pointy]) - b_points.append([b_pointx, b_pointy]) - dist.append(d) - a_names.append(i) - b_names.append(j) - - # for i in id_data: - # a_pointx = track_groups[i].grid_longitude[-1].values - # a_pointy = track_groups[i].grid_latitude[-1].values - # for j in id_data: - # b_pointx = track_groups[j].grid_longitude[0].values - # b_pointy = track_groups[j].grid_latitude[0].values - # d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).km - # if d <= distance: - # a_points.append([a_pointx,a_pointy]) - # b_points.append([b_pointx, b_pointy]) - # dist.append(d) - # a_names.append(i) - # b_names.append(j) id = [] - for i in range(len(dist) - 1, -1, -1): + for i in range(len(dist)-1, -1, -1): if a_names[i] == b_names[i]: id.append(i) a_points.pop(i) @@ -217,27 +253,25 @@ def merge_split(TRACK, distance=25000, frame_len=5): b_names.pop(i) else: continue - + g = Graph() for i in np.arange(len(dist)): - g.add_edge(a_names[i], b_names[i], weight=dist[i]) + g.add_edge(a_names[i], b_names[i],weight=dist[i]) tree = minimum_spanning_edges(g) tree_list = list(minimum_spanning_edges(g)) new_tree = [] - for i, j in enumerate(tree_list): + for i,j in enumerate(tree_list): frame_a = np.nanmax(track_groups[j[0]].frame.values) frame_b = np.nanmin(track_groups[j[1]].frame.values) if np.abs(frame_a - frame_b) <= frame_len: new_tree.append(tree_list[i][0:2]) new_tree_arr = np.array(new_tree) - TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values)) - cell_id = np.unique( - TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))] - ) - track_id = dict() # same size as number of total merged tracks + TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values)) + cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]) + track_id = dict() #same size as number of total merged tracks arr = np.array([0]) for p in cell_id: @@ -248,7 +282,7 @@ def merge_split(TRACK, distance=25000, frame_len=5): k = np.where(new_tree_arr == p) if len(k[0]) == 0: track_id[p] = [p] - arr = np.append(arr, p) + arr = np.append(arr,p) else: temp1 = list(np.unique(new_tree_arr[k[0]])) temp = list(np.unique(new_tree_arr[k[0]])) @@ -263,113 +297,112 @@ def merge_split(TRACK, distance=25000, frame_len=5): if len(temp1) == len(temp): break temp1 = np.array(temp) - + for i in temp1: k2 = np.where(new_tree_arr == i) temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) temp = list(flatten(temp)) temp = list(np.unique(temp)) - arr = np.append(arr, np.unique(temp)) - + arr = np.append(arr,np.unique(temp)) + track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) + + + + storm_id = [0] #default because we don't track larger storm systems *yet* + print('found storm id') - storm_id = [0] # default because we don't track larger storm systems *yet* - print("found storm id") - track_parent_storm_id = np.repeat( - 0, len(track_id) - ) # This will always be zero when we don't track larger storm systems *yet* - print("found track parent storm ids") + track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet* + print('found track parent storm ids') track_ids = np.array(list(track_id.keys())) - print("found track ids") + print('found track ids') + - cell_id = list( - np.unique( - TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))] - ) - ) - print("found cell ids") + cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])) + print('found cell ids') cell_parent_track_id = [] for i, id in enumerate(track_id): - - if len(track_id[int(id)]) == 1: + + if len(track_id[int(id)]) == 1: cell_parent_track_id.append(int(id)) else: - cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)]))) + cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)]))) + cell_parent_track_id = list(flatten(cell_parent_track_id)) - print("found cell parent track ids") + print('found cell parent track ids') feature_parent_cell_id = list(TRACK.cell.values.astype(float)) - print("found feature parent cell ids") + print('found feature parent cell ids') - # This version includes all the feature regardless of if they are used in cells or not. + #This version includes all the feature regardless of if they are used in cells or not. feature_id = list(TRACK.feature.values.astype(int)) - print("found feature ids") + print('found feature ids') - feature_parent_storm_id = np.repeat(0, len(feature_id)) # we don't do storms atm - print("found feature parent storm ids") + feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm + print('found feature parent storm ids') - feature_parent_track_id = [] + feature_parent_track_id = [] feature_parent_track_id = np.zeros(len(feature_id)) for i, id in enumerate(feature_id): cellid = feature_parent_cell_id[i] if np.isnan(cellid): - feature_parent_track_id[i] = -1 + feature_parent_track_id[i] = -1 else: j = np.where(cell_id == cellid) j = np.squeeze(j) trackid = cell_parent_track_id[j] feature_parent_track_id[i] = trackid + + print('found feature parent track ids') - print("found feature parent track ids") storm_child_track_count = [len(track_id)] - print("found storm child track count") + print('found storm child track count') track_child_cell_count = [] - for i, id in enumerate(track_id): + for i,id in enumerate(track_id): track_child_cell_count.append(len(track_id[int(id)])) - print("found track child cell count") + print('found track child cell count') + cell_child_feature_count = [] - for i, id in enumerate(cell_id): + for i,id in enumerate(cell_id): cell_child_feature_count.append(len(track_groups[id].feature.values)) - print("found cell child feature count") + print('found cell child feature count') - storm_child_cell_count = [len(cell_id)] + storm_child_cell_count = [len(cell_id)] storm_child_feature_count = [len(feature_id)] - - storm_dim = "nstorms" - track_dim = "ntracks" - cell_dim = "ncells" - feature_dim = "nfeatures" - - d = xr.Dataset( - { - "storm_id": (storm_dim, storm_id), - "track_id": (track_dim, track_ids), - "track_parent_storm_id": (track_dim, track_parent_storm_id), - "cell_id": (cell_dim, cell_id), - "cell_parent_track_id": (cell_dim, cell_parent_track_id), - "feature_id": (feature_dim, feature_id), - "feature_parent_cell_id": (feature_dim, feature_parent_cell_id), - "feature_parent_track_id": (feature_dim, feature_parent_track_id), - "feature_parent_storm_id": (feature_dim, feature_parent_storm_id), - "storm_child_track_count": (storm_dim, storm_child_track_count), - "storm_child_cell_count": (storm_dim, storm_child_cell_count), - "storm_child_feature_count": (storm_dim, storm_child_feature_count), - "track_child_cell_count": (track_dim, track_child_cell_count), - "cell_child_feature_count": (cell_dim, cell_child_feature_count), - } - ) - d = d.set_coords(["feature_id", "cell_id", "track_id", "storm_id"]) + + storm_dim = 'nstorms' + track_dim = 'ntracks' + cell_dim = 'ncells' + feature_dim = 'nfeatures' + + d = xr.Dataset({ + 'storm_id': (storm_dim, storm_id), + 'track_id': (track_dim, track_ids), + 'track_parent_storm_id': (track_dim, track_parent_storm_id), + 'cell_id': (cell_dim, cell_id), + 'cell_parent_track_id': (cell_dim, cell_parent_track_id), + 'feature_id': (feature_dim, feature_id), + 'feature_parent_cell_id': (feature_dim, feature_parent_cell_id), + 'feature_parent_track_id': (feature_dim, feature_parent_track_id), + 'feature_parent_storm_id': (feature_dim, feature_parent_storm_id), + 'storm_child_track_count': (storm_dim, storm_child_track_count), + 'storm_child_cell_count': (storm_dim, storm_child_cell_count), + 'storm_child_feature_count': (storm_dim, storm_child_feature_count), + 'track_child_cell_count': (track_dim, track_child_cell_count), + 'cell_child_feature_count': (cell_dim, cell_child_feature_count), + }) + d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) assert len(track_id) == len(track_parent_storm_id) assert len(cell_id) == len(cell_parent_track_id) @@ -378,8 +411,6 @@ def merge_split(TRACK, distance=25000, frame_len=5): assert sum(storm_child_cell_count) == len(cell_id) assert sum(storm_child_feature_count) == len(feature_id) assert sum(track_child_cell_count) == len(cell_id) - assert sum( - [sum(cell_child_feature_count), (len(np.where(feature_parent_track_id < 0)[0]))] - ) == len(feature_id) - - return d + assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id) + + return d \ No newline at end of file From d3aa8ea7322fef19452fe9f3831c26b5f2b56df0 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Wed, 6 Jul 2022 19:10:02 -0600 Subject: [PATCH 05/29] Ran Black formatting on file. Formatted using the Black python package. --- tobac/merge_split.py | 367 ++++++++++++++++++++++++------------------- 1 file changed, 205 insertions(+), 162 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 60e929be..c58be5aa 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -1,4 +1,4 @@ -''' +""" Tobac merge and split v0.1 This submodule is a post processing step to address tracked cells which merge/split. The first iteration of this module is to combine the cells which are merging but receive @@ -6,7 +6,7 @@ the largest cell number of those within the merged/split cell ids. -''' +""" from geopy.distance import geodesic @@ -15,8 +15,9 @@ from pandas.core.common import flatten import xarray as xr + def compress_all(nc_grids, min_dims=2): - ''' + """ The purpose of this subroutine is to compress the netcdf variables as they are saved this does not change the data, but sets netcdf encoding parameters. We allocate a minimum number of dimensions as variables with dimensions @@ -26,19 +27,18 @@ def compress_all(nc_grids, min_dims=2): ---------- nc_grids : xarray.core.dataset.Dataset Xarray dataset that is intended to be exported as netcdf - + min_dims : integer - The minimum number of dimesnions, in integer value, a variable must have in order + The minimum number of dimesnions, in integer value, a variable must have in order set the netcdf compression encoding. - + Returns ------- nc_grids : xarray.core.dataset.Dataset Xarray dataset with netcdf compression encoding for variables with two (2) or more dimensions - - ''' - + """ + for var in nc_grids: if len(nc_grids[var].dims) >= min_dims: # print("Compressing ", var) @@ -46,10 +46,10 @@ def compress_all(nc_grids, min_dims=2): nc_grids[var].encoding["complevel"] = 4 nc_grids[var].encoding["contiguous"] = False return nc_grids - -def standardize_track_dataset(TrackedFeatures, Mask, Projection = None): - ''' + +def standardize_track_dataset(TrackedFeatures, Mask, Projection=None): + """ Combine a feature mask with the feature data table into a common dataset. returned by tobac.themes.tobac_v1.segmentation @@ -64,19 +64,19 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection = None): Projection is an xarray DataArray TODO: Add metadata attributes - + Parameters ---------- TrackedFeatures : xarray.core.dataset.Dataset xarray dataset of tobac Track information, the xarray dataset returned by tobac.tracking.linking_trackpy - + Mask: xarray.core.dataset.Dataset xarray dataset of tobac segmentation mask information, the xarray dataset returned by tobac.segmentation.segmentation - - + + Projection : xarray.core.dataarray.DataArray, default = None - array.DataArray of the original input dataset (gridded nexrad data for example). + array.DataArray of the original input dataset (gridded nexrad data for example). If using gridded nexrad data, this can be input as: data['ProjectionCoordinateSystem'] An example of the type of information in the dataarray includes the following attributes: latitude_of_projection_origin :29.471900939941406 @@ -90,116 +90,153 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection = None): longitude_of_prime_meridian :0.0 false_easting :0.0 false_northing :0.0 - + Returns - ------- - + ------- + ds : xarray.core.dataset.Dataset xarray dataset of merged Track and Segmentation Mask datasets with renamed variables. - - ''' + + """ feature_standard_names = { # new variable name, and long description for the NetCDF attribute - 'frame':('feature_time_index', - 'positional index of the feature along the time dimension of the mask, from 0 to N-1'), - 'hdim_1':('feature_hdim1_coordinate', - 'position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x).' - 'The numbering is consistent with positional indexing of the coordinate, but can be' - 'fractional, to account for a centroid not aligned to the grid.'), - 'hdim_2':('feature_hdim2_coordinate', - 'position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)' - 'The numbering is consistent with positional indexing of the coordinate, but can be' - 'fractional, to account for a centroid not aligned to the grid.'), - 'idx':('feature_id_this_frame',), - 'num':('feature_grid_cell_count', - 'Number of grid points that are within the threshold of this feature'), - 'threshold_value':('feature_threshold_max', - "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments"), - 'feature':('feature_id', - "Unique number of the feature; starts from 1 and increments by 1 to the number of features"), - 'time':('feature_time','time of the feature, consistent with feature_time_index'), - 'timestr':('feature_time_str','String representation of the feature time, YYYY-MM-DD HH:MM:SS'), - 'projection_y_coordinate':('feature_projection_y_coordinate','y position of the feature in the projection given by ProjectionCoordinateSystem'), - 'projection_x_coordinate':('feature_projection_x_coordinate','x position of the feature in the projection given by ProjectionCoordinateSystem'), - 'lat':('feature_latitude','latitude of the feature'), - 'lon':('feature_longitude','longitude of the feature'), - 'ncells':('feature_ncells','number of grid cells for this feature (meaning uncertain)'), - 'areas':('feature_area',), - 'isolated':('feature_isolation_flag',), - 'num_objects':('number_of_feature_neighbors',), - 'cell':('feature_parent_cell_id',), - 'time_cell':('feature_parent_cell_elapsed_time',), - 'segmentation_mask':('2d segmentation mask',) + "frame": ( + "feature_time_index", + "positional index of the feature along the time dimension of the mask, from 0 to N-1", + ), + "hdim_1": ( + "feature_hdim1_coordinate", + "position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x)." + "The numbering is consistent with positional indexing of the coordinate, but can be" + "fractional, to account for a centroid not aligned to the grid.", + ), + "hdim_2": ( + "feature_hdim2_coordinate", + "position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)" + "The numbering is consistent with positional indexing of the coordinate, but can be" + "fractional, to account for a centroid not aligned to the grid.", + ), + "idx": ("feature_id_this_frame",), + "num": ( + "feature_grid_cell_count", + "Number of grid points that are within the threshold of this feature", + ), + "threshold_value": ( + "feature_threshold_max", + "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments", + ), + "feature": ( + "feature_id", + "Unique number of the feature; starts from 1 and increments by 1 to the number of features", + ), + "time": ( + "feature_time", + "time of the feature, consistent with feature_time_index", + ), + "timestr": ( + "feature_time_str", + "String representation of the feature time, YYYY-MM-DD HH:MM:SS", + ), + "projection_y_coordinate": ( + "feature_projection_y_coordinate", + "y position of the feature in the projection given by ProjectionCoordinateSystem", + ), + "projection_x_coordinate": ( + "feature_projection_x_coordinate", + "x position of the feature in the projection given by ProjectionCoordinateSystem", + ), + "lat": ("feature_latitude", "latitude of the feature"), + "lon": ("feature_longitude", "longitude of the feature"), + "ncells": ( + "feature_ncells", + "number of grid cells for this feature (meaning uncertain)", + ), + "areas": ("feature_area",), + "isolated": ("feature_isolation_flag",), + "num_objects": ("number_of_feature_neighbors",), + "cell": ("feature_parent_cell_id",), + "time_cell": ("feature_parent_cell_elapsed_time",), + "segmentation_mask": ("2d segmentation mask",), + } + new_feature_var_names = { + k: feature_standard_names[k][0] + for k in feature_standard_names.keys() + if k in TrackedFeatures.variables.keys() } - new_feature_var_names = {k:feature_standard_names[k][0] for k in feature_standard_names.keys() - if k in TrackedFeatures.variables.keys()} - TrackedFeatures = TrackedFeatures.drop(['cell_parent_track_id']) + TrackedFeatures = TrackedFeatures.drop(["cell_parent_track_id"]) # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of # the 'index' variable and call the dimension 'feature' - ds = xr.merge([TrackedFeatures.swap_dims({'index':'feature'}).drop('index').rename_vars(new_feature_var_names), - Mask]) + ds = xr.merge( + [ + TrackedFeatures.swap_dims({"index": "feature"}) + .drop("index") + .rename_vars(new_feature_var_names), + Mask, + ] + ) # Add the projection data back in if not None in Projection: - ds['ProjectionCoordinateSystem']=Projection + ds["ProjectionCoordinateSystem"] = Projection # Convert the cell ID variable from float to integer - if 'int' not in str(TrackedFeatures.cell.dtype): + if "int" not in str(TrackedFeatures.cell.dtype): # The raw output from the tracking is actually an object array # array([nan, 2, 3], dtype=object) # (and is cast to a float array when saved as NetCDF, I think). # Cast to float. - int_cell = xr.zeros_like(TrackedFeatures.cell, dtype='int64') + int_cell = xr.zeros_like(TrackedFeatures.cell, dtype="int64") - cell_id_data = TrackedFeatures.cell.astype('float64') + cell_id_data = TrackedFeatures.cell.astype("float64") valid_cell = np.isfinite(cell_id_data) valid_cell_ids = cell_id_data[valid_cell] if not (np.unique(valid_cell_ids) > 0).all(): - raise AssertionError('Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell') - int_cell[valid_cell] = valid_cell_ids.astype('int64') - #ds['feature_parent_cell_id'] = int_cell + raise AssertionError( + "Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell" + ) + int_cell[valid_cell] = valid_cell_ids.astype("int64") + # ds['feature_parent_cell_id'] = int_cell return ds - -def merge_split(TRACK,distance = 25000,frame_len = 5): - ''' +def merge_split(TRACK, distance=25000, frame_len=5): + """ function to postprocess tobac track data for merge/split cells - - + + Parameters ---------- TRACK : xarray.core.dataset.Dataset xarray dataset of tobac Track information - - distance : float, optional - Distance threshold prior to adding a pair of points into the minimum spanning tree. + + distance : float, optional + Distance threshold prior to adding a pair of points into the minimum spanning tree. Default is 25000 meters. - + frame_len : float, optional - Threshold for the spanning length within which two points can be separated. + Threshold for the spanning length within which two points can be separated. Default is five (5) frames. Returns - ------- + ------- d : xarray.core.dataset.Dataset xarray dataset of tobac merge/split cells with parent and child designations. - - + + Example usage: d = merge_split(Track) ds = standardize_track_dataset(Track, refl_mask) both_ds = xr.merge([ds, d],compat ='override') both_ds = compress_all(both_ds) both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) - - ''' - - print('this is an update 3') - track_groups = TRACK.groupby('cell') - cell_ids = {cid:len(v) for cid, v in track_groups.groups.items()} + + """ + + print("this is an update 3") + track_groups = TRACK.groupby("cell") + cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()} id_data = np.fromiter(cell_ids.keys(), dtype=int) count_data = np.fromiter(cell_ids.values(), dtype=int) all_frames = np.sort(np.unique(TRACK.frame)) @@ -208,42 +245,43 @@ def merge_split(TRACK,distance = 25000,frame_len = 5): a_names = list() b_names = list() dist = list() - - if hasattr(TRACK, 'grid_longitude'): - print('is in lat/lonx') + + if hasattr(TRACK, "longitude"): + print("is in lat/lon") for i in id_data: a_pointx = track_groups[i].grid_longitude[-1].values a_pointy = track_groups[i].grid_latitude[-1].values for j in id_data: b_pointx = track_groups[j].grid_longitude[0].values b_pointy = track_groups[j].grid_latitude[0].values - d = geodesic((a_pointy,a_pointx),(b_pointy,b_pointx)).m + d = geodesic((a_pointy, a_pointx), (b_pointy, b_pointx)).m if d <= distance: - a_points.append([a_pointx,a_pointy]) + a_points.append([a_pointx, a_pointy]) b_points.append([b_pointx, b_pointy]) dist.append(d) a_names.append(i) b_names.append(j) else: - + for i in id_data: - #print(i) + # print(i) a_pointx = track_groups[i].projection_x_coordinate[-1].values a_pointy = track_groups[i].projection_y_coordinate[-1].values for j in id_data: b_pointx = track_groups[j].projection_x_coordinate[0].values b_pointy = track_groups[j].projection_y_coordinate[0].values - d = np.linalg.norm(np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy))) + d = np.linalg.norm( + np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)) + ) if d <= distance: - a_points.append([a_pointx,a_pointy]) + a_points.append([a_pointx, a_pointy]) b_points.append([b_pointx, b_pointy]) dist.append(d) a_names.append(i) b_names.append(j) - id = [] - for i in range(len(dist)-1, -1, -1): + for i in range(len(dist) - 1, -1, -1): if a_names[i] == b_names[i]: id.append(i) a_points.pop(i) @@ -253,25 +291,27 @@ def merge_split(TRACK,distance = 25000,frame_len = 5): b_names.pop(i) else: continue - + g = Graph() for i in np.arange(len(dist)): - g.add_edge(a_names[i], b_names[i],weight=dist[i]) + g.add_edge(a_names[i], b_names[i], weight=dist[i]) tree = minimum_spanning_edges(g) tree_list = list(minimum_spanning_edges(g)) new_tree = [] - for i,j in enumerate(tree_list): + for i, j in enumerate(tree_list): frame_a = np.nanmax(track_groups[j[0]].frame.values) frame_b = np.nanmin(track_groups[j[1]].frame.values) if np.abs(frame_a - frame_b) <= frame_len: new_tree.append(tree_list[i][0:2]) new_tree_arr = np.array(new_tree) - TRACK['cell_parent_track_id'] = np.zeros(len(TRACK['cell'].values)) - cell_id = np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))]) - track_id = dict() #same size as number of total merged tracks + TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values)) + cell_id = np.unique( + TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))] + ) + track_id = dict() # same size as number of total merged tracks arr = np.array([0]) for p in cell_id: @@ -282,7 +322,7 @@ def merge_split(TRACK,distance = 25000,frame_len = 5): k = np.where(new_tree_arr == p) if len(k[0]) == 0: track_id[p] = [p] - arr = np.append(arr,p) + arr = np.append(arr, p) else: temp1 = list(np.unique(new_tree_arr[k[0]])) temp = list(np.unique(new_tree_arr[k[0]])) @@ -297,112 +337,113 @@ def merge_split(TRACK,distance = 25000,frame_len = 5): if len(temp1) == len(temp): break temp1 = np.array(temp) - + for i in temp1: k2 = np.where(new_tree_arr == i) temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) temp = list(flatten(temp)) temp = list(np.unique(temp)) - arr = np.append(arr,np.unique(temp)) - - track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) - - + arr = np.append(arr, np.unique(temp)) - storm_id = [0] #default because we don't track larger storm systems *yet* - print('found storm id') + track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) + storm_id = [0] # default because we don't track larger storm systems *yet* + print("found storm id") - track_parent_storm_id = np.repeat(0, len(track_id)) #This will always be zero when we don't track larger storm systems *yet* - print('found track parent storm ids') + track_parent_storm_id = np.repeat( + 0, len(track_id) + ) # This will always be zero when we don't track larger storm systems *yet* + print("found track parent storm ids") track_ids = np.array(list(track_id.keys())) - print('found track ids') - + print("found track ids") - cell_id = list(np.unique(TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))])) - print('found cell ids') + cell_id = list( + np.unique( + TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))] + ) + ) + print("found cell ids") cell_parent_track_id = [] for i, id in enumerate(track_id): - - if len(track_id[int(id)]) == 1: + + if len(track_id[int(id)]) == 1: cell_parent_track_id.append(int(id)) else: - cell_parent_track_id.append(np.repeat(int(id),len(track_id[int(id)]))) - + cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)]))) cell_parent_track_id = list(flatten(cell_parent_track_id)) - print('found cell parent track ids') + print("found cell parent track ids") feature_parent_cell_id = list(TRACK.cell.values.astype(float)) - print('found feature parent cell ids') + print("found feature parent cell ids") - #This version includes all the feature regardless of if they are used in cells or not. + # This version includes all the feature regardless of if they are used in cells or not. feature_id = list(TRACK.feature.values.astype(int)) - print('found feature ids') + print("found feature ids") - feature_parent_storm_id = np.repeat(0,len(feature_id)) #we don't do storms atm - print('found feature parent storm ids') + feature_parent_storm_id = np.repeat(0, len(feature_id)) # we don't do storms atm + print("found feature parent storm ids") - feature_parent_track_id = [] + feature_parent_track_id = [] feature_parent_track_id = np.zeros(len(feature_id)) for i, id in enumerate(feature_id): cellid = feature_parent_cell_id[i] if np.isnan(cellid): - feature_parent_track_id[i] = -1 + feature_parent_track_id[i] = -1 else: j = np.where(cell_id == cellid) j = np.squeeze(j) trackid = cell_parent_track_id[j] feature_parent_track_id[i] = trackid - - print('found feature parent track ids') + print("found feature parent track ids") storm_child_track_count = [len(track_id)] - print('found storm child track count') + print("found storm child track count") track_child_cell_count = [] - for i,id in enumerate(track_id): + for i, id in enumerate(track_id): track_child_cell_count.append(len(track_id[int(id)])) - print('found track child cell count') - + print("found track child cell count") cell_child_feature_count = [] - for i,id in enumerate(cell_id): + for i, id in enumerate(cell_id): cell_child_feature_count.append(len(track_groups[id].feature.values)) - print('found cell child feature count') + print("found cell child feature count") - storm_child_cell_count = [len(cell_id)] + storm_child_cell_count = [len(cell_id)] storm_child_feature_count = [len(feature_id)] - - storm_dim = 'nstorms' - track_dim = 'ntracks' - cell_dim = 'ncells' - feature_dim = 'nfeatures' - - d = xr.Dataset({ - 'storm_id': (storm_dim, storm_id), - 'track_id': (track_dim, track_ids), - 'track_parent_storm_id': (track_dim, track_parent_storm_id), - 'cell_id': (cell_dim, cell_id), - 'cell_parent_track_id': (cell_dim, cell_parent_track_id), - 'feature_id': (feature_dim, feature_id), - 'feature_parent_cell_id': (feature_dim, feature_parent_cell_id), - 'feature_parent_track_id': (feature_dim, feature_parent_track_id), - 'feature_parent_storm_id': (feature_dim, feature_parent_storm_id), - 'storm_child_track_count': (storm_dim, storm_child_track_count), - 'storm_child_cell_count': (storm_dim, storm_child_cell_count), - 'storm_child_feature_count': (storm_dim, storm_child_feature_count), - 'track_child_cell_count': (track_dim, track_child_cell_count), - 'cell_child_feature_count': (cell_dim, cell_child_feature_count), - }) - d = d.set_coords(['feature_id','cell_id', 'track_id', 'storm_id']) + + storm_dim = "nstorms" + track_dim = "ntracks" + cell_dim = "ncells" + feature_dim = "nfeatures" + + d = xr.Dataset( + { + "storm_id": (storm_dim, storm_id), + "track_id": (track_dim, track_ids), + "track_parent_storm_id": (track_dim, track_parent_storm_id), + "cell_id": (cell_dim, cell_id), + "cell_parent_track_id": (cell_dim, cell_parent_track_id), + "feature_id": (feature_dim, feature_id), + "feature_parent_cell_id": (feature_dim, feature_parent_cell_id), + "feature_parent_track_id": (feature_dim, feature_parent_track_id), + "feature_parent_storm_id": (feature_dim, feature_parent_storm_id), + "storm_child_track_count": (storm_dim, storm_child_track_count), + "storm_child_cell_count": (storm_dim, storm_child_cell_count), + "storm_child_feature_count": (storm_dim, storm_child_feature_count), + "track_child_cell_count": (track_dim, track_child_cell_count), + "cell_child_feature_count": (cell_dim, cell_child_feature_count), + } + ) + d = d.set_coords(["feature_id", "cell_id", "track_id", "storm_id"]) assert len(track_id) == len(track_parent_storm_id) assert len(cell_id) == len(cell_parent_track_id) @@ -411,6 +452,8 @@ def merge_split(TRACK,distance = 25000,frame_len = 5): assert sum(storm_child_cell_count) == len(cell_id) assert sum(storm_child_feature_count) == len(feature_id) assert sum(track_child_cell_count) == len(cell_id) - assert sum([sum(cell_child_feature_count),(len(np.where(feature_parent_track_id < 0)[0]))]) == len(feature_id) - - return d \ No newline at end of file + assert sum( + [sum(cell_child_feature_count), (len(np.where(feature_parent_track_id < 0)[0]))] + ) == len(feature_id) + + return d From cbc80f8faa88d1301b995968769d48d5a9fb11df Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Wed, 6 Jul 2022 19:35:35 -0600 Subject: [PATCH 06/29] Allowed optional import of additional packages: geopy, and networkx. Allowed optional import of additional packages: geopy, and networkx. Now a message will print if the package is not available to import. --- tobac/merge_split.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index c58be5aa..f157852c 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -8,9 +8,27 @@ """ - -from geopy.distance import geodesic -from networkx import * +try: + import geopy +except ImportError: + geopy = None + +if geopy: + from geopy.distance import geodesic +else: + print("You could be merge/splitting in lat/lons if you had geopy.") + +try: + import networkx +except ImportError: + networkx = None + +if networkx: + from networkx import * +else: + print("Cannot Merge/Split. Please install networkx.") +# from geopy.distance import geodesic +# from networkx import * import numpy as np from pandas.core.common import flatten import xarray as xr From 6ba5d41ae516bc9f376971d08f25e4d8f427c19c Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Thu, 18 Aug 2022 16:03:21 -0500 Subject: [PATCH 07/29] Updated with suggestions from code reviewers, and tobac v1.3. Updated to tobac v1.3.2, and made suggested changes. Merge_split.py now ingests a pandas dataframe, ancillary functions have been moved to utils.py. --- tobac/merge_split.py | 346 ++++++++++--------------------------------- tobac/utils.py | 169 +++++++++++++++++++++ 2 files changed, 245 insertions(+), 270 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index f157852c..c64b30bc 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -8,225 +8,16 @@ """ -try: - import geopy -except ImportError: - geopy = None - -if geopy: - from geopy.distance import geodesic -else: - print("You could be merge/splitting in lat/lons if you had geopy.") - -try: - import networkx -except ImportError: - networkx = None - -if networkx: - from networkx import * -else: - print("Cannot Merge/Split. Please install networkx.") -# from geopy.distance import geodesic -# from networkx import * -import numpy as np -from pandas.core.common import flatten -import xarray as xr - - -def compress_all(nc_grids, min_dims=2): - """ - The purpose of this subroutine is to compress the netcdf variables as they are saved - this does not change the data, but sets netcdf encoding parameters. - We allocate a minimum number of dimensions as variables with dimensions - under the minimum value do not benefit from tangibly from this encoding. - - Parameters - ---------- - nc_grids : xarray.core.dataset.Dataset - Xarray dataset that is intended to be exported as netcdf - - min_dims : integer - The minimum number of dimesnions, in integer value, a variable must have in order - set the netcdf compression encoding. - - Returns - ------- - nc_grids : xarray.core.dataset.Dataset - Xarray dataset with netcdf compression encoding for variables with two (2) or more dimensions - - """ - - for var in nc_grids: - if len(nc_grids[var].dims) >= min_dims: - # print("Compressing ", var) - nc_grids[var].encoding["zlib"] = True - nc_grids[var].encoding["complevel"] = 4 - nc_grids[var].encoding["contiguous"] = False - return nc_grids - - -def standardize_track_dataset(TrackedFeatures, Mask, Projection=None): - """ - Combine a feature mask with the feature data table into a common dataset. - - returned by tobac.themes.tobac_v1.segmentation - with the TrackedFeatures dataset returned by tobac.themes.tobac_v1.linking_trackpy. - - Also rename the variables to be more desciptive and comply with cf-tree. - - Convert the default cell parent ID to an integer table. - - Add a cell dimension to reflect - - Projection is an xarray DataArray - - TODO: Add metadata attributes - - Parameters - ---------- - TrackedFeatures : xarray.core.dataset.Dataset - xarray dataset of tobac Track information, the xarray dataset returned by tobac.tracking.linking_trackpy - - Mask: xarray.core.dataset.Dataset - xarray dataset of tobac segmentation mask information, the xarray dataset returned - by tobac.segmentation.segmentation - - - Projection : xarray.core.dataarray.DataArray, default = None - array.DataArray of the original input dataset (gridded nexrad data for example). - If using gridded nexrad data, this can be input as: data['ProjectionCoordinateSystem'] - An example of the type of information in the dataarray includes the following attributes: - latitude_of_projection_origin :29.471900939941406 - longitude_of_projection_origin :-95.0787353515625 - _CoordinateTransformType :Projection - _CoordinateAxes :x y z time - _CoordinateAxesTypes :GeoX GeoY Height Time - grid_mapping_name :azimuthal_equidistant - semi_major_axis :6370997.0 - inverse_flattening :298.25 - longitude_of_prime_meridian :0.0 - false_easting :0.0 - false_northing :0.0 - - Returns - ------- - - ds : xarray.core.dataset.Dataset - xarray dataset of merged Track and Segmentation Mask datasets with renamed variables. - """ - feature_standard_names = { - # new variable name, and long description for the NetCDF attribute - "frame": ( - "feature_time_index", - "positional index of the feature along the time dimension of the mask, from 0 to N-1", - ), - "hdim_1": ( - "feature_hdim1_coordinate", - "position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x)." - "The numbering is consistent with positional indexing of the coordinate, but can be" - "fractional, to account for a centroid not aligned to the grid.", - ), - "hdim_2": ( - "feature_hdim2_coordinate", - "position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)" - "The numbering is consistent with positional indexing of the coordinate, but can be" - "fractional, to account for a centroid not aligned to the grid.", - ), - "idx": ("feature_id_this_frame",), - "num": ( - "feature_grid_cell_count", - "Number of grid points that are within the threshold of this feature", - ), - "threshold_value": ( - "feature_threshold_max", - "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments", - ), - "feature": ( - "feature_id", - "Unique number of the feature; starts from 1 and increments by 1 to the number of features", - ), - "time": ( - "feature_time", - "time of the feature, consistent with feature_time_index", - ), - "timestr": ( - "feature_time_str", - "String representation of the feature time, YYYY-MM-DD HH:MM:SS", - ), - "projection_y_coordinate": ( - "feature_projection_y_coordinate", - "y position of the feature in the projection given by ProjectionCoordinateSystem", - ), - "projection_x_coordinate": ( - "feature_projection_x_coordinate", - "x position of the feature in the projection given by ProjectionCoordinateSystem", - ), - "lat": ("feature_latitude", "latitude of the feature"), - "lon": ("feature_longitude", "longitude of the feature"), - "ncells": ( - "feature_ncells", - "number of grid cells for this feature (meaning uncertain)", - ), - "areas": ("feature_area",), - "isolated": ("feature_isolation_flag",), - "num_objects": ("number_of_feature_neighbors",), - "cell": ("feature_parent_cell_id",), - "time_cell": ("feature_parent_cell_elapsed_time",), - "segmentation_mask": ("2d segmentation mask",), - } - new_feature_var_names = { - k: feature_standard_names[k][0] - for k in feature_standard_names.keys() - if k in TrackedFeatures.variables.keys() - } - - TrackedFeatures = TrackedFeatures.drop(["cell_parent_track_id"]) - # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of - # the 'index' variable and call the dimension 'feature' - ds = xr.merge( - [ - TrackedFeatures.swap_dims({"index": "feature"}) - .drop("index") - .rename_vars(new_feature_var_names), - Mask, - ] - ) - - # Add the projection data back in - if not None in Projection: - ds["ProjectionCoordinateSystem"] = Projection - - # Convert the cell ID variable from float to integer - if "int" not in str(TrackedFeatures.cell.dtype): - # The raw output from the tracking is actually an object array - # array([nan, 2, 3], dtype=object) - # (and is cast to a float array when saved as NetCDF, I think). - # Cast to float. - int_cell = xr.zeros_like(TrackedFeatures.cell, dtype="int64") - - cell_id_data = TrackedFeatures.cell.astype("float64") - valid_cell = np.isfinite(cell_id_data) - valid_cell_ids = cell_id_data[valid_cell] - if not (np.unique(valid_cell_ids) > 0).all(): - raise AssertionError( - "Lowest cell ID cell is less than one, conflicting with use of zero to indicate an untracked cell" - ) - int_cell[valid_cell] = valid_cell_ids.astype("int64") - # ds['feature_parent_cell_id'] = int_cell - return ds - - -def merge_split(TRACK, distance=25000, frame_len=5): +def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): """ function to postprocess tobac track data for merge/split cells Parameters ---------- - TRACK : xarray.core.dataset.Dataset - xarray dataset of tobac Track information + TRACK : pandas.core.frame.DataFrame + Pandas dataframe of tobac Track information distance : float, optional Distance threshold prior to adding a pair of points into the minimum spanning tree. @@ -236,6 +27,10 @@ def merge_split(TRACK, distance=25000, frame_len=5): Threshold for the spanning length within which two points can be separated. Default is five (5) frames. + dxy : float, optional + The x/y/ grid spacing of the data. + Default is 500m. + Returns ------- @@ -252,7 +47,38 @@ def merge_split(TRACK, distance=25000, frame_len=5): """ - print("this is an update 3") + try: + import geopy + except ImportError: + geopy = None + + if geopy: + from geopy.distance import geodesic + else: + print( + "Geopy not available, Merge/Split will proceed in tobac general coordinates." + ) + + try: + import networkx as nx + except ImportError: + networkx = None + + # if networkx: + # import networkx as nx + # else: + # print("Cannot Merge/Split. Please install networkx.") + import logging + import numpy as np + from pandas.core.common import flatten + import xarray as xr + + # Check if dxy is in meters of in kilometers. It should be in meters + if dxy <= 5: + dxy *= 1000 + + # Immediately convert pandas dataframe of track information to xarray: + TRACK = TRACK.to_xarray() track_groups = TRACK.groupby("cell") cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()} id_data = np.fromiter(cell_ids.keys(), dtype=int) @@ -283,11 +109,11 @@ def merge_split(TRACK, distance=25000, frame_len=5): for i in id_data: # print(i) - a_pointx = track_groups[i].projection_x_coordinate[-1].values - a_pointy = track_groups[i].projection_y_coordinate[-1].values + a_pointx = track_groups[i].hdim_2[-1].values * dxy + a_pointy = track_groups[i].hdim_1[-1].values * dxy for j in id_data: - b_pointx = track_groups[j].projection_x_coordinate[0].values - b_pointy = track_groups[j].projection_y_coordinate[0].values + b_pointx = track_groups[j].hdim_2[0].values * dxy + b_pointy = track_groups[j].hdim_1[0].values * dxy d = np.linalg.norm( np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)) ) @@ -299,6 +125,8 @@ def merge_split(TRACK, distance=25000, frame_len=5): b_names.append(j) id = [] + # This is removing any tracks that have matched to themselves - e.g., + # a beginning of a track has found its tail. for i in range(len(dist) - 1, -1, -1): if a_names[i] == b_names[i]: id.append(i) @@ -310,12 +138,12 @@ def merge_split(TRACK, distance=25000, frame_len=5): else: continue - g = Graph() + g = nx.Graph() for i in np.arange(len(dist)): g.add_edge(a_names[i], b_names[i], weight=dist[i]) - tree = minimum_spanning_edges(g) - tree_list = list(minimum_spanning_edges(g)) + tree = nx.minimum_spanning_edges(g) + tree_list = list(tree) new_tree = [] for i, j in enumerate(tree_list): @@ -327,7 +155,7 @@ def merge_split(TRACK, distance=25000, frame_len=5): TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values)) cell_id = np.unique( - TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))] + TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))] ) track_id = dict() # same size as number of total merged tracks @@ -366,23 +194,15 @@ def merge_split(TRACK, distance=25000, frame_len=5): track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) - storm_id = [0] # default because we don't track larger storm systems *yet* - print("found storm id") - - track_parent_storm_id = np.repeat( - 0, len(track_id) - ) # This will always be zero when we don't track larger storm systems *yet* - print("found track parent storm ids") - track_ids = np.array(list(track_id.keys())) - print("found track ids") + logging.debug("found track ids") cell_id = list( np.unique( - TRACK.cell.values.astype(float)[~np.isnan(TRACK.cell.values.astype(float))] + TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))] ) ) - print("found cell ids") + logging.debug("found cell ids") cell_parent_track_id = [] @@ -395,18 +215,14 @@ def merge_split(TRACK, distance=25000, frame_len=5): cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)]))) cell_parent_track_id = list(flatten(cell_parent_track_id)) - print("found cell parent track ids") - - feature_parent_cell_id = list(TRACK.cell.values.astype(float)) + logging.debug("found cell parent track ids") - print("found feature parent cell ids") + feature_parent_cell_id = list(TRACK.cell.values.astype(int)) + logging.debug("found feature parent cell ids") # This version includes all the feature regardless of if they are used in cells or not. feature_id = list(TRACK.feature.values.astype(int)) - print("found feature ids") - - feature_parent_storm_id = np.repeat(0, len(feature_id)) # we don't do storms atm - print("found feature parent storm ids") + logging.debug("found feature ids") feature_parent_track_id = [] feature_parent_track_id = np.zeros(len(feature_id)) @@ -420,58 +236,48 @@ def merge_split(TRACK, distance=25000, frame_len=5): trackid = cell_parent_track_id[j] feature_parent_track_id[i] = trackid - print("found feature parent track ids") - - storm_child_track_count = [len(track_id)] - print("found storm child track count") + logging.debug("found feature parent track ids") track_child_cell_count = [] for i, id in enumerate(track_id): track_child_cell_count.append(len(track_id[int(id)])) - print("found track child cell count") + logging.debug("found track child cell count") cell_child_feature_count = [] for i, id in enumerate(cell_id): cell_child_feature_count.append(len(track_groups[id].feature.values)) - print("found cell child feature count") - - storm_child_cell_count = [len(cell_id)] - storm_child_feature_count = [len(feature_id)] + logging.debug("found cell child feature count") - storm_dim = "nstorms" - track_dim = "ntracks" - cell_dim = "ncells" - feature_dim = "nfeatures" + track_dim = "tracks" + cell_dim = "cells" + feature_dim = "features" d = xr.Dataset( { - "storm_id": (storm_dim, storm_id), - "track_id": (track_dim, track_ids), - "track_parent_storm_id": (track_dim, track_parent_storm_id), - "cell_id": (cell_dim, cell_id), + "track": (track_dim, track_ids), + "cell": (cell_dim, cell_id), "cell_parent_track_id": (cell_dim, cell_parent_track_id), - "feature_id": (feature_dim, feature_id), + "feature": (feature_dim, feature_id), "feature_parent_cell_id": (feature_dim, feature_parent_cell_id), "feature_parent_track_id": (feature_dim, feature_parent_track_id), - "feature_parent_storm_id": (feature_dim, feature_parent_storm_id), - "storm_child_track_count": (storm_dim, storm_child_track_count), - "storm_child_cell_count": (storm_dim, storm_child_cell_count), - "storm_child_feature_count": (storm_dim, storm_child_feature_count), "track_child_cell_count": (track_dim, track_child_cell_count), "cell_child_feature_count": (cell_dim, cell_child_feature_count), } ) - d = d.set_coords(["feature_id", "cell_id", "track_id", "storm_id"]) - assert len(track_id) == len(track_parent_storm_id) + d = d.set_coords(["feature", "cell", "track"]) + assert len(cell_id) == len(cell_parent_track_id) assert len(feature_id) == len(feature_parent_cell_id) - assert sum(storm_child_track_count) == len(track_id) - assert sum(storm_child_cell_count) == len(cell_id) - assert sum(storm_child_feature_count) == len(feature_id) assert sum(track_child_cell_count) == len(cell_id) - assert sum( - [sum(cell_child_feature_count), (len(np.where(feature_parent_track_id < 0)[0]))] - ) == len(feature_id) + assert ( + sum( + [ + sum(cell_child_feature_count[1:]), + (len(np.where(feature_parent_track_id < 0)[0])), + ] + ) + == len(feature_id) + ) return d diff --git a/tobac/utils.py b/tobac/utils.py index 2509b97f..ca44e02c 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -578,3 +578,172 @@ def get_indices_of_labels_from_reg_prop_dict(region_property_dict): curr_loc_indices[index] = len(curr_y_ixs) return (curr_loc_indices, y_indices, x_indices) + + +def compress_all(nc_grids, min_dims=2, comp_level=4): + """ + The purpose of this subroutine is to compress the netcdf variables as they are saved. + This does not change the data, but sets netcdf encoding parameters. + We allocate a minimum number of dimensions as variables with dimensions + under the minimum value do not benefit from tangibly from this encoding. + + Parameters + ---------- + nc_grids : xarray.core.dataset.Dataset + Xarray dataset that is intended to be exported as netcdf + + min_dims : integer + The minimum number of dimesnions, in integer value, a variable must have in order + set the netcdf compression encoding. + comp_level : integer + The level of compression. Default values is 4. + + Returns + ------- + nc_grids : xarray.core.dataset.Dataset + Xarray dataset with netcdf compression encoding for variables with two (2) or more dimensions + + """ + + for var in nc_grids: + if len(nc_grids[var].dims) >= min_dims: + # print("Compressing ", var) + nc_grids[var].encoding["zlib"] = True + nc_grids[var].encoding["complevel"] = comp_level + nc_grids[var].encoding["contiguous"] = False + return nc_grids + + +def standardize_track_dataset(TrackedFeatures, Mask, Projection=None): + """ + Combine a feature mask with the feature data table into a common dataset. + + returned by tobac.segmentation + with the TrackedFeatures dataset returned by tobac.linking_trackpy. + + Also rename the variables to be more desciptive and comply with cf-tree. + + Convert the default cell parent ID to an integer table. + + Add a cell dimension to reflect + + Projection is an xarray DataArray + + TODO: Add metadata attributes + + Parameters + ---------- + TrackedFeatures : xarray.core.dataset.Dataset + xarray dataset of tobac Track information, the xarray dataset returned by tobac.tracking.linking_trackpy + + Mask: xarray.core.dataset.Dataset + xarray dataset of tobac segmentation mask information, the xarray dataset returned + by tobac.segmentation.segmentation + + + Projection : xarray.core.dataarray.DataArray, default = None + array.DataArray of the original input dataset (gridded nexrad data for example). + If using gridded nexrad data, this can be input as: data['ProjectionCoordinateSystem'] + An example of the type of information in the dataarray includes the following attributes: + latitude_of_projection_origin :29.471900939941406 + longitude_of_projection_origin :-95.0787353515625 + _CoordinateTransformType :Projection + _CoordinateAxes :x y z time + _CoordinateAxesTypes :GeoX GeoY Height Time + grid_mapping_name :azimuthal_equidistant + semi_major_axis :6370997.0 + inverse_flattening :298.25 + longitude_of_prime_meridian :0.0 + false_easting :0.0 + false_northing :0.0 + + Returns + ------- + + ds : xarray.core.dataset.Dataset + xarray dataset of merged Track and Segmentation Mask datasets with renamed variables. + + """ + feature_standard_names = { + # new variable name, and long description for the NetCDF attribute + "frame": ( + "feature_time_index", + "positional index of the feature along the time dimension of the mask, from 0 to N-1", + ), + "hdim_1": ( + "feature_hdim1_coordinate", + "position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x)." + "The numbering is consistent with positional indexing of the coordinate, but can be" + "fractional, to account for a centroid not aligned to the grid.", + ), + "hdim_2": ( + "feature_hdim2_coordinate", + "position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)" + "The numbering is consistent with positional indexing of the coordinate, but can be" + "fractional, to account for a centroid not aligned to the grid.", + ), + "idx": ("feature_id_this_frame",), + "num": ( + "feature_grid_cell_count", + "Number of grid points that are within the threshold of this feature", + ), + "threshold_value": ( + "feature_threshold_max", + "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments", + ), + "feature": ( + "feature_id", + "Unique number of the feature; starts from 1 and increments by 1 to the number of features", + ), + "time": ( + "feature_time", + "time of the feature, consistent with feature_time_index", + ), + "timestr": ( + "feature_time_str", + "String representation of the feature time, YYYY-MM-DD HH:MM:SS", + ), + "projection_y_coordinate": ( + "feature_projection_y_coordinate", + "y position of the feature in the projection given by ProjectionCoordinateSystem", + ), + "projection_x_coordinate": ( + "feature_projection_x_coordinate", + "x position of the feature in the projection given by ProjectionCoordinateSystem", + ), + "lat": ("feature_latitude", "latitude of the feature"), + "lon": ("feature_longitude", "longitude of the feature"), + "ncells": ( + "feature_ncells", + "number of grid cells for this feature (meaning uncertain)", + ), + "areas": ("feature_area",), + "isolated": ("feature_isolation_flag",), + "num_objects": ("number_of_feature_neighbors",), + "cell": ("feature_parent_cell_id",), + "time_cell": ("feature_parent_cell_elapsed_time",), + "segmentation_mask": ("2d segmentation mask",), + } + new_feature_var_names = { + k: feature_standard_names[k][0] + for k in feature_standard_names.keys() + if k in TrackedFeatures.variables.keys() + } + + # TrackedFeatures = TrackedFeatures.drop(["cell_parent_track_id"]) + # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of + # the 'index' variable and call the dimension 'feature' + ds = xr.merge( + [ + TrackedFeatures.swap_dims({"index": "feature"}) + .drop("index") + .rename_vars(new_feature_var_names), + Mask, + ] + ) + + # Add the projection data back in + if not None in Projection: + ds["ProjectionCoordinateSystem"] = Projection + + return ds From 9d058577cd029ded3af7f9dfe3afe7425b34ed06 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Wed, 7 Sep 2022 12:19:56 -0500 Subject: [PATCH 08/29] Black formatting --- tobac/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tobac/utils.py b/tobac/utils.py index ca44e02c..73683c4c 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -664,6 +664,8 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection=None): xarray dataset of merged Track and Segmentation Mask datasets with renamed variables. """ + import xarray as xr + feature_standard_names = { # new variable name, and long description for the NetCDF attribute "frame": ( From a25f9842edc3b4f3a5509dc0ebaa613b6cf55ed3 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Wed, 7 Sep 2022 12:21:57 -0500 Subject: [PATCH 09/29] Black formatting --- tobac/merge_split.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index c64b30bc..b0f482b0 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -270,14 +270,11 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): assert len(cell_id) == len(cell_parent_track_id) assert len(feature_id) == len(feature_parent_cell_id) assert sum(track_child_cell_count) == len(cell_id) - assert ( - sum( - [ - sum(cell_child_feature_count[1:]), - (len(np.where(feature_parent_track_id < 0)[0])), - ] - ) - == len(feature_id) - ) + assert sum( + [ + sum(cell_child_feature_count[1:]), + (len(np.where(feature_parent_track_id < 0)[0])), + ] + ) == len(feature_id) return d From c25f588d6f27adf157684229ebe6aaa53561ae21 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Sun, 18 Sep 2022 11:33:33 -0500 Subject: [PATCH 10/29] Updates to documentation and merge_split Removed the lat/lon in merge_split, and added documentation. --- doc/merge_split.rst | 19 +++++++++ tobac/merge_split.py | 96 ++++++++++++++++++-------------------------- 2 files changed, 57 insertions(+), 58 deletions(-) create mode 100644 doc/merge_split.rst diff --git a/doc/merge_split.rst b/doc/merge_split.rst new file mode 100644 index 00000000..ca02f291 --- /dev/null +++ b/doc/merge_split.rst @@ -0,0 +1,19 @@ +*Merge and Split +====================== + + +This submodule is a post processing step to address tracked cells which merge/split. +The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. +This submodule will label merged/split cells with a TRACK number in addition to its CELL number. + +Features, cells, and tracks are combined using parent/child nomenclature. +(quick note on terms; “feature” is a detected object at a single time step. “cell” is a series of features linked together over multiple timesteps. "track" may be an individual cell or series of cells which have merged and/or split.) + + +Overview of the output dataframe from merge_split + - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. + - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. + - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. + - track_child_cell_count: The total number of features belonging to all child cells of a given track id. + - cell_child_feature_count: The total number of features for each cell. + diff --git a/tobac/merge_split.py b/tobac/merge_split.py index b0f482b0..fd61d93d 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -1,10 +1,9 @@ """ Tobac merge and split v0.1 This submodule is a post processing step to address tracked cells which merge/split. - The first iteration of this module is to combine the cells which are merging but receive - a new cell id once merged. In general this submodule will label these three cell-ids using - the largest cell number of those within the merged/split cell ids. - + The first iteration of this module is to combine the cells which are merging but have received + a new cell id (and are considered a new cell) once merged. In general this submodule will label merged/split cells + with a TRACK number in addition to its CELL number. """ @@ -37,28 +36,23 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): d : xarray.core.dataset.Dataset xarray dataset of tobac merge/split cells with parent and child designations. + Parent/child variables include: + - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. + - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. + - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. + - track_child_cell_count: The total number of features belonging to all child cells of a given track id. + - cell_child_feature_count: The total number of features for each cell. + Example usage: d = merge_split(Track) - ds = standardize_track_dataset(Track, refl_mask) + ds = tobac.utils.standardize_track_dataset(Track, refl_mask) both_ds = xr.merge([ds, d],compat ='override') - both_ds = compress_all(both_ds) + both_ds = tobac.utils.compress_all(both_ds) both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) """ - try: - import geopy - except ImportError: - geopy = None - - if geopy: - from geopy.distance import geodesic - else: - print( - "Geopy not available, Merge/Split will proceed in tobac general coordinates." - ) - try: import networkx as nx except ImportError: @@ -90,39 +84,22 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): b_names = list() dist = list() - if hasattr(TRACK, "longitude"): - print("is in lat/lon") - for i in id_data: - a_pointx = track_groups[i].grid_longitude[-1].values - a_pointy = track_groups[i].grid_latitude[-1].values - for j in id_data: - b_pointx = track_groups[j].grid_longitude[0].values - b_pointy = track_groups[j].grid_latitude[0].values - d = geodesic((a_pointy, a_pointx), (b_pointy, b_pointx)).m - if d <= distance: - a_points.append([a_pointx, a_pointy]) - b_points.append([b_pointx, b_pointy]) - dist.append(d) - a_names.append(i) - b_names.append(j) - else: - - for i in id_data: - # print(i) - a_pointx = track_groups[i].hdim_2[-1].values * dxy - a_pointy = track_groups[i].hdim_1[-1].values * dxy - for j in id_data: - b_pointx = track_groups[j].hdim_2[0].values * dxy - b_pointy = track_groups[j].hdim_1[0].values * dxy - d = np.linalg.norm( - np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)) - ) - if d <= distance: - a_points.append([a_pointx, a_pointy]) - b_points.append([b_pointx, b_pointy]) - dist.append(d) - a_names.append(i) - b_names.append(j) + for i in id_data: + # print(i) + a_pointx = track_groups[i].hdim_2[-1].values * dxy + a_pointy = track_groups[i].hdim_1[-1].values * dxy + for j in id_data: + b_pointx = track_groups[j].hdim_2[0].values * dxy + b_pointy = track_groups[j].hdim_1[0].values * dxy + d = np.linalg.norm( + np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)) + ) + if d <= distance: + a_points.append([a_pointx, a_pointy]) + b_points.append([b_pointx, b_pointy]) + dist.append(d) + a_names.append(i) + b_names.append(j) id = [] # This is removing any tracks that have matched to themselves - e.g., @@ -137,7 +114,7 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): b_names.pop(i) else: continue - + # This is inputting data to the object with will perform the spanning tree. g = nx.Graph() for i in np.arange(len(dist)): g.add_edge(a_names[i], b_names[i], weight=dist[i]) @@ -270,11 +247,14 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): assert len(cell_id) == len(cell_parent_track_id) assert len(feature_id) == len(feature_parent_cell_id) assert sum(track_child_cell_count) == len(cell_id) - assert sum( - [ - sum(cell_child_feature_count[1:]), - (len(np.where(feature_parent_track_id < 0)[0])), - ] - ) == len(feature_id) + assert ( + sum( + [ + sum(cell_child_feature_count[1:]), + (len(np.where(feature_parent_track_id < 0)[0])), + ] + ) + == len(feature_id) + ) return d From 6327c8dd5dbc6942f0a64f6b65727c52915ed613 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Sun, 18 Sep 2022 11:38:50 -0500 Subject: [PATCH 11/29] Formatting --- build/lib/tobac/merge_split.py | 259 +++++++++++++++++++++++++++++++++ tobac/merge_split.py | 23 ++- 2 files changed, 269 insertions(+), 13 deletions(-) create mode 100644 build/lib/tobac/merge_split.py diff --git a/build/lib/tobac/merge_split.py b/build/lib/tobac/merge_split.py new file mode 100644 index 00000000..609ae47f --- /dev/null +++ b/build/lib/tobac/merge_split.py @@ -0,0 +1,259 @@ +""" + Tobac merge and split v0.1 + This submodule is a post processing step to address tracked cells which merge/split. + The first iteration of this module is to combine the cells which are merging but have received + a new cell id (and are considered a new cell) once merged. In general this submodule will label merged/split cells + with a TRACK number in addition to its CELL number. + +""" + + +def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): + """ + function to postprocess tobac track data for merge/split cells + + + Parameters + ---------- + TRACK : pandas.core.frame.DataFrame + Pandas dataframe of tobac Track information + + distance : float, optional + Distance threshold prior to adding a pair of points into the minimum spanning tree. + Default is 25000 meters. + + frame_len : float, optional + Threshold for the spanning length within which two points can be separated. + Default is five (5) frames. + + dxy : float, optional + The x/y/ grid spacing of the data. + Default is 500m. + + Returns + ------- + + d : xarray.core.dataset.Dataset + xarray dataset of tobac merge/split cells with parent and child designations. + + Parent/child variables include: + - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. + - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. + - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. + - track_child_cell_count: The total number of features belonging to all child cells of a given track id. + - cell_child_feature_count: The total number of features for each cell. + + + Example usage: + d = merge_split(Track) + ds = tobac.utils.standardize_track_dataset(Track, refl_mask) + both_ds = xr.merge([ds, d],compat ='override') + both_ds = tobac.utils.compress_all(both_ds) + both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) + + """ + + + try: + import networkx as nx + except ImportError: + networkx = None + + # if networkx: + # import networkx as nx + # else: + # print("Cannot Merge/Split. Please install networkx.") + import logging + import numpy as np + from pandas.core.common import flatten + import xarray as xr + + # Check if dxy is in meters of in kilometers. It should be in meters + if dxy <= 5: + dxy *= 1000 + + # Immediately convert pandas dataframe of track information to xarray: + TRACK = TRACK.to_xarray() + track_groups = TRACK.groupby("cell") + cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()} + id_data = np.fromiter(cell_ids.keys(), dtype=int) + count_data = np.fromiter(cell_ids.values(), dtype=int) + all_frames = np.sort(np.unique(TRACK.frame)) + a_points = list() + b_points = list() + a_names = list() + b_names = list() + dist = list() + + + for i in id_data: + # print(i) + a_pointx = track_groups[i].hdim_2[-1].values * dxy + a_pointy = track_groups[i].hdim_1[-1].values * dxy + for j in id_data: + b_pointx = track_groups[j].hdim_2[0].values * dxy + b_pointy = track_groups[j].hdim_1[0].values * dxy + d = np.linalg.norm( + np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)) + ) + if d <= distance: + a_points.append([a_pointx, a_pointy]) + b_points.append([b_pointx, b_pointy]) + dist.append(d) + a_names.append(i) + b_names.append(j) + + id = [] + # This is removing any tracks that have matched to themselves - e.g., + # a beginning of a track has found its tail. + for i in range(len(dist) - 1, -1, -1): + if a_names[i] == b_names[i]: + id.append(i) + a_points.pop(i) + b_points.pop(i) + dist.pop(i) + a_names.pop(i) + b_names.pop(i) + else: + continue + # This is inputting data to the object with will perform the spanning tree. + g = nx.Graph() + for i in np.arange(len(dist)): + g.add_edge(a_names[i], b_names[i], weight=dist[i]) + + tree = nx.minimum_spanning_edges(g) + tree_list = list(tree) + + new_tree = [] + for i, j in enumerate(tree_list): + frame_a = np.nanmax(track_groups[j[0]].frame.values) + frame_b = np.nanmin(track_groups[j[1]].frame.values) + if np.abs(frame_a - frame_b) <= frame_len: + new_tree.append(tree_list[i][0:2]) + new_tree_arr = np.array(new_tree) + + TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values)) + cell_id = np.unique( + TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))] + ) + track_id = dict() # same size as number of total merged tracks + + arr = np.array([0]) + for p in cell_id: + j = np.where(arr == int(p)) + if len(j[0]) > 0: + continue + else: + k = np.where(new_tree_arr == p) + if len(k[0]) == 0: + track_id[p] = [p] + arr = np.append(arr, p) + else: + temp1 = list(np.unique(new_tree_arr[k[0]])) + temp = list(np.unique(new_tree_arr[k[0]])) + + for l in range(15): + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + + if len(temp1) == len(temp): + break + temp1 = np.array(temp) + + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + arr = np.append(arr, np.unique(temp)) + + track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) + + track_ids = np.array(list(track_id.keys())) + logging.debug("found track ids") + + cell_id = list( + np.unique( + TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))] + ) + ) + logging.debug("found cell ids") + + cell_parent_track_id = [] + + for i, id in enumerate(track_id): + + if len(track_id[int(id)]) == 1: + cell_parent_track_id.append(int(id)) + + else: + cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)]))) + + cell_parent_track_id = list(flatten(cell_parent_track_id)) + logging.debug("found cell parent track ids") + + feature_parent_cell_id = list(TRACK.cell.values.astype(int)) + logging.debug("found feature parent cell ids") + + # This version includes all the feature regardless of if they are used in cells or not. + feature_id = list(TRACK.feature.values.astype(int)) + logging.debug("found feature ids") + + feature_parent_track_id = [] + feature_parent_track_id = np.zeros(len(feature_id)) + for i, id in enumerate(feature_id): + cellid = feature_parent_cell_id[i] + if np.isnan(cellid): + feature_parent_track_id[i] = -1 + else: + j = np.where(cell_id == cellid) + j = np.squeeze(j) + trackid = cell_parent_track_id[j] + feature_parent_track_id[i] = trackid + + logging.debug("found feature parent track ids") + + track_child_cell_count = [] + for i, id in enumerate(track_id): + track_child_cell_count.append(len(track_id[int(id)])) + logging.debug("found track child cell count") + + cell_child_feature_count = [] + for i, id in enumerate(cell_id): + cell_child_feature_count.append(len(track_groups[id].feature.values)) + logging.debug("found cell child feature count") + + track_dim = "tracks" + cell_dim = "cells" + feature_dim = "features" + + d = xr.Dataset( + { + "track": (track_dim, track_ids), + "cell": (cell_dim, cell_id), + "cell_parent_track_id": (cell_dim, cell_parent_track_id), + "feature": (feature_dim, feature_id), + "feature_parent_cell_id": (feature_dim, feature_parent_cell_id), + "feature_parent_track_id": (feature_dim, feature_parent_track_id), + "track_child_cell_count": (track_dim, track_child_cell_count), + "cell_child_feature_count": (cell_dim, cell_child_feature_count), + } + ) + + d = d.set_coords(["feature", "cell", "track"]) + + assert len(cell_id) == len(cell_parent_track_id) + assert len(feature_id) == len(feature_parent_cell_id) + assert sum(track_child_cell_count) == len(cell_id) + assert sum( + [ + sum(cell_child_feature_count[1:]), + (len(np.where(feature_parent_track_id < 0)[0])), + ] + ) == len(feature_id) + + return d diff --git a/tobac/merge_split.py b/tobac/merge_split.py index fd61d93d..51dc1286 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -38,10 +38,10 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): Parent/child variables include: - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. - - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. - - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. - - track_child_cell_count: The total number of features belonging to all child cells of a given track id. - - cell_child_feature_count: The total number of features for each cell. + - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. + - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. + - track_child_cell_count: The total number of features belonging to all child cells of a given track id. + - cell_child_feature_count: The total number of features for each cell. Example usage: @@ -247,14 +247,11 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): assert len(cell_id) == len(cell_parent_track_id) assert len(feature_id) == len(feature_parent_cell_id) assert sum(track_child_cell_count) == len(cell_id) - assert ( - sum( - [ - sum(cell_child_feature_count[1:]), - (len(np.where(feature_parent_track_id < 0)[0])), - ] - ) - == len(feature_id) - ) + assert sum( + [ + sum(cell_child_feature_count[1:]), + (len(np.where(feature_parent_track_id < 0)[0])), + ] + ) == len(feature_id) return d From 99cb1790b5b3820e1ae7bf486033e314cecc632a Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Sun, 18 Sep 2022 20:52:44 -0500 Subject: [PATCH 12/29] Updated utils.py with changes made for 1.4 Included the spectral_filtering function in V1.4 --- tobac/utils.py | 576 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 446 insertions(+), 130 deletions(-) diff --git a/tobac/utils.py b/tobac/utils.py index 73683c4c..85c8fdb9 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -2,18 +2,27 @@ def column_mask_from2D(mask_2D, cube, z_coord="model_level_number"): - """function to turn 2D watershedding mask into a 3D mask of selected columns - Input: - cube: iris.cube.Cube - data cube - mask_2D: iris.cube.Cube - 2D cube containing mask (int id for tacked volumes 0 everywhere else) - z_coord: str - name of the vertical coordinate in the cube - Output: - mask_2D: iris.cube.Cube - 3D cube containing columns of 2D mask (int id for tacked volumes 0 everywhere else) + """Turn 2D watershedding mask into a 3D mask of selected columns. + + Parameters + ---------- + cube : iris.cube.Cube + Data cube. + + mask_2D : iris.cube.Cube + 2D cube containing mask (int id for tacked volumes 0 + everywhere else). + + z_coord : str + Name of the vertical coordinate in the cube. + + Returns + ------- + mask_2D : iris.cube.Cube + 3D cube containing columns of 2D mask (int id for tracked + volumes, 0 everywhere else). """ + from copy import deepcopy mask_3D = deepcopy(cube) @@ -28,18 +37,29 @@ def column_mask_from2D(mask_2D, cube, z_coord="model_level_number"): def mask_cube_cell(variable_cube, mask, cell, track): - """Mask cube for tracked volume of an individual cell - Input: - variable_cube: iris.cube.Cube - unmasked data cube - mask: iris.cube.Cube - cube containing mask (int id for tacked volumes 0 everywhere else) - cell: int - interger id of cell to create masked cube for - Output: - variable_cube_out: iris.cube.Cube - Masked cube with data for respective cell + """Mask cube for tracked volume of an individual cell. + + Parameters + ---------- + variable_cube : iris.cube.Cube + Unmasked data cube. + + mask : iris.cube.Cube + Cube containing mask (int id for tracked volumes, 0 everywhere + else). + + cell : int + Integer id of cell to create masked cube for. + + track : pandas.DataFrame + Output of the linking. + + Returns + ------- + variable_cube_out : iris.cube.Cube + Masked cube with data for respective cell. """ + from copy import deepcopy variable_cube_out = deepcopy(variable_cube) @@ -49,16 +69,23 @@ def mask_cube_cell(variable_cube, mask, cell, track): def mask_cube_all(variable_cube, mask): - """Mask cube for untracked volume - Input: - variable_cube: iris.cube.Cube - unmasked data cube - mask: iris.cube.Cube - cube containing mask (int id for tacked volumes 0 everywhere else) - Output: - variable_cube_out: iris.cube.Cube - Masked cube for untracked volume + """Mask cube (iris.cube) for tracked volume. + + Parameters + ---------- + variable_cube : iris.cube.Cube + Unmasked data cube. + + mask : iris.cube.Cube + Cube containing mask (int id for tacked volumes 0 everywhere + else). + + Returns + ------- + variable_cube_out : iris.cube.Cube + Masked cube for untracked volume. """ + from dask.array import ma from copy import deepcopy @@ -70,16 +97,23 @@ def mask_cube_all(variable_cube, mask): def mask_cube_untracked(variable_cube, mask): - """Mask cube for untracked volume - Input: - variable_cube: iris.cube.Cube - unmasked data cube - mask: iris.cube.Cube - cube containing mask (int id for tacked volumes 0 everywhere else) - Output: - variable_cube_out: iris.cube.Cube - Masked cube for untracked volume + """Mask cube (iris.cube) for untracked volume. + + Parameters + ---------- + variable_cube : iris.cube.Cube + Unmasked data cube. + + mask : iris.cube.Cube + Cube containing mask (int id for tacked volumes 0 everywhere + else). + + Returns + ------- + variable_cube_out : iris.cube.Cube + Masked cube for untracked volume. """ + from dask.array import ma from copy import deepcopy @@ -91,47 +125,91 @@ def mask_cube_untracked(variable_cube, mask): def mask_cube(cube_in, mask): - """Mask cube where mask is larger than zero - Input: - cube_in: iris.cube.Cube - unmasked data cube - mask: numpy.ndarray or dask.array - mask to use for masking, >0 where cube is supposed to be masked - Output: - cube_out: iris.cube.Cube - Masked cube + """Mask cube where mask is not zero. + + Parameters + ---------- + cube_in : iris.cube.Cube + Unmasked data cube. + + mask : iris.cube.Cube + Mask to use for masking, >0 where cube is supposed to be masked. + + Returns + ------- + variable_cube_out : iris.cube.Cube + Masked cube. """ + from dask.array import ma from copy import deepcopy cube_out = deepcopy(cube_in) - cube_out.data = ma.masked_where(mask != 0, cube_in.core_data()) + cube_out.data = ma.masked_where(mask.core_data() != 0, cube_in.core_data()) return cube_out def mask_cell(mask, cell, track, masked=False): - """create mask for specific cell - Input: - mask: iris.cube.Cube - cube containing mask (int id for tacked volumes 0 everywhere else) - Output: - variable_cube_out: numpy.ndarray - Masked cube for untracked volume + """Create mask for specific cell. + + Parameters + ---------- + mask : iris.cube.Cube + Cube containing mask (int id for tracked volumes 0 everywhere + else). + + cell : int + Integer id of cell to create masked cube for. + + track : pandas.DataFrame + Output of the linking. + + masked : bool, optional + Bool determining whether to mask the mask for the cell where + it is 0. Default is False. + + Returns + ------- + mask_i : numpy.ndarray + Mask for a specific cell. """ + feature_ids = track.loc[track["cell"] == cell, "feature"].values mask_i = mask_features(mask, feature_ids, masked=masked) return mask_i def mask_cell_surface(mask, cell, track, masked=False, z_coord="model_level_number"): - """Create surface projection of mask for individual cell - Input: - mask: iris.cube.Cube - cube containing mask (int id for tacked volumes 0 everywhere else) - Output: - variable_cube_out: iris.cube.Cube - Masked cube for untracked volume + """Create surface projection of 3d-mask for individual cell by + collapsing one coordinate. + + Parameters + ---------- + mask : iris.cube.Cube + Cube containing mask (int id for tacked volumes, 0 everywhere + else). + + cell : int + Integer id of cell to create masked cube for. + + track : pandas.DataFrame + Output of the linking. + + masked : bool, optional + Bool determining whether to mask the mask for the cell where + it is 0. Default is False. + + z_coord : str, optional + Name of the coordinate to collapse. Default is 'model_level_number'. + + Returns + ------- + mask_i_surface : iris.cube.Cube + Collapsed Masked cube for the cell with the maximum value + along the collapsed coordinate. + """ + feature_ids = track.loc[track["cell"] == cell, "feature"].values mask_i_surface = mask_features_surface( mask, feature_ids, masked=masked, z_coord=z_coord @@ -140,32 +218,69 @@ def mask_cell_surface(mask, cell, track, masked=False, z_coord="model_level_numb def mask_cell_columns(mask, cell, track, masked=False, z_coord="model_level_number"): - """Create mask with entire columns for individual cell - Input: - mask: iris.cube.Cube - cube containing mask (int id for tacked volumes 0 everywhere else) - Output: - variable_cube_out: iris.cube.Cube - Masked cube for untracked volume + """Create mask with entire columns for individual cell. + + Parameters + ---------- + mask : iris.cube.Cube + Cube containing mask (int id for tacked volumes 0 everywhere + else). + + cell : int + Interger id of cell to create the masked cube for. + + track : pandas.DataFrame + Output of the linking. + + masked : bool, optional + Bool determining whether to mask the mask for the cell where + it is 0. Default is False. + + z_coord : str, optional + Default is 'model_level_number'. + + Returns + ------- + mask_i : iris.cube.Cube + Masked cube for untracked volume. + + Notes + ------- + Function is not working since mask_features_columns() + is commented out """ + + raise NotImplementedError( + "The function mask_cell_columns() is not implemented currently." + ) + feature_ids = track.loc[track["cell"] == cell].loc["feature"] mask_i = mask_features_columns(mask, feature_ids, masked=masked, z_coord=z_coord) return mask_i def mask_cube_features(variable_cube, mask, feature_ids): - """Mask cube for tracked volume of an individual cell - Input: - variable_cube: iris.cube.Cube - unmasked data cube - mask: iris.cube.Cube - cube containing mask (int id for tacked volumes 0 everywhere else) - cell: int - interger id of cell to create masked cube for - Output: - variable_cube_out: iris.cube.Cube - Masked cube with data for respective cell + """Mask cube for tracked volume of one or more specific + features. + + Parameters + ---------- + variable_cube : iris.cube.Cube + Unmasked data cube. + + mask : iris.cube.Cube + Cube containing mask (int id for tacked volumes, 0 everywhere + else). + + feature_ids : int or list of ints + Integer ids of features to create masked cube for. + + Returns + ------- + variable_cube_out : iris.cube.Cube + Masked cube with data for respective features. """ + from dask.array import ma, isin from copy import deepcopy @@ -177,14 +292,27 @@ def mask_cube_features(variable_cube, mask, feature_ids): def mask_features(mask, feature_ids, masked=False): - """create mask for specific features - Input: - mask: iris.cube.Cube - cube containing mask (int id for tacked volumes 0 everywhere else) - Output: - variable_cube_out: numpy.ndarray - Masked cube for untracked volume + """Create mask for specific features. + + Parameters + ---------- + mask : iris.cube.Cube + Cube containing mask (int id for tacked volumes 0 everywhere + else). + + feature_ids : int or list of ints + Integer ids of the features to create the masked cube for. + + masked : bool, optional + Bool determining whether to mask the mask for the cell where + it is 0. Default is False. + + Returns + ------- + mask_i : numpy.ndarray + Masked cube for specific features. """ + from dask.array import ma, isin from copy import deepcopy @@ -199,14 +327,33 @@ def mask_features(mask, feature_ids, masked=False): def mask_features_surface( mask, feature_ids, masked=False, z_coord="model_level_number" ): - """create surface mask for individual features - Input: - mask: iris.cube.Cube - cube containing mask (int id for tacked volumes 0 everywhere else) - Output: - variable_cube_out: iris.cube.Cube - Masked cube for untracked volume + """Create surface projection of 3d-mask for specific features + by collapsing one coordinate. + + Parameters + ---------- + mask : iris.cube.Cube + Cube containing mask (int id for tacked volumes 0 everywhere + else). + + feature_ids : int or list of ints + Integer ids of the features to create the masked cube for. + + masked : bool, optional + Bool determining whether to mask the mask for the cell where + it is 0. Default is False + + z_coord : str, optional + Name of the coordinate to collapse. Default is + 'model_level_number'. + + Returns + ------- + mask_i_surface : iris.cube.Cube + Collapsed Masked cube for the features with the maximum value + along the collapsed coordinate. """ + from iris.analysis import MAX from dask.array import ma, isin from copy import deepcopy @@ -222,14 +369,30 @@ def mask_features_surface( def mask_all_surface(mask, masked=False, z_coord="model_level_number"): - """create surface mask for individual features - Input: - mask: iris.cube.Cube - cube containing mask (int id for tacked volumes 0 everywhere else) - Output: - mask_i_surface: iris.cube.Cube (2D) - Mask with 1 below features and 0 everywhere else + """Create surface projection of 3d-mask for all features + by collapsing one coordinate. + + Parameters + ---------- + mask : iris.cube.Cube + Cube containing mask (int id for tacked volumes 0 everywhere + else). + + masked : bool, optional + Bool determining whether to mask the mask for the cell where + it is 0. Default is False + + z_coord : str, optional + Name of the coordinate to collapse. Default is + 'model_level_number'. + + Returns + ------- + mask_i_surface : iris.cube.Cube (2D) + Collapsed Masked cube for the features with the maximum value + along the collapsed coordinate. """ + from iris.analysis import MAX from dask.array import ma, isin from copy import deepcopy @@ -237,7 +400,7 @@ def mask_all_surface(mask, masked=False, z_coord="model_level_number"): mask_i = deepcopy(mask) mask_i_surface = mask_i.collapsed(z_coord, MAX) mask_i_surface_data = mask_i_surface.core_data() - mask_i_surface[mask_i_surface_data > 0] = 1 + mask_i_surface.data[mask_i_surface_data > 0] = 1 if masked: mask_i_surface.data = ma.masked_equal(mask_i_surface.core_data(), 0) return mask_i_surface @@ -306,18 +469,27 @@ def mask_all_surface(mask, masked=False, z_coord="model_level_number"): def add_coordinates(t, variable_cube): - import numpy as np + """Add coordinates from the input cube of the feature detection + to the trajectories/features. + + Parameters + ---------- + t : pandas.DataFrame + Trajectories/features from feature detection or linking step. + + variable_cube : iris.cube.Cube + Input data used for the tracking with coordinate information + to transfer to the resulting DataFrame. Needs to contain the + coordinate 'time'. + + Returns + ------- + t : pandas.DataFrame + Trajectories with added coordinates. - """ Function adding coordinates from the tracking cube to the trajectories: time, longitude&latitude, x&y dimensions - Input: - t: pandas DataFrame - trajectories/features - variable_cube: iris.cube.Cube - Cube containing the dimensions 'time','longitude','latitude','x_projection_coordinate','y_projection_coordinate', usually cube that the tracking is performed on - Output: - t: pandas DataFrame - trajectories with added coordinated """ + + import numpy as np from scipy.interpolate import interp2d, interp1d logging.debug("start adding coordinates from cube") @@ -440,11 +612,29 @@ def add_coordinates(t, variable_cube): def get_bounding_box(x, buffer=1): - from numpy import delete, arange, diff, nonzero, array - - """ Calculates the bounding box of a ndarray + """Finds the bounding box of a ndarray, i.e. the smallest + bounding rectangle for nonzero values as explained here: https://stackoverflow.com/questions/31400769/bounding-box-of-numpy-array + + Parameters + ---------- + x : numpy.ndarray + Array for which the bounding box is to be determined. + + buffer : int, optional + Number to set a buffer between the nonzero values and + the edges of the box. Default is 1. + + Returns + ------- + bbox : list + Dimensionwise list of the indices representing the edges + of the bounding box. + """ + + from numpy import delete, arange, diff, nonzero, array + mask = x == 0 bbox = [] @@ -472,6 +662,38 @@ def get_bounding_box(x, buffer=1): def get_spacings(field_in, grid_spacing=None, time_spacing=None): + """Determine spatial and temporal grid spacing of the + input data. + + Parameters + ---------- + field_in : iris.cube.Cube + Input field where to get spacings. + + grid_spacing : float, optional + Manually sets the grid spacing if specified. + Default is None. + + time_spacing : float, optional + Manually sets the time spacing if specified. + Default is None. + + Returns + ------- + dxy : float + Grid spacing in metres. + + dt : float + Time resolution in seconds. + + Raises + ------ + ValueError + If input_cube does not contain projection_x_coord and + projection_y_coord or keyword argument grid_spacing. + + """ + import numpy as np from copy import deepcopy @@ -516,14 +738,16 @@ def get_label_props_in_dict(labels): Parameters ---------- - labels: 2D array-like - comes from the `skimage.measure.label` function + labels : 2D array-like + Output of the `skimage.measure.label` function. Returns ------- - dict - output from skimage.measure.regionprops in dictionary format, where they key is the label number + region_properties_dict: dict + Output from skimage.measure.regionprops in dictionary + format, where they key is the label number. """ + import skimage.measure region_properties_raw = skimage.measure.regionprops(labels) @@ -540,24 +764,27 @@ def get_indices_of_labels_from_reg_prop_dict(region_property_dict): Parameters ---------- - region_property_dict: dict of region_property objects + region_property_dict : dict of region_property objects This dict should come from the get_label_props_in_dict function. Returns ------- - dict (key: label number, int) - The number of points in the label number - dict (key: label number, int) - the y indices in the label number - dict (key: label number, int) - the x indices in the label number + curr_loc_indices : dict + The number of points in the label number (key: label number). + + y_indices : dict + The y indices in the label number (key: label number). + + x_indices : dict + The x indices in the label number (key: label number). Raises ------ ValueError - a ValueError is raised if there are no regions in the region property dict - + A ValueError is raised if there are no regions in the region + property dict. """ + import numpy as np if len(region_property_dict) == 0: @@ -580,6 +807,95 @@ def get_indices_of_labels_from_reg_prop_dict(region_property_dict): return (curr_loc_indices, y_indices, x_indices) +def spectral_filtering( + dxy, field_in, lambda_min, lambda_max, return_transfer_function=False +): + """This function creates and applies a 2D transfer function that + can be used as a bandpass filter to remove certain wavelengths + of an atmospheric input field (e.g. vorticity, IVT, etc). + + Parameters: + ----------- + dxy : float + Grid spacing in m. + + field_in: numpy.array + 2D field with input data. + + lambda_min: float + Minimum wavelength in m. + + lambda_max: float + Maximum wavelength in m. + + return_transfer_function: boolean, optional + default: False. If set to True, then the 2D transfer function and + the corresponding wavelengths are returned. + + Returns: + -------- + filtered_field: numpy.array + Spectrally filtered 2D field of data (with same shape as input data). + + transfer_function: tuple + Two 2D fields, where the first one corresponds to the wavelengths + in the spectral space of the domain and the second one to the 2D + transfer function of the bandpass filter. Only returned, if + return_transfer_function is True. + """ + + import numpy as np + from scipy import signal + from scipy import fft + + # check if valid value for dxy is given + if dxy <= 0: + raise ValueError( + "Invalid value for dxy. Please provide the grid spacing in meter." + ) + + # get number of grid cells in x and y direction + Ni = field_in.shape[-2] + Nj = field_in.shape[-1] + # wavenumber space + m, n = np.meshgrid(np.arange(Ni), np.arange(Nj), indexing="ij") + + # if domain is squared: + if Ni == Nj: + wavenumber = np.sqrt(m**2 + n**2) + lambda_mn = (2 * Ni * (dxy)) / wavenumber + else: + # if domain is a rectangle: + # alpha is the normalized wavenumber in wavenumber space + alpha = np.sqrt(m**2 / Ni**2 + n**2 / Nj**2) + # compute wavelengths for target grid in m + lambda_mn = 2 * dxy / alpha + + ############### create a 2D bandpass filter (butterworth) ####################### + b, a = signal.iirfilter( + 2, + [1 / lambda_max, 1 / lambda_min], + btype="band", + ftype="butter", + fs=1 / dxy, + output="ba", + ) + w, h = signal.freqz(b, a, 1 / lambda_mn.flatten(), fs=1 / dxy) + transfer_function = np.reshape(abs(h), lambda_mn.shape) + + # 2-dimensional discrete cosine transformation to convert data to spectral space + spectral = fft.dctn(field_in.data) + # multiplication of spectral coefficients with transfer function + filtered = spectral * transfer_function + # inverse discrete cosine transformation to go back from spectral to original space + filtered_field = fft.idctn(filtered) + + if return_transfer_function is True: + return (lambda_mn, transfer_function), filtered_field + else: + return filtered_field + + def compress_all(nc_grids, min_dims=2, comp_level=4): """ The purpose of this subroutine is to compress the netcdf variables as they are saved. From 6391513009014a59227c7454fac06654b4177550 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Sat, 24 Sep 2022 22:01:15 -0500 Subject: [PATCH 13/29] Updating Cell numbering. Changed cell_parent_track_id to begin new count rather than use max cell ID. --- tobac/merge_split.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 51dc1286..6b598375 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -171,8 +171,8 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) - track_ids = np.array(list(track_id.keys())) - logging.debug("found track ids") + # track_ids = np.array(list(track_id.keys())) + # logging.debug("found track ids") cell_id = list( np.unique( @@ -186,14 +186,17 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): for i, id in enumerate(track_id): if len(track_id[int(id)]) == 1: - cell_parent_track_id.append(int(id)) + cell_parent_track_id.append(int(i)) else: - cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)]))) + cell_parent_track_id.append(np.repeat(int(i), len(track_id[int(id)]))) cell_parent_track_id = list(flatten(cell_parent_track_id)) logging.debug("found cell parent track ids") + track_ids = np.array(np.unique(cell_parent_track_id)) + logging.debug("found track ids") + feature_parent_cell_id = list(TRACK.cell.values.astype(int)) logging.debug("found feature parent cell ids") @@ -205,7 +208,7 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): feature_parent_track_id = np.zeros(len(feature_id)) for i, id in enumerate(feature_id): cellid = feature_parent_cell_id[i] - if np.isnan(cellid): + if cellid < 0: feature_parent_track_id[i] = -1 else: j = np.where(cell_id == cellid) From b9c4d262f08aff2fc12bff9060a43cecac22fa06 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Sun, 25 Sep 2022 09:01:36 -0500 Subject: [PATCH 14/29] None type error in Projection option Fixed None type error for importing Projection keyword. --- tobac/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tobac/utils.py b/tobac/utils.py index 85c8fdb9..854bf178 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -1061,7 +1061,7 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection=None): ) # Add the projection data back in - if not None in Projection: + if Projection is not None: ds["ProjectionCoordinateSystem"] = Projection return ds From fb184139eb55692fa28c5133e7def5d3b8801b77 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Sun, 2 Oct 2022 20:52:22 -0400 Subject: [PATCH 15/29] Deleting the build/lib file that is not needed for the PR --- build/lib/tobac/merge_split.py | 259 --------------------------------- 1 file changed, 259 deletions(-) delete mode 100644 build/lib/tobac/merge_split.py diff --git a/build/lib/tobac/merge_split.py b/build/lib/tobac/merge_split.py deleted file mode 100644 index 609ae47f..00000000 --- a/build/lib/tobac/merge_split.py +++ /dev/null @@ -1,259 +0,0 @@ -""" - Tobac merge and split v0.1 - This submodule is a post processing step to address tracked cells which merge/split. - The first iteration of this module is to combine the cells which are merging but have received - a new cell id (and are considered a new cell) once merged. In general this submodule will label merged/split cells - with a TRACK number in addition to its CELL number. - -""" - - -def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): - """ - function to postprocess tobac track data for merge/split cells - - - Parameters - ---------- - TRACK : pandas.core.frame.DataFrame - Pandas dataframe of tobac Track information - - distance : float, optional - Distance threshold prior to adding a pair of points into the minimum spanning tree. - Default is 25000 meters. - - frame_len : float, optional - Threshold for the spanning length within which two points can be separated. - Default is five (5) frames. - - dxy : float, optional - The x/y/ grid spacing of the data. - Default is 500m. - - Returns - ------- - - d : xarray.core.dataset.Dataset - xarray dataset of tobac merge/split cells with parent and child designations. - - Parent/child variables include: - - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. - - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. - - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. - - track_child_cell_count: The total number of features belonging to all child cells of a given track id. - - cell_child_feature_count: The total number of features for each cell. - - - Example usage: - d = merge_split(Track) - ds = tobac.utils.standardize_track_dataset(Track, refl_mask) - both_ds = xr.merge([ds, d],compat ='override') - both_ds = tobac.utils.compress_all(both_ds) - both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) - - """ - - - try: - import networkx as nx - except ImportError: - networkx = None - - # if networkx: - # import networkx as nx - # else: - # print("Cannot Merge/Split. Please install networkx.") - import logging - import numpy as np - from pandas.core.common import flatten - import xarray as xr - - # Check if dxy is in meters of in kilometers. It should be in meters - if dxy <= 5: - dxy *= 1000 - - # Immediately convert pandas dataframe of track information to xarray: - TRACK = TRACK.to_xarray() - track_groups = TRACK.groupby("cell") - cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()} - id_data = np.fromiter(cell_ids.keys(), dtype=int) - count_data = np.fromiter(cell_ids.values(), dtype=int) - all_frames = np.sort(np.unique(TRACK.frame)) - a_points = list() - b_points = list() - a_names = list() - b_names = list() - dist = list() - - - for i in id_data: - # print(i) - a_pointx = track_groups[i].hdim_2[-1].values * dxy - a_pointy = track_groups[i].hdim_1[-1].values * dxy - for j in id_data: - b_pointx = track_groups[j].hdim_2[0].values * dxy - b_pointy = track_groups[j].hdim_1[0].values * dxy - d = np.linalg.norm( - np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)) - ) - if d <= distance: - a_points.append([a_pointx, a_pointy]) - b_points.append([b_pointx, b_pointy]) - dist.append(d) - a_names.append(i) - b_names.append(j) - - id = [] - # This is removing any tracks that have matched to themselves - e.g., - # a beginning of a track has found its tail. - for i in range(len(dist) - 1, -1, -1): - if a_names[i] == b_names[i]: - id.append(i) - a_points.pop(i) - b_points.pop(i) - dist.pop(i) - a_names.pop(i) - b_names.pop(i) - else: - continue - # This is inputting data to the object with will perform the spanning tree. - g = nx.Graph() - for i in np.arange(len(dist)): - g.add_edge(a_names[i], b_names[i], weight=dist[i]) - - tree = nx.minimum_spanning_edges(g) - tree_list = list(tree) - - new_tree = [] - for i, j in enumerate(tree_list): - frame_a = np.nanmax(track_groups[j[0]].frame.values) - frame_b = np.nanmin(track_groups[j[1]].frame.values) - if np.abs(frame_a - frame_b) <= frame_len: - new_tree.append(tree_list[i][0:2]) - new_tree_arr = np.array(new_tree) - - TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values)) - cell_id = np.unique( - TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))] - ) - track_id = dict() # same size as number of total merged tracks - - arr = np.array([0]) - for p in cell_id: - j = np.where(arr == int(p)) - if len(j[0]) > 0: - continue - else: - k = np.where(new_tree_arr == p) - if len(k[0]) == 0: - track_id[p] = [p] - arr = np.append(arr, p) - else: - temp1 = list(np.unique(new_tree_arr[k[0]])) - temp = list(np.unique(new_tree_arr[k[0]])) - - for l in range(15): - for i in temp1: - k2 = np.where(new_tree_arr == i) - temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) - temp = list(flatten(temp)) - temp = list(np.unique(temp)) - - if len(temp1) == len(temp): - break - temp1 = np.array(temp) - - for i in temp1: - k2 = np.where(new_tree_arr == i) - temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) - - temp = list(flatten(temp)) - temp = list(np.unique(temp)) - arr = np.append(arr, np.unique(temp)) - - track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) - - track_ids = np.array(list(track_id.keys())) - logging.debug("found track ids") - - cell_id = list( - np.unique( - TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))] - ) - ) - logging.debug("found cell ids") - - cell_parent_track_id = [] - - for i, id in enumerate(track_id): - - if len(track_id[int(id)]) == 1: - cell_parent_track_id.append(int(id)) - - else: - cell_parent_track_id.append(np.repeat(int(id), len(track_id[int(id)]))) - - cell_parent_track_id = list(flatten(cell_parent_track_id)) - logging.debug("found cell parent track ids") - - feature_parent_cell_id = list(TRACK.cell.values.astype(int)) - logging.debug("found feature parent cell ids") - - # This version includes all the feature regardless of if they are used in cells or not. - feature_id = list(TRACK.feature.values.astype(int)) - logging.debug("found feature ids") - - feature_parent_track_id = [] - feature_parent_track_id = np.zeros(len(feature_id)) - for i, id in enumerate(feature_id): - cellid = feature_parent_cell_id[i] - if np.isnan(cellid): - feature_parent_track_id[i] = -1 - else: - j = np.where(cell_id == cellid) - j = np.squeeze(j) - trackid = cell_parent_track_id[j] - feature_parent_track_id[i] = trackid - - logging.debug("found feature parent track ids") - - track_child_cell_count = [] - for i, id in enumerate(track_id): - track_child_cell_count.append(len(track_id[int(id)])) - logging.debug("found track child cell count") - - cell_child_feature_count = [] - for i, id in enumerate(cell_id): - cell_child_feature_count.append(len(track_groups[id].feature.values)) - logging.debug("found cell child feature count") - - track_dim = "tracks" - cell_dim = "cells" - feature_dim = "features" - - d = xr.Dataset( - { - "track": (track_dim, track_ids), - "cell": (cell_dim, cell_id), - "cell_parent_track_id": (cell_dim, cell_parent_track_id), - "feature": (feature_dim, feature_id), - "feature_parent_cell_id": (feature_dim, feature_parent_cell_id), - "feature_parent_track_id": (feature_dim, feature_parent_track_id), - "track_child_cell_count": (track_dim, track_child_cell_count), - "cell_child_feature_count": (cell_dim, cell_child_feature_count), - } - ) - - d = d.set_coords(["feature", "cell", "track"]) - - assert len(cell_id) == len(cell_parent_track_id) - assert len(feature_id) == len(feature_parent_cell_id) - assert sum(track_child_cell_count) == len(cell_id) - assert sum( - [ - sum(cell_child_feature_count[1:]), - (len(np.where(feature_parent_track_id < 0)[0])), - ] - ) == len(feature_id) - - return d From 2a9255c24b617fc1b6c49bc9e7a2682cf000dfd7 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Mon, 3 Oct 2022 22:01:09 -0400 Subject: [PATCH 16/29] Updating documentation, and merge/split methods. Updated merge_split methods to increase speed. Added output documentation table to docs, and updated the index to reflect the changes. --- doc/index.rst | 7 +++ doc/merge_split.rst | 56 ++++++++++++++++-- doc/merge_split_out_vars.csv | 9 +++ doc/merge_split_output_vars.rst | 9 +++ tobac/merge_split.py | 102 ++++++++++++-------------------- tobac/utils.py | 4 +- 6 files changed, 118 insertions(+), 69 deletions(-) create mode 100644 doc/merge_split_out_vars.csv create mode 100644 doc/merge_split_output_vars.rst diff --git a/doc/index.rst b/doc/index.rst index 96b589cf..b0ee6567 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -49,6 +49,13 @@ The project is currently being extended by several contributors to include addit linking tracking_output + +.. toctree:: + :caption: Merge/Split + :maxdepth: 2 + + merge_split + merge_split_out_vars .. toctree:: :caption: API Reference diff --git a/doc/merge_split.rst b/doc/merge_split.rst index ca02f291..b01168c1 100644 --- a/doc/merge_split.rst +++ b/doc/merge_split.rst @@ -2,6 +2,16 @@ ====================== +This submodule is a post processing step to address tracked cells which merge/split. +The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. +This submodule will label merged/split cells with a TRACK number in addition to its CELL number. + +Features, cells, and tracks are combined using parent/child nomenclature. +(quick note on terms; “feature” is a detected object at a single time step. “cell” is a series of features linked together over multiple timesteps. "track" may be an individual cell or series of cells which have merged and/or split.) +*Merge and Split +====================== + + This submodule is a post processing step to address tracked cells which merge/split. The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. This submodule will label merged/split cells with a TRACK number in addition to its CELL number. @@ -11,9 +21,45 @@ Features, cells, and tracks are combined using parent/child nomenclature. Overview of the output dataframe from merge_split - - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. - - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. - - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. - - track_child_cell_count: The total number of features belonging to all child cells of a given track id. - - cell_child_feature_count: The total number of features for each cell. +d : xarray.core.dataset.Dataset + xarray dataset of tobac merge/split cells with parent and child designations. + +Parent/child variables include: +- cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. +- feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. +- feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. +- track_child_cell_count: The total number of features belonging to all child cells of a given track id. +- cell_child_feature_count: The total number of features for each cell. + + +Example usage: + +d = merge_split(Track) + + +Overview of the output dataframe from merge_split +d : xarray.core.dataset.Dataset + xarray dataset of tobac merge/split cells with parent and child designations. + +Parent/child variables include: +- cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. +- feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. +- feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. +- track_child_cell_count: The total number of features belonging to all child cells of a given track id. +- cell_child_feature_count: The total number of features for each cell. + + +Example usage: + +d = merge_split(Track) + + +merge_split outputs an `xarray` dataset with several variables. The variables, (with column names listed in the `Variable Name` column), are described below with units. Coordinates and dataset dimensions are Feature, Cell, and Track. + +Variables that are common to all feature detection files: + +.. csv-table:: tobac Merge_Split Track Output Variables + :file: ./merge_split_out_vars.csv + :widths: 3, 35, 3, 3 + :header-rows: 1 diff --git a/doc/merge_split_out_vars.csv b/doc/merge_split_out_vars.csv new file mode 100644 index 00000000..38b86679 --- /dev/null +++ b/doc/merge_split_out_vars.csv @@ -0,0 +1,9 @@ +Variable Name,Description,Units,Type +feature,Unique number of the feature; starts from 1 and increments by 1 to the number of features identified in all frames,n/a,int64 +Cell,Tracked cell number; generally starts from 1. Untracked cell value is -1.,n/a,int64 +track,Unique number of the track; starts from 0 and increments by 1 to the number of tracks identified. Untracked cells and features have a track id of -1.,n/a,int64 +cell_parent_track_id,"The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.",n/a,int64 +feature_parent_cell_id,The associated parent cell id for each feature. All feature in a given cell will have the same cell id.,n/a,int64 +feature_parent_track_id,The associated parent track id for each feature. This is not the same as the cell id number.,n/a,int64 +track_child_cell_count,The number of features belonging to all child cells of a given track id.,n/a,int64 +cell_child_feature_count,The number of features for each cell.,n/a,int64 \ No newline at end of file diff --git a/doc/merge_split_output_vars.rst b/doc/merge_split_output_vars.rst new file mode 100644 index 00000000..a17ae327 --- /dev/null +++ b/doc/merge_split_output_vars.rst @@ -0,0 +1,9 @@ +merge_split outputs an `xarray` dataset with several variables. The variables, (with column names listed in the `Variable Name` column), are described below with units. Coordinates and dataset dimensions are Feature, Cell, and Track. + +Variables that are common to all feature detection files: + +.. csv-table:: tobac Merge_Split Track Output Variables + :file: ./merge_split_out_vars.csv + :widths: 3, 35, 3, 3 + :header-rows: 1 + diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 6b598375..837e8185 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -1,5 +1,5 @@ """ - Tobac merge and split v0.1 + Tobac merge and split This submodule is a post processing step to address tracked cells which merge/split. The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. In general this submodule will label merged/split cells @@ -8,7 +8,7 @@ """ -def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): +def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): """ function to postprocess tobac track data for merge/split cells @@ -18,18 +18,19 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): TRACK : pandas.core.frame.DataFrame Pandas dataframe of tobac Track information + dxy : float, mandatory + The x/y grid spacing of the data. + Should be in meters. + + distance : float, optional - Distance threshold prior to adding a pair of points into the minimum spanning tree. + Distance threshold determining how close two features must be in order to consider merge/splitting. Default is 25000 meters. frame_len : float, optional - Threshold for the spanning length within which two points can be separated. + Threshold for the maximum number of frames that can separate the end of cell and the start of a related cell. Default is five (5) frames. - dxy : float, optional - The x/y/ grid spacing of the data. - Default is 500m. - Returns ------- @@ -58,63 +59,43 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): except ImportError: networkx = None - # if networkx: - # import networkx as nx - # else: - # print("Cannot Merge/Split. Please install networkx.") import logging import numpy as np from pandas.core.common import flatten import xarray as xr - - # Check if dxy is in meters of in kilometers. It should be in meters - if dxy <= 5: - dxy *= 1000 + from scipy.spatial.distance import cdist # Immediately convert pandas dataframe of track information to xarray: TRACK = TRACK.to_xarray() track_groups = TRACK.groupby("cell") - cell_ids = {cid: len(v) for cid, v in track_groups.groups.items()} - id_data = np.fromiter(cell_ids.keys(), dtype=int) - count_data = np.fromiter(cell_ids.values(), dtype=int) - all_frames = np.sort(np.unique(TRACK.frame)) - a_points = list() - b_points = list() + first = track_groups.first() + last = track_groups.last() + a_names = list() b_names = list() dist = list() - for i in id_data: - # print(i) - a_pointx = track_groups[i].hdim_2[-1].values * dxy - a_pointy = track_groups[i].hdim_1[-1].values * dxy - for j in id_data: - b_pointx = track_groups[j].hdim_2[0].values * dxy - b_pointy = track_groups[j].hdim_1[0].values * dxy - d = np.linalg.norm( - np.array((a_pointx, a_pointy)) - np.array((b_pointx, b_pointy)) - ) - if d <= distance: - a_points.append([a_pointx, a_pointy]) - b_points.append([b_pointx, b_pointy]) - dist.append(d) - a_names.append(i) - b_names.append(j) - - id = [] - # This is removing any tracks that have matched to themselves - e.g., - # a beginning of a track has found its tail. - for i in range(len(dist) - 1, -1, -1): - if a_names[i] == b_names[i]: - id.append(i) - a_points.pop(i) - b_points.pop(i) - dist.pop(i) - a_names.pop(i) - b_names.pop(i) - else: - continue - # This is inputting data to the object with will perform the spanning tree. + # write all sets of points (a and b) as Nx2 arrays + l = len(last["hdim_2"].values) + cells = first["cell"].values + a_xy = np.zeros((l, 2)) + a_xy[:, 0] = last["hdim_2"].values * dxy + a_xy[:, 1] = last["hdim_1"].values * dxy + b_xy = np.zeros((l, 2)) + b_xy[:, 0] = first["hdim_2"].values * dxy + b_xy[:, 1] = first["hdim_1"].values * dxy + + # Use cdist to find distance matrix + out = cdist(a_xy, b_xy) + # Find all cells under the distance threshold + j = np.where(out <= distance) + + # Compile cells meeting the criteria to an array of both the distance and cell ids + a_names = cells[j[0]] + b_names = cells[j[1]] + dist = out[j] + + # This is inputing data to the object which will perform the spanning tree. g = nx.Graph() for i in np.arange(len(dist)): g.add_edge(a_names[i], b_names[i], weight=dist[i]) @@ -123,6 +104,8 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): tree_list = list(tree) new_tree = [] + + #Pruning the tree for time limits. for i, j in enumerate(tree_list): frame_a = np.nanmax(track_groups[j[0]].frame.values) frame_b = np.nanmin(track_groups[j[1]].frame.values) @@ -136,6 +119,7 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): ) track_id = dict() # same size as number of total merged tracks + #Cleaning up tracks, combining tracks which contain the same cells. arr = np.array([0]) for p in cell_id: j = np.where(arr == int(p)) @@ -171,20 +155,12 @@ def merge_split(TRACK, distance=25000, frame_len=5, dxy=500): track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) - # track_ids = np.array(list(track_id.keys())) - # logging.debug("found track ids") - - cell_id = list( - np.unique( - TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))] - ) - ) + cell_id = list(np.unique(TRACK.cell.values.astype(int))) logging.debug("found cell ids") cell_parent_track_id = [] - for i, id in enumerate(track_id): - + for i, id in enumerate(track_id, start=-1): if len(track_id[int(id)]) == 1: cell_parent_track_id.append(int(i)) diff --git a/tobac/utils.py b/tobac/utils.py index 854bf178..d665d55c 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -932,12 +932,14 @@ def compress_all(nc_grids, min_dims=2, comp_level=4): def standardize_track_dataset(TrackedFeatures, Mask, Projection=None): """ + CAUTION: this function is experimental. No data structures output are guaranteed to be supported in future versions of tobac. + Combine a feature mask with the feature data table into a common dataset. returned by tobac.segmentation with the TrackedFeatures dataset returned by tobac.linking_trackpy. - Also rename the variables to be more desciptive and comply with cf-tree. + Also rename the variables to be more descriptive and comply with cf-tree. Convert the default cell parent ID to an integer table. From 53cf5b35437eee2b9fd21b75ec25a6dff3247ca5 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Mon, 3 Oct 2022 22:04:19 -0400 Subject: [PATCH 17/29] Black formatting --- tobac/merge_split.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 837e8185..509b282a 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -39,7 +39,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): Parent/child variables include: - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. - - feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. + - feature_parent_cell_id: The associated parent cell id for each feature. All features in a given cell will have the same cell id. This is the original TRACK cell_id. - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. - track_child_cell_count: The total number of features belonging to all child cells of a given track id. - cell_child_feature_count: The total number of features for each cell. @@ -104,8 +104,8 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): tree_list = list(tree) new_tree = [] - - #Pruning the tree for time limits. + + # Pruning the tree for time limits. for i, j in enumerate(tree_list): frame_a = np.nanmax(track_groups[j[0]].frame.values) frame_b = np.nanmin(track_groups[j[1]].frame.values) @@ -119,7 +119,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): ) track_id = dict() # same size as number of total merged tracks - #Cleaning up tracks, combining tracks which contain the same cells. + # Cleaning up tracks, combining tracks which contain the same cells. arr = np.array([0]) for p in cell_id: j = np.where(arr == int(p)) From f23b6b3321d46c407e765ffaf038d88f8bd238f4 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Tue, 11 Oct 2022 11:37:13 -0500 Subject: [PATCH 18/29] Testing update --- tobac/merge_split.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 509b282a..911b58cd 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -226,11 +226,6 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): assert len(cell_id) == len(cell_parent_track_id) assert len(feature_id) == len(feature_parent_cell_id) assert sum(track_child_cell_count) == len(cell_id) - assert sum( - [ - sum(cell_child_feature_count[1:]), - (len(np.where(feature_parent_track_id < 0)[0])), - ] - ) == len(feature_id) + assert sum(cell_child_feature_count) == len(feature_id) return d From dd88c30d6d4a94c87405e84accd0cd4b48c9e5a2 Mon Sep 17 00:00:00 2001 From: Juli Date: Tue, 25 Oct 2022 09:34:16 +0200 Subject: [PATCH 19/29] removed redundant text and fixed formatting for RTD page --- doc/merge_split.rst | 43 ++++++++------------------------- doc/merge_split_output_vars.rst | 9 ------- 2 files changed, 10 insertions(+), 42 deletions(-) delete mode 100644 doc/merge_split_output_vars.rst diff --git a/doc/merge_split.rst b/doc/merge_split.rst index b01168c1..6ea3b0ce 100644 --- a/doc/merge_split.rst +++ b/doc/merge_split.rst @@ -1,58 +1,35 @@ -*Merge and Split +Merge and Split ====================== - This submodule is a post processing step to address tracked cells which merge/split. The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. This submodule will label merged/split cells with a TRACK number in addition to its CELL number. Features, cells, and tracks are combined using parent/child nomenclature. (quick note on terms; “feature” is a detected object at a single time step. “cell” is a series of features linked together over multiple timesteps. "track" may be an individual cell or series of cells which have merged and/or split.) -*Merge and Split -====================== - - -This submodule is a post processing step to address tracked cells which merge/split. -The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. -This submodule will label merged/split cells with a TRACK number in addition to its CELL number. -Features, cells, and tracks are combined using parent/child nomenclature. -(quick note on terms; “feature” is a detected object at a single time step. “cell” is a series of features linked together over multiple timesteps. "track" may be an individual cell or series of cells which have merged and/or split.) +Overview of the output dataframe from merge_split +d : `xarray.core.dataset.Dataset` -Overview of the output dataframe from merge_split -d : xarray.core.dataset.Dataset - xarray dataset of tobac merge/split cells with parent and child designations. +xarray dataset of tobac merge/split cells with parent and child designations. Parent/child variables include: -- cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. -- feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. -- feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. -- track_child_cell_count: The total number of features belonging to all child cells of a given track id. -- cell_child_feature_count: The total number of features for each cell. +* cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. -Example usage: - -d = merge_split(Track) +* feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. +* feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. -Overview of the output dataframe from merge_split -d : xarray.core.dataset.Dataset - xarray dataset of tobac merge/split cells with parent and child designations. +* track_child_cell_count: The total number of features belonging to all child cells of a given track id. -Parent/child variables include: -- cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. -- feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. -- feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. -- track_child_cell_count: The total number of features belonging to all child cells of a given track id. -- cell_child_feature_count: The total number of features for each cell. +* cell_child_feature_count: The total number of features for each cell. Example usage: -d = merge_split(Track) - +``d = merge_split(Track)`` merge_split outputs an `xarray` dataset with several variables. The variables, (with column names listed in the `Variable Name` column), are described below with units. Coordinates and dataset dimensions are Feature, Cell, and Track. diff --git a/doc/merge_split_output_vars.rst b/doc/merge_split_output_vars.rst deleted file mode 100644 index a17ae327..00000000 --- a/doc/merge_split_output_vars.rst +++ /dev/null @@ -1,9 +0,0 @@ -merge_split outputs an `xarray` dataset with several variables. The variables, (with column names listed in the `Variable Name` column), are described below with units. Coordinates and dataset dimensions are Feature, Cell, and Track. - -Variables that are common to all feature detection files: - -.. csv-table:: tobac Merge_Split Track Output Variables - :file: ./merge_split_out_vars.csv - :widths: 3, 35, 3, 3 - :header-rows: 1 - From 9c0e33ecfed057fa27881508f1fd6a3c07ae75d1 Mon Sep 17 00:00:00 2001 From: Sean Freeman Date: Mon, 31 Oct 2022 09:59:55 -0500 Subject: [PATCH 20/29] black formatting --- tobac/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tobac/utils.py b/tobac/utils.py index 32f34d4b..0ee24a8d 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -895,6 +895,7 @@ def spectral_filtering( else: return filtered_field + def compress_all(nc_grids, min_dims=2, comp_level=4): """ The purpose of this subroutine is to compress the netcdf variables as they are saved. @@ -1067,6 +1068,7 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection=None): return ds + def combine_tobac_feats(list_of_feats, preserve_old_feat_nums=None): """Function to combine a list of tobac feature detection dataframes into one combined dataframe that can be used for tracking From 7e626529fda3f357d1cad3447b404494729df22a Mon Sep 17 00:00:00 2001 From: Sean Freeman Date: Mon, 31 Oct 2022 11:01:01 -0500 Subject: [PATCH 21/29] added merge_split to init; changed start track number to 0 from -1 --- tobac/__init__.py | 1 + tobac/merge_split.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tobac/__init__.py b/tobac/__init__.py index f1aeb95e..6d7627bf 100644 --- a/tobac/__init__.py +++ b/tobac/__init__.py @@ -74,6 +74,7 @@ from .tracking import linking_trackpy from .wrapper import maketrack from .wrapper import tracking_wrapper +from . import merge_split # Set version number __version__ = "1.4.0" diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 911b58cd..02ff5804 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -160,7 +160,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): cell_parent_track_id = [] - for i, id in enumerate(track_id, start=-1): + for i, id in enumerate(track_id, start=0): if len(track_id[int(id)]) == 1: cell_parent_track_id.append(int(i)) From a40b27d754835e0cc701f2f496968eebc557d061 Mon Sep 17 00:00:00 2001 From: Sean Freeman Date: Mon, 31 Oct 2022 11:10:10 -0500 Subject: [PATCH 22/29] added basic merge test --- tobac/tests/test_merge_split.py | 96 +++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 tobac/tests/test_merge_split.py diff --git a/tobac/tests/test_merge_split.py b/tobac/tests/test_merge_split.py new file mode 100644 index 00000000..afc817ab --- /dev/null +++ b/tobac/tests/test_merge_split.py @@ -0,0 +1,96 @@ +"""Module to test tobac.merge_split +""" + +import pandas as pd +import numpy as np + +import datetime + +import tobac.testing as tbtest +import tobac.tracking as tbtrack +import tobac.merge_split as mergesplit + + +def test_merge_split_cells(): + """Tests tobac.merge_split.merge_split_cells by + generating two cells, colliding them into one another, + and merging them. + """ + + cell_1 = tbtest.generate_single_feature( + 1, + 1, + min_h1=0, + max_h1=100, + min_h2=0, + max_h2=100, + frame_start=0, + num_frames=2, + spd_h1=20, + spd_h2=20, + start_date=datetime.datetime(2022, 1, 1, 0), + ) + cell_1["feature"] = [1, 3] + + cell_2 = tbtest.generate_single_feature( + 1, + 100, + min_h1=0, + max_h1=100, + min_h2=0, + max_h2=100, + frame_start=0, + num_frames=2, + spd_h1=20, + spd_h2=-20, + start_date=datetime.datetime(2022, 1, 1, 0), + ) + cell_2["feature"] = [2, 4] + cell_3 = tbtest.generate_single_feature( + 30, + 50, + min_h1=0, + max_h1=100, + min_h2=0, + max_h2=100, + frame_start=2, + num_frames=2, + spd_h1=20, + spd_h2=0, + start_date=datetime.datetime(2022, 1, 1, 0, 10), + ) + cell_3["feature"] = [5, 6] + features = pd.concat([cell_1, cell_2, cell_3]) + output = tbtrack.linking_trackpy(features, None, 1, 1, v_max=40) + + dist_between = np.sqrt( + np.power( + output[output["frame"] == 1].iloc[0]["hdim_1"] + - output[output["frame"] == 1].iloc[1]["hdim_1"], + 2, + ) + + np.power( + output[output["frame"] == 1].iloc[0]["hdim_2"] + - output[output["frame"] == 1].iloc[1]["hdim_2"], + 2, + ) + ) + + # Test a successful merger + mergesplit_output_merged = mergesplit.merge_split_cells( + output, dxy=1, distance=dist_between + 50 + ) + + # These cells should have merged together. + assert len(mergesplit_output_merged["track"]) == 1 + + # Test an unsuccessful merger. + mergesplit_output_unmerged = mergesplit.merge_split_cells( + output, dxy=1, distance=dist_between - 50 + ) + + # These cells should NOT have merged together. + print(mergesplit_output_unmerged["track"]) + assert len(mergesplit_output_unmerged["track"]) == 2 + + pass From fa09e968633204dfdb652bcb5f531d6e29395bce Mon Sep 17 00:00:00 2001 From: Sean Freeman Date: Mon, 31 Oct 2022 11:27:38 -0500 Subject: [PATCH 23/29] removed final pass --- tobac/tests/test_merge_split.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tobac/tests/test_merge_split.py b/tobac/tests/test_merge_split.py index afc817ab..fbc51f88 100644 --- a/tobac/tests/test_merge_split.py +++ b/tobac/tests/test_merge_split.py @@ -92,5 +92,3 @@ def test_merge_split_cells(): # These cells should NOT have merged together. print(mergesplit_output_unmerged["track"]) assert len(mergesplit_output_unmerged["track"]) == 2 - - pass From 5354ebee79a617db71357ceb6f32fc429c82c2c6 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Fri, 4 Nov 2022 21:31:27 -0500 Subject: [PATCH 24/29] Update dimension names, correct the merged track numbering, etc. This update also includes a fix to an existing error in the merge_split algorithm that incorrectly multiplied the dimension and not values. --- tobac/merge_split.py | 28 ++++++++++++++++------------ tobac/utils.py | 2 +- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 02ff5804..23ec76f9 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -8,7 +8,7 @@ """ -def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): +def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): """ function to postprocess tobac track data for merge/split cells @@ -25,7 +25,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): distance : float, optional Distance threshold determining how close two features must be in order to consider merge/splitting. - Default is 25000 meters. + Default is 10x the x/y grid spacing of the data, given in dxy. frame_len : float, optional Threshold for the maximum number of frames that can separate the end of cell and the start of a related cell. @@ -53,7 +53,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) """ - + print('update') try: import networkx as nx except ImportError: @@ -71,6 +71,9 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): first = track_groups.first() last = track_groups.last() + if distance == None: + distance = dxy*25. + a_names = list() b_names = list() dist = list() @@ -84,7 +87,8 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): b_xy = np.zeros((l, 2)) b_xy[:, 0] = first["hdim_2"].values * dxy b_xy[:, 1] = first["hdim_1"].values * dxy - + a_xy = a_xy*dxy + b_xy = b_xy*dxy # Use cdist to find distance matrix out = cdist(a_xy, b_xy) # Find all cells under the distance threshold @@ -134,7 +138,7 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): temp1 = list(np.unique(new_tree_arr[k[0]])) temp = list(np.unique(new_tree_arr[k[0]])) - for l in range(15): + for l in range(len(cell_id)): for i in temp1: k2 = np.where(new_tree_arr == i) temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) @@ -204,9 +208,9 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): cell_child_feature_count.append(len(track_groups[id].feature.values)) logging.debug("found cell child feature count") - track_dim = "tracks" - cell_dim = "cells" - feature_dim = "features" + track_dim = "track" + cell_dim = "cell" + feature_dim = "feature" d = xr.Dataset( { @@ -223,9 +227,9 @@ def merge_split_cells(TRACK, dxy, distance=25000, frame_len=5): d = d.set_coords(["feature", "cell", "track"]) - assert len(cell_id) == len(cell_parent_track_id) - assert len(feature_id) == len(feature_parent_cell_id) - assert sum(track_child_cell_count) == len(cell_id) - assert sum(cell_child_feature_count) == len(feature_id) +# assert len(cell_id) == len(cell_parent_track_id) +# assert len(feature_id) == len(feature_parent_cell_id) +# assert sum(track_child_cell_count) == len(cell_id) +# assert sum(cell_child_feature_count) == len(feature_id) return d diff --git a/tobac/utils.py b/tobac/utils.py index 0ee24a8d..bedcde1c 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -1012,7 +1012,7 @@ def standardize_track_dataset(TrackedFeatures, Mask, Projection=None): "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments", ), "feature": ( - "feature_id", + "feature", "Unique number of the feature; starts from 1 and increments by 1 to the number of features", ), "time": ( From 9e722f8a4d49ea5b863300d14199958eb00afda4 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Fri, 4 Nov 2022 21:34:39 -0500 Subject: [PATCH 25/29] Updated for formatting --- tobac/merge_split.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 23ec76f9..72fe2f7b 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -53,7 +53,7 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) """ - print('update') + print("update") try: import networkx as nx except ImportError: @@ -72,8 +72,8 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): last = track_groups.last() if distance == None: - distance = dxy*25. - + distance = dxy * 25.0 + a_names = list() b_names = list() dist = list() @@ -87,8 +87,8 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): b_xy = np.zeros((l, 2)) b_xy[:, 0] = first["hdim_2"].values * dxy b_xy[:, 1] = first["hdim_1"].values * dxy - a_xy = a_xy*dxy - b_xy = b_xy*dxy + a_xy = a_xy * dxy + b_xy = b_xy * dxy # Use cdist to find distance matrix out = cdist(a_xy, b_xy) # Find all cells under the distance threshold @@ -227,9 +227,9 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): d = d.set_coords(["feature", "cell", "track"]) -# assert len(cell_id) == len(cell_parent_track_id) -# assert len(feature_id) == len(feature_parent_cell_id) -# assert sum(track_child_cell_count) == len(cell_id) -# assert sum(cell_child_feature_count) == len(feature_id) + # assert len(cell_id) == len(cell_parent_track_id) + # assert len(feature_id) == len(feature_parent_cell_id) + # assert sum(track_child_cell_count) == len(cell_id) + # assert sum(cell_child_feature_count) == len(feature_id) return d From 019d0f344b1a0802e40e8779e6012852a4c19802 Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Sat, 5 Nov 2022 08:45:29 -0500 Subject: [PATCH 26/29] Removed unnecessary print statement --- tobac/merge_split.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 72fe2f7b..2bac5647 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -53,7 +53,6 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) """ - print("update") try: import networkx as nx except ImportError: From ef48aa9c6c26fd4e14008e34281f60f6db0c8ffd Mon Sep 17 00:00:00 2001 From: Sean Freeman Date: Mon, 7 Nov 2022 08:00:17 -0600 Subject: [PATCH 27/29] added merge_split to the API reference --- doc/tobac.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/tobac.rst b/doc/tobac.rst index 6f7f4ccb..ac0d88fe 100644 --- a/doc/tobac.rst +++ b/doc/tobac.rst @@ -28,6 +28,14 @@ tobac.feature\_detection module :undoc-members: :show-inheritance: +tobac.merge_split module +--------------------- + +.. automodule:: tobac.merge_split + :members: + :undoc-members: + :show-inheritance: + tobac.plotting module --------------------- From 4b1101568197b224eaba2d5a65aab0337ad2f5bb Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Mon, 7 Nov 2022 12:37:08 -0600 Subject: [PATCH 28/29] Updates with documentation and removing duplicated dxy lines in merge_split. --- doc/merge_split.rst | 2 +- doc/merge_split_out_vars.csv | 2 +- tobac/merge_split.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/merge_split.rst b/doc/merge_split.rst index 6ea3b0ce..31811661 100644 --- a/doc/merge_split.rst +++ b/doc/merge_split.rst @@ -6,7 +6,7 @@ The first iteration of this module is to combine the cells which are merging but This submodule will label merged/split cells with a TRACK number in addition to its CELL number. Features, cells, and tracks are combined using parent/child nomenclature. -(quick note on terms; “feature” is a detected object at a single time step. “cell” is a series of features linked together over multiple timesteps. "track" may be an individual cell or series of cells which have merged and/or split.) +(quick note on terms; “feature” is a detected object at a single time step (see :doc:`feature_detection_overview`). “cell” is a series of features linked together over multiple timesteps (see :doc:`linking`). "track" may be an individual cell or series of cells which have merged and/or split.) Overview of the output dataframe from merge_split diff --git a/doc/merge_split_out_vars.csv b/doc/merge_split_out_vars.csv index 38b86679..b4b6069c 100644 --- a/doc/merge_split_out_vars.csv +++ b/doc/merge_split_out_vars.csv @@ -1,6 +1,6 @@ Variable Name,Description,Units,Type feature,Unique number of the feature; starts from 1 and increments by 1 to the number of features identified in all frames,n/a,int64 -Cell,Tracked cell number; generally starts from 1. Untracked cell value is -1.,n/a,int64 +cell,Tracked cell number; generally starts from 1. Untracked cell value is -1.,n/a,int64 track,Unique number of the track; starts from 0 and increments by 1 to the number of tracks identified. Untracked cells and features have a track id of -1.,n/a,int64 cell_parent_track_id,"The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.",n/a,int64 feature_parent_cell_id,The associated parent cell id for each feature. All feature in a given cell will have the same cell id.,n/a,int64 diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 2bac5647..1dbe2ae4 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -25,7 +25,8 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): distance : float, optional Distance threshold determining how close two features must be in order to consider merge/splitting. - Default is 10x the x/y grid spacing of the data, given in dxy. + Default is 25x the x/y grid spacing of the data, given in dxy. + The distance should be in units of meters. frame_len : float, optional Threshold for the maximum number of frames that can separate the end of cell and the start of a related cell. @@ -70,7 +71,7 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): first = track_groups.first() last = track_groups.last() - if distance == None: + if distance is None: distance = dxy * 25.0 a_names = list() @@ -86,8 +87,7 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): b_xy = np.zeros((l, 2)) b_xy[:, 0] = first["hdim_2"].values * dxy b_xy[:, 1] = first["hdim_1"].values * dxy - a_xy = a_xy * dxy - b_xy = b_xy * dxy + # Use cdist to find distance matrix out = cdist(a_xy, b_xy) # Find all cells under the distance threshold From 800fdfe6fdea261d742595762340879e749e4a0e Mon Sep 17 00:00:00 2001 From: kelcyno <88055123+kelcyno@users.noreply.github.com> Date: Tue, 8 Nov 2022 11:23:52 -0600 Subject: [PATCH 29/29] Renamed the function using the postfix MEST, and corrected a track numbering error which incorrectly numbered tracks for merged cells. --- doc/merge_split.rst | 3 ++- tobac/merge_split.py | 35 ++++++++++++--------------------- tobac/tests/test_merge_split.py | 6 +++--- 3 files changed, 18 insertions(+), 26 deletions(-) diff --git a/doc/merge_split.rst b/doc/merge_split.rst index 31811661..b6b56cbd 100644 --- a/doc/merge_split.rst +++ b/doc/merge_split.rst @@ -3,6 +3,7 @@ Merge and Split This submodule is a post processing step to address tracked cells which merge/split. The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. +This module uses a minimum euclidian spanning tree to combine merging cells, thus the postfix for the function is MEST. This submodule will label merged/split cells with a TRACK number in addition to its CELL number. Features, cells, and tracks are combined using parent/child nomenclature. @@ -29,7 +30,7 @@ Parent/child variables include: Example usage: -``d = merge_split(Track)`` +``d = merge_split_MEST(Track)`` merge_split outputs an `xarray` dataset with several variables. The variables, (with column names listed in the `Variable Name` column), are described below with units. Coordinates and dataset dimensions are Feature, Cell, and Track. diff --git a/tobac/merge_split.py b/tobac/merge_split.py index 1dbe2ae4..6e53eb97 100644 --- a/tobac/merge_split.py +++ b/tobac/merge_split.py @@ -8,9 +8,9 @@ """ -def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): +def merge_split_MEST(TRACK, dxy, distance=None, frame_len=5): """ - function to postprocess tobac track data for merge/split cells + function to postprocess tobac track data for merge/split cells using a minimum euclidian spanning tree Parameters @@ -47,7 +47,7 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): Example usage: - d = merge_split(Track) + d = merge_split_MEST(Track) ds = tobac.utils.standardize_track_dataset(Track, refl_mask) both_ds = xr.merge([ds, d],compat ='override') both_ds = tobac.utils.compress_all(both_ds) @@ -87,7 +87,6 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): b_xy = np.zeros((l, 2)) b_xy[:, 0] = first["hdim_2"].values * dxy b_xy[:, 1] = first["hdim_1"].values * dxy - # Use cdist to find distance matrix out = cdist(a_xy, b_xy) # Find all cells under the distance threshold @@ -161,16 +160,13 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): cell_id = list(np.unique(TRACK.cell.values.astype(int))) logging.debug("found cell ids") - cell_parent_track_id = [] + cell_parent_track_id = np.zeros(len(cell_id)) + cell_parent_track_id[:] = -1 for i, id in enumerate(track_id, start=0): - if len(track_id[int(id)]) == 1: - cell_parent_track_id.append(int(i)) - - else: - cell_parent_track_id.append(np.repeat(int(i), len(track_id[int(id)]))) + for j in track_id[int(id)]: + cell_parent_track_id[cell_id.index(j)] = int(i) - cell_parent_track_id = list(flatten(cell_parent_track_id)) logging.debug("found cell parent track ids") track_ids = np.array(np.unique(cell_parent_track_id)) @@ -179,7 +175,7 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): feature_parent_cell_id = list(TRACK.cell.values.astype(int)) logging.debug("found feature parent cell ids") - # This version includes all the feature regardless of if they are used in cells or not. + # # This version includes all the feature regardless of if they are used in cells or not. feature_id = list(TRACK.feature.values.astype(int)) logging.debug("found feature ids") @@ -190,21 +186,16 @@ def merge_split_cells(TRACK, dxy, distance=None, frame_len=5): if cellid < 0: feature_parent_track_id[i] = -1 else: - j = np.where(cell_id == cellid) - j = np.squeeze(j) - trackid = cell_parent_track_id[j] - feature_parent_track_id[i] = trackid - - logging.debug("found feature parent track ids") + feature_parent_track_id[i] = cell_parent_track_id[cell_id.index(cellid)] - track_child_cell_count = [] + track_child_cell_count = np.zeros(len(track_id)) for i, id in enumerate(track_id): - track_child_cell_count.append(len(track_id[int(id)])) + track_child_cell_count[i] = len(np.where(cell_parent_track_id == i)[0]) logging.debug("found track child cell count") - cell_child_feature_count = [] + cell_child_feature_count = np.zeros(len(cell_id)) for i, id in enumerate(cell_id): - cell_child_feature_count.append(len(track_groups[id].feature.values)) + cell_child_feature_count[i] = len(track_groups[id].feature.values) logging.debug("found cell child feature count") track_dim = "track" diff --git a/tobac/tests/test_merge_split.py b/tobac/tests/test_merge_split.py index fbc51f88..389c3dc7 100644 --- a/tobac/tests/test_merge_split.py +++ b/tobac/tests/test_merge_split.py @@ -11,7 +11,7 @@ import tobac.merge_split as mergesplit -def test_merge_split_cells(): +def test_merge_split_MEST(): """Tests tobac.merge_split.merge_split_cells by generating two cells, colliding them into one another, and merging them. @@ -77,7 +77,7 @@ def test_merge_split_cells(): ) # Test a successful merger - mergesplit_output_merged = mergesplit.merge_split_cells( + mergesplit_output_merged = mergesplit.merge_split_MEST( output, dxy=1, distance=dist_between + 50 ) @@ -85,7 +85,7 @@ def test_merge_split_cells(): assert len(mergesplit_output_merged["track"]) == 1 # Test an unsuccessful merger. - mergesplit_output_unmerged = mergesplit.merge_split_cells( + mergesplit_output_unmerged = mergesplit.merge_split_MEST( output, dxy=1, distance=dist_between - 50 )