Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/issue 260 #261

Merged
merged 10 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]
### Added
- [issue/260](https://github.com/podaac/l2ss-py/pull/261): Add gpm cleanup function to add a timeMidScan variable if the timeMidScan variable isn't present. Function takes the years, months, days etc ScanTime variables and creates a single time variable using datetime.datetime library.
### Changed
- Update code to determin lat lon time variables
- Update xarray version
Expand Down
42 changes: 41 additions & 1 deletion podaac/subsetter/gpm_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,34 @@
to nscan, nbin, nfreq by using the DimensionNames variable attribute
"""

import datetime
from netCDF4 import date2num # pylint: disable=no-name-in-module

dim_dict = {}


def change_var_dims(nc_dataset, variables=None):
def compute_new_time_data(time_group, nc_dataset):
"""
create a time variable, timeMidScan, that is present in other
GPM collections but not the ENV collections.
"""
# set the time unit for GPM
time_unit_out = "seconds since 1980-01-06 00:00:00"
# conver to a float, seconds variable
new_time_list = [date2num(datetime.datetime(
nc_dataset[time_group+'__Year'][:][i],
nc_dataset[time_group+'__Month'][:][i],
nc_dataset[time_group+'__DayOfMonth'][:][i],
hour=nc_dataset[time_group+'__Hour'][:][i],
minute=nc_dataset[time_group+'__Minute'][:][i],
second=nc_dataset[time_group+'__Second'][:][i],
microsecond=nc_dataset[time_group+'__MilliSecond'][:][i]*1000),
time_unit_out) for i in range(len(nc_dataset[time_group+'__Year'][:]))]

return new_time_list, time_unit_out


def change_var_dims(nc_dataset, variables=None, time_name="_timeMidScan"):
"""
Go through each variable and get the dimension names from attribute "DimensionNames
If the name is unique, add it as a dimension to the netCDF4 dataset. Then change the
Expand Down Expand Up @@ -62,4 +86,20 @@ def change_var_dims(nc_dataset, variables=None):
# copy the data to the new variable with dimension names
new_mapped_var[var_name][:] = var[:]

if not any(time_name in var for var in var_list):
# if there isn't any timeMidScan variables, create one
scan_time_groups = ["__".join(i.split('__')[:-1]) for i in var_list if 'ScanTime' in i]
for time_group in list(set(scan_time_groups)):
# get the seconds since Jan 6, 1980
time_data, time_unit = compute_new_time_data(time_group, nc_dataset)
# make a new variable for each ScanTime group
new_time_var_name = time_group+time_name
# copy dimensions from the Year variable
var_dims = nc_dataset.variables[time_group+'__Year'].dimensions
comp_args = {"zlib": True, "complevel": 1}
nc_dataset.createVariable(new_time_var_name, 'f8', var_dims, **comp_args)
nc_dataset.variables[new_time_var_name].setncattr('unit', time_unit)
# copy the data in
nc_dataset.variables[new_time_var_name][:] = time_data

return nc_dataset
3 changes: 3 additions & 0 deletions podaac/subsetter/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1226,6 +1226,9 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
except AttributeError:
pass

if hdf_type == 'GPM':
args['decode_times'] = False

with xr.open_dataset(
xr.backends.NetCDF4DataStore(nc_dataset),
**args
Expand Down
Binary file removed tests/data/GPM/GPM_test_file.HDF5
Binary file not shown.
Binary file added tests/data/GPM/GPM_test_file_2.HDF5
Binary file not shown.
13 changes: 7 additions & 6 deletions tests/test_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2288,21 +2288,22 @@ def test_get_unique_groups():
assert expected_groups_single == unique_groups_single
assert expected_diff_counts_single == diff_counts_single

def test_gpm_dimension_map(data_dir, subset_output_dir, request):
"""Test GPM files for dimension mapping and returns the expected netCDF
dataset without the phony dimensions"""

def test_gpm_compute_new_var_data(data_dir, subset_output_dir, request):
"""Test GPM files that have scantime variable to compute the time for seconds
since 1980-01-06"""

gpm_dir = join(data_dir, 'GPM')
gpm_file = 'GPM_test_file.HDF5'
bbox = np.array(((-180, 180), (-90, 90)))
gpm_file = 'GPM_test_file_2.HDF5'
shutil.copyfile(
os.path.join(gpm_dir, gpm_file),
os.path.join(subset_output_dir, gpm_file)
)

nc_dataset, has_groups, file_extension = subset.open_as_nc_dataset(join(subset_output_dir, gpm_file))

nc_dataset = gc.change_var_dims(nc_dataset)
nc_dataset_new = gc.change_var_dims(nc_dataset, variables=None, time_name='__test_time')
assert int(nc_dataset_new.variables["__FS__ScanTime__test_time"][:][0]) == 1306403820

for var_name, var in nc_dataset.variables.items():
dims = list(var.dimensions)
Expand Down
Loading