From ca85179fa90f5ebcbe9ea09d9d3bc8226b9454c7 Mon Sep 17 00:00:00 2001 From: vishal Date: Mon, 28 Oct 2024 09:52:25 -0500 Subject: [PATCH 01/13] initial transformation function --- data_transformation_plugins/__init__.py | 0 .../tm54dvar_ch4flux_mask_monthgrid_v5.py | 43 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 data_transformation_plugins/__init__.py create mode 100644 data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5.py diff --git a/data_transformation_plugins/__init__.py b/data_transformation_plugins/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5.py b/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5.py new file mode 100644 index 00000000..99fcc503 --- /dev/null +++ b/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5.py @@ -0,0 +1,43 @@ +import xarray +from datetime import datetime + +def tm54dvar_ch4flux_mask_monthgrid_v5_transformation(file_obj, name, nodata): + """Tranformation function for the tm5 ch4 influx dataset + + Args: + file_obj (s3fs object): s3fs sile object for one file of the dataset + name (str): name of the file to be transformed + nodata (int): Nodata value as specified by the data provider + + Returns: + dict: Dictionary with the COG name and its corresponding data array. + """ + + var_data_netcdf = {} + xds = xarray.open_dataset(file_obj) + xds = xds.rename({"latitude": "lat", "longitude": "lon"}) + xds = xds.assign_coords(lon=(((xds.lon + 180) % 360) - 180)).sortby("lon") + variable = [var for var in xds.data_vars if "global" not in var] + + for time_increment in range(0, len(xds.months)): + filename = name.split("/")[-1] + filename_elements = re.split("[_ .]", filename) + start_time = datetime(int(filename_elements[-2]), time_increment + 1, 1) + for var in variable: + data = getattr(xds.isel(months=time_increment), var) + data = data.isel(lat=slice(None, None, -1)) + data = data.where(data == nodata, -9999) + data.rio.set_spatial_dims("lon", "lat", inplace=True) + data.rio.write_crs("epsg:4326", inplace=True) + data.rio.write_nodata(-9999, inplace=True) + + # # insert date of generated COG into filename + filename_elements.pop() + filename_elements[-1] = start_time.strftime("%Y%m") + filename_elements.insert(2, var) + cog_filename = "_".join(filename_elements) + # # add extension + cog_filename = f"{cog_filename}.tif" + var_data_netcdf[cog_filename] = data + + return var_data_netcdf \ No newline at end of file From 79e5c5b38a4b6b50b0c9dd905c9fd813bb37836e Mon Sep 17 00:00:00 2001 From: vishal Date: Mon, 28 Oct 2024 09:52:50 -0500 Subject: [PATCH 02/13] initial transformation function --- .../tm54dvar_ch4flux_mask_monthgrid_v5.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5.py b/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5.py index 99fcc503..73168450 100644 --- a/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5.py +++ b/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5.py @@ -1,5 +1,6 @@ import xarray from datetime import datetime +import re def tm54dvar_ch4flux_mask_monthgrid_v5_transformation(file_obj, name, nodata): """Tranformation function for the tm5 ch4 influx dataset From c567180b6e0801aa2a0ab0d5cded59383b71ffae Mon Sep 17 00:00:00 2001 From: vishal Date: Mon, 28 Oct 2024 11:46:10 -0500 Subject: [PATCH 03/13] added transformation plugins for different datasets --- data_transformation_plugins/ecco_darwin.py | 47 ++++++++++++++++++++++ data_transformation_plugins/geos_oco2.py | 38 +++++++++++++++++ data_transformation_plugins/gpw.py | 33 +++++++++++++++ data_transformation_plugins/push_to_s3.py | 35 ++++++++++++++++ 4 files changed, 153 insertions(+) create mode 100644 data_transformation_plugins/ecco_darwin.py create mode 100644 data_transformation_plugins/geos_oco2.py create mode 100644 data_transformation_plugins/gpw.py create mode 100644 data_transformation_plugins/push_to_s3.py diff --git a/data_transformation_plugins/ecco_darwin.py b/data_transformation_plugins/ecco_darwin.py new file mode 100644 index 00000000..90ac2da9 --- /dev/null +++ b/data_transformation_plugins/ecco_darwin.py @@ -0,0 +1,47 @@ +import xarray +import re + +def ecco_darwin_transformation(file_obj, name, nodata): + """Tranformation function for the ecco darwin dataset + + Args: + file_obj (s3fs object): s3fs sile object for one file of the dataset + name (str): name of the file to be transformed + nodata (int): Nodata value as specified by the data provider + + Returns: + dict: Dictionary with the COG name and its corresponding data array. + """ + var_data_netcdf = {} + xds = xarray.open_dataset(file_obj) + xds = xds.rename({"y": "latitude", "x": "longitude"}) + xds = xds.assign_coords(longitude=((xds.longitude / 1440) * 360) - 180).sortby( + "longitude" + ) + xds = xds.assign_coords(latitude=((xds.latitude / 721) * 180) - 90).sortby( + "latitude" + ) + + variable = [var for var in xds.data_vars] + + for _ in xds.time.values: + for var in variable[2:]: + filename = name.split("/")[-1] + filename_elements = re.split("[_ .]", filename) + data = xds[var] + + data = data.reindex(latitude=list(reversed(data.latitude))) + data = data.where(data == nodata, -9999) + data.rio.set_spatial_dims("longitude", "latitude", inplace=True) + data.rio.write_crs("epsg:4326", inplace=True) + data.rio.write_nodata(-9999, inplace=True) + + filename_elements.pop() + filename_elements[-1] = filename_elements[-2] + filename_elements[-1] + filename_elements.pop(-2) + # # insert date of generated COG into filename + cog_filename = "_".join(filename_elements) + # # add extension + cog_filename = f"{cog_filename}.tif" + var_data_netcdf[cog_filename] = data + return var_data_netcdf \ No newline at end of file diff --git a/data_transformation_plugins/geos_oco2.py b/data_transformation_plugins/geos_oco2.py new file mode 100644 index 00000000..f647e33d --- /dev/null +++ b/data_transformation_plugins/geos_oco2.py @@ -0,0 +1,38 @@ +import xarray +import re + +def geos_oco2_transformation(file_obj, name, nodata): + """Tranformation function for the oco2 geos dataset + + Args: + file_obj (s3fs object): s3fs sile object for one file of the dataset + name (str): name of the file to be transformed + nodata (int): Nodata value as specified by the data provider + + Returns: + dict: Dictionary with the COG name and its corresponding data array. + """ + var_data_netcdf = {} + xds = xarray.open_dataset(file_obj) + xds = xds.assign_coords(lon=(((xds.lon + 180) % 360) - 180)).sortby("lon") + variable = [var for var in xds.data_vars] + for time_increment in range(0, len(xds.time)): + for var in variable: + filename = name.split("/ ")[-1] + filename_elements = re.split("[_ .]", filename) + data = getattr(xds.isel(time=time_increment), var) + data = data.isel(lat=slice(None, None, -1)) + data = data.where(data == nodata, -9999) + data.rio.set_spatial_dims("lon", "lat", inplace=True) + data.rio.write_crs("epsg:4326", inplace=True) + data.rio.write_nodata(-9999, inplace=True) + # # insert date of generated COG into filename + filename_elements[-1] = filename_elements[-3] + filename_elements.insert(2, var) + filename_elements.pop(-3) + cog_filename = "_".join(filename_elements) + # # add extension + cog_filename = f"{cog_filename}.tif" + var_data_netcdf[cog_filename] = data + + return var_data_netcdf \ No newline at end of file diff --git a/data_transformation_plugins/gpw.py b/data_transformation_plugins/gpw.py new file mode 100644 index 00000000..6f030623 --- /dev/null +++ b/data_transformation_plugins/gpw.py @@ -0,0 +1,33 @@ +import xarray +import re + +def gpw_transformation(file_obj, name, nodata): + """Tranformation function for the gridded population dataset + + Args: + file_obj (s3fs object): s3fs sile object for one file of the dataset + name (str): name of the file to be transformed + nodata (int): Nodata value as specified by the data provider + + Returns: + dict: Dictionary with the COG name and its corresponding data array. + """ + + var_data_netcdf = {} + xds = xarray.open_dataarray(file_obj, engine="rasterio") + + filename = name.split("/")[-1] + filename_elements = re.split("[_ .]", filename) + # # insert date of generated COG into filename + filename_elements.pop() + filename_elements.append(filename_elements[-3]) + xds = xds.where(xds == nodata, -9999) + xds.rio.set_spatial_dims("x", "y", inplace=True) + xds.rio.write_crs("epsg:4326", inplace=True) + xds.rio.write_nodata(-9999, inplace=True) + + cog_filename = "_".join(filename_elements) + # # add extension + cog_filename = f"{cog_filename}.tif" + var_data_netcdf[cog_filename] = xds + return var_data_netcdf \ No newline at end of file diff --git a/data_transformation_plugins/push_to_s3.py b/data_transformation_plugins/push_to_s3.py new file mode 100644 index 00000000..18617f9a --- /dev/null +++ b/data_transformation_plugins/push_to_s3.py @@ -0,0 +1,35 @@ +import boto3 +import os + +def upload_folder_to_s3(folder_path, bucket_name, s3_folder, exclude_files): + """ + Uploads all files in a folder to a specified S3 folder, excluding specified files. + + Parameters: + - folder_path (str): Path to the local folder containing files to upload. + - bucket_name (str): Name of the S3 bucket. + - s3_folder (str): Destination folder path in the S3 bucket. + - exclude_files (list): List of files to exclude from uploading. + """ + # Initialize S3 client + s3 = boto3.client('s3') + + # Loop through files in the local folder + for file_name in os.listdir(folder_path): + file_path = os.path.join(folder_path, file_name) + + # Check if it's a file and not in the exclude list + if os.path.isfile(file_path) and file_name not in exclude_files: + s3_key = os.path.join(s3_folder, file_name) + + try: + # Upload the file + s3.upload_file(file_path, bucket_name, s3_key) + print(f"Uploaded {file_name} to {s3_key}") + except Exception as e: + print(f"Error uploading {file_name}: {e}") + +# Example usage: +# upload_folder_to_s3("path/to/local/folder", "my-s3-bucket", "my/s3/folder", ["exclude1.ext", "exclude2.ext"]) +if __name__ == "__main__": + upload_folder_to_s3("data_transformation_plugins", "ghgc-data-store-develop", "data_transformation_plugins", ["__init__.py", "push_to_s3.py"]) \ No newline at end of file From 08132c8f4b6089c954eaadcaa467d482ca171377 Mon Sep 17 00:00:00 2001 From: vishal Date: Mon, 28 Oct 2024 14:47:23 -0500 Subject: [PATCH 04/13] add readme for the folder --- data_transformation_plugins/README.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 data_transformation_plugins/README.md diff --git a/data_transformation_plugins/README.md b/data_transformation_plugins/README.md new file mode 100644 index 00000000..008ca8b1 --- /dev/null +++ b/data_transformation_plugins/README.md @@ -0,0 +1,4 @@ +The naming convention for the python files is as follows: +### collectionname_transformation +where collection name refers to the STAC collection name that we want to provide +for the given dataset followed by the tword `transformation`. Make sure that the collection name in the function is same to the one passed as argument when running the DAG. \ No newline at end of file From 11dac6eac2fd0d7ad560a45e0abb05b07c51e403 Mon Sep 17 00:00:00 2001 From: vishal Date: Mon, 28 Oct 2024 14:58:21 -0500 Subject: [PATCH 05/13] updated the README file --- data_transformation_plugins/README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/data_transformation_plugins/README.md b/data_transformation_plugins/README.md index 008ca8b1..26524176 100644 --- a/data_transformation_plugins/README.md +++ b/data_transformation_plugins/README.md @@ -1,4 +1,9 @@ -The naming convention for the python files is as follows: -### collectionname_transformation -where collection name refers to the STAC collection name that we want to provide -for the given dataset followed by the tword `transformation`. Make sure that the collection name in the function is same to the one passed as argument when running the DAG. \ No newline at end of file +## Information about the folder +This folder is a part of the `automation pipeline using DAG`. It contains the functions which are essential for transforming the given dataset into COGs. There are python files for transforming every dataset which will be used as plugins into the pipeline. + +## How this folder fits in the automation pipeline +These functions/files are created by the `developer` to transform a single data file to COG. Once the COGs are validated, these python scripts are pushed to the `GHGC-SMCE S3` These files are then fetched by the `SM2A DAG` to complete the transformation of entire dataset automatically. + +## Naming convention for the transformation files in the folder +- `name of python file` - `collectionname_transformation.py` +`collectionname` refers to the STAC collection name of the dataset followed by the word `transformation`. Make sure the `collectionname` within the filename matches with the `collectionname` passed as a `parameter` to the DAG. \ No newline at end of file From 40f7c369a4677643544345f9d9b0d212632fa71b Mon Sep 17 00:00:00 2001 From: vishal Date: Tue, 29 Oct 2024 12:24:45 -0500 Subject: [PATCH 06/13] updated file names as per the convention and changed the logic for finding files in S3 --- ...darwin.py => ecco_darwin_tranformation.py} | 0 ...os_oco2.py => geos_oco2_transformation.py} | 0 .../{gpw.py => gpw_transformation.py} | 0 data_transformation_plugins/push_to_s3.py | 28 +++++++++++++------ ...4flux_mask_monthgrid_v5_transformation.py} | 0 5 files changed, 20 insertions(+), 8 deletions(-) rename data_transformation_plugins/{ecco_darwin.py => ecco_darwin_tranformation.py} (100%) rename data_transformation_plugins/{geos_oco2.py => geos_oco2_transformation.py} (100%) rename data_transformation_plugins/{gpw.py => gpw_transformation.py} (100%) rename data_transformation_plugins/{tm54dvar_ch4flux_mask_monthgrid_v5.py => tm54dvar_ch4flux_mask_monthgrid_v5_transformation.py} (100%) diff --git a/data_transformation_plugins/ecco_darwin.py b/data_transformation_plugins/ecco_darwin_tranformation.py similarity index 100% rename from data_transformation_plugins/ecco_darwin.py rename to data_transformation_plugins/ecco_darwin_tranformation.py diff --git a/data_transformation_plugins/geos_oco2.py b/data_transformation_plugins/geos_oco2_transformation.py similarity index 100% rename from data_transformation_plugins/geos_oco2.py rename to data_transformation_plugins/geos_oco2_transformation.py diff --git a/data_transformation_plugins/gpw.py b/data_transformation_plugins/gpw_transformation.py similarity index 100% rename from data_transformation_plugins/gpw.py rename to data_transformation_plugins/gpw_transformation.py diff --git a/data_transformation_plugins/push_to_s3.py b/data_transformation_plugins/push_to_s3.py index 18617f9a..85d52c22 100644 --- a/data_transformation_plugins/push_to_s3.py +++ b/data_transformation_plugins/push_to_s3.py @@ -1,9 +1,13 @@ import boto3 import os -def upload_folder_to_s3(folder_path, bucket_name, s3_folder, exclude_files): +import boto3 +import os + +def upload_files_to_s3(folder_path, bucket_name, s3_folder, exclude_files): """ - Uploads all files in a folder to a specified S3 folder, excluding specified files. + Uploads all files in a folder to a specified S3 folder only if the file does not already exist in S3, + and excluding specified files. Parameters: - folder_path (str): Path to the local folder containing files to upload. @@ -23,13 +27,21 @@ def upload_folder_to_s3(folder_path, bucket_name, s3_folder, exclude_files): s3_key = os.path.join(s3_folder, file_name) try: - # Upload the file - s3.upload_file(file_path, bucket_name, s3_key) - print(f"Uploaded {file_name} to {s3_key}") - except Exception as e: - print(f"Error uploading {file_name}: {e}") + # Check if the file already exists in S3 + s3.head_object(Bucket=bucket_name, Key=s3_key) + print(f"Skipped {file_name} (already exists in S3)") + except s3.exceptions.ClientError as e: + # If the file does not exist, upload it + if e.response['Error']['Code'] == '404': + try: + s3.upload_file(file_path, bucket_name, s3_key) + print(f"Uploaded {file_name} to {s3_key}") + except Exception as upload_error: + print(f"Error uploading {file_name}: {upload_error}") + else: + print(f"Error checking existence of {file_name}: {e}") # Example usage: # upload_folder_to_s3("path/to/local/folder", "my-s3-bucket", "my/s3/folder", ["exclude1.ext", "exclude2.ext"]) if __name__ == "__main__": - upload_folder_to_s3("data_transformation_plugins", "ghgc-data-store-develop", "data_transformation_plugins", ["__init__.py", "push_to_s3.py"]) \ No newline at end of file + upload_files_to_s3("data_transformation_plugins", "ghgc-data-store-develop", "data_transformation_plugins", ["__init__.py", "push_to_s3.py", "README.md"]) \ No newline at end of file diff --git a/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5.py b/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5_transformation.py similarity index 100% rename from data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5.py rename to data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5_transformation.py From d80620e241656b3b42b59628de900990fd109ca7 Mon Sep 17 00:00:00 2001 From: vishal Date: Tue, 29 Oct 2024 16:45:36 -0500 Subject: [PATCH 07/13] updated the logic for no data value and a script to transform gosat ch4 --- ...ation.py => ecco_darwin_transformation.py} | 2 +- .../geos_oco2_transformation.py | 2 +- .../gosat_ch4_transformation.py | 36 +++++++++++++++++++ .../gpw_transformation.py | 2 +- ...h4flux_mask_monthgrid_v5_transformation.py | 2 +- 5 files changed, 40 insertions(+), 4 deletions(-) rename data_transformation_plugins/{ecco_darwin_tranformation.py => ecco_darwin_transformation.py} (97%) create mode 100644 data_transformation_plugins/gosat_ch4_transformation.py diff --git a/data_transformation_plugins/ecco_darwin_tranformation.py b/data_transformation_plugins/ecco_darwin_transformation.py similarity index 97% rename from data_transformation_plugins/ecco_darwin_tranformation.py rename to data_transformation_plugins/ecco_darwin_transformation.py index 90ac2da9..9696a372 100644 --- a/data_transformation_plugins/ecco_darwin_tranformation.py +++ b/data_transformation_plugins/ecco_darwin_transformation.py @@ -31,7 +31,7 @@ def ecco_darwin_transformation(file_obj, name, nodata): data = xds[var] data = data.reindex(latitude=list(reversed(data.latitude))) - data = data.where(data == nodata, -9999) + data = data.where(data != nodata, -9999) data.rio.set_spatial_dims("longitude", "latitude", inplace=True) data.rio.write_crs("epsg:4326", inplace=True) data.rio.write_nodata(-9999, inplace=True) diff --git a/data_transformation_plugins/geos_oco2_transformation.py b/data_transformation_plugins/geos_oco2_transformation.py index f647e33d..4165c91e 100644 --- a/data_transformation_plugins/geos_oco2_transformation.py +++ b/data_transformation_plugins/geos_oco2_transformation.py @@ -22,7 +22,7 @@ def geos_oco2_transformation(file_obj, name, nodata): filename_elements = re.split("[_ .]", filename) data = getattr(xds.isel(time=time_increment), var) data = data.isel(lat=slice(None, None, -1)) - data = data.where(data == nodata, -9999) + data = data.where(data != nodata, -9999) data.rio.set_spatial_dims("lon", "lat", inplace=True) data.rio.write_crs("epsg:4326", inplace=True) data.rio.write_nodata(-9999, inplace=True) diff --git a/data_transformation_plugins/gosat_ch4_transformation.py b/data_transformation_plugins/gosat_ch4_transformation.py new file mode 100644 index 00000000..a9adbb3e --- /dev/null +++ b/data_transformation_plugins/gosat_ch4_transformation.py @@ -0,0 +1,36 @@ +import xarray +import re + +def gosat_ch4_transformation(file_obj, name, nodata): + """Tranformation function for the ecco darwin dataset + + Args: + file_obj (s3fs object): s3fs sile object for one file of the dataset + name (str): name of the file to be transformed + nodata (int): Nodata value as specified by the data provider + + Returns: + dict: Dictionary with the COG name and its corresponding data array. + """ + var_data_netcdf = {} + ds = xarray.open_dataset(file_obj) + variable = [var for var in ds.data_vars] + + for var in variable: + filename = name.split("/")[-1] + filename_elements = re.split("[_ .]", filename) + data = ds[var] + filename_elements.pop() + filename_elements.insert(2, var) + cog_filename = "_".join(filename_elements) + # # add extension + cog_filename = f"{cog_filename}.tif" + + data = data.reindex(lat=list(reversed(data.lat))) + data = data.where(data != -9999, -9999) + data.rio.write_nodata(-9999, inplace=True) + + data.rio.set_spatial_dims("lon", "lat") + data.rio.write_crs("epsg:4326", inplace=True) + var_data_netcdf[cog_filename] = data + return var_data_netcdf \ No newline at end of file diff --git a/data_transformation_plugins/gpw_transformation.py b/data_transformation_plugins/gpw_transformation.py index 6f030623..5849f350 100644 --- a/data_transformation_plugins/gpw_transformation.py +++ b/data_transformation_plugins/gpw_transformation.py @@ -21,7 +21,7 @@ def gpw_transformation(file_obj, name, nodata): # # insert date of generated COG into filename filename_elements.pop() filename_elements.append(filename_elements[-3]) - xds = xds.where(xds == nodata, -9999) + xds = xds.where(xds != nodata, -9999) xds.rio.set_spatial_dims("x", "y", inplace=True) xds.rio.write_crs("epsg:4326", inplace=True) xds.rio.write_nodata(-9999, inplace=True) diff --git a/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5_transformation.py b/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5_transformation.py index 73168450..7565ec8a 100644 --- a/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5_transformation.py +++ b/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5_transformation.py @@ -27,7 +27,7 @@ def tm54dvar_ch4flux_mask_monthgrid_v5_transformation(file_obj, name, nodata): for var in variable: data = getattr(xds.isel(months=time_increment), var) data = data.isel(lat=slice(None, None, -1)) - data = data.where(data == nodata, -9999) + data = data.where(data != nodata, -9999) data.rio.set_spatial_dims("lon", "lat", inplace=True) data.rio.write_crs("epsg:4326", inplace=True) data.rio.write_nodata(-9999, inplace=True) From 5dcb6022793d942595f600a815f923e72698c448 Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 31 Oct 2024 09:12:06 -0500 Subject: [PATCH 08/13] resolved the comments --- .../ecco_darwin_transformation.py | 38 +++++++++---------- .../gosat_ch4_transformation.py | 1 + .../gpw_transformation.py | 1 + 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/data_transformation_plugins/ecco_darwin_transformation.py b/data_transformation_plugins/ecco_darwin_transformation.py index 9696a372..b31c2c13 100644 --- a/data_transformation_plugins/ecco_darwin_transformation.py +++ b/data_transformation_plugins/ecco_darwin_transformation.py @@ -22,26 +22,26 @@ def ecco_darwin_transformation(file_obj, name, nodata): "latitude" ) - variable = [var for var in xds.data_vars] + variables = list(xds.data_vars)[2:] - for _ in xds.time.values: - for var in variable[2:]: - filename = name.split("/")[-1] - filename_elements = re.split("[_ .]", filename) - data = xds[var] + for var in variables: + filename = name.split("/")[-1] + filename_elements = re.split("[_ .]", filename) + data = xds[var] - data = data.reindex(latitude=list(reversed(data.latitude))) - data = data.where(data != nodata, -9999) - data.rio.set_spatial_dims("longitude", "latitude", inplace=True) - data.rio.write_crs("epsg:4326", inplace=True) - data.rio.write_nodata(-9999, inplace=True) + data = data.reindex(latitude=list(reversed(data.latitude))) + data = data.where(data != nodata, -9999) + data.rio.set_spatial_dims("longitude", "latitude", inplace=True) + data.rio.write_crs("epsg:4326", inplace=True) + data.rio.write_nodata(-9999, inplace=True) - filename_elements.pop() - filename_elements[-1] = filename_elements[-2] + filename_elements[-1] - filename_elements.pop(-2) - # # insert date of generated COG into filename - cog_filename = "_".join(filename_elements) - # # add extension - cog_filename = f"{cog_filename}.tif" - var_data_netcdf[cog_filename] = data + filename_elements.pop() + filename_elements[-1] = filename_elements[-2] + filename_elements[-1] + filename_elements.pop(-2) + # # insert date of generated COG into filename + cog_filename = "_".join(filename_elements) + # # add extension + cog_filename = f"{cog_filename}.tif" + var_data_netcdf[cog_filename] = data + return var_data_netcdf \ No newline at end of file diff --git a/data_transformation_plugins/gosat_ch4_transformation.py b/data_transformation_plugins/gosat_ch4_transformation.py index a9adbb3e..7a88b85f 100644 --- a/data_transformation_plugins/gosat_ch4_transformation.py +++ b/data_transformation_plugins/gosat_ch4_transformation.py @@ -33,4 +33,5 @@ def gosat_ch4_transformation(file_obj, name, nodata): data.rio.set_spatial_dims("lon", "lat") data.rio.write_crs("epsg:4326", inplace=True) var_data_netcdf[cog_filename] = data + return var_data_netcdf \ No newline at end of file diff --git a/data_transformation_plugins/gpw_transformation.py b/data_transformation_plugins/gpw_transformation.py index 5849f350..fde5d1f7 100644 --- a/data_transformation_plugins/gpw_transformation.py +++ b/data_transformation_plugins/gpw_transformation.py @@ -30,4 +30,5 @@ def gpw_transformation(file_obj, name, nodata): # # add extension cog_filename = f"{cog_filename}.tif" var_data_netcdf[cog_filename] = xds + return var_data_netcdf \ No newline at end of file From 0b0870d7352d5364d73bc3fff70619dc6192e95e Mon Sep 17 00:00:00 2001 From: vishal Date: Fri, 1 Nov 2024 10:12:13 -0500 Subject: [PATCH 09/13] added a sample transformation notebook along with updating the README file --- data_transformation_plugins/README.md | 8 +- data_transformation_plugins/push_to_s3.py | 2 +- .../sample_transformation.ipynb | 102 ++++++++++++++++++ 3 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 data_transformation_plugins/sample_transformation.ipynb diff --git a/data_transformation_plugins/README.md b/data_transformation_plugins/README.md index 26524176..9160fa89 100644 --- a/data_transformation_plugins/README.md +++ b/data_transformation_plugins/README.md @@ -6,4 +6,10 @@ These functions/files are created by the `developer` to transform a single data ## Naming convention for the transformation files in the folder - `name of python file` - `collectionname_transformation.py` -`collectionname` refers to the STAC collection name of the dataset followed by the word `transformation`. Make sure the `collectionname` within the filename matches with the `collectionname` passed as a `parameter` to the DAG. \ No newline at end of file +`collectionname` refers to the STAC collection name of the dataset followed by the word `transformation`. Make sure the `collectionname` within the filename matches with the `collectionname` passed as a `parameter` to the DAG. + +## Steps for running the pipeline +- Test convert a single netCDF file for a new dataset using the `sample_transformation.ipynb` notebook. +- Create a new `data transformation plugin` python file for the new dataset using the convention mentioned above. +- `push_to_s3.py` is not yet plugged into the `CI/CD pipeline` so after creating the plugin, run the python file in the terminal. Running the python file will only push the files that are not present on the S3 folder. +- At this point, the tasks from `ghgc-docs` are completed. \ No newline at end of file diff --git a/data_transformation_plugins/push_to_s3.py b/data_transformation_plugins/push_to_s3.py index 85d52c22..7f941bc9 100644 --- a/data_transformation_plugins/push_to_s3.py +++ b/data_transformation_plugins/push_to_s3.py @@ -44,4 +44,4 @@ def upload_files_to_s3(folder_path, bucket_name, s3_folder, exclude_files): # Example usage: # upload_folder_to_s3("path/to/local/folder", "my-s3-bucket", "my/s3/folder", ["exclude1.ext", "exclude2.ext"]) if __name__ == "__main__": - upload_files_to_s3("data_transformation_plugins", "ghgc-data-store-develop", "data_transformation_plugins", ["__init__.py", "push_to_s3.py", "README.md"]) \ No newline at end of file + upload_files_to_s3("data_transformation_plugins", "ghgc-data-store-develop", "data_transformation_plugins", ["__init__.py", "push_to_s3.py", "README.md", "sample_transformation.ipynb"]) \ No newline at end of file diff --git a/data_transformation_plugins/sample_transformation.ipynb b/data_transformation_plugins/sample_transformation.ipynb new file mode 100644 index 00000000..1f98a392 --- /dev/null +++ b/data_transformation_plugins/sample_transformation.ipynb @@ -0,0 +1,102 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Instructions on how to transform a single file\n", + "Install the required libraries in an isolated environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xarray\n", + "import re\n", + "from datetime import datetime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read a file of the new dataset\n", + "name = 'test.nc'\n", + "ds = xarray.open_dataset(name) # open the file\n", + "print('The netcdf file contains information as shown below')\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Explore the netCDF file\n", + "print('Data valriables in the netCDF file are', ds.data_vars)\n", + "print('Dimensions mentioned in the netCDF file are', ds.dims)\n", + "#..... explore the netCDf file for more information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# once explored, look at the latitude, longitude and other values and comvert them if needed.\n", + "ds = ds.rename({\"latitude\": \"lat\", \"longitude\": \"lon\"})\n", + "ds = ds.assign_coords(lon=(((ds.lon + 180) % 360) - 180)).sortby(\"lon\")\n", + "variable = [var for var in ds.data_vars]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Parse through the time variable to extract single timestamp and create a dataarray for the given timestamp.\n", + "for time_increment in range(0, len(ds.months)):\n", + " # split the original file name to create a unique COG name\n", + " filename = name.split(\"/ \")[-1]\n", + " filename_elements = re.split(\"[_ .]\", filename)\n", + " start_time = datetime(int(filename_elements[-2]), time_increment + 1, 1)\n", + " for var in variable: # Parse through the data variables\n", + " data = getattr(ds.isel(months=time_increment), var)\n", + " data = data.isel(lat=slice(None, None, -1))\n", + " data.rio.set_spatial_dims(\"lon\", \"lat\", inplace=True)\n", + " data.rio.write_crs(\"epsg:4326\", inplace=True)\n", + "\n", + " # # insert date of generated COG into filename\n", + " filename_elements.pop()\n", + " filename_elements[-1] = start_time.strftime(\"%Y%m\")\n", + " filename_elements.insert(2, var)\n", + " cog_filename = \"_\".join(filename_elements)\n", + " # # add extension\n", + " cog_filename = f\"{cog_filename}.tif\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Validate the COG using QGIS or any other means.\n", + "Once validated, follow the steps mentioned in the README in the folder `data_transformation_plugins`" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From dcb3db1364c6e1e2ac79d337546986419bfbca50 Mon Sep 17 00:00:00 2001 From: vishal Date: Mon, 11 Nov 2024 14:37:44 -0600 Subject: [PATCH 10/13] tm5 new data transformation plugin --- ...hgrid_v5_transformation.py => tm54dvar_noaa_transformation.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename data_transformation_plugins/{tm54dvar_ch4flux_mask_monthgrid_v5_transformation.py => tm54dvar_noaa_transformation.py} (100%) diff --git a/data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5_transformation.py b/data_transformation_plugins/tm54dvar_noaa_transformation.py similarity index 100% rename from data_transformation_plugins/tm54dvar_ch4flux_mask_monthgrid_v5_transformation.py rename to data_transformation_plugins/tm54dvar_noaa_transformation.py From d94aed272bc59788b8cf59e9e79aaa57223e72fd Mon Sep 17 00:00:00 2001 From: vishal Date: Mon, 11 Nov 2024 15:18:29 -0600 Subject: [PATCH 11/13] updated the tm5 transformation script --- .../sample_transformation.ipynb | 595 +++++++++++++++++- ...> tm5_4dvar_update_noaa_transformation.py} | 4 +- 2 files changed, 590 insertions(+), 9 deletions(-) rename data_transformation_plugins/{tm54dvar_noaa_transformation.py => tm5_4dvar_update_noaa_transformation.py} (91%) diff --git a/data_transformation_plugins/sample_transformation.ipynb b/data_transformation_plugins/sample_transformation.ipynb index 1f98a392..44dd8c02 100644 --- a/data_transformation_plugins/sample_transformation.ipynb +++ b/data_transformation_plugins/sample_transformation.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -21,12 +21,461 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The netcdf file contains information as shown below\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:               (latitude: 180, longitude: 360, months: 12)\n",
+       "Coordinates:\n",
+       "  * latitude              (latitude) float64 -89.5 -88.5 -87.5 ... 88.5 89.5\n",
+       "  * longitude             (longitude) float64 -179.5 -178.5 ... 178.5 179.5\n",
+       "  * months                (months) int32 1 2 3 4 5 6 7 8 9 10 11 12\n",
+       "Data variables:\n",
+       "    fossil                (months, latitude, longitude) float64 ...\n",
+       "    fossil_global_sum     (months) float64 ...\n",
+       "    microbial             (months, latitude, longitude) float64 ...\n",
+       "    microbial_global_sum  (months) float64 ...\n",
+       "    pyrogenic             (months, latitude, longitude) float64 ...\n",
+       "    pyrogenic_global_sum  (months) float64 ...\n",
+       "    total                 (months, latitude, longitude) float64 ...\n",
+       "    total_global_sum      (months) float64 ...\n",
+       "    surface_area          (latitude, longitude) float64 ...\n",
+       "Attributes:\n",
+       "    email:        carbontracker.team@noaa.gov\n",
+       "    url:          http://gml.noaa.gov/ccgg/carbontracker-ch4/\n",
+       "    institution:  NOAA Global Monitoring Laboratory\n",
+       "    version:      CTCH4 v2023 1x1 monthly fluxes as of 08-Nov-2024 01:58:47 UTC
" + ], + "text/plain": [ + "\n", + "Dimensions: (latitude: 180, longitude: 360, months: 12)\n", + "Coordinates:\n", + " * latitude (latitude) float64 -89.5 -88.5 -87.5 ... 88.5 89.5\n", + " * longitude (longitude) float64 -179.5 -178.5 ... 178.5 179.5\n", + " * months (months) int32 1 2 3 4 5 6 7 8 9 10 11 12\n", + "Data variables:\n", + " fossil (months, latitude, longitude) float64 ...\n", + " fossil_global_sum (months) float64 ...\n", + " microbial (months, latitude, longitude) float64 ...\n", + " microbial_global_sum (months) float64 ...\n", + " pyrogenic (months, latitude, longitude) float64 ...\n", + " pyrogenic_global_sum (months) float64 ...\n", + " total (months, latitude, longitude) float64 ...\n", + " total_global_sum (months) float64 ...\n", + " surface_area (latitude, longitude) float64 ...\n", + "Attributes:\n", + " email: carbontracker.team@noaa.gov\n", + " url: http://gml.noaa.gov/ccgg/carbontracker-ch4/\n", + " institution: NOAA Global Monitoring Laboratory\n", + " version: CTCH4 v2023 1x1 monthly fluxes as of 08-Nov-2024 01:58:47 UTC" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Read a file of the new dataset\n", - "name = 'test.nc'\n", + "name = '/Users/vgaur/Downloads/CTCH4_methane_emis_1998.nc'\n", "ds = xarray.open_dataset(name) # open the file\n", "print('The netcdf file contains information as shown below')\n", "ds" @@ -34,9 +483,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data valriables in the netCDF file are Data variables:\n", + " fossil (months, latitude, longitude) float64 ...\n", + " fossil_global_sum (months) float64 ...\n", + " microbial (months, latitude, longitude) float64 ...\n", + " microbial_global_sum (months) float64 ...\n", + " pyrogenic (months, latitude, longitude) float64 ...\n", + " pyrogenic_global_sum (months) float64 ...\n", + " total (months, latitude, longitude) float64 ...\n", + " total_global_sum (months) float64 ...\n", + " surface_area (latitude, longitude) float64 ...\n", + "Dimensions mentioned in the netCDF file are Frozen({'latitude': 180, 'longitude': 360, 'months': 12})\n" + ] + } + ], "source": [ "# Explore the netCDF file\n", "print('Data valriables in the netCDF file are', ds.data_vars)\n", @@ -46,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -83,6 +550,104 @@ " cog_filename = f\"{cog_filename}.tif\"" ] }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CTCH4_methane_fossil_emis_199801.tif\n", + "CTCH4_methane_microbial_emis_199801.tif\n", + "CTCH4_methane_pyrogenic_emis_199801.tif\n", + "CTCH4_methane_total_emis_199801.tif\n", + "CTCH4_methane_surface_area_emis_199801.tif\n", + "CTCH4_methane_fossil_emis_199802.tif\n", + "CTCH4_methane_microbial_emis_199802.tif\n", + "CTCH4_methane_pyrogenic_emis_199802.tif\n", + "CTCH4_methane_total_emis_199802.tif\n", + "CTCH4_methane_surface_area_emis_199802.tif\n", + "CTCH4_methane_fossil_emis_199803.tif\n", + "CTCH4_methane_microbial_emis_199803.tif\n", + "CTCH4_methane_pyrogenic_emis_199803.tif\n", + "CTCH4_methane_total_emis_199803.tif\n", + "CTCH4_methane_surface_area_emis_199803.tif\n", + "CTCH4_methane_fossil_emis_199804.tif\n", + "CTCH4_methane_microbial_emis_199804.tif\n", + "CTCH4_methane_pyrogenic_emis_199804.tif\n", + "CTCH4_methane_total_emis_199804.tif\n", + "CTCH4_methane_surface_area_emis_199804.tif\n", + "CTCH4_methane_fossil_emis_199805.tif\n", + "CTCH4_methane_microbial_emis_199805.tif\n", + "CTCH4_methane_pyrogenic_emis_199805.tif\n", + "CTCH4_methane_total_emis_199805.tif\n", + "CTCH4_methane_surface_area_emis_199805.tif\n", + "CTCH4_methane_fossil_emis_199806.tif\n", + "CTCH4_methane_microbial_emis_199806.tif\n", + "CTCH4_methane_pyrogenic_emis_199806.tif\n", + "CTCH4_methane_total_emis_199806.tif\n", + "CTCH4_methane_surface_area_emis_199806.tif\n", + "CTCH4_methane_fossil_emis_199807.tif\n", + "CTCH4_methane_microbial_emis_199807.tif\n", + "CTCH4_methane_pyrogenic_emis_199807.tif\n", + "CTCH4_methane_total_emis_199807.tif\n", + "CTCH4_methane_surface_area_emis_199807.tif\n", + "CTCH4_methane_fossil_emis_199808.tif\n", + "CTCH4_methane_microbial_emis_199808.tif\n", + "CTCH4_methane_pyrogenic_emis_199808.tif\n", + "CTCH4_methane_total_emis_199808.tif\n", + "CTCH4_methane_surface_area_emis_199808.tif\n", + "CTCH4_methane_fossil_emis_199809.tif\n", + "CTCH4_methane_microbial_emis_199809.tif\n", + "CTCH4_methane_pyrogenic_emis_199809.tif\n", + "CTCH4_methane_total_emis_199809.tif\n", + "CTCH4_methane_surface_area_emis_199809.tif\n", + "CTCH4_methane_fossil_emis_199810.tif\n", + "CTCH4_methane_microbial_emis_199810.tif\n", + "CTCH4_methane_pyrogenic_emis_199810.tif\n", + "CTCH4_methane_total_emis_199810.tif\n", + "CTCH4_methane_surface_area_emis_199810.tif\n", + "CTCH4_methane_fossil_emis_199811.tif\n", + "CTCH4_methane_microbial_emis_199811.tif\n", + "CTCH4_methane_pyrogenic_emis_199811.tif\n", + "CTCH4_methane_total_emis_199811.tif\n", + "CTCH4_methane_surface_area_emis_199811.tif\n", + "CTCH4_methane_fossil_emis_199812.tif\n", + "CTCH4_methane_microbial_emis_199812.tif\n", + "CTCH4_methane_pyrogenic_emis_199812.tif\n", + "CTCH4_methane_total_emis_199812.tif\n", + "CTCH4_methane_surface_area_emis_199812.tif\n" + ] + } + ], + "source": [ + "variable = [var for var in ds.data_vars if \"global\" not in var]\n", + "\n", + "for time_increment in range(0, len(ds.months)):\n", + " filename = name.split(\"/\")[-1]\n", + " filename_elements = re.split(\"[_ .]\", filename)\n", + " start_time = datetime(int(filename_elements[-2]), time_increment + 1, 1)\n", + " for var in variable:\n", + " filename = name.split(\"/\")[-1]\n", + " filename_elements = re.split(\"[_ .]\", filename)\n", + " data = getattr(ds.isel(months=time_increment), var)\n", + " data = data.isel(lat=slice(None, None, -1))\n", + " data = data.where(data != -9999, -9999)\n", + " data.rio.set_spatial_dims(\"lon\", \"lat\", inplace=True)\n", + " data.rio.write_crs(\"epsg:4326\", inplace=True)\n", + " data.rio.write_nodata(-9999, inplace=True)\n", + "\n", + " # # insert date of generated COG into filename\n", + " filename_elements.pop()\n", + " filename_elements[-1] = start_time.strftime(\"%Y%m\")\n", + " filename_elements.insert(2, var)\n", + " cog_filename = \"_\".join(filename_elements)\n", + " # # add extension\n", + " cog_filename = f\"{cog_filename}.tif\"" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -93,8 +658,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": "cmip6", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" } }, "nbformat": 4, diff --git a/data_transformation_plugins/tm54dvar_noaa_transformation.py b/data_transformation_plugins/tm5_4dvar_update_noaa_transformation.py similarity index 91% rename from data_transformation_plugins/tm54dvar_noaa_transformation.py rename to data_transformation_plugins/tm5_4dvar_update_noaa_transformation.py index 7565ec8a..1fd491b3 100644 --- a/data_transformation_plugins/tm54dvar_noaa_transformation.py +++ b/data_transformation_plugins/tm5_4dvar_update_noaa_transformation.py @@ -2,7 +2,7 @@ from datetime import datetime import re -def tm54dvar_ch4flux_mask_monthgrid_v5_transformation(file_obj, name, nodata): +def tm5_4dvar_update_noaa_transformation(file_obj, name, nodata): """Tranformation function for the tm5 ch4 influx dataset Args: @@ -25,6 +25,8 @@ def tm54dvar_ch4flux_mask_monthgrid_v5_transformation(file_obj, name, nodata): filename_elements = re.split("[_ .]", filename) start_time = datetime(int(filename_elements[-2]), time_increment + 1, 1) for var in variable: + filename = name.split("/")[-1] + filename_elements = re.split("[_ .]", filename) data = getattr(xds.isel(months=time_increment), var) data = data.isel(lat=slice(None, None, -1)) data = data.where(data != nodata, -9999) From e6a6ee9ae51d820b30f71da91fdd13252568ff87 Mon Sep 17 00:00:00 2001 From: vishal Date: Tue, 19 Nov 2024 11:11:45 -0600 Subject: [PATCH 12/13] reformatting files --- .../ecco_darwin_transformation.py | 14 ++++++----- .../geos_oco2_transformation.py | 6 +++-- .../gosat_ch4_transformation.py | 8 ++++--- .../gpw_transformation.py | 8 ++++--- data_transformation_plugins/push_to_s3.py | 24 +++++++++++++------ .../tm5_4dvar_update_noaa_transformation.py | 8 ++++--- 6 files changed, 44 insertions(+), 24 deletions(-) diff --git a/data_transformation_plugins/ecco_darwin_transformation.py b/data_transformation_plugins/ecco_darwin_transformation.py index b31c2c13..93be21cd 100644 --- a/data_transformation_plugins/ecco_darwin_transformation.py +++ b/data_transformation_plugins/ecco_darwin_transformation.py @@ -1,6 +1,8 @@ -import xarray import re +import xarray + + def ecco_darwin_transformation(file_obj, name, nodata): """Tranformation function for the ecco darwin dataset @@ -15,9 +17,9 @@ def ecco_darwin_transformation(file_obj, name, nodata): var_data_netcdf = {} xds = xarray.open_dataset(file_obj) xds = xds.rename({"y": "latitude", "x": "longitude"}) - xds = xds.assign_coords(longitude=((xds.longitude / 1440) * 360) - 180).sortby( - "longitude" - ) + xds = xds.assign_coords( + longitude=((xds.longitude / 1440) * 360) - 180 + ).sortby("longitude") xds = xds.assign_coords(latitude=((xds.latitude / 721) * 180) - 90).sortby( "latitude" ) @@ -43,5 +45,5 @@ def ecco_darwin_transformation(file_obj, name, nodata): # # add extension cog_filename = f"{cog_filename}.tif" var_data_netcdf[cog_filename] = data - - return var_data_netcdf \ No newline at end of file + + return var_data_netcdf diff --git a/data_transformation_plugins/geos_oco2_transformation.py b/data_transformation_plugins/geos_oco2_transformation.py index 4165c91e..61b9702a 100644 --- a/data_transformation_plugins/geos_oco2_transformation.py +++ b/data_transformation_plugins/geos_oco2_transformation.py @@ -1,6 +1,8 @@ -import xarray import re +import xarray + + def geos_oco2_transformation(file_obj, name, nodata): """Tranformation function for the oco2 geos dataset @@ -35,4 +37,4 @@ def geos_oco2_transformation(file_obj, name, nodata): cog_filename = f"{cog_filename}.tif" var_data_netcdf[cog_filename] = data - return var_data_netcdf \ No newline at end of file + return var_data_netcdf diff --git a/data_transformation_plugins/gosat_ch4_transformation.py b/data_transformation_plugins/gosat_ch4_transformation.py index 7a88b85f..ea5552ec 100644 --- a/data_transformation_plugins/gosat_ch4_transformation.py +++ b/data_transformation_plugins/gosat_ch4_transformation.py @@ -1,6 +1,8 @@ -import xarray import re +import xarray + + def gosat_ch4_transformation(file_obj, name, nodata): """Tranformation function for the ecco darwin dataset @@ -33,5 +35,5 @@ def gosat_ch4_transformation(file_obj, name, nodata): data.rio.set_spatial_dims("lon", "lat") data.rio.write_crs("epsg:4326", inplace=True) var_data_netcdf[cog_filename] = data - - return var_data_netcdf \ No newline at end of file + + return var_data_netcdf diff --git a/data_transformation_plugins/gpw_transformation.py b/data_transformation_plugins/gpw_transformation.py index fde5d1f7..fb1e99d8 100644 --- a/data_transformation_plugins/gpw_transformation.py +++ b/data_transformation_plugins/gpw_transformation.py @@ -1,6 +1,8 @@ -import xarray import re +import xarray + + def gpw_transformation(file_obj, name, nodata): """Tranformation function for the gridded population dataset @@ -30,5 +32,5 @@ def gpw_transformation(file_obj, name, nodata): # # add extension cog_filename = f"{cog_filename}.tif" var_data_netcdf[cog_filename] = xds - - return var_data_netcdf \ No newline at end of file + + return var_data_netcdf diff --git a/data_transformation_plugins/push_to_s3.py b/data_transformation_plugins/push_to_s3.py index 7f941bc9..03cdfaa6 100644 --- a/data_transformation_plugins/push_to_s3.py +++ b/data_transformation_plugins/push_to_s3.py @@ -1,8 +1,7 @@ -import boto3 import os import boto3 -import os + def upload_files_to_s3(folder_path, bucket_name, s3_folder, exclude_files): """ @@ -16,23 +15,23 @@ def upload_files_to_s3(folder_path, bucket_name, s3_folder, exclude_files): - exclude_files (list): List of files to exclude from uploading. """ # Initialize S3 client - s3 = boto3.client('s3') + s3 = boto3.client("s3") # Loop through files in the local folder for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) - + # Check if it's a file and not in the exclude list if os.path.isfile(file_path) and file_name not in exclude_files: s3_key = os.path.join(s3_folder, file_name) - + try: # Check if the file already exists in S3 s3.head_object(Bucket=bucket_name, Key=s3_key) print(f"Skipped {file_name} (already exists in S3)") except s3.exceptions.ClientError as e: # If the file does not exist, upload it - if e.response['Error']['Code'] == '404': + if e.response["Error"]["Code"] == "404": try: s3.upload_file(file_path, bucket_name, s3_key) print(f"Uploaded {file_name} to {s3_key}") @@ -41,7 +40,18 @@ def upload_files_to_s3(folder_path, bucket_name, s3_folder, exclude_files): else: print(f"Error checking existence of {file_name}: {e}") + # Example usage: # upload_folder_to_s3("path/to/local/folder", "my-s3-bucket", "my/s3/folder", ["exclude1.ext", "exclude2.ext"]) if __name__ == "__main__": - upload_files_to_s3("data_transformation_plugins", "ghgc-data-store-develop", "data_transformation_plugins", ["__init__.py", "push_to_s3.py", "README.md", "sample_transformation.ipynb"]) \ No newline at end of file + upload_files_to_s3( + "data_transformation_plugins", + "ghgc-data-store-develop", + "data_transformation_plugins", + [ + "__init__.py", + "push_to_s3.py", + "README.md", + "sample_transformation.ipynb", + ], + ) diff --git a/data_transformation_plugins/tm5_4dvar_update_noaa_transformation.py b/data_transformation_plugins/tm5_4dvar_update_noaa_transformation.py index 1fd491b3..a7f3a8ac 100644 --- a/data_transformation_plugins/tm5_4dvar_update_noaa_transformation.py +++ b/data_transformation_plugins/tm5_4dvar_update_noaa_transformation.py @@ -1,6 +1,8 @@ -import xarray -from datetime import datetime import re +from datetime import datetime + +import xarray + def tm5_4dvar_update_noaa_transformation(file_obj, name, nodata): """Tranformation function for the tm5 ch4 influx dataset @@ -43,4 +45,4 @@ def tm5_4dvar_update_noaa_transformation(file_obj, name, nodata): cog_filename = f"{cog_filename}.tif" var_data_netcdf[cog_filename] = data - return var_data_netcdf \ No newline at end of file + return var_data_netcdf From 6c85d0ca6bd56e2d6028476358942da33bb4bc00 Mon Sep 17 00:00:00 2001 From: vishal Date: Wed, 20 Nov 2024 10:01:32 -0600 Subject: [PATCH 13/13] reformatting files along with resolving the comments --- .../ecco_darwin_transformation.py | 9 +++++++-- .../geos_oco2_transformation.py | 9 +++++++-- .../gosat_ch4_transformation.py | 11 ++++++++--- data_transformation_plugins/gpw_transformation.py | 9 +++++++-- .../tm5_4dvar_update_noaa_transformation.py | 9 +++++++-- 5 files changed, 36 insertions(+), 11 deletions(-) diff --git a/data_transformation_plugins/ecco_darwin_transformation.py b/data_transformation_plugins/ecco_darwin_transformation.py index 93be21cd..9eb518b0 100644 --- a/data_transformation_plugins/ecco_darwin_transformation.py +++ b/data_transformation_plugins/ecco_darwin_transformation.py @@ -1,10 +1,15 @@ import re +from typing import Dict import xarray +from s3fs import S3File +from xarray import DataArray -def ecco_darwin_transformation(file_obj, name, nodata): - """Tranformation function for the ecco darwin dataset +def ecco_darwin_transformation( + file_obj: S3File, name: str, nodata: int +) -> Dict[str, DataArray]: + """Transformation function for the ecco darwin dataset Args: file_obj (s3fs object): s3fs sile object for one file of the dataset diff --git a/data_transformation_plugins/geos_oco2_transformation.py b/data_transformation_plugins/geos_oco2_transformation.py index 61b9702a..dc18dfbc 100644 --- a/data_transformation_plugins/geos_oco2_transformation.py +++ b/data_transformation_plugins/geos_oco2_transformation.py @@ -1,10 +1,15 @@ import re +from typing import Dict import xarray +from s3fs import S3File +from xarray import DataArray -def geos_oco2_transformation(file_obj, name, nodata): - """Tranformation function for the oco2 geos dataset +def geos_oco2_transformation( + file_obj: S3File, name: str, nodata: int +) -> Dict[str, DataArray]: + """Transformation function for the oco2 geos dataset Args: file_obj (s3fs object): s3fs sile object for one file of the dataset diff --git a/data_transformation_plugins/gosat_ch4_transformation.py b/data_transformation_plugins/gosat_ch4_transformation.py index ea5552ec..ac299264 100644 --- a/data_transformation_plugins/gosat_ch4_transformation.py +++ b/data_transformation_plugins/gosat_ch4_transformation.py @@ -1,10 +1,15 @@ import re +from typing import Dict import xarray +from s3fs import S3File +from xarray import DataArray -def gosat_ch4_transformation(file_obj, name, nodata): - """Tranformation function for the ecco darwin dataset +def gosat_ch4_transformation( + file_obj: S3File, name: str, nodata: int +) -> Dict[str, DataArray]: + """Transformation function for the ecco darwin dataset Args: file_obj (s3fs object): s3fs sile object for one file of the dataset @@ -29,7 +34,7 @@ def gosat_ch4_transformation(file_obj, name, nodata): cog_filename = f"{cog_filename}.tif" data = data.reindex(lat=list(reversed(data.lat))) - data = data.where(data != -9999, -9999) + data = data.where(data != nodata, -9999) data.rio.write_nodata(-9999, inplace=True) data.rio.set_spatial_dims("lon", "lat") diff --git a/data_transformation_plugins/gpw_transformation.py b/data_transformation_plugins/gpw_transformation.py index fb1e99d8..9db17e90 100644 --- a/data_transformation_plugins/gpw_transformation.py +++ b/data_transformation_plugins/gpw_transformation.py @@ -1,10 +1,15 @@ import re +from typing import Dict import xarray +from s3fs import S3File +from xarray import DataArray -def gpw_transformation(file_obj, name, nodata): - """Tranformation function for the gridded population dataset +def gpw_transformation( + file_obj: S3File, name: str, nodata: int +) -> Dict[str, DataArray]: + """Transformation function for the gridded population dataset Args: file_obj (s3fs object): s3fs sile object for one file of the dataset diff --git a/data_transformation_plugins/tm5_4dvar_update_noaa_transformation.py b/data_transformation_plugins/tm5_4dvar_update_noaa_transformation.py index a7f3a8ac..8b13fc79 100644 --- a/data_transformation_plugins/tm5_4dvar_update_noaa_transformation.py +++ b/data_transformation_plugins/tm5_4dvar_update_noaa_transformation.py @@ -1,11 +1,16 @@ import re from datetime import datetime +from typing import Dict import xarray +from s3fs import S3File +from xarray import DataArray -def tm5_4dvar_update_noaa_transformation(file_obj, name, nodata): - """Tranformation function for the tm5 ch4 influx dataset +def tm5_4dvar_update_noaa_transformation( + file_obj: S3File, name: str, nodata: int +) -> Dict[str, DataArray]: + """Transformation function for the tm5 ch4 influx dataset Args: file_obj (s3fs object): s3fs sile object for one file of the dataset