Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate ODIAC statistics #48

Merged
merged 3 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import os
import numpy as np
import matplotlib.pyplot as plt
import rasterio
from rasterio.plot import show
from rasterio.plot import show_hist
from glob import glob
import pathlib
import boto3
import pandas as pd
import calendar
import seaborn as sns
import json
import re

from dotenv import load_dotenv

load_dotenv()

# session_veda_smce = boto3.session.Session()
session_veda_smce = boto3.Session(
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
aws_session_token=os.environ.get("AWS_SESSION_TOKEN"),
)
s3_client_veda_smce = session_veda_smce.client("s3")
raster_io_session = rasterio.env.Env(
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
aws_session_token=os.environ.get("AWS_SESSION_TOKEN"),
)
bucket_name = "ghgc-data-store-dev"

keys = []
resp = s3_client_veda_smce.list_objects_v2(
Bucket=bucket_name, Prefix="ODIAC_geotiffs_COGs/"
)
for obj in resp["Contents"]:
if re.search(".*2000\d\d.tif", obj["Key"]):
keys.append(obj["Key"])

# List all TIFF files in the folder
tif_files = glob("../../data/odiac_data/2000/*.tif", recursive=True)
# tif_files = glob("data/wetlands-monthly/*.nc", recursive=True)
session = rasterio.env.Env()
summary_dict_netcdf, summary_dict_cog = {}, {}
overall_stats_netcdf, overall_stats_cog = {}, {}
full_data_df_netcdf, full_data_df_cog = pd.DataFrame(), pd.DataFrame()

for key in keys:
with raster_io_session:
s3_file = s3_client_veda_smce.generate_presigned_url(
"get_object", Params={"Bucket": bucket_name, "Key": key}
)
filename_elements = re.split("[_ ? . /]", s3_file)
with rasterio.open(s3_file) as src:
for band in src.indexes:
idx = pd.MultiIndex.from_product(
[
["_".join(filename_elements[6:13])],
[filename_elements[13]],
[x for x in np.arange(1, src.height + 1)],
]
)
# Read the raster data
raster_data = src.read(band)
raster_data[raster_data == -9999] = np.nan
temp = pd.DataFrame(index=idx, data=raster_data)
full_data_df_cog = full_data_df_cog._append(temp, ignore_index=False)

# Calculate summary statistics
min_value = np.float64(temp.values.min())
max_value = np.float64(temp.values.max())
mean_value = np.float64(temp.values.mean())
std_value = np.float64(temp.values.std())

summary_dict_cog[
f'{"_".join(filename_elements[9:13])}_{filename_elements[13][:4]}_{calendar.month_name[int(filename_elements[13][4:])]}'
] = {
"min_value": min_value,
"max_value": max_value,
"mean_value": mean_value,
"std_value": std_value,
}

# Iterate over each TIFF file
for tif_file in tif_files:
file_name = pathlib.Path(tif_file).name[:-4]

# Open the TIFF file
with rasterio.open(tif_file) as src:
for band in src.indexes:
idx = pd.MultiIndex.from_product(
[
[pathlib.Path(tif_file).name[:-9]],
[pathlib.Path(tif_file).name[-8:-4]],
[x for x in np.arange(1, src.height + 1)],
]
)
# Read the raster data
raster_data = src.read(band)
raster_data[raster_data == -9999] = np.nan
temp = pd.DataFrame(index=idx, data=raster_data)
full_data_df_netcdf = full_data_df_netcdf._append(temp, ignore_index=False)

# Calculate summary statistics
min_value = np.float64(temp.values.min())
max_value = np.float64(temp.values.max())
mean_value = np.float64(temp.values.mean())
std_value = np.float64(temp.values.std())

summary_dict_netcdf[
f'{tif_file.split("/")[-1][:-9]}_{tif_file.split("/")[2]}_{calendar.month_name[int(tif_file.split("/")[-1][-6:-4])]}'
] = {
"min_value": min_value,
"max_value": max_value,
"mean_value": mean_value,
"std_value": std_value,
}

overall_stats_netcdf["min_value"] = np.float64(full_data_df_netcdf.values.min())
overall_stats_netcdf["max_value"] = np.float64(full_data_df_netcdf.values.max())
overall_stats_netcdf["mean_value"] = np.float64(full_data_df_netcdf.values.mean())
overall_stats_netcdf["std_value"] = np.float64(full_data_df_netcdf.values.std())

overall_stats_cog["min_value"] = np.float64(full_data_df_cog.values.min())
overall_stats_cog["max_value"] = np.float64(full_data_df_cog.values.max())
overall_stats_cog["mean_value"] = np.float64(full_data_df_cog.values.mean())
overall_stats_cog["std_value"] = np.float64(full_data_df_cog.values.std())


with open("monthly_stats.json", "w") as fp:
json.dump("Stats for raw netCDF files.", fp)
fp.write("\n")
json.dump(summary_dict_netcdf, fp)
fp.write("\n")
json.dump("Stats for transformed COG files.", fp)
fp.write("\n")
json.dump(summary_dict_cog, fp)

with open("overall_stats.json", "w") as fp:
json.dump("Stats for raw netCDF files.", fp)
fp.write("\n")
json.dump(overall_stats_netcdf, fp)
fp.write("\n")
json.dump("Stats for transformed COG files.", fp)
fp.write("\n")
json.dump(overall_stats_cog, fp)

fig, ax = plt.subplots(2, 2, figsize=(10, 10))
plt.Figure(figsize=(10, 10))
sns.histplot(
data=full_data_df_netcdf.to_numpy().flatten(),
kde=False,
bins=100,
legend=False,
ax=ax[0][0],
)
ax[0][0].set_title("distribution plot for overall raw data")

sns.histplot(
data=full_data_df_cog.to_numpy().flatten(),
kde=False,
bins=100,
legend=False,
ax=ax[0][1],
)
ax[0][1].set_title("distribution plot for overall cog data")

temp_df = pd.DataFrame()
for key_value in summary_dict_netcdf.keys():
temp_df = temp_df._append(summary_dict_netcdf[key_value], ignore_index=True)

sns.lineplot(
data=temp_df,
ax=ax[1][0],
)
ax[1][0].set_title("distribution plot for 2000 raw data")
ax[1][0].set_xlabel("Months")

temp_df = pd.DataFrame()
for key_value in summary_dict_cog.keys():
temp_df = temp_df._append(summary_dict_cog[key_value], ignore_index=True)
sns.lineplot(
data=temp_df,
ax=ax[1][1],
)
ax[1][1].set_title("distribution plot for 2000 cog data")
ax[1][1].set_xlabel("Months")

plt.savefig("stats_summary.png")
plt.show()
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"Stats for raw netCDF files."
{"odiac2022_1km_excl_intl_data_July": {"min_value": 0.0, "max_value": 735599.6875, "mean_value": 0.5692264437675476, "std_value": 180.981201171875}, "odiac2022_1km_excl_intl_data_December": {"min_value": 0.0, "max_value": 795665.6875, "mean_value": 0.6445742249488831, "std_value": 208.37452697753906}, "odiac2022_1km_excl_intl_data_June": {"min_value": 0.0, "max_value": 711870.6875, "mean_value": 0.5623753070831299, "std_value": 178.8464813232422}, "odiac2022_1km_excl_intl_data_October": {"min_value": 0.0, "max_value": 735599.6875, "mean_value": 0.5883747339248657, "std_value": 186.28933715820312}, "odiac2022_1km_excl_intl_data_April": {"min_value": 0.0, "max_value": 711870.6875, "mean_value": 0.5595216155052185, "std_value": 177.70298767089844}, "odiac2022_1km_excl_intl_data_May": {"min_value": 0.0, "max_value": 735599.6875, "mean_value": 0.5656134486198425, "std_value": 180.9856414794922}, "odiac2022_1km_excl_intl_data_November": {"min_value": 0.0, "max_value": 753263.9375, "mean_value": 0.5986246466636658, "std_value": 191.67637634277344}, "odiac2022_1km_excl_intl_data_January": {"min_value": 0.0, "max_value": 904096.875, "mean_value": 0.6154927015304565, "std_value": 203.31666564941406}, "odiac2022_1km_excl_intl_data_February": {"min_value": 0.0, "max_value": 864400.5625, "mean_value": 0.585207462310791, "std_value": 192.9818572998047}, "odiac2022_1km_excl_intl_data_March": {"min_value": 0.0, "max_value": 911276.75, "mean_value": 0.6101319193840027, "std_value": 196.6996307373047}, "odiac2022_1km_excl_intl_data_August": {"min_value": 0.0, "max_value": 735599.6875, "mean_value": 0.5851415395736694, "std_value": 186.6926727294922}, "odiac2022_1km_excl_intl_data_September": {"min_value": 0.0, "max_value": 711870.6875, "mean_value": 0.5668593645095825, "std_value": 180.1910400390625}}
"Stats for transformed COG files."
{"odiac2022_1km_excl_intl_2000_January": {"min_value": 0.0, "max_value": 904096.875, "mean_value": 0.6154927015304565, "std_value": 203.31666564941406}, "odiac2022_1km_excl_intl_2000_February": {"min_value": 0.0, "max_value": 864400.5625, "mean_value": 0.585207462310791, "std_value": 192.9818572998047}, "odiac2022_1km_excl_intl_2000_March": {"min_value": 0.0, "max_value": 911276.75, "mean_value": 0.6101319193840027, "std_value": 196.6996307373047}, "odiac2022_1km_excl_intl_2000_April": {"min_value": 0.0, "max_value": 711870.6875, "mean_value": 0.5595216155052185, "std_value": 177.70298767089844}, "odiac2022_1km_excl_intl_2000_May": {"min_value": 0.0, "max_value": 735599.6875, "mean_value": 0.5656134486198425, "std_value": 180.9856414794922}, "odiac2022_1km_excl_intl_2000_June": {"min_value": 0.0, "max_value": 711870.6875, "mean_value": 0.5623753070831299, "std_value": 178.8464813232422}, "odiac2022_1km_excl_intl_2000_July": {"min_value": 0.0, "max_value": 735599.6875, "mean_value": 0.5692264437675476, "std_value": 180.981201171875}, "odiac2022_1km_excl_intl_2000_August": {"min_value": 0.0, "max_value": 735599.6875, "mean_value": 0.5851415395736694, "std_value": 186.6926727294922}, "odiac2022_1km_excl_intl_2000_September": {"min_value": 0.0, "max_value": 711870.6875, "mean_value": 0.5668593645095825, "std_value": 180.1910400390625}, "odiac2022_1km_excl_intl_2000_October": {"min_value": 0.0, "max_value": 735599.6875, "mean_value": 0.5883747339248657, "std_value": 186.28933715820312}, "odiac2022_1km_excl_intl_2000_November": {"min_value": 0.0, "max_value": 753263.9375, "mean_value": 0.5986246466636658, "std_value": 191.67637634277344}, "odiac2022_1km_excl_intl_2000_December": {"min_value": 0.0, "max_value": 795665.6875, "mean_value": 0.6445742249488831, "std_value": 208.37452697753906}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"Stats for raw netCDF files."
{"min_value": 0.0, "max_value": 911276.75, "mean_value": 0.5871875286102295, "std_value": 188.92286682128906}
"Stats for transformed COG files."
{"min_value": 0.0, "max_value": 911276.75, "mean_value": 0.5871965885162354, "std_value": 188.92271423339844}
Loading