diff --git a/README.md b/README.md index 44ea45c..5c32315 100644 --- a/README.md +++ b/README.md @@ -61,34 +61,50 @@ The damages correspond to insurance claims per cell (pixel of the precipitation The damages are managed by the `Damages` class in `swafi/damages.py` and are also handled internally as a Pandas dataframe. There are two classes of damages: -- `DamagesMobiliar` from the file `swafi/damages_mobiliar.py`: handles the claims from the Swiss Mobiliar Insurance Company as GeoTIFF. - The dataset from the Mobiliar contains the following categories of claims: - - | Name in swafi | Client | Ext/Int | Object | Flood type | Original file names | - |---------------------|---------|----------|-----------|------------|-----------------------------------| - | sme_ext_cont_pluv | SME | external | content | pluvial | Ueberschwemmung_pluvial_KMU_FH | - | sme_ext_cont_fluv | SME | external | content | fluvial | Ueberschwemmung_fluvial_KMU_FH | - | sme_ext_struc_pluv | SME | external | structure | pluvial | Ueberschwemmung_pluvial_KMU_GB | - | sme_ext_struc_fluv | SME | external | structure | fluvial | Ueberschwemmung_fluvial_KMU_GB | - | sme_int_cont | SME | internal | content | | Wasser_KMU_FH | - | sme_int_struc | SME | internal | structure | | Wasser_KMU_GB | - | priv_ext_cont_pluv | Private | external | content | pluvial | Ueberschwemmung_pluvial_Privat_FH | - | priv_ext_cont_fluv | Private | external | content | fluvial | Ueberschwemmung_fluvial_Privat_FH | - | priv_ext_struc_pluv | Private | external | structure | pluvial | Ueberschwemmung_pluvial_Privat_GB | - | priv_ext_struc_fluv | Private | external | structure | fluvial | Ueberschwemmung_fluvial_Privat_GB | - | priv_int_cont | Private | internal | content | | Wasser_Privat_FH | - | priv_int_struc | Private | internal | structure | | Wasser_Privat_GB | - -- `DamagesGVZ` from the file `swafi/damages_gvz.py`: handles the claims from the GVZ (Building insurance Canton Zurich) as netCDF. - The dataset from the GVZ contains the following categories: - - | Name in swafi | Original tag | - |---------------------|---------------| - | most_likely_pluvial | A | - | likely_pluvial | A, B | - | fluvial_or_pluvial | A, B, C, D, E | - | likely_fluvial | D, E | - | most_likely_fluvial | E | + +#### DamagesMobiliar +`DamagesMobiliar` from the file `swafi/damages_mobiliar.py`: handles the claims from the Swiss Mobiliar Insurance Company as GeoTIFF. +The dataset from the Mobiliar contains the following categories of **exposure** (contracts): + +| Name in swafi | Client | Ext/Int | Object | Original file names | +|---------------------|---------|----------|-----------|-----------------------------| +| sme_ext_cont | SME | external | content | Vertraege_KMU_ES_FH_YYYY | +| sme_ext_struc | SME | external | structure | Vertraege_KMU_ES_GB_YYYY | +| sme_int_cont | SME | internal | content | Vertraege_KMU_W_FH_YYYY | +| sme_int_struc | SME | internal | structure | Vertraege_KMU_W_GB_YYYY | +| priv_ext_cont | Private | external | content | Vertraege_Privat_ES_FH_YYYY | +| priv_ext_struc | Private | external | structure | Vertraege_Privat_ES_GB_YYYY | +| priv_int_cont | Private | internal | content | Vertraege_Privat_W_FH_YYYY | +| priv_int_struc | Private | internal | structure | Vertraege_Privat_W_GB_YYYY | + +The dataset from the Mobiliar contains the following categories of **claims**: + +| Name in swafi | Client | Ext/Int | Object | Flood type | Original file names | +|---------------------|---------|----------|-----------|------------|-----------------------------------| +| sme_ext_cont_pluv | SME | external | content | pluvial | Ueberschwemmung_pluvial_KMU_FH | +| sme_ext_cont_fluv | SME | external | content | fluvial | Ueberschwemmung_fluvial_KMU_FH | +| sme_ext_struc_pluv | SME | external | structure | pluvial | Ueberschwemmung_pluvial_KMU_GB | +| sme_ext_struc_fluv | SME | external | structure | fluvial | Ueberschwemmung_fluvial_KMU_GB | +| sme_int_cont | SME | internal | content | | Wasser_KMU_FH | +| sme_int_struc | SME | internal | structure | | Wasser_KMU_GB | +| priv_ext_cont_pluv | Private | external | content | pluvial | Ueberschwemmung_pluvial_Privat_FH | +| priv_ext_cont_fluv | Private | external | content | fluvial | Ueberschwemmung_fluvial_Privat_FH | +| priv_ext_struc_pluv | Private | external | structure | pluvial | Ueberschwemmung_pluvial_Privat_GB | +| priv_ext_struc_fluv | Private | external | structure | fluvial | Ueberschwemmung_fluvial_Privat_GB | +| priv_int_cont | Private | internal | content | | Wasser_Privat_FH | +| priv_int_struc | Private | internal | structure | | Wasser_Privat_GB | + +#### DamagesGVZ +`DamagesGVZ` from the file `swafi/damages_gvz.py`: handles the claims from the GVZ (Building insurance Canton Zurich) as netCDF. +The dataset from the GVZ contains a single category of **exposure** (contracts): `all_buildings`, and the following categories of **claims**: + +| Name in swafi | Original tag | +|---------------------|---------------| +| most_likely_pluvial | A | +| likely_pluvial | A, B | +| fluvial_or_pluvial | A, B, C, D, E | +| likely_fluvial | D, E | +| most_likely_fluvial | E | These classes are subclasses of the `Damages` class and implement the data loading according to the corresponding file format as well as their specific classification. @@ -273,7 +289,7 @@ All the hyperparameters of the model can be set as options of the script. The model can be trained using the following command: ```bash -train_dl_occurrence.py [-h] [--run-id RUN_ID] [--optimize-with-optuna] +train_cnn_occurrence.py [-h] [--run-id RUN_ID] [--optimize-with-optuna] [--target-type TARGET_TYPE] [--factor-neg-reduction FACTOR_NEG_REDUCTION] [--weight-denominator WEIGHT_DENOMINATOR] diff --git a/config_example.yaml b/config_example.yaml index 0e1ee31..ada8456 100644 --- a/config_example.yaml +++ b/config_example.yaml @@ -9,8 +9,8 @@ YEAR_END_MOBILIAR: 2022 YEAR_START_GVZ: 2005 YEAR_END_GVZ: 2022 -# CID (cells IDs) raster path -CID_PATH: '..\files\cids.tif' +# CID (cells IDs) raster path (not needed for Switzerland) +CID_PATH: 'path/to/cids.tif' # Contract and damage data directories DIR_EXPOSURE_MOBILIAR: '' @@ -19,7 +19,7 @@ DIR_EXPOSURE_GVZ: '' DIR_CLAIMS_GVZ: '' # Path to the events parquet file -EVENTS_PATH: '' +EVENTS_PATH: 'path/to/prec_events_2005-2023_no_smoothing.parquet' # Path to the precipitation directory DIR_PRECIP: '' diff --git a/requirements-optional.txt b/requirements-optional.txt index 2e40744..86d8ea3 100644 --- a/requirements-optional.txt +++ b/requirements-optional.txt @@ -6,3 +6,4 @@ optuna asyncpg psycopg2 psycopg2-binary +plotly diff --git a/scripts/data_analyses/analyze_damage_data.py b/scripts/data_analyses/analyze_damage_data.py new file mode 100644 index 0000000..94afec5 --- /dev/null +++ b/scripts/data_analyses/analyze_damage_data.py @@ -0,0 +1,186 @@ +""" +This script analyzes the distribution of the number of contracts and claims per cell. +""" + +from swafi.config import Config +from swafi.damages_mobiliar import DamagesMobiliar +from swafi.damages_gvz import DamagesGvz +import pandas as pd +import matplotlib.pyplot as plt + +config = Config(output_dir='analysis_damage_distribution') +output_dir = config.output_dir + +PICKLES_DIR = config.get('PICKLES_DIR') +DATASET = 'mobiliar' # 'mobiliar' or 'gvz' + +if DATASET == 'mobiliar': + EXPOSURE_CATEGORIES = ['external'] + CLAIM_CATEGORIES = ['external', 'pluvial'] +elif DATASET == 'gvz': + EXPOSURE_CATEGORIES = ['all_buildings'] + CLAIM_CATEGORIES = ['likely_pluvial'] + + +def main(): + if DATASET == 'mobiliar': + damages = DamagesMobiliar(dir_exposure=config.get('DIR_EXPOSURE_MOBILIAR'), + dir_claims=config.get('DIR_CLAIMS_MOBILIAR'), + year_start=config.get('YEAR_START_MOBILIAR'), + year_end=config.get('YEAR_END_MOBILIAR')) + elif DATASET == 'gvz': + damages = DamagesGvz(dir_exposure=config.get('DIR_EXPOSURE_GVZ'), + dir_claims=config.get('DIR_CLAIMS_GVZ'), + year_start=config.get('YEAR_START_GVZ'), + year_end=config.get('YEAR_END_GVZ')) + else: + raise ValueError(f'Dataset {DATASET} not recognized.') + + # Format the date of the claims + df_claims = damages.claims + df_claims['date_claim'] = pd.to_datetime( + df_claims['date_claim'], errors='coerce') + + # Compute the monthly sum of claims for each month per category + df_claims_month = df_claims.copy() + df_claims_month['month'] = df_claims_month['date_claim'].dt.month + df_claims_month = df_claims_month.drop( + columns=['date_claim', 'mask_index', 'cid', 'x', 'y']) + df_claims_month_sum = df_claims_month.groupby('month').sum() + + # Plot the monthly distribution of the total # of claims for different categories + for category in damages.claim_categories: + sum_claims = df_claims_month_sum[category].sum() + plt.figure(figsize=(8, 4)) + plt.title(f'Monthly distribution of the claims for category {category} ' + f'(total: {sum_claims})') + plt.xlabel('Month') + plt.ylabel('Percentage of claims [%]') + nb_annual_claims = df_claims_month_sum[category] / sum_claims + plt.bar(df_claims_month_sum.index, 100 * nb_annual_claims) + plt.xticks(range(1, 13)) + plt.tight_layout() + plt.savefig(output_dir / f'monthly_distribution_tot_claims_{category}.png') + plt.savefig(output_dir / f'monthly_distribution_tot_claims_{category}.pdf') + plt.close() + + # For the whole domain, aggregate by date (sum) + df_claims_date = df_claims.copy() + df_claims_date = df_claims_date.drop( + columns=['mask_index', 'cid', 'x', 'y']) + df_claims_date = df_claims_date.groupby('date_claim').sum() + df_claims_date['date_claim'] = pd.to_datetime(df_claims_date.index, errors='coerce') + df_claims_date['month'] = df_claims_date['date_claim'].dt.month + + # Plot the monthly distribution of the mean # of claims for different categories + for category in damages.claim_categories: + df_claims_date_cat = df_claims_date.copy() + df_claims_date_cat = df_claims_date_cat[df_claims_date_cat[category] > 0] + df_claims_date_cat = df_claims_date_cat.groupby('month').mean() + plt.figure(figsize=(8, 4)) + plt.title(f'Monthly distribution of the mean # of claims / event for category {category}') + plt.xlabel('Month') + plt.ylabel('Mean number of claims') + plt.bar(df_claims_date_cat.index, df_claims_date_cat[category]) + plt.xticks(range(1, 13)) + plt.tight_layout() + plt.savefig(output_dir / f'monthly_distribution_mean_claims_{category}.png') + plt.savefig(output_dir / f'monthly_distribution_mean_claims_{category}.pdf') + plt.close() + + # Select the categories of interest + damages.select_categories_type(EXPOSURE_CATEGORIES, CLAIM_CATEGORIES) + + # Analyze the occurrences of damages to the structure and/or content + if DATASET == 'mobiliar': + df_mobi = damages.claims + + # Sum priv and sme + df_mobi['ext_struc_pluv'] = ( + df_mobi['priv_ext_struc_pluv'] + df_mobi['sme_ext_struc_pluv']) + df_mobi['ext_cont_pluv'] = ( + df_mobi['priv_ext_cont_pluv'] + df_mobi['sme_ext_cont_pluv']) + df_mobi['ext_both_pluv'] = ( + df_mobi[['ext_struc_pluv', 'ext_cont_pluv']].min(axis=1)) + df_mobi['ext_struc_only_pluv'] = ( + df_mobi['ext_struc_pluv'] - df_mobi['ext_both_pluv']) + df_mobi['ext_cont_only_pluv'] = ( + df_mobi['ext_cont_pluv'] - df_mobi['ext_both_pluv']) + + nb_both = df_mobi['ext_both_pluv'].sum() + nb_struc = df_mobi['ext_struc_only_pluv'].sum() + nb_cont = df_mobi['ext_cont_only_pluv'].sum() + nb_tot = nb_both + nb_struc + nb_cont + pc_both = 100 * nb_both / nb_tot + pc_struc = 100 * nb_struc / nb_tot + pc_cont = 100 * nb_cont / nb_tot + + print(f"Number of claims with both structure and content: {pc_both:.2f}%") + print(f"Number of claims with structure only: {pc_struc:.2f}%") + print(f"Number of claims with content only: {pc_cont:.2f}%") + + # Analyze the distribution of the number of contracts and claims per cell + df_contracts = damages.exposure + df_contracts = df_contracts[['mask_index', 'selection', 'cid']] + + # Average the number of annual contracts per location + df_contracts = df_contracts.groupby('mask_index').mean() + + # Plot the histogram of the number of contracts per cell + plt.figure() + plt.title('Histogram of the number of contracts per cell') + plt.xlabel('Number of contracts') + plt.ylabel('Number of cells') + plt.hist(df_contracts['selection'], bins=100) + plt.yscale('log') + plt.xlim(0, None) + plt.tight_layout() + plt.savefig(output_dir / 'histogram_contracts.png') + plt.savefig(output_dir / 'histogram_contracts.pdf') + + df_claims = damages.claims + df_claims = df_claims[['mask_index', 'selection']] + + # Sum the number of claims per location and divide by the number of years + df_claims = df_claims.groupby('mask_index').sum() + n_years = damages.year_end - damages.year_start + 1 + df_claims['selection'] = df_claims['selection'] / n_years + + # Plot the histogram of the number of claims per cell + plt.figure() + plt.title('Histogram of the number of annual claims per cell') + plt.xlabel('Number of claims') + plt.ylabel('Number of cells') + plt.hist(df_claims['selection'], bins=50) + plt.yscale('log') + plt.xlim(0, None) + plt.tight_layout() + plt.savefig(output_dir / 'histogram_claims.png') + plt.savefig(output_dir / 'histogram_claims.pdf') + + # Merge the contracts and claims dataframes on the index + df_merged = df_contracts.merge(df_claims, left_index=True, + right_index=True, how='left') + + # Rename the columns + df_merged.columns = ['contracts', 'cid', 'claims'] + + # Replace nan values with 0 + df_merged.fillna(0, inplace=True) + + # Plot the relationship between the number of contracts and the number of claims + plt.figure() + plt.title('Relationship between the number of contracts and the claims') + plt.xlabel('Number of contracts (mean per cell)') + plt.ylabel('Mean number of annual claims (sum per cell)') + plt.scatter(df_merged['contracts'], df_merged['claims'], + facecolors='none', edgecolors='k') + plt.xscale('log') + plt.yscale('log') + plt.tight_layout() + plt.savefig(output_dir / 'scatter_contracts_claims.png') + plt.savefig(output_dir / 'scatter_contracts_claims.pdf') + + +if __name__ == '__main__': + main() diff --git a/scripts/data_analyses/analyze_damage_distribution.py b/scripts/data_analyses/analyze_damage_distribution.py deleted file mode 100644 index 76adc67..0000000 --- a/scripts/data_analyses/analyze_damage_distribution.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -This script analyzes the distribution of the number of contracts and claims per cell. -""" - -from swafi.config import Config -from swafi.damages_mobiliar import DamagesMobiliar -import matplotlib.pyplot as plt - -CONFIG = Config() - -EXPOSURE_CATEGORIES = ['external'] -CLAIM_CATEGORIES = ['external', 'pluvial'] -PICKLES_DIR = CONFIG.get('PICKLES_DIR') - -config = Config(output_dir='analysis_damage_distribution') -output_dir = config.output_dir - - -def main(): - damages = DamagesMobiliar(dir_exposure=CONFIG.get('DIR_EXPOSURE'), - dir_claims=CONFIG.get('DIR_CLAIMS')) - damages.select_categories_type(EXPOSURE_CATEGORIES, CLAIM_CATEGORIES) - - df_contracts = damages.exposure - df_contracts = df_contracts[['mask_index', 'selection', 'cid']] - - # Average the number of annual contracts per location - df_contracts = df_contracts.groupby('mask_index').mean() - - # Plot the histogram of the number of contracts per cell - plt.figure() - plt.title('Histogram of the number of contracts per cell') - plt.xlabel('Number of contracts') - plt.ylabel('Number of cells') - plt.hist(df_contracts['selection'], bins=100) - plt.yscale('log') - plt.xlim(0, None) - plt.tight_layout() - plt.savefig(output_dir / 'histogram_contracts.png') - plt.savefig(output_dir / 'histogram_contracts.pdf') - - df_claims = damages.claims - df_claims = df_claims[['mask_index', 'selection']] - - # Sum the number of claims per location and divide by the number of years - df_claims = df_claims.groupby('mask_index').sum() - n_years = damages.year_end - damages.year_start + 1 - df_claims['selection'] = df_claims['selection'] / n_years - - # Plot the histogram of the number of claims per cell - plt.figure() - plt.title('Histogram of the number of annual claims per cell') - plt.xlabel('Number of claims') - plt.ylabel('Number of cells') - plt.hist(df_claims['selection'], bins=50) - plt.yscale('log') - plt.xlim(0, None) - plt.tight_layout() - plt.savefig(output_dir / 'histogram_claims.png') - plt.savefig(output_dir / 'histogram_claims.pdf') - - # Merge the contracts and claims dataframes on the index - df_merged = df_contracts.merge(df_claims, left_index=True, - right_index=True, how='left') - - # Rename the columns - df_merged.columns = ['contracts', 'cid', 'claims'] - - # Replace nan values with 0 - df_merged.fillna(0, inplace=True) - - # Plot the relationship between the number of contracts and the number of claims - plt.figure() - plt.title('Relationship between the number of contracts and the claims') - plt.xlabel('Number of contracts (mean per cell)') - plt.ylabel('Mean number of annual claims (sum per cell)') - plt.scatter(df_merged['contracts'], df_merged['claims'], - facecolors='none', edgecolors='k') - plt.xscale('log') - plt.yscale('log') - plt.tight_layout() - plt.savefig(output_dir / 'scatter_contracts_claims.png') - plt.savefig(output_dir / 'scatter_contracts_claims.pdf') - - # Plot the number of claims for different number of contracts - contracts_ranges = [(0, 2), (0, 5), (0, 10), (10, 100), (100, 9000)] - for cont_range in contracts_ranges: - v_min = cont_range[0] - v_max = cont_range[1] - - plt.figure() - plt.title(f'Number of annual claims for {v_min} < contracts < {v_max}') - plt.xlabel('Number of claims') - plt.ylabel('Number of cells') - plt.hist(df_merged[(df_merged['contracts'] > v_min) & - (df_merged['contracts'] < v_max)]['claims'], bins=50) - plt.yscale('log') - plt.xlim(0, None) - plt.tight_layout() - plt.savefig(output_dir / f'histogram_claims_{v_min}_{v_max}.png') - plt.savefig(output_dir / f'histogram_claims_{v_min}_{v_max}.pdf') - - -if __name__ == '__main__': - main() diff --git a/scripts/data_analyses/analyze_precipitation_for_claims.py b/scripts/data_analyses/analyze_precipitation_for_claims.py new file mode 100644 index 0000000..5c07ecf --- /dev/null +++ b/scripts/data_analyses/analyze_precipitation_for_claims.py @@ -0,0 +1,222 @@ +""" +This script analyzes the precipitation data characteristics for each claim. +""" +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +from swafi.config import Config +from swafi.precip_combiprecip import CombiPrecip +from swafi.damages_mobiliar import DamagesMobiliar +from swafi.damages_gvz import DamagesGvz + +config = Config(output_dir='analysis_precip_claims') + +DATASET = 'mobiliar' # 'mobiliar' or 'gvz' +PRECIP_DAYS_BEFORE = 2 +PRECIP_DAYS_AFTER = 2 + +if DATASET == 'mobiliar': + EXPOSURE_CATEGORIES = ['external'] + CLAIM_CATEGORIES = ['external', 'pluvial'] +elif DATASET == 'gvz': + EXPOSURE_CATEGORIES = ['all_buildings'] + CLAIM_CATEGORIES = ['likely_pluvial'] + + +def main(): + generate_csv() + generate_plots() + + +def generate_csv(): + # Load the damage data + if DATASET == 'mobiliar': + damages = DamagesMobiliar(dir_exposure=config.get('DIR_EXPOSURE_MOBILIAR'), + dir_claims=config.get('DIR_CLAIMS_MOBILIAR'), + year_start=config.get('YEAR_START_MOBILIAR'), + year_end=config.get('YEAR_END_MOBILIAR')) + elif DATASET == 'gvz': + damages = DamagesGvz(dir_exposure=config.get('DIR_EXPOSURE_GVZ'), + dir_claims=config.get('DIR_CLAIMS_GVZ'), + year_start=config.get('YEAR_START_GVZ'), + year_end=config.get('YEAR_END_GVZ')) + else: + raise ValueError(f'Dataset {DATASET} not recognized.') + + # Select the categories of interest + damages.select_categories_type(EXPOSURE_CATEGORIES, CLAIM_CATEGORIES) + + # Compute the precipitation time series for each claim + claims = damages.claims + + # Extract CIDs with claims + cids = damages.claims['cid'].unique() + + # Load CombiPrecip files + precip = CombiPrecip() + precip.load_data(config.get('DIR_PRECIP')) + + # Add columns to the claims dataframe + claims['precip_max'] = None + claims['precip_max_q'] = None + claims['precip_dt'] = None + claims['precip_06h_max'] = None + claims['precip_12h_max'] = None + claims['precip_24h_max'] = None + + for cid in cids: + print(f'Processing CID {cid}') + precip_cid_q = precip.compute_quantiles_cid(cid) + + for idx, claim in claims[claims['cid'] == cid].iterrows(): + start = claim['date_claim'] - pd.Timedelta(days=PRECIP_DAYS_BEFORE) + end = claim['date_claim'] + pd.Timedelta(days=PRECIP_DAYS_AFTER) + precip_ts = precip.get_time_series(cid, start, end) + precip_ts = precip_ts.flatten() + precip_q_ts = precip_cid_q.sel(time=slice(start, end)) + precip_q_ts = precip_q_ts['precip'].to_numpy() + + # Compute the max precipitation + claims.loc[idx, 'precip_max'] = precip_ts.max() + claims.loc[idx, 'precip_max_q'] = precip_q_ts.max() + + # Compute centrality of the max precipitation + if precip_ts.max() > 0: + idx_max = precip_ts.argmax() + claims.loc[idx, 'precip_dt'] = idx_max - len(precip_ts) // 2 + + # Compute a precipitation sum on rolling windows + precip_ts_6h = np.convolve(precip_ts, np.ones(6), mode='valid') + claims.loc[idx, 'precip_06h_max'] = precip_ts_6h.max() + precip_ts_12h = np.convolve(precip_ts, np.ones(12), mode='valid') + claims.loc[idx, 'precip_12h_max'] = precip_ts_12h.max() + precip_ts_24h = np.convolve(precip_ts, np.ones(24), mode='valid') + claims.loc[idx, 'precip_24h_max'] = precip_ts_24h.max() + + # Save the claims dataframe + filename = f'claims_precip_{DATASET}.csv' + claims.to_csv(config.output_dir / filename, index=False) + + +def generate_plots(): + # Load csv + filename = f'claims_precip_{DATASET}.csv' + claims = pd.read_csv(config.output_dir / filename) + + # Copy of the claims with positive precipitation only + claims_pos = claims[claims['precip_max'] > 0].copy() + + # Plot a histogram of the max precipitation (all) + filename = f'hist_precip_max_{DATASET}_all.png' + claims['precip_max'].hist(bins=50) + plt.title('Max precipitation intensity (all)') + plt.xlabel('Precipitation intensity [mm/h]') + plt.grid(axis='x') + plt.tight_layout() + plt.savefig(config.output_dir / filename) + plt.close() + + # Plot a histogram of the max precipitation (>0) + filename = f'hist_precip_max_{DATASET}_pos.png' + claims_pos['precip_max'].hist(bins=50) + plt.title('Max precipitation intensity (>0)') + plt.xlabel('Precipitation intensity [mm/h]') + plt.grid(axis='x') + plt.tight_layout() + plt.savefig(config.output_dir / filename) + plt.close() + + # Plot a histogram of the max precipitation quantile (all) + filename = f'hist_precip_max_q_{DATASET}_all.png' + claims['precip_max_q'].hist(bins=50) + plt.title('Quantiles of max precipitation intensity (all)') + plt.grid(axis='x') + plt.tight_layout() + plt.savefig(config.output_dir / filename) + plt.close() + + # Plot a histogram of the max precipitation quantile (>0) + filename = f'hist_precip_max_q_{DATASET}_pos.png' + claims_pos['precip_max_q'].hist(bins=50) + plt.title('Quantiles of max precipitation intensity (>0)') + plt.grid(axis='x') + plt.tight_layout() + plt.savefig(config.output_dir / filename) + plt.close() + + # Plot a histogram of the time to max precipitation + filename = f'hist_precip_dt_{DATASET}.png' + nbins = int((PRECIP_DAYS_BEFORE + PRECIP_DAYS_AFTER + 1) * 24 / 2) + claims['precip_dt'].hist(bins=nbins) + plt.xlim(-(12 + PRECIP_DAYS_BEFORE * 24), 12 + PRECIP_DAYS_AFTER * 24) + plt.title('Time to max intensity') + plt.xlabel('Time to max intensity [h]') + plt.grid(axis='x') + plt.tight_layout() + plt.savefig(config.output_dir / filename) + plt.close() + + # Plot a histogram of the max precipitation in 6h (all) + filename = f'hist_precip_06h_max_{DATASET}_all.png' + claims['precip_06h_max'].hist(bins=50) + plt.title('Max 6-hrly precipitation total (all)') + plt.xlabel('Precipitation total over 6h [mm]') + plt.grid(axis='x') + plt.tight_layout() + plt.savefig(config.output_dir / filename) + plt.close() + + # Plot a histogram of the max precipitation in 6h (>0) + filename = f'hist_precip_06h_max_{DATASET}_pos.png' + claims_pos['precip_06h_max'].hist(bins=50) + plt.title('Max 6-hrly precipitation total (>0)') + plt.xlabel('Precipitation total over 6h [mm]') + plt.grid(axis='x') + plt.tight_layout() + plt.savefig(config.output_dir / filename) + plt.close() + + # Plot a histogram of the max precipitation in 12h (all) + filename = f'hist_precip_12h_max_{DATASET}_all.png' + claims['precip_12h_max'].hist(bins=50) + plt.title('Max 12-hrly precipitation total (all)') + plt.xlabel('Precipitation total over 12h [mm]') + plt.grid(axis='x') + plt.tight_layout() + plt.savefig(config.output_dir / filename) + plt.close() + + # Plot a histogram of the max precipitation in 12h (>0) + filename = f'hist_precip_12h_max_{DATASET}_pos.png' + claims_pos['precip_12h_max'].hist(bins=50) + plt.title('Max 12-hrly precipitation total (>0)') + plt.xlabel('Precipitation total over 12h [mm]') + plt.grid(axis='x') + plt.tight_layout() + plt.savefig(config.output_dir / filename) + plt.close() + + # Plot a histogram of the max precipitation in 24h (all) + filename = f'hist_precip_24h_max_{DATASET}_all.png' + claims['precip_24h_max'].hist(bins=50) + plt.title('Max 24-hrly precipitation total (all)') + plt.xlabel('Precipitation total over 24h [mm]') + plt.grid(axis='x') + plt.tight_layout() + plt.savefig(config.output_dir / filename) + plt.close() + + # Plot a histogram of the max precipitation in 24h (>0) + filename = f'hist_precip_24h_max_{DATASET}_pos.png' + claims_pos['precip_24h_max'].hist(bins=50) + plt.title('Max 24-hrly precipitation total (>0)') + plt.xlabel('Precipitation total over 24h [mm]') + plt.grid(axis='x') + plt.tight_layout() + plt.savefig(config.output_dir / filename) + plt.close() + + +if __name__ == '__main__': + main() diff --git a/scripts/data_preparation/extract_precipitation_events.py b/scripts/data_preparation/extract_precipitation_events.py index ad4c6b6..9e3cc45 100644 --- a/scripts/data_preparation/extract_precipitation_events.py +++ b/scripts/data_preparation/extract_precipitation_events.py @@ -14,7 +14,7 @@ config = Config() -domain = Domain(config.get('CID_PATH')) +domain = Domain() # Get quantile data to select the coordinates for the event calculation diff --git a/scripts/data_preparation/extract_precipitation_time_series.py b/scripts/data_preparation/extract_precipitation_time_series.py new file mode 100644 index 0000000..d0944c5 --- /dev/null +++ b/scripts/data_preparation/extract_precipitation_time_series.py @@ -0,0 +1,48 @@ +""" +Extracts the precipitation time series from the raw data and saves it as a netCDF files. +""" +import argparse +import numpy as np + +from swafi.config import Config +from swafi.precip_combiprecip import CombiPrecip + +config = Config() + +START_DATE = '2005-01-01' +END_DATE = '2022-12-31' + + +def main(args): + # Load CombiPrecip files + precip = CombiPrecip() + precip.load_data(config.get('DIR_PRECIP')) + + cids = np.unique(precip.domain.cids['ids_map']) + cids = cids[cids != 0] + + if args.start: + cids = cids[cids >= int(args.start)] + if args.end: + cids = cids[cids <= int(args.end)] + + if args.reverse: + cids = cids[::-1] + + for cid in cids: + print(f'Processing CID {cid}') + precip.save_nc_file_per_cid(cid, START_DATE, END_DATE) + + +if __name__ == '__main__': + # Initialize parser + parser = argparse.ArgumentParser() + parser.add_argument("-s", "--start", help="Starting CID") + parser.add_argument("-e", "--end", help="Ending CID") + parser.add_argument('--reverse', action='store_true', + help='Process in reverse order') + + # Read arguments from command line + args = parser.parse_args() + + main(args) diff --git a/scripts/data_preparation/extract_static_data_to_csv.py b/scripts/data_preparation/extract_static_data_to_csv.py index 4fe9315..0a92cb7 100644 --- a/scripts/data_preparation/extract_static_data_to_csv.py +++ b/scripts/data_preparation/extract_static_data_to_csv.py @@ -26,7 +26,7 @@ def main(): - domain = Domain(config.get('CID_PATH')) + domain = Domain() # List all .tif files in the source directory tif_files = [f for f in data_dir.glob('*.tif')] diff --git a/scripts/impact_functions/train_dl_occurrence.py b/scripts/impact_functions/train_cnn_occurrence.py similarity index 55% rename from scripts/impact_functions/train_dl_occurrence.py rename to scripts/impact_functions/train_cnn_occurrence.py index 67f7f7d..d1dc899 100644 --- a/scripts/impact_functions/train_dl_occurrence.py +++ b/scripts/impact_functions/train_cnn_occurrence.py @@ -9,36 +9,33 @@ from glob import glob from swafi.config import Config -from swafi.impact_dl import ImpactDeepLearning, ImpactDeepLearningOptions +from swafi.impact_cnn import ImpactCnn, ImpactCnnOptions from swafi.events import load_events_from_pickle from swafi.precip_combiprecip import CombiPrecip has_optuna = False try: import optuna + has_optuna = True - from optuna.storages import RDBStorage + from optuna.storages import RDBStorage, JournalStorage, JournalFileStorage except ImportError: pass +USE_SQLITE = False +USE_TXTFILE = True +OPTUNA_RANDOM = True DATASET = 'gvz' # 'mobiliar' or 'gvz' LABEL_EVENT_FILE = 'original_w_prior_pluvial_occurrence' +SAVE_MODEL = True config = Config() MISSING_DATES = CombiPrecip.missing -# Additional missing dates for ZH region (specific radar data) -MISSING_DATES.extend([ - ('2005-01-06', '2005-01-06'), - ('2009-05-02', '2009-05-02'), - ('2017-04-16', '2017-04-16'), - ('2022-07-04', '2022-07-04'), - ('2022-12-30', '2022-12-30') -]) def main(): - options = ImpactDeepLearningOptions() + options = ImpactCnnOptions() options.parse_args() # options.parser.print_help() options.print() @@ -66,39 +63,36 @@ def main(): dem = rxr.open_rasterio(config.get('DEM_PATH'), masked=True).squeeze() # Load CombiPrecip files - data_path = config.get('DIR_PRECIP') - files = sorted(glob(f"{data_path}/*.nc")) - precip = xr.open_mfdataset(files, parallel=False) - precip = precip.rename_vars({'CPC': 'precip'}) - precip = precip.rename({'REFERENCE_TS': 'time'}) - if options.use_dem: - precip = precip.sel(x=dem.x, y=dem.y) # Select the same domain as the DEM + precip = CombiPrecip() + precip.set_data_path(config.get('DIR_PRECIP')) if not options.optimize_with_optuna: - dl = _setup_model(options, events, precip, dem) - dl.fit(dir_plots=config.get('OUTPUT_DIR'), - tag=options.run_name) - dl.assess_model_on_all_periods() + cnn = _setup_model(options, events, precip, dem) + cnn.fit(dir_plots=config.get('OUTPUT_DIR'), + tag=options.run_name) + cnn.assess_model_on_all_periods() + if SAVE_MODEL: + cnn.save_model(dir_output=config.get('OUTPUT_DIR')) else: - dl = optimize_model_with_optuna(options, events, precip, dem, - dir_plots=config.get('OUTPUT_DIR')) - dl.assess_model_on_all_periods() + cnn = optimize_model_with_optuna(options, events, precip, dem, + dir_plots=config.get('OUTPUT_DIR')) + cnn.assess_model_on_all_periods() def _setup_model(options, events, precip, dem): - dl = ImpactDeepLearning(events, options=options) - dl.set_dem(dem) - dl.set_precipitation(precip) - if dl.options.use_simple_features: - dl.select_features(dl.options.simple_features) - dl.load_features(dl.options.simple_feature_classes) - dl.split_sample() - dl.reduce_negatives_for_training(dl.options.factor_neg_reduction) - dl.compute_balanced_class_weights() - dl.compute_corrected_class_weights( - weight_denominator=dl.options.weight_denominator) - return dl + cnn = ImpactCnn(events, options=options) + cnn.set_dem(dem) + cnn.set_precipitation(precip) + if cnn.options.use_simple_features: + cnn.select_features(cnn.options.simple_features) + cnn.load_features(cnn.options.simple_feature_classes) + cnn.split_sample() + cnn.reduce_negatives_for_training(cnn.options.factor_neg_reduction) + cnn.compute_balanced_class_weights() + cnn.compute_corrected_class_weights( + weight_denominator=cnn.options.weight_denominator) + return cnn def optimize_model_with_optuna(options, events, precip=None, dem=None, dir_plots=None): @@ -107,11 +101,11 @@ def optimize_model_with_optuna(options, events, precip=None, dem=None, dir_plots Parameters ---------- - options: ImpactDeepLearningOptions + options: ImpactCnnOptions The options. events: pd.DataFrame The events. - precip: xr.Dataset|None + precip: Precipitation|None The precipitation data. dem: xr.Dataset|None The DEM data. @@ -121,9 +115,17 @@ def optimize_model_with_optuna(options, events, precip=None, dem=None, dir_plots if not has_optuna: raise ValueError("Optuna is not installed") - storage = RDBStorage( - url=f'sqlite:///{options.optuna_study_name}.db' - ) + if USE_SQLITE: + storage = RDBStorage( + url=f'sqlite:///{options.optuna_study_name}.db', + engine_kwargs={"connect_args": {"timeout": 60.0}} + ) + elif USE_TXTFILE: + storage = JournalStorage( + JournalFileStorage(f"{options.optuna_study_name}.log") + ) + else: + raise ValueError("No storage specified") def optuna_objective(trial): """ @@ -141,19 +143,32 @@ def optuna_objective(trial): """ options_c = options.copy() options_c.generate_for_optuna(trial) - dl_trial = _setup_model(options_c, events, precip, dem) + cnn_trial = _setup_model(options_c, events, precip, dem) # Fit the model - dl_trial.fit(do_plot=False, silent=True) + cnn_trial.fit(do_plot=False, silent=True) # Assess the model - score = dl_trial.compute_f1_score(dl_trial.dg_val) + score = cnn_trial.compute_f1_score(cnn_trial.dg_val) return score - study = optuna.load_study( - study_name=options.optuna_study_name, storage=storage, direction='maximize' - ) + sampler = None + if OPTUNA_RANDOM: + sampler = optuna.samplers.RandomSampler() + + if USE_SQLITE or USE_TXTFILE: + study = optuna.load_study( + study_name=options.optuna_study_name, + storage=storage, + sampler=sampler + ) + else: + study = optuna.create_study( + study_name=options.optuna_study_name, + direction='maximize', + sampler=sampler + ) study.optimize(optuna_objective, n_trials=options.optuna_trials_nb) print("Number of finished trials: ", len(study.trials)) @@ -167,10 +182,10 @@ def optuna_objective(trial): # Fit the model with the best parameters options_best = options.copy() options_best.generate_for_optuna(best_trial) - dl = _setup_model(options_best, events, precip, dem) - dl.fit(dir_plots=dir_plots, tag='best_optuna_' + dl.options.run_name) + cnn = _setup_model(options_best, events, precip, dem) + cnn.fit(dir_plots=dir_plots, tag='best_optuna_' + cnn.options.run_name) - return dl + return cnn if __name__ == '__main__': diff --git a/swafi/config.py b/swafi/config.py index 1d73de8..ccb52d3 100644 --- a/swafi/config.py +++ b/swafi/config.py @@ -36,10 +36,12 @@ def create_output_directory(self, output_dir): self.output_dir = self.output_dir / output_dir self.output_dir.mkdir(parents=True, exist_ok=True) - def get(self, key, default=None): + def get(self, key, default=None, do_raise=True): if key not in self.config: if default: return default + if not do_raise: + return None raise RuntimeError(f"The entry '{key}' was not found in the config file.") return self.config[key] diff --git a/swafi/damages.py b/swafi/damages.py index b41f85b..aa050c0 100644 --- a/swafi/damages.py +++ b/swafi/damages.py @@ -404,6 +404,17 @@ def _remove_claims_with_no_event(self): self.claims = self.claims[self.claims.eid != 0] self.claims.reset_index(inplace=True, drop=True) + def _remove_data_outside_period(self): + # Ensure the 'date_claim' column is of datetime type + self.claims['date_claim'] = pd.to_datetime( + self.claims['date_claim'], errors='coerce') + self.claims = self.claims[ + (self.claims.date_claim.dt.year >= self.year_start) & + (self.claims.date_claim.dt.year <= self.year_end)] + self.exposure = self.exposure[ + (self.exposure.year >= self.year_start) & + (self.exposure.year <= self.year_end)] + @staticmethod def _get_best_candidate(pot_events, window_days, stats): best_matches = pot_events.loc[pot_events['match_score'] == @@ -497,7 +508,7 @@ def _compute_temporal_overlap(date_claim, pot_events, window): overlap_window_end = min(date_window_end, event['e_end']) overlap = overlap_window_end - overlap_window_start overlap_hrs = max(0.0, overlap.total_seconds() / 3600) - pot_events.at[i, 'overlap_hrs'] = overlap_hrs + pot_events.at[i, 'overlap_hrs'] = int(round(overlap_hrs)) @staticmethod def _compute_prior_to_claim(date_claim, pot_events): diff --git a/swafi/damages_gvz.py b/swafi/damages_gvz.py index 802fb62..930acb3 100644 --- a/swafi/damages_gvz.py +++ b/swafi/damages_gvz.py @@ -39,7 +39,7 @@ def __init__(self, cid_file=None, year_start=None, year_end=None, use_dump=True, dir_claims: str The path to the directory containing the claim files. pickle_file: str - The path to a pickle file to load. + The name of a pickle file to load. pickle_dir: str The path to the working directory for pickle files """ @@ -66,7 +66,7 @@ def __init__(self, cid_file=None, year_start=None, year_end=None, use_dump=True, 'B'] self._create_exposure_claims_df() - self._load_from_dump('damages_gvz') + self._load_from_dump('damages_gvz.pickle') if dir_exposure is not None: self.load_exposure(dir_exposure) @@ -77,6 +77,8 @@ def __init__(self, cid_file=None, year_start=None, year_end=None, use_dump=True, if pickle_file is not None: self.load_from_pickle(pickle_file) + self._remove_data_outside_period() + def get_claim_categories_from_type(self, types): """ Get the claim categories from types. @@ -113,6 +115,9 @@ def get_claim_categories_from_type(self, types): elif cat_type.lower() == 'most_likely_fluvial': columns = [i for i in columns if i in ['E']] continue + elif cat_type in ['A', 'B', 'C', 'D', 'E']: + columns = [i for i in columns if i in [cat_type]] + continue else: raise ValueError(f"Unknown claim type: {cat_type}") diff --git a/swafi/damages_mobiliar.py b/swafi/damages_mobiliar.py index 68567fc..247c892 100644 --- a/swafi/damages_mobiliar.py +++ b/swafi/damages_mobiliar.py @@ -38,7 +38,7 @@ def __init__(self, cid_file=None, year_start=None, year_end=None, use_dump=True, dir_claims: str The path to the directory containing the claim files. pickle_file: str - The path to a pickle file to load. + The name of a pickle file to load. pickle_dir: str The path to the working directory for pickle files """ @@ -108,7 +108,7 @@ def __init__(self, cid_file=None, year_start=None, year_end=None, use_dump=True, 'Wasser_Privat_GB'] self._create_exposure_claims_df() - self._load_from_dump('damages_mobiliar') + self._load_from_dump('damages_mobiliar.pickle') if dir_exposure is not None: self.load_exposure(dir_exposure) @@ -119,6 +119,8 @@ def __init__(self, cid_file=None, year_start=None, year_end=None, use_dump=True, if pickle_file is not None: self.load_from_pickle(pickle_file) + self._remove_data_outside_period() + def get_claim_categories_from_type(self, types): """ Get the claim categories from types. diff --git a/files/cids.tif b/swafi/data/cids_switzerland_combiprecip.tif similarity index 100% rename from files/cids.tif rename to swafi/data/cids_switzerland_combiprecip.tif diff --git a/swafi/domain.py b/swafi/domain.py index 60561a8..2d2e010 100644 --- a/swafi/domain.py +++ b/swafi/domain.py @@ -5,6 +5,7 @@ import pickle import rasterio import numpy as np +import importlib.resources as pkg_resources from pathlib import Path try: @@ -12,26 +13,45 @@ except ImportError: nc4 = None +from . import data from .config import Config config = Config() class Domain: - def __init__(self, cid_file=None): + def __init__(self, cid_file_path=None, + cid_file_name='cids_switzerland_combiprecip.tif', + epsg=2056): """ The Domain class. Defines the cell IDs. + + Parameters + ---------- + cid_file_path: str|Path|None + The full path to the CID file (including name). + cid_file_name: str + The name of the CID file if part of the module data. + epsg: int + The EPSG code of the projection. Default is 2056 (CH1903+ / LV95). """ - self.crs = config.get('CRS', 'EPSG:2056') + self.crs = config.get('CRS', f'EPSG:{epsg}') self.resolution = None self.cids = dict(extent=None, ids_map=np.array([]), xs=np.array([]), ys=np.array([])) - if not cid_file: - cid_file = config.get('CID_PATH') + if not cid_file_path: + cid_file_path = config.get('CID_PATH', None, False) + if not cid_file_path: + with pkg_resources.path(data, cid_file_name) as p: + cid_file_path = str(p) + + if not Path(cid_file_path).exists(): + print(f"Working directory: {Path.cwd()}") + raise FileNotFoundError(f"The CID file {cid_file_path} does not exist.") self._load_from_dump() - self._load_cid_file(cid_file) + self._load_cid_file(cid_file_path) def check_projection(self, dataset, file): """ diff --git a/swafi/impact_dl.py b/swafi/impact_cnn.py similarity index 76% rename from swafi/impact_dl.py rename to swafi/impact_cnn.py index e09cbff..a583a2c 100644 --- a/swafi/impact_dl.py +++ b/swafi/impact_cnn.py @@ -3,7 +3,7 @@ """ from .impact import Impact -from .model_dl import DeepImpact +from .model_cnn import ModelCnn from .utils.data_generator import DataGenerator from .utils.verification import compute_confusion_matrix, print_classic_scores, \ assess_roc_auc, compute_score_binary @@ -19,6 +19,7 @@ import tensorflow as tf import datetime import dask +import math has_optuna = False try: @@ -31,9 +32,9 @@ epsilon = 1e-7 # a small constant to avoid division by zero -class ImpactDeepLearningOptions: +class ImpactCnnOptions: """ - The Deep Learning Impact class options. + The CNN Deep Learning Impact class options. Attributes ---------- @@ -83,12 +84,16 @@ class ImpactDeepLearningOptions: The number of epochs. learning_rate: float The learning rate. - dropout_rate: float - The dropout rate. + dropout_rate_cnn: float + The dropout rate for the CNN. + dropout_rate_dense: float + The dropout rate for the dense layers. with_spatial_dropout: bool Whether to use spatial dropout or not. - with_batchnorm: bool - Whether to use batch normalization or not. + with_batchnorm_cnn: bool + Whether to use batch normalization or not for the CNN. + with_batchnorm_dense: bool + Whether to use batch normalization or not for the dense layers. nb_filters: int The number of filters. nb_conv_blocks: int @@ -99,12 +104,14 @@ class ImpactDeepLearningOptions: The number of dense units. nb_dense_units_decreasing: bool Whether the number of dense units should decrease or not. - inner_activation: str - The inner activation function. + inner_activation_cnn: str + The inner activation function for the CNN. + inner_activation_dense: str + The inner activation function for the dense layers. """ def __init__(self): - self.parser = argparse.ArgumentParser(description="SWAFI DL") + self.parser = argparse.ArgumentParser(description="SWAFI CNN") self._set_parser_arguments() # General options @@ -127,10 +134,10 @@ def __init__(self): self.precip_resolution = 0 self.precip_time_step = 0 self.precip_days_before = 0 - self.precip_days_after = 0 - self.transform_static = '' - self.transform_2d = '' - self.precip_trans_domain = '' + self.precip_days_after = 1 + self.transform_static = 'standardize' + self.transform_2d = 'normalize' + self.precip_trans_domain = 'per-pixel' self.log_transform_precip = True # Training options @@ -139,22 +146,25 @@ def __init__(self): self.learning_rate = 0 # Model options - self.dropout_rate = 0 + self.dropout_rate_cnn = 0 + self.dropout_rate_dense = 0 self.with_spatial_dropout = True - self.with_batchnorm = True + self.with_batchnorm_cnn = True + self.with_batchnorm_dense = True self.nb_filters = 0 self.nb_conv_blocks = 0 self.nb_dense_layers = 0 self.nb_dense_units = 0 - self.nb_dense_units_decreasing = True - self.inner_activation = '' + self.nb_dense_units_decreasing = False + self.inner_activation_cnn = 'relu' + self.inner_activation_dense = 'relu' def copy(self): """ Make a copy of the object. Returns ------- - ImpactDeepLearningOptions + ImpactCnnOptions The copy of the object. """ return copy.deepcopy(self) @@ -189,20 +199,23 @@ def parse_args(self): self.batch_size = args.batch_size self.epochs = args.epochs self.learning_rate = args.learning_rate - self.dropout_rate = args.dropout_rate + self.dropout_rate_cnn = args.dropout_rate_cnn + self.dropout_rate_dense = args.dropout_rate_dense self.with_spatial_dropout = not args.no_spatial_dropout - self.with_batchnorm = not args.no_batchnorm + self.with_batchnorm_cnn = not args.no_batchnorm_cnn + self.with_batchnorm_dense = not args.no_batchnorm_dense self.nb_filters = args.nb_filters self.nb_conv_blocks = args.nb_conv_blocks self.nb_dense_layers = args.nb_dense_layers self.nb_dense_units = args.nb_dense_units - self.nb_dense_units_decreasing = not args.no_dense_units_decreasing - self.inner_activation = args.inner_activation + self.nb_dense_units_decreasing = args.dense_units_decreasing + self.inner_activation_cnn = args.inner_activation_cnn + self.inner_activation_dense = args.inner_activation_dense if self.optimize_with_optuna: print("Optimizing with Optuna; some options will be ignored.") - def generate_for_optuna(self, trial): + def generate_for_optuna(self, trial, hp_to_optimize='default'): """ Generate the options for Optuna. @@ -210,10 +223,19 @@ def generate_for_optuna(self, trial): ---------- trial: optuna.Trial The trial. + hp_to_optimize: list|str + The list of hyperparameters to optimize. Can be the string 'default' + Options are: weight_denominator, precip_window_size, precip_time_step, + precip_days_before, precip_resolution, precip_days_after, transform_static, + transform_2d, precip_trans_domain, log_transform_precip, batch_size, + learning_rate, dropout_rate_dense, dropout_rate_cnn, with_spatial_dropout, + with_batchnorm_cnn, with_batchnorm_dense, nb_filters, nb_conv_blocks, + nb_dense_layers, nb_dense_units, nb_dense_units_decreasing, + inner_activation_dense, inner_activation_cnn, Returns ------- - ImpactDeepLearningOptions + ImpactCnnOptions The options. """ if not has_optuna: @@ -221,32 +243,101 @@ def generate_for_optuna(self, trial): assert self.optimize_with_optuna, "Optimize with Optuna is not set to True" - self.weight_denominator = trial.suggest_int('weight_denominator', 1, 100) + if isinstance(hp_to_optimize, str) and hp_to_optimize == 'default': + hp_to_optimize = ['precip_window_size', 'precip_time_step', + 'transform_static', 'transform_2d', 'precip_days_before'] + + if 'weight_denominator' in hp_to_optimize: + self.weight_denominator = trial.suggest_int( + 'weight_denominator', 1, 100) + if self.use_precip: - self.precip_window_size = trial.suggest_categorical('precip_window_size', [2, 4, 6, 8, 12]) - self.precip_resolution = trial.suggest_categorical('precip_resolution', [1]) - self.precip_time_step = trial.suggest_categorical('precip_time_step', [1, 2, 3, 4, 6, 12]) - self.precip_days_before = trial.suggest_int('precip_days_before', 1, 10) - self.precip_days_after = trial.suggest_int('precip_days_after', 1, 5) + if 'precip_window_size' in hp_to_optimize: + self.precip_window_size = trial.suggest_categorical( + 'precip_window_size', [2, 4, 6, 8, 12]) + if 'precip_resolution' in hp_to_optimize: + self.precip_resolution = trial.suggest_categorical( + 'precip_resolution', [1, 2, 4]) + if 'precip_time_step' in hp_to_optimize: + self.precip_time_step = trial.suggest_categorical( + 'precip_time_step', [1, 2, 3, 4, 6, 12, 24]) + if 'precip_days_before' in hp_to_optimize: + self.precip_days_before = trial.suggest_int( + 'precip_days_before', 1, 3) + if 'precip_days_after' in hp_to_optimize: + self.precip_days_after = trial.suggest_int( + 'precip_days_after', 1, 2) if self.use_simple_features: - self.transform_static = trial.suggest_categorical('transform_static', ['standardize', 'normalize']) + if 'transform_static' in hp_to_optimize: + self.transform_static = trial.suggest_categorical( + 'transform_static', ['standardize', 'normalize']) if self.use_precip: - self.transform_2d = trial.suggest_categorical('transform_2d', ['standardize', 'normalize']) - self.precip_trans_domain = trial.suggest_categorical('precip_trans_domain', ['domain-average', 'per-pixel']) - self.log_transform_precip = trial.suggest_categorical('log_transform_precip', [True, False]) - self.batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128]) - self.learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True) - self.dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5) + if 'transform_2d' in hp_to_optimize: + self.transform_2d = trial.suggest_categorical( + 'transform_2d', ['standardize', 'normalize']) + if 'precip_trans_domain' in hp_to_optimize: + self.precip_trans_domain = trial.suggest_categorical( + 'precip_trans_domain', ['domain-average', 'per-pixel']) + if 'log_transform_precip' in hp_to_optimize: + self.log_transform_precip = trial.suggest_categorical( + 'log_transform_precip', [True, False]) + + if 'batch_size' in hp_to_optimize: + self.batch_size = trial.suggest_categorical( + 'batch_size', [16, 32, 64, 128, 256, 512]) + if 'learning_rate' in hp_to_optimize: + self.learning_rate = trial.suggest_float( + 'learning_rate', 1e-4, 1e-2, log=True) + if 'dropout_rate_dense' in hp_to_optimize: + self.dropout_rate_dense = trial.suggest_float( + 'dropout_rate_dense', 0.0, 0.5) if self.use_precip: - self.with_spatial_dropout = trial.suggest_categorical('with_spatial_dropout', [True, False]) - self.with_batchnorm = trial.suggest_categorical('with_batchnorm', [True, False]) + if 'dropout_rate_cnn' in hp_to_optimize: + self.dropout_rate_cnn = trial.suggest_float( + 'dropout_rate_cnn', 0.0, 0.5) + if 'with_spatial_dropout' in hp_to_optimize: + self.with_spatial_dropout = trial.suggest_categorical( + 'with_spatial_dropout', [True, False]) + if 'with_batchnorm_cnn' in hp_to_optimize: + self.with_batchnorm_cnn = trial.suggest_categorical( + 'with_batchnorm_cnn', [True, False]) + + if 'with_batchnorm_dense' in hp_to_optimize: + self.with_batchnorm_dense = trial.suggest_categorical( + 'with_batchnorm_dense', [True, False]) if self.use_precip: - self.nb_filters = trial.suggest_categorical('nb_filters', [16, 32, 64, 128, 256]) - self.nb_conv_blocks = trial.suggest_int('nb_conv_blocks', 1, 5) - self.nb_dense_layers = trial.suggest_int('nb_dense_layers', 1, 5) - self.nb_dense_units = trial.suggest_int('nb_dense_units', 16, 512) - self.nb_dense_units_decreasing = trial.suggest_categorical('nb_dense_units_decreasing', [True, False]) - self.inner_activation = trial.suggest_categorical('inner_activation', ['relu', 'tanh', 'sigmoid']) + if 'nb_filters' in hp_to_optimize: + self.nb_filters = trial.suggest_categorical( + 'nb_filters', [16, 32, 64, 128, 256]) + if 'nb_conv_blocks' in hp_to_optimize: + self.nb_conv_blocks = trial.suggest_int( + 'nb_conv_blocks', 1, 5) + + if 'nb_dense_layers' in hp_to_optimize: + self.nb_dense_layers = trial.suggest_int( + 'nb_dense_layers', 1, 10) + if 'nb_dense_units' in hp_to_optimize: + self.nb_dense_units = trial.suggest_int( + 'nb_dense_units', 16, 512) + if 'nb_dense_units_decreasing' in hp_to_optimize: + self.nb_dense_units_decreasing = trial.suggest_categorical( + 'nb_dense_units_decreasing', [True, False]) + if 'inner_activation_dense' in hp_to_optimize: + self.inner_activation_dense = trial.suggest_categorical( + 'inner_activation_dense', ['relu', 'tanh', 'sigmoid', 'softmax', + 'elu', 'selu', 'leaky_relu', 'linear']) + if 'inner_activation_cnn' in hp_to_optimize: + self.inner_activation_cnn = trial.suggest_categorical( + 'inner_activation_cnn', ['relu', 'tanh', 'sigmoid', 'softmax', + 'elu', 'selu', 'leaky_relu', 'linear']) + + # Check the input 2D size vs nb_conv_blocks + pixels_nb = int(self.precip_window_size / self.precip_resolution) + nb_conv_blocks_max = math.floor(math.log(pixels_nb, 2)) + if self.nb_conv_blocks > nb_conv_blocks_max: + return False # Not valid + + return True def print(self): """ @@ -291,21 +382,24 @@ def print(self): print("- batch_size: ", self.batch_size) print("- epochs: ", self.epochs) print("- learning_rate: ", self.learning_rate) - print("- dropout_rate: ", self.dropout_rate) + print("- dropout_rate_dense: ", self.dropout_rate_dense) if self.use_precip: print("- with_spatial_dropout: ", self.with_spatial_dropout) + print("- dropout_rate_cnn: ", self.dropout_rate_cnn) - print("- with_batchnorm: ", self.with_batchnorm) + print("- with_batchnorm_dense: ", self.with_batchnorm_dense) if self.use_precip: + print("- with_batchnorm_cnn: ", self.with_batchnorm_cnn) print("- nb_filters: ", self.nb_filters) print("- nb_conv_blocks: ", self.nb_conv_blocks) + print("- inner_activation_cnn: ", self.inner_activation_cnn) print("- nb_dense_layers: ", self.nb_dense_layers) print("- nb_dense_units: ", self.nb_dense_units) print("- nb_dense_units_decreasing: ", self.nb_dense_units_decreasing) - print("- inner_activation: ", self.inner_activation) + print("- inner_activation_dense: ", self.inner_activation_dense) def is_ok(self): """ @@ -345,8 +439,9 @@ def _set_parser_arguments(self): '--optimize-with-optuna', action='store_true', help='Optimize the hyperparameters with Optuna') self.parser.add_argument( - '--optuna-study-name', type=str, default='', - help='The Optuna study name') + '--optuna-study-name', type=str, + default=datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), + help='The Optuna study name (default: using the date and time'), self.parser.add_argument( '--optuna-trials-nb', type=int, default=100, help='The number of trials for Optuna') @@ -357,7 +452,7 @@ def _set_parser_arguments(self): '--factor-neg-reduction', type=int, default=10, help='The factor to reduce the number of negatives only for training') self.parser.add_argument( - '--weight-denominator', type=int, default=5, + '--weight-denominator', type=int, default=40, help='The weight denominator to reduce the negative class weights') self.parser.add_argument( '--random-state', type=int, default=42, @@ -383,25 +478,25 @@ def _set_parser_arguments(self): 'If specified, the default class features will be overridden for' 'this class only (e.g. event).') self.parser.add_argument( - '--precip-window-size', type=int, default=8, + '--precip-window-size', type=int, default=2, help='The precipitation window size [km]') self.parser.add_argument( '--precip-resolution', type=int, default=1, help='The precipitation resolution [km]') self.parser.add_argument( - '--precip-time-step', type=int, default=1, + '--precip-time-step', type=int, default=12, help='The precipitation time step [h]') self.parser.add_argument( - '--precip-days-before', type=int, default=4, + '--precip-days-before', type=int, default=1, help='The number of days before the event to use for the precipitation') self.parser.add_argument( - '--precip-days-after', type=int, default=2, + '--precip-days-after', type=int, default=1, help='The number of days after the event to use for the precipitation') self.parser.add_argument( '--transform-static', type=str, default='standardize', help='The transformation to apply to the static data') self.parser.add_argument( - '--transform-2d', type=str, default='standardize', + '--transform-2d', type=str, default='normalize', help='The transformation to apply to the 2D data') self.parser.add_argument( '--precip-trans-domain', type=str, default='per-pixel', @@ -411,7 +506,7 @@ def _set_parser_arguments(self): '--no-log-transform-precip', action='store_true', help='Do not log-transform the precipitation') self.parser.add_argument( - '--batch-size', type=int, default=32, + '--batch-size', type=int, default=64, help='The batch size') self.parser.add_argument( '--epochs', type=int, default=100, @@ -420,43 +515,52 @@ def _set_parser_arguments(self): '--learning-rate', type=float, default=0.001, help='The learning rate') self.parser.add_argument( - '--dropout-rate', type=float, default=0.2, - help='The dropout rate') + '--dropout-rate-cnn', type=float, default=0.5, + help='The dropout rate for the CNN') + self.parser.add_argument( + '--dropout-rate-dense', type=float, default=0.2, + help='The dropout rate for the dense layers') self.parser.add_argument( '--no-spatial-dropout', action='store_true', help='Do not use spatial dropout') self.parser.add_argument( - '--no-batchnorm', action='store_true', - help='Do not use batch normalization') + '--no-batchnorm-cnn', action='store_true', + help='Do not use batch normalization for the CNN') + self.parser.add_argument( + '--no-batchnorm-dense', action='store_true', + help='Do not use batch normalization for the dense layers') self.parser.add_argument( '--nb-filters', type=int, default=64, help='The number of filters') self.parser.add_argument( - '--nb-conv-blocks', type=int, default=3, + '--nb-conv-blocks', type=int, default=5, help='The number of convolutional blocks') self.parser.add_argument( - '--nb-dense-layers', type=int, default=4, + '--nb-dense-layers', type=int, default=5, help='The number of dense layers') self.parser.add_argument( '--nb-dense-units', type=int, default=256, help='The number of dense units') self.parser.add_argument( - '--no-dense-units-decreasing', action='store_true', - help='The number of dense units should not decrease') + '--dense-units-decreasing', action='store_true', + help='The number of dense units should decrease') + self.parser.add_argument( + '--inner-activation-cnn', type=str, default='elu', + help='The inner activation function for the CNN') self.parser.add_argument( - '--inner-activation', type=str, default='relu', - help='The inner activation function') + '--inner-activation-dense', type=str, default='leaky_relu', + help='The inner activation function for the dense layers') -class ImpactDeepLearning(Impact): +class ImpactCnn(Impact): """ - The Deep Learning Impact class. + The CNN Deep Learning Impact class. Parameters ---------- events: Events The events object. - options: ImpactDeepLearningOptions + options: ImpactCnnOptions The model options. reload_trained_models: bool Whether to reload the previously trained models or not. @@ -484,12 +588,28 @@ def __init__(self, events, options, reload_trained_models=False): # Options that will be set later self.factor_neg_reduction = 1 + def save_model(self, dir_output): + """ + Save the model. + + Parameters + ---------- + dir_output: str + The directory where to save the model. + """ + if self.model is None: + raise ValueError("Model not defined") + + filename = f'{dir_output}/model_{self.options.run_name}.h5' + self.model.save(filename) + print(f"Model saved: {filename}") + def copy(self): """ Make a copy of the object. Returns ------- - ImpactDeepLearning + ImpactCnn The copy of the object. """ return copy.deepcopy(self) @@ -598,7 +718,10 @@ def optimize_model_with_optuna(self, n_trials=100, n_jobs=4, dir_plots=None): print(f" {key}: {value}") # Fit the model with the best parameters - self.options.generate_for_optuna(trial) + if not self.options.generate_for_optuna(trial): + print("The parameters are not valid.") + return float('-inf') + self.compute_balanced_class_weights() self.compute_corrected_class_weights( weight_denominator=self.options.weight_denominator) @@ -677,7 +800,7 @@ def compute_f1_score(self, dg): float The F1 score. """ - n_batches = dg.get_number_of_batches_for_full_dataset() + n_batches = dg.__len__() # Predict all_pred = [] @@ -848,7 +971,7 @@ def _create_data_generator_valid(self): std_precip=self.dg_train.std_precip, min_static=self.dg_train.min_static, max_static=self.dg_train.max_static, - max_precip=self.dg_train.max_precip, + q95_precip=self.dg_train.q95_precip, debug=DEBUG ) @@ -883,7 +1006,7 @@ def _create_data_generator_test(self): std_precip=self.dg_train.std_precip, min_static=self.dg_train.min_static, max_static=self.dg_train.max_static, - max_precip=self.dg_train.max_precip, + q95_precip=self.dg_train.q95_precip, debug=DEBUG ) @@ -894,7 +1017,7 @@ def _define_model(self, input_2d_size, input_1d_size): """ Define the model. """ - self.model = DeepImpact( + self.model = ModelCnn( task=self.target_type, options=self.options, input_2d_size=input_2d_size, @@ -944,7 +1067,7 @@ def _create_model_tmp_file_name(self): pickle.dumps(self.class_weight) + pickle.dumps(self.options.random_state) + pickle.dumps(self.target_type)) - model_hashed_name = f'dl_model_{hashlib.md5(tag_model).hexdigest()}.pickle' + model_hashed_name = f'cnn_model_{hashlib.md5(tag_model).hexdigest()}.pickle' tmp_filename = self.tmp_dir / model_hashed_name return tmp_filename @@ -955,7 +1078,7 @@ def set_precipitation(self, precipitation): Parameters ---------- - precipitation: xarray.Dataset|None + precipitation: Precipitation|None The precipitation data. """ if precipitation is None: @@ -965,45 +1088,17 @@ def set_precipitation(self, precipitation): print("Precipitation is not used and is therefore not loaded.") return - assert len(precipitation.dims) == 3, "Precipitation must be 3D" - - hash_tag = self._compute_hash_precip_full_data(precipitation) - filename = f"precip_full_{hash_tag}.pickle" - tmp_filename = self.tmp_dir / filename - - if tmp_filename.exists(): - print("Precipitation already preloaded. Loading from pickle file.") - with open(tmp_filename, 'rb') as f: - precipitation = pickle.load(f) - - else: - # Adapt the spatial resolution - with dask.config.set(**{'array.slicing.split_large_chunks': False}): - if self.options.precip_resolution != 1: - precipitation = precipitation.coarsen( - x=self.options.precip_resolution, - y=self.options.precip_resolution, - boundary='trim' - ).mean() - - # Aggregate the precipitation at the desired time step - with dask.config.set(**{'array.slicing.split_large_chunks': False}): - if self.options.precip_time_step != 1: - precipitation = precipitation.resample( - time=f'{self.options.precip_time_step}h', - ).sum(dim='time') - - # Save the precipitation - with open(tmp_filename, 'wb') as f: - pickle.dump(precipitation, f) + precipitation.load_data(resolution=self.options.precip_resolution, + time_step=self.options.precip_time_step) # Check the shape of the precipitation and the DEM if self.dem is not None: - assert precipitation['precip'].shape[1:] == self.dem.shape, \ + # Select the same domain as the DEM + precipitation.data = precipitation.data.sel(x=self.dem.x, y=self.dem.y) + assert precipitation.data['precip'].shape[1:] == self.dem.shape, \ "DEM and precipitation must have the same shape" - self.precipitation = precipitation - self.precipitation['time'] = pd.to_datetime(self.precipitation['time']) + self.precipitation = precipitation.data def set_dem(self, dem): """ @@ -1036,18 +1131,6 @@ def set_dem(self, dem): "DEM and precipitation must have the same shape" self.dem = dem - def _compute_hash_precip_full_data(self, precipitation): - tag_data = ( - pickle.dumps(self.options.precip_resolution) + - pickle.dumps(self.options.precip_time_step) + - pickle.dumps(precipitation['x']) + - pickle.dumps(precipitation['y']) + - pickle.dumps(precipitation['time'][0]) + - pickle.dumps(precipitation['time'][-1]) + - pickle.dumps(precipitation['precip'].shape)) - - return hashlib.md5(tag_data).hexdigest() - @staticmethod def _plot_training_history(hist, dir_plots, show_plots, prefix=None): """ diff --git a/swafi/model_dl.py b/swafi/model_cnn.py similarity index 92% rename from swafi/model_dl.py rename to swafi/model_cnn.py index 20b5c23..0700efb 100644 --- a/swafi/model_dl.py +++ b/swafi/model_cnn.py @@ -6,15 +6,15 @@ from keras import layers, models -class DeepImpact(models.Model): +class ModelCnn(models.Model): """ - Model factory. + CNN model factory. Parameters ---------- task: str The task. Options are: 'regression', 'classification' - options: ImpactDeepLearningOptions + options: ImpactCnnOptions The options. input_2d_size: ?list The input 2D size. @@ -23,7 +23,7 @@ class DeepImpact(models.Model): """ def __init__(self, task, options, input_2d_size, input_1d_size): - super(DeepImpact, self).__init__() + super(ModelCnn, self).__init__() self.model = None self.task = task self.options = options @@ -98,14 +98,14 @@ def _build_model(self): nb_units = self.options.nb_dense_units // (2 ** i) else: nb_units = self.options.nb_dense_units - x = layers.Dense(nb_units, activation=self.options.inner_activation, + x = layers.Dense(nb_units, activation=self.options.inner_activation_dense, name=f'dense_{i}')(x) - if self.options.with_batchnorm: + if self.options.with_batchnorm_dense: x = layers.BatchNormalization(name=f'batchnorm_dense_{i}')(x) - if self.options.dropout_rate > 0: - x = layers.Dropout(rate=self.options.dropout_rate, + if self.options.dropout_rate_dense > 0: + x = layers.Dropout(rate=self.options.dropout_rate_dense, name=f'dropout_dense_{i}')(x) # Last activation @@ -147,7 +147,7 @@ def conv2d_block(self, x, i, filters, kernel_size=3, initializer='he_normal', The output layer. """ if activation == 'default': - activation = self.options.inner_activation + activation = self.options.inner_activation_cnn x = layers.Conv2D( filters=filters, @@ -166,7 +166,7 @@ def conv2d_block(self, x, i, filters, kernel_size=3, initializer='he_normal', name=f'conv2d_{i}b', )(x) - if self.options.with_batchnorm: + if self.options.with_batchnorm_cnn: # Batch normalization should be before any dropout # https://stackoverflow.com/questions/59634780/correct-order-for- # spatialdropout2d-batchnormalization-and-activation-function @@ -179,15 +179,15 @@ def conv2d_block(self, x, i, filters, kernel_size=3, initializer='he_normal', name=f'maxpool2d_cnn_{i}', )(x) - if self.options.dropout_rate > 0: + if self.options.dropout_rate_cnn > 0: if self.options.with_spatial_dropout and x.shape[1] > 1 and x.shape[2] > 1: x = layers.SpatialDropout2D( - rate=self.options.dropout_rate, + rate=self.options.dropout_rate_cnn, name=f'spatial_dropout_cnn_{i}', )(x) else: x = layers.Dropout( - rate=self.options.dropout_rate, + rate=self.options.dropout_rate_cnn, name=f'dropout_cnn_{i}', )(x) diff --git a/swafi/precip.py b/swafi/precip.py index 12332dc..274ee45 100644 --- a/swafi/precip.py +++ b/swafi/precip.py @@ -1,9 +1,12 @@ """ Class to handle the precipitation data. """ - -from glob import glob +import pickle +import hashlib +import dask +import pandas as pd import xarray as xr +from pathlib import Path from .config import Config from .domain import Domain @@ -15,20 +18,44 @@ class Precipitation: def __init__(self, cid_file=None): """ The generic Precipitation class. Must be netCDF files as relies on xarray. + + Parameters + ---------- + cid_file: str|None + The path to the CID file """ if not cid_file: - cid_file = config.get('CID_PATH') + cid_file = config.get('CID_PATH', None, False) + self.dataset = None + self.data_path = None self.x_axis = 'x' self.y_axis = 'y' self.time_axis = 'time' - self.precip_var = 'prec' + self.precip_var = 'precip' + self.resolution = None + self.time_step = None self.data = None self.missing = None self.domain = Domain(cid_file) + self.tmp_dir = Path(config.get('TMP_DIR')) + + def set_data_path(self, data_path): + """ + Set the path to the precipitation data. + + Parameters + ---------- + data_path: str + The path to the precipitation data + """ + self.data_path = data_path + + def load_data(self): + raise NotImplementedError("This method must be implemented in the child class.") - def get_time_series(self, cid, start, end, size=3): + def get_time_series(self, cid, start, end, size=1): """ Extract the precipitation time series for the given cell ID and the given period (between start and end). @@ -42,7 +69,7 @@ def get_time_series(self, cid, start, end, size=3): end: datetime.datetime The end of the period to extract size: int - The number of pixels to average on (default: 3x3) + The number of pixels to average on (default: 1x1) Returns ------- @@ -58,5 +85,166 @@ def get_time_series(self, cid, start, end, size=3): self.y_axis: slice(y + dy * dpx, y - dy * dpx), self.time_axis: slice(start, end)}) + if size == 1: + return dat[self.precip_var].to_numpy() + return dat[self.precip_var].mean(dim=[self.x_axis, self.y_axis]).to_numpy() + def save_nc_file_per_cid(self, cid, start, end): + """ + Save the precipitation time series for the given cell ID and the given + period (between start and end) in a netCDF file. + + Parameters + ---------- + cid: int + The cell ID + start: datetime.datetime|str + The start of the period to extract + end: datetime.datetime|str + The end of the period to extract + """ + if isinstance(start, str): + start = pd.to_datetime(start) + if isinstance(end, str): + end = pd.to_datetime(end) + + y_start = start.year + y_end = end.year + + hash_tag = self._compute_hash_single_cid(y_start, y_end) + filename = f"precip_cid_{cid}_{hash_tag}.nc" + tmp_filename = self.tmp_dir / filename + + if tmp_filename.exists(): + return + + x, y = self.domain.get_cid_coordinates(cid) + time_series = self.data.sel({self.x_axis: x, self.y_axis: y, + self.time_axis: slice(start, end)}) + + time_series.to_netcdf(tmp_filename) + + def compute_quantiles_cid(self, cid): + """ + Compute the quantiles of the precipitation data for each cell ID. + + Parameters + ---------- + cid: int + The cell ID for which to compute the quantiles + + Returns + ------- + np.array + The quantiles of the precipitation data + """ + # Use pickles to store the data + hash_tag_pk = self._compute_hash_precip_full_data() + filename_pk = f"precip_quantiles_cid_{cid}_{hash_tag_pk}.pickle" + tmp_filename_pk = self.tmp_dir / filename_pk + + if tmp_filename_pk.exists(): + print(f"Loading quantiles for CID {cid} from pickle file.") + with open(tmp_filename_pk, 'rb') as f: + quantiles = pickle.load(f) + return quantiles + + # Check if netCDF file of the time series exists + y_start = pd.to_datetime(self.data['time'][0].to_pandas()).year + y_end = pd.to_datetime(self.data['time'][-25].to_pandas()).year # avoid Y+1 + hash_tag_nc = self._compute_hash_single_cid(y_start, y_end) + filename_nc = f"precip_cid_{cid}_{hash_tag_nc}.nc" + tmp_filename_nc = self.tmp_dir / filename_nc + + # Extract the time series + if tmp_filename_nc.exists(): + print(f"Loading time series for CID {cid} from netCDF file.") + time_series = xr.open_dataset(tmp_filename_nc) + else: + x, y = self.domain.get_cid_coordinates(cid) + time_series = self.data.sel({self.x_axis: x, self.y_axis: y}) + time_series = time_series.load() + time_series.to_netcdf(tmp_filename_nc) + + # Compute the ranks + quantiles = time_series.rank(dim='time', pct=True) + + # Save as pickle + with open(tmp_filename_pk, 'wb') as f: + pickle.dump(quantiles, f) + + return quantiles + + def _compute_hash_precip_full_data(self): + tag_data = ( + pickle.dumps(self.dataset) + + pickle.dumps(self.resolution) + + pickle.dumps(self.time_step) + + pickle.dumps(self.data['x']) + + pickle.dumps(self.data['y']) + + pickle.dumps(self.data['time'][0]) + + pickle.dumps(self.data['time'][-1]) + + pickle.dumps(self.data['precip'].shape)) + + return hashlib.md5(tag_data).hexdigest() + + def _compute_hash_single_cid(self, y_start, y_end): + tag_data = ( + pickle.dumps(self.dataset) + + pickle.dumps(self.resolution) + + pickle.dumps(self.time_step) + + pickle.dumps(y_start) + + pickle.dumps(y_end)) + + return hashlib.md5(tag_data).hexdigest() + + def _load_from_pickle(self): + hash_tag = self._compute_hash_precip_full_data() + filename = f"precip_full_{hash_tag}.pickle" + tmp_filename = self.tmp_dir / filename + + if tmp_filename.exists(): + print("Precipitation already preloaded. Loading from pickle file.") + with open(tmp_filename, 'rb') as f: + self.data = pickle.load(f) + else: + print("Loading data from original files.") + self._fill_missing_values() + self._resample() + self.data['time'] = pd.to_datetime(self.data['time']) + + # Save the precipitation + with open(tmp_filename, 'wb') as f: + pickle.dump(self.data, f) + + def _fill_missing_values(self): + # Create a complete time series index with hourly frequency + data_start = self.data.time.values[0] + data_end = self.data.time.values[-1] + complete_time_index = pd.date_range( + start=data_start, end=data_end, freq='h') + + with dask.config.set(**{'array.slicing.split_large_chunks': True}): + # Reindex the data to the complete time series index + self.data = self.data.reindex(time=complete_time_index) + + # Interpolate missing values + self.data = self.data.chunk({'time': -1}) + self.data = self.data.interpolate_na(dim='time', method='linear') + + def _resample(self): + with dask.config.set(**{'array.slicing.split_large_chunks': True}): + # Adapt the spatial resolution + if self.resolution != 1: + self.data = self.data.coarsen( + x=self.resolution, + y=self.resolution, + boundary='trim' + ).mean() + + # Aggregate the precipitation at the desired time step + if self.time_step != 1: + self.data = self.data.resample( + time=f'{self.time_step}h', + ).sum(dim='time') diff --git a/swafi/precip_combiprecip.py b/swafi/precip_combiprecip.py index 45f5610..af0f68e 100644 --- a/swafi/precip_combiprecip.py +++ b/swafi/precip_combiprecip.py @@ -29,19 +29,45 @@ class CombiPrecip(Precipitation): ('2022-10-17', '2022-10-23'), ] - def __init__(self, data_path, cid_file=None): + def __init__(self, cid_file=None): """ The Precipitation class for CombiPrecip data. Must be netCDF files. + + Parameters + ---------- + cid_file: str|None + The path to the CID file """ super().__init__(cid_file) + self.dataset = "CombiPrecip" + + def load_data(self, data_path=None, resolution=1, time_step=1): + """ + Load the precipitation data from the given path. + + Parameters + ---------- + data_path: str|None + The path to the data files + resolution: int + The resolution [km] of the precipitation data (default: 1) + time_step: int + The time step [h] of the precipitation data (default: 1) + """ + if data_path: + self.data_path = data_path + if not self.data_path: + self.data_path = config.get('DIR_PRECIP') + if not self.data_path: + raise FileNotFoundError("The data path was not provided.") + self.resolution = resolution + self.time_step = time_step - self.x_axis = 'x' - self.y_axis = 'y' - self.time_axis = 'REFERENCE_TS' - self.precip_var = 'CPC' + files = sorted(glob(f"{self.data_path}/*.nc")) + self.data = xr.open_mfdataset(files, parallel=False, chunks={'time': 1000}) + self.data = self.data.rename_vars({'CPC': 'precip'}) + self.data = self.data.rename({'REFERENCE_TS': 'time'}) - self._load_data(data_path) + # self._load_from_pickle() - def _load_data(self, data_path): - files = sorted(glob(f"{data_path}/*.nc")) - self.data = xr.open_mfdataset(files, parallel=False) + assert len(self.data.dims) == 3, "Precipitation must be 3D" diff --git a/swafi/utils/data_generator.py b/swafi/utils/data_generator.py index bd7b8f4..8156b3c 100644 --- a/swafi/utils/data_generator.py +++ b/swafi/utils/data_generator.py @@ -13,14 +13,13 @@ class DataGenerator(keras.utils.Sequence): def __init__(self, event_props, x_static, x_precip, x_dem, y, batch_size=32, shuffle=True, use_pickle_events_precip_data=False, - use_pickle_full_precip_data=True, precip_window_size=12, - precip_resolution=1, precip_time_step=1, precip_days_before=8, - precip_days_after=3, tmp_dir=None, transform_static='standardize', - transform_2d='standardize', - precip_transformation_domain='per-pixel', + use_pickle_full_precip_data=True, precip_window_size=2, + precip_resolution=1, precip_time_step=12, precip_days_before=1, + precip_days_after=1, tmp_dir=None, transform_static='standardize', + transform_2d='normalize', precip_transformation_domain='per-pixel', log_transform_precip=True, mean_static=None, std_static=None, mean_precip=None, std_precip=None, min_static=None, - max_static=None, max_precip=None, debug=False): + max_static=None, q95_precip=None, debug=False): """ Data generator class. Template from: @@ -64,11 +63,11 @@ def __init__(self, event_props, x_static, x_precip, x_dem, y, batch_size=32, tmp_dir: Path The temporary directory to use. transform_static: str - The transformation to apply to the static data: 'standardize' or - 'normalize'. + The transformation to apply to the static data. + Options: 'normalize' or 'standardize'. transform_2d: str - The transformation to apply to the 2D data: 'standardize' or - 'normalize'. + The transformation to apply to the 2D data. + Options: 'normalize' or 'standardize'. precip_transformation_domain: str How to apply the transformation of the precipitation data: 'domain-average', or 'per-pixel'. @@ -86,8 +85,8 @@ def __init__(self, event_props, x_static, x_precip, x_dem, y, batch_size=32, The min of the static data. max_static: np.array The max of the static data. - max_precip: np.array - The max of the precipitation data. + q95_precip: np.array + The 95th percentile of the precipitation data. debug: bool Whether to run in debug mode or not (print more messages). """ @@ -119,11 +118,15 @@ def __init__(self, event_props, x_static, x_precip, x_dem, y, batch_size=32, self.std_precip = std_precip self.min_static = min_static self.max_static = max_static - self.max_precip = max_precip + self.q95_precip = q95_precip + self.mean_dem = None self.std_dem = None self.min_dem = None self.max_dem = None + self.q25_dem = None + self.q50_dem = None + self.q75_dem = None self.X_static = x_static self.X_precip = x_precip @@ -289,7 +292,7 @@ def _normalize_static_inputs(self): def _normalize_2d_inputs(self): if self.X_precip is not None: - self.X_precip['precip'] = self.X_precip['precip'] / self.max_precip + self.X_precip['precip'] = self.X_precip['precip'] / self.q95_precip if self.X_dem is not None: self.X_dem = (self.X_dem - self.min_dem) / (self.max_dem - self.min_dem) @@ -380,24 +383,24 @@ def _compute_predictor_statistics(self): if self.X_precip is None: return - # Log transform the precipitation (log(1 + x)) + # Log transform the precipitation if self.log_transform_precip: print('Log-transforming precipitation') - self.X_precip['precip'] = np.log1p(self.X_precip['precip']) + self.X_precip['precip'] = np.log(self.X_precip['precip'] + 1e-10) # Compute the mean and standard deviation of the precipitation if (self.transform_2d == 'standardize' and self.mean_precip is not None and self.std_precip is not None): return - if self.transform_2d == 'normalize' and self.max_precip is not None: + if self.transform_2d == 'normalize' and self.q95_precip is not None: return # If pickle file exists, load it file_hash = self._compute_hash_precip_full() file_mean_precip = self.tmp_dir / f'mean_precip_{file_hash}.pickle' file_std_precip = self.tmp_dir / f'std_precip_{file_hash}.pickle' - file_max_precip = self.tmp_dir / f'max_precip_{file_hash}.pickle' + file_q95_precip = self.tmp_dir / f'q95_precip_{file_hash}.pickle' if (self.transform_2d == 'standardize' and file_mean_precip.exists() and file_std_precip.exists()): @@ -408,10 +411,10 @@ def _compute_predictor_statistics(self): self.std_precip = pickle.load(f) return - if self.transform_2d == 'normalize' and file_max_precip.exists(): + if self.transform_2d == 'normalize' and file_q95_precip.exists(): print('Loading precipitation statistics from pickle files') - with open(file_max_precip, 'rb') as f: - self.max_precip = pickle.load(f) + with open(file_q95_precip, 'rb') as f: + self.q95_precip = pickle.load(f) return # Otherwise, compute the statistics and save them to a pickle file @@ -423,8 +426,8 @@ def _compute_predictor_statistics(self): self.std_precip = self.X_precip['precip'].std('time').mean( ('x', 'y')).compute().values elif self.transform_2d == 'normalize': - self.max_precip = self.X_precip['precip'].max( - ('time', 'x', 'y')).compute().values + self.q95_precip = self.X_precip['precip'].quantile( + 0.95, dim=('time', 'x', 'y')).compute().values elif self.precip_transformation_domain == 'per-pixel': if self.transform_2d == 'standardize': self.mean_precip = self.X_precip['precip'].mean( @@ -432,8 +435,8 @@ def _compute_predictor_statistics(self): self.std_precip = self.X_precip['precip'].std( 'time').compute().values elif self.transform_2d == 'normalize': - self.max_precip = self.X_precip['precip'].max( - 'time').compute().values + self.q95_precip = self.X_precip['precip'].chunk(dict(time=-1)).quantile( + 0.95, dim='time').compute().values else: raise ValueError( f'Unknown option: {self.precip_transformation_domain}') @@ -445,8 +448,8 @@ def _compute_predictor_statistics(self): with open(file_std_precip, 'wb') as f: pickle.dump(self.std_precip, f) elif self.transform_2d == 'normalize': - with open(file_max_precip, 'wb') as f: - pickle.dump(self.max_precip, f) + with open(file_q95_precip, 'wb') as f: + pickle.dump(self.q95_precip, f) def _compute_hash_precip_full(self): tag_data = (