diff --git a/.gitignore b/.gitignore index 30a09ba..7c23438 100644 --- a/.gitignore +++ b/.gitignore @@ -172,5 +172,9 @@ selected_1000/ archive/ *.pth *.csv +*.pdf +*.svg +*.png +*.json submit_* config* \ No newline at end of file diff --git a/modelling/dino/evaluate.py b/modelling/dino/evaluate.py index 76f99b6..9a9a765 100644 --- a/modelling/dino/evaluate.py +++ b/modelling/dino/evaluate.py @@ -33,8 +33,9 @@ def evaluate(fold, model_name, target = "", use_checkpoint = False, model_not_na checkpoint = f'{model_par_dir}{model_name}_{fold}_one_country_best_{imagery_source}{named_target}_.pth' else: raise Exception(mode) + - print(f"Evaluating {model_name} on fold {fold} with target {target} using checkpoint {use_checkpoint}") + print(f"Evaluating {model_name} on fold {fold} with target {target} using checkpoint {checkpoint}") if target == '': eval_target = 'deprived_sev' @@ -45,6 +46,7 @@ def evaluate(fold, model_name, target = "", use_checkpoint = False, model_not_na target_size = 1 else: target_size = 99 + if imagery_source == 'L': normalization = 30000. @@ -55,14 +57,14 @@ def evaluate(fold, model_name, target = "", use_checkpoint = False, model_not_na data_folder = r'survey_processing/processed_data/' if mode == 'spatial': - train_df = pd.read_csv(f'{data_folder}train_fold_{fold}.csv', index_col=0) - test_df = pd.read_csv(f'{data_folder}test_fold_{fold}.csv', index_col=0) + train_df = pd.read_csv(f'{data_folder}train_fold_{fold}.csv') + test_df = pd.read_csv(f'{data_folder}test_fold_{fold}.csv') elif mode == 'temporal': - train_df = pd.read_csv(f'{data_folder}before_2020.csv', index_col=0) - test_df = pd.read_csv(f'{data_folder}after_2020.csv', index_col=0) + train_df = pd.read_csv(f'{data_folder}before_2020.csv') + test_df = pd.read_csv(f'{data_folder}after_2020.csv') elif mode == 'one_country': - train_df = pd.read_csv(f'{data_folder}train_fold_{fold}.csv', index_col=0) - test_df = pd.read_csv(f'{data_folder}test_fold_{fold}.csv', index_col=0) + train_df = pd.read_csv(f'{data_folder}train_fold_{fold}.csv') + test_df = pd.read_csv(f'{data_folder}test_fold_{fold}.csv') available_imagery = [] import os @@ -159,7 +161,7 @@ def __getitem__(self, idx): train_dataset = CustomDataset(train_df, transform) val_dataset = CustomDataset(test_df, transform) - train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=4) + train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=4) val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=4) model.to(device) model.eval() @@ -190,6 +192,27 @@ def __getitem__(self, idx): X_test.append(outputs.cpu()[0].numpy()) y_test.append(targets.cpu()[0].numpy()) + # Convert lists to numpy arrays + X_train = np.array(X_train) + y_train = np.array(y_train) + X_test = np.array(X_test) + y_test = np.array(y_test) + + # Convert to pandas DataFrames + df_X_train = pd.DataFrame(X_train) + df_y_train = pd.DataFrame(y_train, columns=['target']) + df_X_test = pd.DataFrame(X_test) + df_y_test = pd.DataFrame(y_test, columns=['target']) + + results_folder = f'modelling/dino/results/split_{mode}{imagery_source}_{fold}/' + if not os.path.exists(results_folder): + os.makedirs(results_folder) + # Save to CSV files + df_X_train.to_csv(results_folder+'X_train.csv', index=False) + df_y_train.to_csv(results_folder+'y_train.csv', index=False) + df_X_test.to_csv(results_folder+'X_test.csv', index=False) + df_y_test.to_csv(results_folder+'y_test.csv', index=False) + alphas = np.logspace(-6, 6, 20) # Define the model and pipeline ridge_pipeline = Pipeline([ diff --git a/modelling/dino/finetune_one_country.py b/modelling/dino/finetune_one_country.py index 030ac4e..8d3752d 100644 --- a/modelling/dino/finetune_one_country.py +++ b/modelling/dino/finetune_one_country.py @@ -33,8 +33,8 @@ def main(country, model_name, target, imagery_path, imagery_source, emb_size, ba data_folder = r'survey_processing/processed_data' - train_df = pd.read_csv(f'{data_folder}/train_fold_{country}.csv', index_col=0) - test_df = pd.read_csv(f'{data_folder}/test_fold_{country}.csv', index_col=0) + train_df = pd.read_csv(f'{data_folder}/train_fold_{country}.csv') + test_df = pd.read_csv(f'{data_folder}/test_fold_{country}.csv') available_imagery = [] for d in os.listdir(imagery_path): diff --git a/modelling/dino/finetune_spatial.py b/modelling/dino/finetune_spatial.py index 63bd373..5aa502b 100644 --- a/modelling/dino/finetune_spatial.py +++ b/modelling/dino/finetune_spatial.py @@ -31,8 +31,8 @@ def main(fold, model_name, target, imagery_path, imagery_source, emb_size, batch imagery_size = img_size data_folder = r'survey_processing/processed_data' - train_df = pd.read_csv(f'{data_folder}/train_fold_{fold}.csv', index_col=0) - test_df = pd.read_csv(f'{data_folder}/test_fold_{fold}.csv', index_col=0) + train_df = pd.read_csv(f'{data_folder}/train_fold_{fold}.csv') + test_df = pd.read_csv(f'{data_folder}/test_fold_{fold}.csv') available_imagery = [] for d in os.listdir(imagery_path): @@ -74,6 +74,7 @@ def filter_contains(query): train_df = train_df.dropna(subset=filtered_predict_target) predict_target = sorted(filtered_predict_target) + print(train_df.shape) def load_and_preprocess_image(path, grouped_bands=[4,3,2]): with rasterio.open(path) as src: b1 = src.read(grouped_bands[0]) @@ -104,8 +105,7 @@ def set_seed(seed): # Set your desired seed seed = 42 set_seed(seed) - - train, validation = train_test_split(train_df, test_size=0.2, random_state=42) + train, validation = train_test_split(train_df, test_size=0.2, random_state=seed) class CustomDataset(Dataset): def __init__(self, dataframe, transform): @@ -167,7 +167,7 @@ def forward(self, pixel_values): if os.path.exists(last_model): last_state_dict = torch.load(last_model) best_error = torch.load(best_model)['loss'] - epoch_ran = last_state_dict['epoch'] + epochs_ran = last_state_dict['epoch'] model.load_state_dict(last_state_dict['model_state_dict']) print('Found existing model') else: