From 37c7dfba64367733fd0cfbf6035772dbe06eefbb Mon Sep 17 00:00:00 2001
From: belisards <adrianobf@gmail.com>
Date: Mon, 17 Jun 2024 14:26:57 -0300
Subject: [PATCH] keep datasets nested

---
 src/get.py | 50 --------------------------------------------------
 1 file changed, 50 deletions(-)

diff --git a/src/get.py b/src/get.py
index 893726b..94954e1 100644
--- a/src/get.py
+++ b/src/get.py
@@ -13,47 +13,6 @@
 UNHCR_INPUT_FILE = UNHCR_DATA_PATH + "metadata.csv"
 UNHCR_OUTPUT_FILE = UNHCR_DATA_PATH + "datasets.csv"
 
-def process_datasets(input_file, output_file):
-    """
-    Processes the dataset by normalizing nested JSON fields, renaming columns, 
-    and removing unnecessary columns.
-    
-    Args:
-    - input_file (str): Path to the input CSV file.
-    - output_file (str): Path to save the processed CSV file.
-    
-    Returns:
-    None
-    """
-    try:
-        df = pd.read_csv(input_file)
-        
-        # Set patterns to be removed from column names
-        patterns_to_remove = ["study_desc.", "doc_desc.", "study_info.", "method."]
-        df.columns = df.columns.str.replace("|".join(patterns_to_remove), "", regex=True)
-        df.columns = df.columns.str.replace("data_collection.", "method_")
-
-        # Normalize nested JSON columns
-        for col in find_list_columns(df):
-            data = df[col].apply(merge_dicts)
-            data_normalized = pd.json_normalize(data)
-            # Add prefix of column name to the new data
-            data_normalized.columns = [f"{col}_{c}" for c in data_normalized.columns]
-            df = pd.concat([df, data_normalized], axis=1)
-            df.drop(col, axis=1, inplace=True)
-
-        # Drop columns with all NaN values and the 'schematype' column
-        df.dropna(axis=1, how='all', inplace=True)
-        if 'schematype' in df.columns:
-            df.drop('schematype', axis=1, inplace=True)
-
-        # Save the processed dataset
-        df.to_csv(output_file, index=False)
-        print(f"Flattened dataset with shape {df.shape} saved to {output_file}")
-
-    except Exception as e:
-        print(f"Error processing file {input_file}: {e}")
-
 MAX_WORKERS = 20
 
 def fetch_data(url, id):
@@ -107,15 +66,6 @@ def process_datasets(df, output_file):
         df.columns = df.columns.str.replace("|".join(patterns_to_remove), "", regex=True)
         df.columns = df.columns.str.replace("data_collection.", "method_")
 
-        # Normalize nested JSON columns
-        for col in find_list_columns(df):
-            data = df[col].apply(merge_dicts)
-            data_normalized = pd.json_normalize(data)
-            # Add prefix of column name to the new data
-            data_normalized.columns = [f"{col}_{c}" for c in data_normalized.columns]
-            df = pd.concat([df, data_normalized], axis=1)
-            df.drop(col, axis=1, inplace=True)
-
         # Drop columns with all NaN values and the 'schematype' column
         df.dropna(axis=1, how='all', inplace=True)
         if 'schematype' in df.columns: