diff --git a/docs/tutorials/forecast_switzerland/forecast_swiss.py b/docs/tutorials/forecast_switzerland/forecast_swiss.py index d2bdf58..9585c4d 100644 --- a/docs/tutorials/forecast_switzerland/forecast_swiss.py +++ b/docs/tutorials/forecast_switzerland/forecast_swiss.py @@ -33,14 +33,14 @@ def get_clusters_swiss(t=0.3, end_date=None): """ Params to get the list of clusters computed by the compute_cluster function. - + Parameters ---------- t : float Thereshold used in the clusterization. end_date : str Indicates the last day used to compute the cluster. - + Returns ------- Array @@ -76,12 +76,12 @@ def get_clusters_swiss(t=0.3, end_date=None): def get_cluster_by_canton(canton): """ Function to return the cluster that contains a specific canton. - + Parameters ---------- canton : str Name (two letters code) of the canton. - + Returns ------- List @@ -99,11 +99,11 @@ def remove_zeros(tgt): """ Function to remove the zeros of the target curve. It needs to be done to us be able to use the LogNormal dist. - + Parameters ---------- tgt : array - + """ tgt[tgt == 0] = 0.01 @@ -129,10 +129,10 @@ def train_eval_single_canton( ): """ Function to train and evaluate the model for one georegion. - + Important: * By default the function is using the clustering cantons and the - data since 2020. + data since 2020. * For the predictor hospCapacity is used as predictor the column ICU_Covid19Patients. @@ -177,9 +177,9 @@ def train_eval_single_canton( ------- pd.DataFrame The return is a pandas DataFrame. - + """ - + cluster_canton = [canton] # get_cluster_by_canton(canton) target_name = f"{target_curve_name}_{canton}" @@ -242,13 +242,13 @@ def train_eval_all_cantons( """ Function to make prediction for all the cantons. - + Important: * By default the function is using the clustering cantons and the data since 2020. * For the predictor hospCapacity is used as predictor the column ICU_Covid19Patients. - + Parameters ---------- target_curve_name : str @@ -277,7 +277,7 @@ def train_eval_all_cantons( look_back : int Number of the last days that will be used to forecast the next days. - + Returns ------- pd.DataFrame @@ -357,13 +357,13 @@ def train_single_canton( """ Function to train and evaluate the model for one georegion. - + Important: * By default the function is using the clustering cantons and the data since 2020. * For the predictor hospCapacity is used as predictor the column ICU_Covid19Patients. - + Parameters ---------- canton : str @@ -378,7 +378,7 @@ def train_single_canton( Determines the beggining of the train dataset path : str Determines where the model trained will be saved. - update_data : bool + update_data : bool Determines if the data from the Geneva hospital will be used. This params only is used when canton = GE and target_curve_name = hosp. @@ -390,7 +390,7 @@ def train_single_canton( look_back : int Number of the last days that will be used to forecast the next days. - + Returns ------- None @@ -449,17 +449,17 @@ def train_all_cantons( look_back=14, path=None, ): - + """ Function to train and evaluate the model for all the cantons in switzerland. - - Important: + + Important: * By default the function is using the clustering cantons and the - data since 2020. + data since 2020. * For the predictor hospCapacity is used as predictor the column ICU_Covid19Patients. - + Parameters ---------- target_curve_name : str @@ -480,14 +480,13 @@ def train_all_cantons( look_back : int Number of the last days that will be used to forecast the next days. - + Returns ------- pd.DataFrame Dataframe with the forecast for all the cantons. """ - clusters = get_clusters_swiss(t=0.6) for cluster in clusters: @@ -541,13 +540,13 @@ def forecast_single_canton( ): """ Function to make the forecast for one canton. - + Important: * By default the function is using the clustering cantons and the data since 2020. * For the predictor hospCapacity is used as predictor the column ICU_Covid19Patients. - + Parameters ---------- target_curve_name : str @@ -569,14 +568,13 @@ def forecast_single_canton( look_back : int Number of the last days that will be used to forecast the next days. - + Returns ------- pd.DataFrame Dataframe with the forecast for one canton. """ - cluster_canton = [canton] # get_cluster_by_canton(canton) df = get_cluster_data( @@ -609,13 +607,13 @@ def forecast_all_cantons( ): """ Function to make the forecast for all the cantons. - + Important: * By default the function is using the clustering cantons and the data since 2020. * For the predictor hospCapacity is used as predictor the column ICU_Covid19Patients. - + Parameters ---------- target_curve_name : str @@ -630,7 +628,7 @@ def forecast_all_cantons( Determines from what day the forecast will be computed. path : str Indicates where the models trained are saved. - + Returns ------- pd.DataFrame diff --git a/epigraphhub/analysis/forecast_models/metrics.py b/epigraphhub/analysis/forecast_models/metrics.py index b1b000c..461ea73 100644 --- a/epigraphhub/analysis/forecast_models/metrics.py +++ b/epigraphhub/analysis/forecast_models/metrics.py @@ -13,15 +13,15 @@ def compute_metrics(df_pred: pd.DataFrame) -> pd.DataFrame: method in the train and test sample. The predictions must be saved in a dataset with the following columns: 'median', 'target' and 'train_size'. - - This function uses the following metrics: - - explained variance score; - - mean absolute error; - - mean squared error; - - root mean squared error; - - mean squared log error; - - mean absolute percentage error. + This function uses the following metrics: + + - explained variance score; + - mean absolute error; + - mean squared error; + - root mean squared error; + - mean squared log error; + - mean absolute percentage error. To compute this metrics we use the implementations of the sklearn.metrics package. diff --git a/epigraphhub/analysis/forecast_models/ngboost_models.py b/epigraphhub/analysis/forecast_models/ngboost_models.py index 34445fe..c775790 100644 --- a/epigraphhub/analysis/forecast_models/ngboost_models.py +++ b/epigraphhub/analysis/forecast_models/ngboost_models.py @@ -136,12 +136,12 @@ def train_eval( Returns ------- pd.DataFrame - A DataFrame with four columns (and a date index): + A DataFrame with four columns (and a date index): - - target: The target values. + - target: The target values. - lower: The lower value of the confidence interval of 95%. - median: The median value of the confidence interval of - 95%. + 95%. - upper: The upper value of the confidence interval of 95%. - train_size: The number of rows of data using as training data. @@ -361,7 +361,7 @@ def forecast( - lower: The lower value of the confidence interval of 95%. - median: The median value of the confidence interval of - 95%. + 95%. - upper: The upper value of the confidence interval of 95%. """ diff --git a/epigraphhub/analysis/preprocessing.py b/epigraphhub/analysis/preprocessing.py index 3b84179..db3466f 100644 --- a/epigraphhub/analysis/preprocessing.py +++ b/epigraphhub/analysis/preprocessing.py @@ -187,9 +187,9 @@ def lstm_split_data( Returns ------- Tuple[np.array,np.array,np.array,np.array] - X_train: array of features to train the model. - y_train: array of targets to train the model. - X_test: array of features to test the model. + X_train: array of features to train the model. + y_train: array of targets to train the model. + X_test: array of features to test the model. y_test: array of targets to test the model. """ @@ -233,7 +233,7 @@ def normalize_data( Returns ------- Tuple[pd.DataFrame, pd.Series] - pd.DataFrame: normalized DataFrame. + pd.DataFrame: normalized DataFrame. pd.Series: Series of the max values used in the normalization. """ diff --git a/epigraphhub/data/_config.py b/epigraphhub/data/_config.py index 02d0ec4..04dbdf3 100644 --- a/epigraphhub/data/_config.py +++ b/epigraphhub/data/_config.py @@ -4,6 +4,8 @@ commonly used in data collection modules """ +from pathlib import Path + # Colombia COVID data config: from sodapy import Socrata @@ -27,3 +29,6 @@ # SINAN data config: SINAN_LOG_PATH = "/tmp/sinan_fetch.log" +_sinan_data = Path().home() / "pysus" +_sinan_data.mkdir(exist_ok=True) +PYSUS_DATA_PATH = str(_sinan_data) diff --git a/epigraphhub/data/brasil/sinan/extract.py b/epigraphhub/data/brasil/sinan/extract.py index bfea8e8..ed7d66f 100644 --- a/epigraphhub/data/brasil/sinan/extract.py +++ b/epigraphhub/data/brasil/sinan/extract.py @@ -3,7 +3,7 @@ from loguru import logger from pysus.online_data import SINAN -from epigraphhub.data._config import SINAN_LOG_PATH +from epigraphhub.data._config import PYSUS_DATA_PATH, SINAN_LOG_PATH logger.add(SINAN_LOG_PATH, retention="7 days") @@ -24,6 +24,6 @@ def download(disease: str): parquets_paths_list list(PosixPath) : A list with all parquets dirs. """ - SINAN.download_all_years_in_chunks(disease) + SINAN.download_all_years_in_chunks(disease, data_dir=PYSUS_DATA_PATH) - logger.info(f"All years for {disease} downloaded at /tmp/pysus") + logger.info(f"All years for {disease} downloaded at {PYSUS_DATA_PATH}") diff --git a/epigraphhub/data/brasil/sinan/loading.py b/epigraphhub/data/brasil/sinan/loading.py index 8b9581a..0eaed3b 100644 --- a/epigraphhub/data/brasil/sinan/loading.py +++ b/epigraphhub/data/brasil/sinan/loading.py @@ -6,7 +6,7 @@ from pysus.online_data import parquets_to_dataframe as to_df from epigraphhub.connection import get_engine -from epigraphhub.data._config import SINAN_LOG_PATH +from epigraphhub.data._config import PYSUS_DATA_PATH, SINAN_LOG_PATH from epigraphhub.settings import env logger.add(SINAN_LOG_PATH, retention="7 days") @@ -17,22 +17,21 @@ def upload(): """ Connects to the EGH SQL server and load all the chunks for all - diseases found at `/tmp/pysus` into database. This method cleans + diseases found at `$PYSUS_DATA_PATH` into database. This method cleans the chunks left. """ - diseases_dir = Path("/tmp/pysus").glob("*") + diseases_dir = Path(PYSUS_DATA_PATH).glob("*") di_years_dir = [x for x in diseases_dir if x.is_dir()] for dir in di_years_dir: - if "parquet" in Path(dir).suffix: - df = to_df(str(dir), clean_after_read=True) + if "parquet" in Path(dir).suffix and any(os.listdir(dir)): + df = to_df(str(dir), clean_after_read=False) df.columns = df.columns.str.lower() df.index.name = "index" table_i = str(dir).split("/")[-1].split(".parquet")[0] - st, yr = table_i[:-4].lower(), table_i[-2:] - table = "".join([st, yr]) + table = table_i[:-4].lower() schema = "brasil" with engine.connect() as conn: @@ -53,3 +52,4 @@ def upload(): except Exception as e: logger.error(f"Not able to upsert {table} \n{e}") + raise e diff --git a/tests/test_data/test_sinan_fetch.py b/tests/test_data/test_sinan_fetch.py index 43dd93e..9c86568 100644 --- a/tests/test_data/test_sinan_fetch.py +++ b/tests/test_data/test_sinan_fetch.py @@ -16,31 +16,28 @@ def setUp(self): self.engine = engine self.disease = "Zika" self.year = 2017 - self.fpath = ["/tmp/pysus/ZIKABR17.parquet"] - self.table = "zika17" + self.data_dir = Path.home() / "pysus" + self.file = ["ZIKABR17.parquet"] + self.table = "zika" self.schema = "brasil" def test_download_data_zika(self): extract.download(self.disease) - self.assertTrue(any(os.listdir("/tmp/pysus/"))) - self.assertTrue(self.fpath[0].split("/")[-1] in os.listdir("/tmp/pysus/")) + self.assertTrue(any(os.listdir(self.data_dir))) + self.assertTrue(self.file[0] in os.listdir(self.data_dir)) def test_parquet_visualization(self): - - df = viz.parquet(self.fpath[0], clean_after_read=False) - + fpath = Path(self.data_dir) / self.file[0] + df = viz.parquet(fpath, clean_after_read=False) self.assertIsInstance(df, pd.DataFrame) self.assertEqual(df.shape, (32684, 38)) @unittest.skip("Need table to test") # TODO: need table to test def test_save_to_pgsql(self): - - loading.upload(self.fpath) + loading.upload(self.file[0]) @unittest.skip("Need table to test") # TODO: need table to test def test_table_visualization(self): - df = viz.table(self.disease, self.year) - self.assertIsInstance(df, pd.DataFrame) self.assertFalse(df.empty)