Skip to content

Commit

Permalink
fix(sinan): use home dir to download sinan data (#209)
Browse files Browse the repository at this point in the history
* fix(sinan): use home dir to download sinan data

* removing tables split by years

* Delete poetry.lock

* Reseting poetry lock

* Fix tests

* linter

* Linter (unrelated)

* SINAN_DATA_PATH to PYSUS_DATA_PATH
  • Loading branch information
luabida authored Feb 7, 2023
1 parent ec69133 commit 70fd64b
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 68 deletions.
60 changes: 29 additions & 31 deletions docs/tutorials/forecast_switzerland/forecast_swiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ def get_clusters_swiss(t=0.3, end_date=None):
"""
Params to get the list of clusters computed by the compute_cluster
function.
Parameters
----------
t : float
Thereshold used in the clusterization.
end_date : str
Indicates the last day used to compute the cluster.
Returns
-------
Array
Expand Down Expand Up @@ -76,12 +76,12 @@ def get_clusters_swiss(t=0.3, end_date=None):
def get_cluster_by_canton(canton):
"""
Function to return the cluster that contains a specific canton.
Parameters
----------
canton : str
Name (two letters code) of the canton.
Returns
-------
List
Expand All @@ -99,11 +99,11 @@ def remove_zeros(tgt):
"""
Function to remove the zeros of the target curve. It needs to be
done to us be able to use the LogNormal dist.
Parameters
----------
tgt : array
"""

tgt[tgt == 0] = 0.01
Expand All @@ -129,10 +129,10 @@ def train_eval_single_canton(
):
"""
Function to train and evaluate the model for one georegion.
Important: * By default the function is using the clustering cantons
and the
data since 2020.
data since 2020.
* For the predictor hospCapacity is used as predictor the column
ICU_Covid19Patients.
Expand Down Expand Up @@ -177,9 +177,9 @@ def train_eval_single_canton(
-------
pd.DataFrame
The return is a pandas DataFrame.
"""

cluster_canton = [canton] # get_cluster_by_canton(canton)

target_name = f"{target_curve_name}_{canton}"
Expand Down Expand Up @@ -242,13 +242,13 @@ def train_eval_all_cantons(

"""
Function to make prediction for all the cantons.
Important:
* By default the function is using the clustering cantons and the
data since 2020.
* For the predictor hospCapacity is used as predictor the column
ICU_Covid19Patients.
Parameters
----------
target_curve_name : str
Expand Down Expand Up @@ -277,7 +277,7 @@ def train_eval_all_cantons(
look_back : int
Number of the last days that will be used to forecast the next
days.
Returns
-------
pd.DataFrame
Expand Down Expand Up @@ -357,13 +357,13 @@ def train_single_canton(

"""
Function to train and evaluate the model for one georegion.
Important: * By default the function is using the clustering cantons
and the
data since 2020.
* For the predictor hospCapacity is used as predictor the column
ICU_Covid19Patients.
Parameters
----------
canton : str
Expand All @@ -378,7 +378,7 @@ def train_single_canton(
Determines the beggining of the train dataset
path : str
Determines where the model trained will be saved.
update_data : bool
update_data : bool
Determines if the data from the Geneva hospital will be used.
This params only is used when canton = GE and target_curve_name
= hosp.
Expand All @@ -390,7 +390,7 @@ def train_single_canton(
look_back : int
Number of the last days that will be used to forecast the next
days.
Returns
-------
None
Expand Down Expand Up @@ -449,17 +449,17 @@ def train_all_cantons(
look_back=14,
path=None,
):

"""
Function to train and evaluate the model for all the cantons in
switzerland.
Important:
Important:
* By default the function is using the clustering cantons and the
data since 2020.
data since 2020.
* For the predictor hospCapacity is used as predictor the column
ICU_Covid19Patients.
Parameters
----------
target_curve_name : str
Expand All @@ -480,14 +480,13 @@ def train_all_cantons(
look_back : int
Number of the last days that will be used to forecast the next
days.
Returns
-------
pd.DataFrame
Dataframe with the forecast for all the cantons.
"""


clusters = get_clusters_swiss(t=0.6)

for cluster in clusters:
Expand Down Expand Up @@ -541,13 +540,13 @@ def forecast_single_canton(
):
"""
Function to make the forecast for one canton.
Important:
* By default the function is using the clustering cantons and the
data since 2020.
* For the predictor hospCapacity is used as predictor the column
ICU_Covid19Patients.
Parameters
----------
target_curve_name : str
Expand All @@ -569,14 +568,13 @@ def forecast_single_canton(
look_back : int
Number of the last days that will be used to forecast the next
days.
Returns
-------
pd.DataFrame
Dataframe with the forecast for one canton.
"""


cluster_canton = [canton] # get_cluster_by_canton(canton)

df = get_cluster_data(
Expand Down Expand Up @@ -609,13 +607,13 @@ def forecast_all_cantons(
):
"""
Function to make the forecast for all the cantons.
Important:
* By default the function is using the clustering cantons and the
data since 2020.
* For the predictor hospCapacity is used as predictor the column
ICU_Covid19Patients.
Parameters
----------
target_curve_name : str
Expand All @@ -630,7 +628,7 @@ def forecast_all_cantons(
Determines from what day the forecast will be computed.
path : str
Indicates where the models trained are saved.
Returns
-------
pd.DataFrame
Expand Down
16 changes: 8 additions & 8 deletions epigraphhub/analysis/forecast_models/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ def compute_metrics(df_pred: pd.DataFrame) -> pd.DataFrame:
method in the train and test sample. The predictions must be saved
in a dataset with the following columns: 'median', 'target' and
'train_size'.
This function uses the following metrics:
- explained variance score;
- mean absolute error;
- mean squared error;
- root mean squared error;
- mean squared log error;
- mean absolute percentage error.
This function uses the following metrics:
- explained variance score;
- mean absolute error;
- mean squared error;
- root mean squared error;
- mean squared log error;
- mean absolute percentage error.
To compute this metrics we use the implementations of the
sklearn.metrics package.
Expand Down
8 changes: 4 additions & 4 deletions epigraphhub/analysis/forecast_models/ngboost_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,12 @@ def train_eval(
Returns
-------
pd.DataFrame
A DataFrame with four columns (and a date index):
A DataFrame with four columns (and a date index):
- target: The target values.
- target: The target values.
- lower: The lower value of the confidence interval of 95%.
- median: The median value of the confidence interval of
95%.
95%.
- upper: The upper value of the confidence interval of 95%.
- train_size: The number of rows of data using as training
data.
Expand Down Expand Up @@ -361,7 +361,7 @@ def forecast(
- lower: The lower value of the confidence interval of 95%.
- median: The median value of the confidence interval of
95%.
95%.
- upper: The upper value of the confidence interval of 95%.
"""

Expand Down
8 changes: 4 additions & 4 deletions epigraphhub/analysis/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,9 @@ def lstm_split_data(
Returns
-------
Tuple[np.array,np.array,np.array,np.array]
X_train: array of features to train the model.
y_train: array of targets to train the model.
X_test: array of features to test the model.
X_train: array of features to train the model.
y_train: array of targets to train the model.
X_test: array of features to test the model.
y_test: array of targets to test the model.
"""

Expand Down Expand Up @@ -233,7 +233,7 @@ def normalize_data(
Returns
-------
Tuple[pd.DataFrame, pd.Series]
pd.DataFrame: normalized DataFrame.
pd.DataFrame: normalized DataFrame.
pd.Series: Series of the max
values used in the normalization.
"""
Expand Down
5 changes: 5 additions & 0 deletions epigraphhub/data/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
commonly used in data collection modules
"""

from pathlib import Path

# Colombia COVID data config:
from sodapy import Socrata

Expand All @@ -27,3 +29,6 @@

# SINAN data config:
SINAN_LOG_PATH = "/tmp/sinan_fetch.log"
_sinan_data = Path().home() / "pysus"
_sinan_data.mkdir(exist_ok=True)
PYSUS_DATA_PATH = str(_sinan_data)
6 changes: 3 additions & 3 deletions epigraphhub/data/brasil/sinan/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from loguru import logger
from pysus.online_data import SINAN

from epigraphhub.data._config import SINAN_LOG_PATH
from epigraphhub.data._config import PYSUS_DATA_PATH, SINAN_LOG_PATH

logger.add(SINAN_LOG_PATH, retention="7 days")

Expand All @@ -24,6 +24,6 @@ def download(disease: str):
parquets_paths_list list(PosixPath) : A list with all parquets dirs.
"""

SINAN.download_all_years_in_chunks(disease)
SINAN.download_all_years_in_chunks(disease, data_dir=PYSUS_DATA_PATH)

logger.info(f"All years for {disease} downloaded at /tmp/pysus")
logger.info(f"All years for {disease} downloaded at {PYSUS_DATA_PATH}")
14 changes: 7 additions & 7 deletions epigraphhub/data/brasil/sinan/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pysus.online_data import parquets_to_dataframe as to_df

from epigraphhub.connection import get_engine
from epigraphhub.data._config import SINAN_LOG_PATH
from epigraphhub.data._config import PYSUS_DATA_PATH, SINAN_LOG_PATH
from epigraphhub.settings import env

logger.add(SINAN_LOG_PATH, retention="7 days")
Expand All @@ -17,22 +17,21 @@
def upload():
"""
Connects to the EGH SQL server and load all the chunks for all
diseases found at `/tmp/pysus` into database. This method cleans
diseases found at `$PYSUS_DATA_PATH` into database. This method cleans
the chunks left.
"""
diseases_dir = Path("/tmp/pysus").glob("*")
diseases_dir = Path(PYSUS_DATA_PATH).glob("*")
di_years_dir = [x for x in diseases_dir if x.is_dir()]

for dir in di_years_dir:
if "parquet" in Path(dir).suffix:
df = to_df(str(dir), clean_after_read=True)
if "parquet" in Path(dir).suffix and any(os.listdir(dir)):
df = to_df(str(dir), clean_after_read=False)
df.columns = df.columns.str.lower()
df.index.name = "index"

table_i = str(dir).split("/")[-1].split(".parquet")[0]
st, yr = table_i[:-4].lower(), table_i[-2:]
table = "".join([st, yr])
table = table_i[:-4].lower()
schema = "brasil"

with engine.connect() as conn:
Expand All @@ -53,3 +52,4 @@ def upload():

except Exception as e:
logger.error(f"Not able to upsert {table} \n{e}")
raise e
Loading

0 comments on commit 70fd64b

Please sign in to comment.