Skip to content

Commit

Permalink
Merge pull request #159 from trevorb1/issue-156
Browse files Browse the repository at this point in the history
Extra column in CSV file, but not in config fails to raise error
  • Loading branch information
trevorb1 authored Apr 19, 2023
2 parents a10d336 + fdf4ebb commit 34e3224
Show file tree
Hide file tree
Showing 6 changed files with 384 additions and 58 deletions.
42 changes: 42 additions & 0 deletions src/otoole/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,45 @@ def __init__(

def __str__(self):
return f"{self.resource} -> {self.message}"


class OtooleIndexError(OtooleException):
"""Index data not consistent between data and config file
Arguments
---------
resource : str
Name of the resource which is invalid
config_indices: List[str]
Indices from config file
data_indices: List[str]
Indices from input data
"""

def __init__(self, resource, config_indices, data_indices):
self.resource = resource
self.config_indices = config_indices
self.data_indices = data_indices
self.message = "Indices inconsistent between config and data"

def __str__(self):
return f"{self.resource} -> {self.message}. Config indices are {self.config_indices}. Data indices are {self.data_indices}."


class OtooleError(OtooleException):
"""General purpose error
Arguments
---------
resource : str
Name of the resource which is invalid
message : str
Error message
"""

def __init__(self, resource, message):
self.resource = resource
self.message = message

def __str__(self):
return f"{self.resource} -> {self.message}"
136 changes: 115 additions & 21 deletions src/otoole/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

import pandas as pd

from otoole.exceptions import OtooleNameMismatchError
from otoole.exceptions import OtooleIndexError, OtooleNameMismatchError

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -385,32 +385,126 @@ def _check_index(
details = self.user_config[name]

if details["type"] == "param":
logger.debug("Identified {} as a parameter".format(name))
try:
df.set_index(details["indices"], inplace=True)
except KeyError:
logger.debug("Unable to set index on {}".format(name))
pass
self._check_param_index_names(name=name, config=details, df=df)
elif details["type"] == "set":
self._check_set_index_names(name=name, df=df)

logger.debug(
"Column dtypes identified: {}".format(details["index_dtypes"])
)
logger.debug(df.head())
# Drop empty rows
df = (
df.dropna(axis=0, how="all")
.reset_index()
.astype(details["index_dtypes"])
.set_index(details["indices"])
)
else:
logger.debug("Identified {} as a set".format(name))
df = df.astype(details["dtype"])
df = self._check_index_dtypes(name=name, config=details, df=df)

input_data[name] = df

return input_data

@staticmethod
def _check_param_index_names(
name: str, config: Dict[str, Any], df: pd.DataFrame
) -> None:
"""Checks parameter index names input data against config file
Arguments
---------
name: str
Name of parameter
config: Dict[str,Any]
Configuration file data for the parameter
df: pd.DataFrame
Data read in for the parameter
Raises
------
OtooleIndexError
If actual indices do not match expected indices
"""

actual_indices = df.index.names
if actual_indices[0] is None: # for ReadMemory
logger.debug(f"No mulit-index identified for {name}")
actual_indices = list(df)[:-1] # Drop "VALUE"

logger.debug(f"Actual indices for {name} are {actual_indices}")
try:
expected_indices = config["indices"]
logger.debug(f"Expected indices for {name} are {expected_indices}")
except KeyError:
logger.debug(f"No expected indices identifed for {name}")
return

if actual_indices == expected_indices:
return
else:
raise OtooleIndexError(
resource=name,
config_indices=expected_indices,
data_indices=actual_indices,
)

@staticmethod
def _check_set_index_names(name: str, df: pd.DataFrame) -> None:
"""Checks for proper set index name
Arguments
---------
name: str
Name of set
df: pd.DataFrame
Data read in for the parameter
Raises
------
OtooleIndexError
If actual indices do not match expected indices
"""
if not df.columns == ["VALUE"]:
raise OtooleIndexError(
resource=name,
config_indices=["VALUE"],
data_indices=df.columns,
)

@staticmethod
def _check_index_dtypes(
name: str, config: Dict[str, Any], df: pd.DataFrame
) -> pd.DataFrame:
"""Checks datatypes of input data against config file
Arguments
---------
name: str
Name of parameter
config: Dict[str,Any]
Configuration file data for the parameter
df: pd.DataFrame
Data read in for the parameter
Returns
-------
pd.DataFrame
input_data with corrected datatypes
"""

if config["type"] == "param":
logger.debug("Identified {} as a parameter".format(name))
try:
df.set_index(config["indices"], inplace=True)
except KeyError:
logger.debug("Unable to set index on {}".format(name))
pass

logger.debug("Column dtypes identified: {}".format(config["index_dtypes"]))
logger.debug(df.head())
# Drop empty rows
df = (
df.dropna(axis=0, how="all")
.reset_index()
.astype(config["index_dtypes"])
.set_index(config["indices"])
)
else:
logger.debug("Identified {} as a set".format(name))
df = df.astype(config["dtype"])

return df

def _get_missing_input_dataframes(
self, input_data: Dict[str, pd.DataFrame], config_type: str
) -> Dict[str, pd.DataFrame]:
Expand Down
43 changes: 22 additions & 21 deletions src/otoole/read_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from amply import Amply
from flatten_dict import flatten

from otoole.exceptions import OtooleDeprecationError
from otoole.exceptions import OtooleDeprecationError, OtooleError
from otoole.input import ReadStrategy
from otoole.preprocess.longify_data import check_datatypes, check_set_datatype
from otoole.utils import create_name_mappings
Expand Down Expand Up @@ -45,7 +45,7 @@ def _check_set(self, df: pd.DataFrame, config_details: Dict, name: str):

return narrow

def _check_parameter(self, df: pd.DataFrame, expected_headers: List, name: str):
def _convert_wide_2_narrow(self, df: pd.DataFrame, name: str):
"""Converts a dataframe from wide to narrow format
Arguments
Expand All @@ -54,41 +54,42 @@ def _check_parameter(self, df: pd.DataFrame, expected_headers: List, name: str):
expected_headers: List
name: str
"""
actual_headers = df.columns
logger.debug("Expected headers for %s: %s", name, expected_headers)

if "REGION" in expected_headers and "REGION" not in actual_headers:
raise ValueError("No REGION column provided for %s", name)
actual_headers = list(df.columns)

if "MODEOFOPERATION" in actual_headers:
df = df.rename(columns={"MODEOFOPERATION": "MODE_OF_OPERATION"})

if actual_headers[-1] == "VALUE":
logger.info(
"%s is already in narrow form with headers %s", name, df.columns
f"{name} is already in narrow form with headers {actual_headers}"
)
narrow = df
converted_headers = actual_headers[:-1] # remove "VALUE"
else:
try:
converted_headers = [
x for x in actual_headers if not isinstance(x, int)
]
converted_headers += ["YEAR"]
if "VALUE" in converted_headers:
raise OtooleError(
resource=name,
message="'VALUE' can not be a header in wide format data",
)
narrow = pd.melt(
df,
id_vars=expected_headers[:-1],
var_name=expected_headers[-1], # Normally 'YEAR'
id_vars=converted_headers[:-1],
var_name=converted_headers[-1], # Normally 'YEAR'
value_name="new_VALUE",
)
narrow = narrow.rename(columns={"new_VALUE": "VALUE"})
logger.info(f"{name} reshaped from wide to narrow format")
except IndexError as ex:
logger.debug("Could not reshape %s", df.columns)
logger.debug(f"Could not reshape {name}")
raise ex

all_headers = expected_headers + ["VALUE"]
for column in all_headers:
if column not in narrow.columns:
logger.warning("%s not in header of %s", column, name)

logger.debug("Final all headers for %s: %s", name, all_headers)

return narrow[all_headers].set_index(expected_headers)
all_headers = converted_headers + ["VALUE"]
return narrow[all_headers].set_index(converted_headers)

def _whitespace_converter(self, indices: List[str]) -> Dict[str, Any]:
"""Creates converter for striping whitespace in dataframe
Expand Down Expand Up @@ -139,7 +140,7 @@ def read(
entity_type = config[mod_name]["type"]

if entity_type == "param":
narrow = self._check_parameter(df, config_details["indices"], mod_name)
narrow = self._convert_wide_2_narrow(df, mod_name)
elif entity_type == "set":
narrow = self._check_set(df, config_details, mod_name)

Expand Down Expand Up @@ -182,7 +183,7 @@ def read(

if entity_type == "param":
df = self._get_input_data(filepath, parameter, details, converter)
narrow = self._check_parameter(df, details["indices"], parameter)
narrow = self._convert_wide_2_narrow(df, parameter)
if not narrow.empty:
narrow_checked = check_datatypes(
narrow, self.user_config, parameter
Expand Down
Binary file modified tests/fixtures/combined_inputs.xlsx
Binary file not shown.
Loading

0 comments on commit 34e3224

Please sign in to comment.