From 94a44487a98d39ee2b44965d720658b848b3a96a Mon Sep 17 00:00:00 2001 From: prakaa Date: Thu, 22 Aug 2024 10:54:20 +1000 Subject: [PATCH 1/5] do not include index in saved csvs --- src/isp_workbook_parser/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/isp_workbook_parser/parser.py b/src/isp_workbook_parser/parser.py index d289858..8250570 100644 --- a/src/isp_workbook_parser/parser.py +++ b/src/isp_workbook_parser/parser.py @@ -415,7 +415,7 @@ def save_tables( for table_name in tables: table = self.get_table(table_name, config_checks=config_checks) save_path = directory / Path(f"{table_name}.csv") - table.to_csv(save_path) + table.to_csv(save_path, index=False) class TableConfigError(Exception): From afe093eea4062251d06401c51f9eb3f4a27b6fe1 Mon Sep 17 00:00:00 2001 From: prakaa Date: Thu, 22 Aug 2024 13:22:29 +1000 Subject: [PATCH 2/5] add skip_rows to TableConfig --- src/isp_workbook_parser/config_model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/isp_workbook_parser/config_model.py b/src/isp_workbook_parser/config_model.py index 08ff988..1c7f32c 100644 --- a/src/isp_workbook_parser/config_model.py +++ b/src/isp_workbook_parser/config_model.py @@ -1,7 +1,7 @@ import yaml from pydantic import BaseModel -from typing import List +from typing import List, Optional from pathlib import Path @@ -36,6 +36,8 @@ class TableConfig(BaseModel): defined over multiple rows, then a list of the row numbers sorted in ascending order. end_row: the last row of table data. column_range: the columns over which the table is defined in the alphabetical format, i.e. 'B:F' + skip_rows: an `int` specifying a row to skip, or a list of `int` corresponding to + row numbers to skip. """ name: str @@ -43,6 +45,7 @@ class TableConfig(BaseModel): header_rows: int | List[int] end_row: int column_range: str + skip_rows: Optional[int | List[int]] = None def load_yaml(path: Path) -> dict[str, TableConfig]: From c65549c82f640ea19819f366af06b4d0d7a53f5b Mon Sep 17 00:00:00 2001 From: prakaa Date: Thu, 22 Aug 2024 13:22:45 +1000 Subject: [PATCH 3/5] use skip_rows from TableConfig in read_table --- src/isp_workbook_parser/read_table.py | 28 +++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/isp_workbook_parser/read_table.py b/src/isp_workbook_parser/read_table.py index 126cce2..ec40b80 100644 --- a/src/isp_workbook_parser/read_table.py +++ b/src/isp_workbook_parser/read_table.py @@ -1,5 +1,6 @@ import pandas as pd from isp_workbook_parser import TableConfig +from typing import Union, List import numpy as np @@ -57,12 +58,28 @@ def _build_cleaned_dataframe( 1. Dropping the header rows in the table 2. Applying the merged headers as the columns of the DataFrame 3. Forward fill values across columns (handles merged value cells) + 4. Reset the DataFrame index """ df_cleaned = df_initial.iloc[header_rows_in_table:, :] df_cleaned.columns = new_headers - df_cleaned = df_cleaned.ffill(axis=1) + df_cleaned = df_cleaned.ffill(axis=1).reset_index(drop=True) return df_cleaned + def _skip_rows_in_dataframe( + df: pd.DataFrame, config_skip_rows: Union[int, List[int]], last_header_row: int + ) -> pd.DataFrame: + """ + Drop rows specified by `skip_rows` by applying an offset from the header and + dropping based on index values + """ + df_reset_index = df.reset_index(drop=True) + if isinstance(config_skip_rows, int): + skip_rows = [config_skip_rows - last_header_row - 1] + else: + skip_rows = np.subtract(config_skip_rows, last_header_row + 1) + dropped = df_reset_index.drop(index=skip_rows).reset_index(drop=True) + return dropped + if type(table.header_rows) is int: df = pd.read_excel( workbook_file, @@ -71,6 +88,8 @@ def _build_cleaned_dataframe( usecols=table.column_range, nrows=(table.end_row - table.header_rows), ) + if table.skip_rows: + df = _skip_rows_in_dataframe(df, table.skip_rows, table.header_rows) return df else: df_initial = pd.read_excel( @@ -86,10 +105,11 @@ def _build_cleaned_dataframe( assert sorted(table.header_rows) == table.header_rows # check that the header_rows are adjacent assert set(np.diff(table.header_rows)) == set([1]) + # apply skip_rows before header processing + # start processing multiple header rows header_rows_in_table = table.header_rows[-1] - table.header_rows[0] initial_header = pd.Series(df_initial.columns) ffilled_initial_header = _ffill_highest_header(initial_header) - # for multiple header rows filled_headers = [] # ffill intermediate header rows for i in range(0, header_rows_in_table - 1): @@ -108,4 +128,8 @@ def _build_cleaned_dataframe( df_cleaned = _build_cleaned_dataframe( df_initial, header_rows_in_table, merged_headers ) + if table.skip_rows: + df_cleaned = _skip_rows_in_dataframe( + df_cleaned, table.skip_rows, table.header_rows[-1] + ) return df_cleaned From 4eb6f4cf4ef5e1265c76d7780eae64048879eec6 Mon Sep 17 00:00:00 2001 From: prakaa Date: Thu, 22 Aug 2024 16:37:40 +1000 Subject: [PATCH 4/5] tests for skip rows --- tests/test_read_table_functionality.py | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 tests/test_read_table_functionality.py diff --git a/tests/test_read_table_functionality.py b/tests/test_read_table_functionality.py new file mode 100644 index 0000000..43ac275 --- /dev/null +++ b/tests/test_read_table_functionality.py @@ -0,0 +1,45 @@ +from isp_workbook_parser.config_model import TableConfig + + +def test_skip_single_row_in_single_header_row_table(workbook_v6): + table_config = TableConfig( + name="build_cost_current_policies", + sheet_name="Build costs", + header_rows=15, + end_row=30, + column_range="B:AI", + skip_rows=30, + ) + df = workbook_v6.get_table_from_config(table_config) + assert len(df) == (table_config.end_row - table_config.header_rows - 1) + assert df[df.Technology.str.contains("Hydrogen")].empty + + +def test_skip_multiple_rows_in_single_header_row_table(workbook_v6): + table_config = TableConfig( + name="existing_generator_maintenance_rates", + sheet_name="Maintenance", + header_rows=7, + end_row=19, + column_range="B:D", + skip_rows=[8, 9, 19], + ) + df = workbook_v6.get_table_from_config(table_config) + assert len(df) == (table_config.end_row - table_config.header_rows - 3) + assert df[df["Generator type"].str.contains("Hydrogen")].empty + assert df[df["Generator type"].str.contains("Coal")].empty + + +def test_skip_multiple_rows_in_multiple_header_row_table(workbook_v6): + table_config = TableConfig( + name="wind_high_capacity_factors", + sheet_name="Capacity Factors ", + header_rows=[7, 8, 9], + end_row=48, + column_range="B:R", + # Victoria + skip_rows=(list(range(29, 35)) + [48]), + ) + df = workbook_v6.get_table_from_config(table_config) + assert len(df) == (table_config.end_row - table_config.header_rows[-1] - 7) + assert df[df["Wind High - REZ ID"].str.contains("V")].empty From fef1866fa398b8b271b6c68c706eb388e8979b5d Mon Sep 17 00:00:00 2001 From: Abi Prakash Date: Thu, 22 Aug 2024 16:42:42 +1000 Subject: [PATCH 5/5] Remove incorrect comment --- src/isp_workbook_parser/read_table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/isp_workbook_parser/read_table.py b/src/isp_workbook_parser/read_table.py index ec40b80..bc9aebe 100644 --- a/src/isp_workbook_parser/read_table.py +++ b/src/isp_workbook_parser/read_table.py @@ -105,7 +105,6 @@ def _skip_rows_in_dataframe( assert sorted(table.header_rows) == table.header_rows # check that the header_rows are adjacent assert set(np.diff(table.header_rows)) == set([1]) - # apply skip_rows before header processing # start processing multiple header rows header_rows_in_table = table.header_rows[-1] - table.header_rows[0] initial_header = pd.Series(df_initial.columns)