From 94a44487a98d39ee2b44965d720658b848b3a96a Mon Sep 17 00:00:00 2001
From: prakaa <abiprakash007@gmail.com>
Date: Thu, 22 Aug 2024 10:54:20 +1000
Subject: [PATCH 1/5] do not include index in saved csvs

---
 src/isp_workbook_parser/parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/isp_workbook_parser/parser.py b/src/isp_workbook_parser/parser.py
index d289858..8250570 100644
--- a/src/isp_workbook_parser/parser.py
+++ b/src/isp_workbook_parser/parser.py
@@ -415,7 +415,7 @@ def save_tables(
         for table_name in tables:
             table = self.get_table(table_name, config_checks=config_checks)
             save_path = directory / Path(f"{table_name}.csv")
-            table.to_csv(save_path)
+            table.to_csv(save_path, index=False)
 
 
 class TableConfigError(Exception):

From afe093eea4062251d06401c51f9eb3f4a27b6fe1 Mon Sep 17 00:00:00 2001
From: prakaa <abiprakash007@gmail.com>
Date: Thu, 22 Aug 2024 13:22:29 +1000
Subject: [PATCH 2/5] add skip_rows to TableConfig

---
 src/isp_workbook_parser/config_model.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/isp_workbook_parser/config_model.py b/src/isp_workbook_parser/config_model.py
index 08ff988..1c7f32c 100644
--- a/src/isp_workbook_parser/config_model.py
+++ b/src/isp_workbook_parser/config_model.py
@@ -1,7 +1,7 @@
 import yaml
 
 from pydantic import BaseModel
-from typing import List
+from typing import List, Optional
 from pathlib import Path
 
 
@@ -36,6 +36,8 @@ class TableConfig(BaseModel):
             defined over multiple rows, then a list of the row numbers sorted in ascending order.
         end_row: the last row of table data.
         column_range: the columns over which the table is defined in the alphabetical format, i.e. 'B:F'
+        skip_rows: an `int` specifying a row to skip, or a list of `int` corresponding to
+            row numbers to skip.
     """
 
     name: str
@@ -43,6 +45,7 @@ class TableConfig(BaseModel):
     header_rows: int | List[int]
     end_row: int
     column_range: str
+    skip_rows: Optional[int | List[int]] = None
 
 
 def load_yaml(path: Path) -> dict[str, TableConfig]:

From c65549c82f640ea19819f366af06b4d0d7a53f5b Mon Sep 17 00:00:00 2001
From: prakaa <abiprakash007@gmail.com>
Date: Thu, 22 Aug 2024 13:22:45 +1000
Subject: [PATCH 3/5] use skip_rows from TableConfig in read_table

---
 src/isp_workbook_parser/read_table.py | 28 +++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/isp_workbook_parser/read_table.py b/src/isp_workbook_parser/read_table.py
index 126cce2..ec40b80 100644
--- a/src/isp_workbook_parser/read_table.py
+++ b/src/isp_workbook_parser/read_table.py
@@ -1,5 +1,6 @@
 import pandas as pd
 from isp_workbook_parser import TableConfig
+from typing import Union, List
 import numpy as np
 
 
@@ -57,12 +58,28 @@ def _build_cleaned_dataframe(
         1. Dropping the header rows in the table
         2. Applying the merged headers as the columns of the DataFrame
         3. Forward fill values across columns (handles merged value cells)
+        4. Reset the DataFrame index
         """
         df_cleaned = df_initial.iloc[header_rows_in_table:, :]
         df_cleaned.columns = new_headers
-        df_cleaned = df_cleaned.ffill(axis=1)
+        df_cleaned = df_cleaned.ffill(axis=1).reset_index(drop=True)
         return df_cleaned
 
+    def _skip_rows_in_dataframe(
+        df: pd.DataFrame, config_skip_rows: Union[int, List[int]], last_header_row: int
+    ) -> pd.DataFrame:
+        """
+        Drop rows specified by `skip_rows` by applying an offset from the header and
+        dropping based on index values
+        """
+        df_reset_index = df.reset_index(drop=True)
+        if isinstance(config_skip_rows, int):
+            skip_rows = [config_skip_rows - last_header_row - 1]
+        else:
+            skip_rows = np.subtract(config_skip_rows, last_header_row + 1)
+        dropped = df_reset_index.drop(index=skip_rows).reset_index(drop=True)
+        return dropped
+
     if type(table.header_rows) is int:
         df = pd.read_excel(
             workbook_file,
@@ -71,6 +88,8 @@ def _build_cleaned_dataframe(
             usecols=table.column_range,
             nrows=(table.end_row - table.header_rows),
         )
+        if table.skip_rows:
+            df = _skip_rows_in_dataframe(df, table.skip_rows, table.header_rows)
         return df
     else:
         df_initial = pd.read_excel(
@@ -86,10 +105,11 @@ def _build_cleaned_dataframe(
         assert sorted(table.header_rows) == table.header_rows
         # check that the header_rows are adjacent
         assert set(np.diff(table.header_rows)) == set([1])
+        # apply skip_rows before header processing
+        # start processing multiple header rows
         header_rows_in_table = table.header_rows[-1] - table.header_rows[0]
         initial_header = pd.Series(df_initial.columns)
         ffilled_initial_header = _ffill_highest_header(initial_header)
-        # for multiple header rows
         filled_headers = []
         # ffill intermediate header rows
         for i in range(0, header_rows_in_table - 1):
@@ -108,4 +128,8 @@ def _build_cleaned_dataframe(
         df_cleaned = _build_cleaned_dataframe(
             df_initial, header_rows_in_table, merged_headers
         )
+        if table.skip_rows:
+            df_cleaned = _skip_rows_in_dataframe(
+                df_cleaned, table.skip_rows, table.header_rows[-1]
+            )
         return df_cleaned

From 4eb6f4cf4ef5e1265c76d7780eae64048879eec6 Mon Sep 17 00:00:00 2001
From: prakaa <abiprakash007@gmail.com>
Date: Thu, 22 Aug 2024 16:37:40 +1000
Subject: [PATCH 4/5] tests for skip rows

---
 tests/test_read_table_functionality.py | 45 ++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 tests/test_read_table_functionality.py

diff --git a/tests/test_read_table_functionality.py b/tests/test_read_table_functionality.py
new file mode 100644
index 0000000..43ac275
--- /dev/null
+++ b/tests/test_read_table_functionality.py
@@ -0,0 +1,45 @@
+from isp_workbook_parser.config_model import TableConfig
+
+
+def test_skip_single_row_in_single_header_row_table(workbook_v6):
+    table_config = TableConfig(
+        name="build_cost_current_policies",
+        sheet_name="Build costs",
+        header_rows=15,
+        end_row=30,
+        column_range="B:AI",
+        skip_rows=30,
+    )
+    df = workbook_v6.get_table_from_config(table_config)
+    assert len(df) == (table_config.end_row - table_config.header_rows - 1)
+    assert df[df.Technology.str.contains("Hydrogen")].empty
+
+
+def test_skip_multiple_rows_in_single_header_row_table(workbook_v6):
+    table_config = TableConfig(
+        name="existing_generator_maintenance_rates",
+        sheet_name="Maintenance",
+        header_rows=7,
+        end_row=19,
+        column_range="B:D",
+        skip_rows=[8, 9, 19],
+    )
+    df = workbook_v6.get_table_from_config(table_config)
+    assert len(df) == (table_config.end_row - table_config.header_rows - 3)
+    assert df[df["Generator type"].str.contains("Hydrogen")].empty
+    assert df[df["Generator type"].str.contains("Coal")].empty
+
+
+def test_skip_multiple_rows_in_multiple_header_row_table(workbook_v6):
+    table_config = TableConfig(
+        name="wind_high_capacity_factors",
+        sheet_name="Capacity Factors ",
+        header_rows=[7, 8, 9],
+        end_row=48,
+        column_range="B:R",
+        # Victoria
+        skip_rows=(list(range(29, 35)) + [48]),
+    )
+    df = workbook_v6.get_table_from_config(table_config)
+    assert len(df) == (table_config.end_row - table_config.header_rows[-1] - 7)
+    assert df[df["Wind High - REZ ID"].str.contains("V")].empty

From fef1866fa398b8b271b6c68c706eb388e8979b5d Mon Sep 17 00:00:00 2001
From: Abi Prakash <abiprakash007@gmail.com>
Date: Thu, 22 Aug 2024 16:42:42 +1000
Subject: [PATCH 5/5] Remove incorrect comment

---
 src/isp_workbook_parser/read_table.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/isp_workbook_parser/read_table.py b/src/isp_workbook_parser/read_table.py
index ec40b80..bc9aebe 100644
--- a/src/isp_workbook_parser/read_table.py
+++ b/src/isp_workbook_parser/read_table.py
@@ -105,7 +105,6 @@ def _skip_rows_in_dataframe(
         assert sorted(table.header_rows) == table.header_rows
         # check that the header_rows are adjacent
         assert set(np.diff(table.header_rows)) == set([1])
-        # apply skip_rows before header processing
         # start processing multiple header rows
         header_rows_in_table = table.header_rows[-1] - table.header_rows[0]
         initial_header = pd.Series(df_initial.columns)