Open-ISP · prakaa · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/src/isp_workbook_parser/config_model.py b/src/isp_workbook_parser/config_model.py
@@ -1,7 +1,7 @@
 import yaml
 
 from pydantic import BaseModel
-from typing import List
+from typing import List, Optional
 from pathlib import Path
 
 
@@ -36,13 +36,16 @@ class TableConfig(BaseModel):
             defined over multiple rows, then a list of the row numbers sorted in ascending order.
         end_row: the last row of table data.
         column_range: the columns over which the table is defined in the alphabetical format, i.e. 'B:F'
+        skip_rows: an `int` specifying a row to skip, or a list of `int` corresponding to
+            row numbers to skip.
     """
 
     name: str
     sheet_name: str
     header_rows: int | List[int]
     end_row: int
     column_range: str
+    skip_rows: Optional[int | List[int]] = None
 
 
 def load_yaml(path: Path) -> dict[str, TableConfig]:

diff --git a/src/isp_workbook_parser/parser.py b/src/isp_workbook_parser/parser.py
@@ -415,7 +415,7 @@
         for table_name in tables:
             table = self.get_table(table_name, config_checks=config_checks)
             save_path = directory / Path(f"{table_name}.csv")
-            table.to_csv(save_path)
+            table.to_csv(save_path, index=False)
 
 
 class TableConfigError(Exception):

diff --git a/src/isp_workbook_parser/read_table.py b/src/isp_workbook_parser/read_table.py
@@ -1,5 +1,6 @@
 import pandas as pd
 from isp_workbook_parser import TableConfig
+from typing import Union, List
 import numpy as np
 
 
@@ -57,12 +58,28 @@ def _build_cleaned_dataframe(
         1. Dropping the header rows in the table
         2. Applying the merged headers as the columns of the DataFrame
         3. Forward fill values across columns (handles merged value cells)
+        4. Reset the DataFrame index
         """
         df_cleaned = df_initial.iloc[header_rows_in_table:, :]
         df_cleaned.columns = new_headers
-        df_cleaned = df_cleaned.ffill(axis=1)
+        df_cleaned = df_cleaned.ffill(axis=1).reset_index(drop=True)
         return df_cleaned
 
+    def _skip_rows_in_dataframe(
+        df: pd.DataFrame, config_skip_rows: Union[int, List[int]], last_header_row: int
+    ) -> pd.DataFrame:
+        """
+        Drop rows specified by `skip_rows` by applying an offset from the header and
+        dropping based on index values
+        """
+        df_reset_index = df.reset_index(drop=True)
+        if isinstance(config_skip_rows, int):
+            skip_rows = [config_skip_rows - last_header_row - 1]
+        else:
+            skip_rows = np.subtract(config_skip_rows, last_header_row + 1)
+        dropped = df_reset_index.drop(index=skip_rows).reset_index(drop=True)
+        return dropped
+
     if type(table.header_rows) is int:
         df = pd.read_excel(
             workbook_file,
@@ -71,6 +88,8 @@ def _build_cleaned_dataframe(
             usecols=table.column_range,
             nrows=(table.end_row - table.header_rows),
         )
+        if table.skip_rows:
+            df = _skip_rows_in_dataframe(df, table.skip_rows, table.header_rows)
         return df
     else:
         df_initial = pd.read_excel(
@@ -86,10 +105,11 @@ def _build_cleaned_dataframe(
         assert sorted(table.header_rows) == table.header_rows
         # check that the header_rows are adjacent
         assert set(np.diff(table.header_rows)) == set([1])
+        # apply skip_rows before header processing
+        # start processing multiple header rows
         header_rows_in_table = table.header_rows[-1] - table.header_rows[0]
         initial_header = pd.Series(df_initial.columns)
         ffilled_initial_header = _ffill_highest_header(initial_header)
-        # for multiple header rows
         filled_headers = []
         # ffill intermediate header rows
         for i in range(0, header_rows_in_table - 1):
@@ -108,4 +128,8 @@ def _build_cleaned_dataframe(
         df_cleaned = _build_cleaned_dataframe(
             df_initial, header_rows_in_table, merged_headers
         )
+        if table.skip_rows:
+            df_cleaned = _skip_rows_in_dataframe(
+                df_cleaned, table.skip_rows, table.header_rows[-1]
+            )
         return df_cleaned
diff --git a/tests/test_read_table_functionality.py b/tests/test_read_table_functionality.py
@@ -0,0 +1,45 @@
+from isp_workbook_parser.config_model import TableConfig
+
+
+def test_skip_single_row_in_single_header_row_table(workbook_v6):
+    table_config = TableConfig(
+        name="build_cost_current_policies",
+        sheet_name="Build costs",
+        header_rows=15,
+        end_row=30,
+        column_range="B:AI",
+        skip_rows=30,
+    )
+    df = workbook_v6.get_table_from_config(table_config)
+    assert len(df) == (table_config.end_row - table_config.header_rows - 1)
+    assert df[df.Technology.str.contains("Hydrogen")].empty
+
+
+def test_skip_multiple_rows_in_single_header_row_table(workbook_v6):
+    table_config = TableConfig(
+        name="existing_generator_maintenance_rates",
+        sheet_name="Maintenance",
+        header_rows=7,
+        end_row=19,
+        column_range="B:D",
+        skip_rows=[8, 9, 19],
+    )
+    df = workbook_v6.get_table_from_config(table_config)
+    assert len(df) == (table_config.end_row - table_config.header_rows - 3)
+    assert df[df["Generator type"].str.contains("Hydrogen")].empty
+    assert df[df["Generator type"].str.contains("Coal")].empty
+
+
+def test_skip_multiple_rows_in_multiple_header_row_table(workbook_v6):
+    table_config = TableConfig(
+        name="wind_high_capacity_factors",
+        sheet_name="Capacity Factors ",
+        header_rows=[7, 8, 9],
+        end_row=48,
+        column_range="B:R",
+        # Victoria
+        skip_rows=(list(range(29, 35)) + [48]),
+    )
+    df = workbook_v6.get_table_from_config(table_config)
+    assert len(df) == (table_config.end_row - table_config.header_rows[-1] - 7)
+    assert df[df["Wind High - REZ ID"].str.contains("V")].empty