Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip rows functionality #12

Merged
merged 5 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/isp_workbook_parser/config_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import yaml

from pydantic import BaseModel
from typing import List
from typing import List, Optional
from pathlib import Path


Expand Down Expand Up @@ -36,13 +36,16 @@ class TableConfig(BaseModel):
defined over multiple rows, then a list of the row numbers sorted in ascending order.
end_row: the last row of table data.
column_range: the columns over which the table is defined in the alphabetical format, i.e. 'B:F'
skip_rows: an `int` specifying a row to skip, or a list of `int` corresponding to
row numbers to skip.
"""

name: str
sheet_name: str
header_rows: int | List[int]
end_row: int
column_range: str
skip_rows: Optional[int | List[int]] = None


def load_yaml(path: Path) -> dict[str, TableConfig]:
Expand Down
2 changes: 1 addition & 1 deletion src/isp_workbook_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@
for table_name in tables:
table = self.get_table(table_name, config_checks=config_checks)
save_path = directory / Path(f"{table_name}.csv")
table.to_csv(save_path)
table.to_csv(save_path, index=False)

Check warning on line 418 in src/isp_workbook_parser/parser.py

View check run for this annotation

Codecov / codecov/patch

src/isp_workbook_parser/parser.py#L418

Added line #L418 was not covered by tests


class TableConfigError(Exception):
Expand Down
28 changes: 26 additions & 2 deletions src/isp_workbook_parser/read_table.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
from isp_workbook_parser import TableConfig
from typing import Union, List
import numpy as np


Expand Down Expand Up @@ -57,12 +58,28 @@ def _build_cleaned_dataframe(
1. Dropping the header rows in the table
2. Applying the merged headers as the columns of the DataFrame
3. Forward fill values across columns (handles merged value cells)
4. Reset the DataFrame index
"""
df_cleaned = df_initial.iloc[header_rows_in_table:, :]
df_cleaned.columns = new_headers
df_cleaned = df_cleaned.ffill(axis=1)
df_cleaned = df_cleaned.ffill(axis=1).reset_index(drop=True)
return df_cleaned

def _skip_rows_in_dataframe(
df: pd.DataFrame, config_skip_rows: Union[int, List[int]], last_header_row: int
) -> pd.DataFrame:
"""
Drop rows specified by `skip_rows` by applying an offset from the header and
dropping based on index values
"""
df_reset_index = df.reset_index(drop=True)
if isinstance(config_skip_rows, int):
skip_rows = [config_skip_rows - last_header_row - 1]
else:
skip_rows = np.subtract(config_skip_rows, last_header_row + 1)
dropped = df_reset_index.drop(index=skip_rows).reset_index(drop=True)
return dropped

if type(table.header_rows) is int:
df = pd.read_excel(
workbook_file,
Expand All @@ -71,6 +88,8 @@ def _build_cleaned_dataframe(
usecols=table.column_range,
nrows=(table.end_row - table.header_rows),
)
if table.skip_rows:
df = _skip_rows_in_dataframe(df, table.skip_rows, table.header_rows)
return df
else:
df_initial = pd.read_excel(
Expand All @@ -86,10 +105,11 @@ def _build_cleaned_dataframe(
assert sorted(table.header_rows) == table.header_rows
# check that the header_rows are adjacent
assert set(np.diff(table.header_rows)) == set([1])
# apply skip_rows before header processing
# start processing multiple header rows
header_rows_in_table = table.header_rows[-1] - table.header_rows[0]
initial_header = pd.Series(df_initial.columns)
ffilled_initial_header = _ffill_highest_header(initial_header)
# for multiple header rows
filled_headers = []
# ffill intermediate header rows
for i in range(0, header_rows_in_table - 1):
Expand All @@ -108,4 +128,8 @@ def _build_cleaned_dataframe(
df_cleaned = _build_cleaned_dataframe(
df_initial, header_rows_in_table, merged_headers
)
if table.skip_rows:
df_cleaned = _skip_rows_in_dataframe(
df_cleaned, table.skip_rows, table.header_rows[-1]
)
return df_cleaned
45 changes: 45 additions & 0 deletions tests/test_read_table_functionality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from isp_workbook_parser.config_model import TableConfig


def test_skip_single_row_in_single_header_row_table(workbook_v6):
table_config = TableConfig(
name="build_cost_current_policies",
sheet_name="Build costs",
header_rows=15,
end_row=30,
column_range="B:AI",
skip_rows=30,
)
df = workbook_v6.get_table_from_config(table_config)
assert len(df) == (table_config.end_row - table_config.header_rows - 1)
assert df[df.Technology.str.contains("Hydrogen")].empty


def test_skip_multiple_rows_in_single_header_row_table(workbook_v6):
table_config = TableConfig(
name="existing_generator_maintenance_rates",
sheet_name="Maintenance",
header_rows=7,
end_row=19,
column_range="B:D",
skip_rows=[8, 9, 19],
)
df = workbook_v6.get_table_from_config(table_config)
assert len(df) == (table_config.end_row - table_config.header_rows - 3)
assert df[df["Generator type"].str.contains("Hydrogen")].empty
assert df[df["Generator type"].str.contains("Coal")].empty


def test_skip_multiple_rows_in_multiple_header_row_table(workbook_v6):
table_config = TableConfig(
name="wind_high_capacity_factors",
sheet_name="Capacity Factors ",
header_rows=[7, 8, 9],
end_row=48,
column_range="B:R",
# Victoria
skip_rows=(list(range(29, 35)) + [48]),
)
df = workbook_v6.get_table_from_config(table_config)
assert len(df) == (table_config.end_row - table_config.header_rows[-1] - 7)
assert df[df["Wind High - REZ ID"].str.contains("V")].empty
Loading