From 3ff63a94a4e7593f3b49e8188e6deaa578a0ec53 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 11:13:39 -0400 Subject: [PATCH 01/15] First cut at useful functionality. --- dcicutils/sheet_utils.py | 301 ++++++++++++++++++++++++++++++++------- poetry.lock | 14 +- pyproject.toml | 1 + test/test_sheet_utils.py | 120 +++++++++++----- 4 files changed, 343 insertions(+), 93 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 8125f27d3..e2f0e1c4d 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -1,10 +1,15 @@ +import chardet import copy +import csv +import io +import openpyxl from dcicutils.common import AnyJsonData -from openpyxl import load_workbook +from dcicutils.misc_utils import ignored from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook -from typing import Any, Dict, List, Optional, Union +from tempfile import TemporaryFile +from typing import Any, Dict, Iterable, List, Union Header = str @@ -12,6 +17,40 @@ ParsedHeader = List[Union[str, int]] ParsedHeaders = List[ParsedHeader] SheetCellValue = Union[int, float, str] +SheetRow = List[SheetCellValue] +CsvReader = type(csv.reader(TemporaryFile())) + + +def prefer_number(value: SheetCellValue): + if isinstance(value, str): # the given value might be an int or float, in which case just fall through + if not value: + return None + value = value + ch0 = value[0] + if ch0 == '+' or ch0 == '-' or ch0.isdigit(): + try: + return int(value) + except Exception: + pass + try: + return float(value) + except Exception: + pass + # If we couldn't parse it as an int or float, fall through to returning the original value + pass + return value + + +def open_text_input_file_respecting_byte_order_mark(filename): + """ + Opens a file for text input, respecting a byte-order mark (BOM). + """ + with io.open(filename, 'rb') as fp: + leading_bytes = fp.read(4 * 8) # 4 bytes is all we need + bom_info = chardet.detect(leading_bytes) + detected_encoding = bom_info and bom_info.get('encoding') # tread lightly + + return io.open(filename, 'r', encoding=detected_encoding) class ItemTools: @@ -90,7 +129,7 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed return parent @classmethod - def parse_value(cls, value: SheetCellValue) -> AnyJsonData: + def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData: if isinstance(value, str): lvalue = value.lower() # TODO: We could consult a schema to make this less heuristic, but this may do for now @@ -101,19 +140,9 @@ def parse_value(cls, value: SheetCellValue) -> AnyJsonData: elif lvalue == 'null' or lvalue == '': return None elif '|' in value: - return [cls.parse_value(subvalue) for subvalue in value.split('|')] + return [cls.parse_item_value(subvalue) for subvalue in value.split('|')] else: - ch0 = value[0] - if ch0 == '+' or ch0 == '-' or ch0.isdigit(): - try: - return int(value) - except Exception: - pass - try: - return float(value) - except Exception: - pass - return value + return prefer_number(value) else: # presumably a number (int or float) return value @@ -128,25 +157,122 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any cls.set_path_value(datum[key], more_path, value) -class WorkbookManager: +class AbstractTableSetManager: + """ + The TableSetManager is the spanning class of anything that wants to be able to load a table set, + regardless of what it wants to load it from. To do this, it must support a load_table_set method + that takes a filename and returns the file content in the form: + { + "Sheet1": [ + {...representation of row1 as some kind of dict...}, + {...representation of row2 as some kind of dict...} + ], + "Sheet2": [...], + ..., + } + Note that at this level of abstraction, we take no position on what form of representation is used + for the rows, as long as it is JSON data of some kind. It might be + {"col1": "val1", "col2", "val2", ...} + or it might be something more structured like + {"something": "val1", {"something_else": ["val2"]}} + Additionally, the values stored might be altered as well. In particular, the most likely alteration + is to turn "123" to 123 or "" to None, though the specifics of whether and how such transformations + happen is not constrained by this class. + """ @classmethod - def load_workbook(cls, filename: str): - wb = cls(filename) - return wb.load_content() + def load_table_set(cls, filename: str) -> Dict[str, List[AnyJsonData]]: + """ + Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. + For more information, see documentation of AbstractTableSetManager. + """ + raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") + + +class BasicTableSetManager(AbstractTableSetManager): + """ + A BasicTableManager provides some structure that most kinds of parsers will need. + In particular, everything will likely need some way of storing headers and some way of storing content + of each sheet. Even a csv file, which doesn't have multiple tabs can be seen as the degenerate case + of this where there's only one set of headers and only one block of content. + """ + + def _create_sheet_processor_state(self, sheetname: str) -> Any: + """ + This method provides for the possibility that some parsers will want auxiliary state, + (such as parsed headers or a line count or a table of temporary names for objects to cross-link + or some other such feature) that it carries with it as it moves from line to line parsing things. + Subclasses might therefore want to make this do something more interesting. + """ + ignored(sheetname) # subclasses might need this, but we don't + return None def __init__(self, filename: str): self.filename: str = filename - self.workbook: Optional[Workbook] = None self.headers_by_sheetname: Dict[str, List[str]] = {} - self.content_by_sheetname: Dict[str, List[Any]] = {} + self.content_by_sheetname: Dict[str, List[AnyJsonData]] = {} + self.workbook: Any = self._initialize_workbook() def sheet_headers(self, sheetname: str) -> List[str]: return self.headers_by_sheetname[sheetname] - def sheet_content(self, sheetname: str) -> List[Any]: + def sheet_content(self, sheetname: str) -> List[AnyJsonData]: return self.content_by_sheetname[sheetname] + def _initialize_workbook(self) -> Any: + """This function is responsible for opening the workbook and returning a workbook object.""" + raise NotImplementedError(f"._initialize_workbook() is not implemented for {self.__class__.__name__}.") + + def load_content(self) -> Any: + raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") + + +class TableSetManager(BasicTableSetManager): + + @classmethod + def load_table_set(cls, filename: str) -> AnyJsonData: + table_set_manager: TableSetManager = cls(filename) + return table_set_manager.load_content() + + def __init__(self, filename: str): + super().__init__(filename=filename) + + @property + def sheetnames(self) -> List[str]: + raise NotImplementedError(f".sheetnames is not implemented for {self.__class__.__name__}..") + + def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: + """ + Given a sheetname and a state (returned by _sheet_loader_state), return a generator for a set of row values. + What constitutes a row is just something that _sheet_col_enumerator will be happy receiving. + """ + raise NotImplementedError(f"._rows_for_sheetname(...) is not implemented for {self.__class__.__name__}.") + + def _process_row(self, sheetname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: + """ + This needs to take a state and whatever represents a row and + must return a list of objects representing column values. + What constitutes a row is just something that _sheet_col_enumerator will be happy receiving. + """ + raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") + + def load_content(self) -> AnyJsonData: + for sheetname in self.sheetnames: + sheet_content = [] + state = self._create_sheet_processor_state(sheetname) + for row_data in self._raw_row_generator_for_sheetname(sheetname): + processed_row_data: AnyJsonData = self._process_row(sheetname, state, row_data) + sheet_content.append(processed_row_data) + self.content_by_sheetname[sheetname] = sheet_content + return self.content_by_sheetname + + @classmethod + def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: + return prefer_number(value) + + +class XlsxManager(TableSetManager): + @classmethod def _all_rows(cls, sheet: Worksheet): row_max = sheet.max_row @@ -159,32 +285,36 @@ def _all_cols(cls, sheet: Worksheet): for col in range(1, col_max + 1): yield col - def _load_headers(self, sheet: Worksheet): + @property + def sheetnames(self) -> List[str]: + return self.workbook.sheetnames + + def _initialize_workbook(self) -> Workbook: + return openpyxl.load_workbook(self.filename) + + def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: + sheet = self.workbook[sheetname] + return (self._get_raw_row_content_tuple(sheet, row) + for row in self._all_rows(sheet)) + + def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: + return [sheet.cell(row=row, column=col).value + for col in self._all_cols(sheet)] + + def _create_sheet_processor_state(self, sheetname: str) -> Headers: + sheet = self.workbook[sheetname] headers: List[str] = [str(sheet.cell(row=1, column=col).value) for col in self._all_cols(sheet)] self.headers_by_sheetname[sheet.title] = headers + return headers - def _load_row(self, *, sheet: Worksheet, row: int): - headers = self.sheet_headers(sheet.title) - row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value - for col in self._all_cols(sheet)} - return row_dict - - def load_content(self): - workbook: Workbook = load_workbook(self.filename) - self.workbook = workbook - for sheetname in workbook.sheetnames: - sheet: Worksheet = workbook[sheetname] - self._load_headers(sheet) - content = [] - for row in self._all_rows(sheet): - row_dict = self._load_row(sheet=sheet, row=row) - content.append(row_dict) - self.content_by_sheetname[sheetname] = content - return self.content_by_sheetname + def _process_row(self, sheetname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(sheetname) + return {headers[i]: self.parse_cell_value(row_datum) + for i, row_datum in enumerate(row_data)} -class ItemManager(ItemTools, WorkbookManager): +class ItemManagerMixin(BasicTableSetManager): def __init__(self, filename: str): super().__init__(filename=filename) @@ -197,22 +327,85 @@ def sheet_patch_prototype(self, sheetname: str) -> Dict: def sheet_parsed_headers(self, sheetname: str) -> List[List[Union[int, str]]]: return self.parsed_headers_by_sheetname[sheetname] - def _load_headers(self, sheet: Worksheet): - super()._load_headers(sheet) - self._compile_sheet_headers(sheet.title) + def _create_sheet_processor_state(self, sheetname: str) -> ParsedHeaders: + super()._create_sheet_processor_state(sheetname) + self._compile_sheet_headers(sheetname) + return self.sheet_parsed_headers(sheetname) def _compile_sheet_headers(self, sheetname: str): headers = self.headers_by_sheetname[sheetname] - parsed_headers = self.parse_sheet_headers(headers) + parsed_headers = ItemTools.parse_sheet_headers(headers) self.parsed_headers_by_sheetname[sheetname] = parsed_headers - prototype = self.compute_patch_prototype(parsed_headers) + prototype = ItemTools.compute_patch_prototype(parsed_headers) self.patch_prototypes_by_sheetname[sheetname] = prototype - def _load_row(self, *, sheet: Worksheet, row: int): - parsed_headers = self.sheet_parsed_headers(sheet.title) - patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet.title)) - for col in self._all_cols(sheet): - value = sheet.cell(row=row, column=col).value - parsed_value = self.parse_value(value) - self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) + def _process_row(self, sheetname: str, parsed_headers: ParsedHeaders, row_data: SheetRow) -> AnyJsonData: + patch_item = copy.deepcopy(self.sheet_patch_prototype(sheetname)) + for i, value in enumerate(row_data): + parsed_value = self.parse_cell_value(value) + ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value) return patch_item + + @classmethod + def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: + return ItemTools.parse_item_value(value) + + +class ItemXlsxManager(ItemManagerMixin, XlsxManager): + pass + + +class CsvManager(TableSetManager): + + DEFAULT_SHEET_NAME = 'Sheet1' + + def __init__(self, filename: str, sheet_name: str = None): + super().__init__(filename=filename) + self.sheet_name = sheet_name or self.DEFAULT_SHEET_NAME + + @property + def sheetnames(self) -> List[str]: + return [self.sheet_name] + + def _initialize_workbook(self) -> CsvReader: + return self._get_csv_reader(self.filename) + + @classmethod + def _get_csv_reader(cls, filename) -> CsvReader: + return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) + + def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: + return self.workbook + + def _create_sheet_processor_state(self, sheetname: str) -> Headers: + headers: Headers = self.headers_by_sheetname.get(sheetname) + if headers is None: + self.headers_by_sheetname[sheetname] = headers = self.workbook.__next__() + return headers + + def _process_row(self, sheetname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(sheetname) + return {headers[i]: self.parse_cell_value(row_datum) + for i, row_datum in enumerate(row_data)} + + +class ItemCsvManager(ItemManagerMixin, CsvManager): + pass + + +class ItemManager(AbstractTableSetManager): + + @classmethod + def create_workbook(cls, filename: str) -> BasicTableSetManager: + if filename.endswith(".xlsx"): + workbook = ItemXlsxManager(filename) + elif filename.endswith(".csv"): + workbook = ItemCsvManager(filename) + else: + raise ValueError("Unknown workbook type: ") + return workbook + + @classmethod + def load_table_set(cls, filename: str) -> AnyJsonData: + workbook = cls.create_workbook(filename) + return workbook.load_content() diff --git a/poetry.lock b/poetry.lock index 480148ea1..95670b506 100644 --- a/poetry.lock +++ b/poetry.lock @@ -489,6 +489,18 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + [[package]] name = "charset-normalizer" version = "3.2.0" @@ -1621,4 +1633,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.7,<3.10" -content-hash = "9d01884634874c0304ebd91ae564ad7920cece54aea7de4c67955c2343e7d44b" +content-hash = "eb629a04469e24b917d9525dd06dac72f2014cc9ede879946909929f5c09b9fd" diff --git a/pyproject.toml b/pyproject.toml index 8fd8826a4..0ca37b8cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ botocore = "^1.20.39" # This value is intentionally pinned and must not be changed casually. elasticsearch = "7.13.4" aws-requests-auth = ">=0.4.2,<1" +chardet = "^5.2.0" docker = "^4.4.4" gitpython = "^3.1.2" openpyxl = "^3.1.2" diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 40286d2e3..df1ed522c 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,7 +1,7 @@ import os import pytest -from dcicutils.sheet_utils import ItemTools, WorkbookManager, ItemManager +from dcicutils.sheet_utils import ItemTools, XlsxManager, ItemXlsxManager, CsvManager, ItemCsvManager, ItemManager from .conftest_settings import TEST_DIR @@ -52,39 +52,39 @@ def test_item_tools_compute_patch_prototype_errors(headers): assert str(exc.value) == "A header cannot begin with a numeric ref: 0" -def test_item_tools_parse_value(): +def test_item_tools_parse_item_value(): for x in [37, 19.3, True, False, None, 'simple text']: - assert ItemTools.parse_value(x) == x + assert ItemTools.parse_item_value(x) == x - assert ItemTools.parse_value('3') == 3 - assert ItemTools.parse_value('+3') == 3 - assert ItemTools.parse_value('-3') == -3 + assert ItemTools.parse_item_value('3') == 3 + assert ItemTools.parse_item_value('+3') == 3 + assert ItemTools.parse_item_value('-3') == -3 - assert ItemTools.parse_value('3.5') == 3.5 - assert ItemTools.parse_value('+3.5') == 3.5 - assert ItemTools.parse_value('-3.5') == -3.5 + assert ItemTools.parse_item_value('3.5') == 3.5 + assert ItemTools.parse_item_value('+3.5') == 3.5 + assert ItemTools.parse_item_value('-3.5') == -3.5 - assert ItemTools.parse_value('3.5e1') == 35.0 - assert ItemTools.parse_value('+3.5e1') == 35.0 - assert ItemTools.parse_value('-3.5e1') == -35.0 + assert ItemTools.parse_item_value('3.5e1') == 35.0 + assert ItemTools.parse_item_value('+3.5e1') == 35.0 + assert ItemTools.parse_item_value('-3.5e1') == -35.0 - assert ItemTools.parse_value('') is None + assert ItemTools.parse_item_value('') is None - assert ItemTools.parse_value('null') is None - assert ItemTools.parse_value('Null') is None - assert ItemTools.parse_value('NULL') is None + assert ItemTools.parse_item_value('null') is None + assert ItemTools.parse_item_value('Null') is None + assert ItemTools.parse_item_value('NULL') is None - assert ItemTools.parse_value('true') is True - assert ItemTools.parse_value('True') is True - assert ItemTools.parse_value('TRUE') is True + assert ItemTools.parse_item_value('true') is True + assert ItemTools.parse_item_value('True') is True + assert ItemTools.parse_item_value('TRUE') is True - assert ItemTools.parse_value('false') is False - assert ItemTools.parse_value('False') is False - assert ItemTools.parse_value('FALSE') is False + assert ItemTools.parse_item_value('false') is False + assert ItemTools.parse_item_value('False') is False + assert ItemTools.parse_item_value('FALSE') is False - assert ItemTools.parse_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] - assert ItemTools.parse_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] + assert ItemTools.parse_item_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] + assert ItemTools.parse_item_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] def test_item_tools_set_path_value(): @@ -158,40 +158,84 @@ def test_item_tools_set_path_value(): SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv') -SAMPLE_CSV_FILE_RAW_CONTENT = SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2'] +SAMPLE_CSV_FILE_RAW_CONTENT = {CsvManager.DEFAULT_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} -SAMPLE_CSV_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2'] +SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_SHEET_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} -def test_workbook_manager_load_content(): +def test_xlsx_manager_load_content(): - wt = WorkbookManager(SAMPLE_XLSX_FILE) + wt = XlsxManager(SAMPLE_XLSX_FILE) assert wt.load_content() == SAMPLE_XLSX_FILE_RAW_CONTENT -def test_workbook_manager_load_workbook(): +def test_xlsx_manager_load_workbook(): - assert WorkbookManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT + assert XlsxManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT -def test_workbook_manager_load_csv(): +def test_xlsx_manager_load_csv(): with pytest.raises(Exception): - WorkbookManager.load_workbook(SAMPLE_CSV_FILE) + XlsxManager.load_table_set(SAMPLE_CSV_FILE) -def test_item_manager_load_content(): +def test_item_xlsx_manager_load_content(): - it = ItemManager(SAMPLE_XLSX_FILE) + it = ItemXlsxManager(SAMPLE_XLSX_FILE) assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT -def test_item_manager_load_workbook(): +def test_item_xlsx_manager_load_workbook(): + + assert ItemXlsxManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_item_xlsx_manager_load_csv(): + + with pytest.raises(Exception): + ItemXlsxManager.load_table_set(SAMPLE_CSV_FILE) + - assert ItemManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT +def test_csv_manager_load_content(): + wt = CsvManager(SAMPLE_CSV_FILE) + assert wt.load_content() == SAMPLE_CSV_FILE_RAW_CONTENT -def test_item_manager_load_csv(): + +def test_csv_manager_load_workbook(): + + assert CsvManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT + + +def test_csv_manager_load_csv(): with pytest.raises(Exception): - ItemManager.load_workbook(SAMPLE_CSV_FILE) + CsvManager.load_table_set(SAMPLE_XLSX_FILE) + + +def test_item_csv_manager_load_content(): + + it = ItemCsvManager(SAMPLE_CSV_FILE) + assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT + + +def test_item_csv_manager_load_workbook(): + + assert ItemCsvManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + + +def test_item_csv_manager_load_csv(): + + with pytest.raises(Exception): + ItemCsvManager.load_table_set(SAMPLE_XLSX_FILE) + + +def test_item_manager_load_workbook(): + + assert ItemManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + assert ItemManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + + with pytest.raises(ValueError): + ItemManager.load_table_set("something.else") From 39bd2e095898b36c819e5330f19ab94591792a6c Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 11:20:34 -0400 Subject: [PATCH 02/15] Some name changes to make things more abstract. workbook becomes reader_agent, for example --- dcicutils/sheet_utils.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index e2f0e1c4d..82647ddb3 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -211,7 +211,7 @@ def __init__(self, filename: str): self.filename: str = filename self.headers_by_sheetname: Dict[str, List[str]] = {} self.content_by_sheetname: Dict[str, List[AnyJsonData]] = {} - self.workbook: Any = self._initialize_workbook() + self.reader_agent: Any = self._get_reader_agent() def sheet_headers(self, sheetname: str) -> List[str]: return self.headers_by_sheetname[sheetname] @@ -219,9 +219,9 @@ def sheet_headers(self, sheetname: str) -> List[str]: def sheet_content(self, sheetname: str) -> List[AnyJsonData]: return self.content_by_sheetname[sheetname] - def _initialize_workbook(self) -> Any: + def _get_reader_agent(self) -> Any: """This function is responsible for opening the workbook and returning a workbook object.""" - raise NotImplementedError(f"._initialize_workbook() is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") def load_content(self) -> Any: raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") @@ -287,13 +287,13 @@ def _all_cols(cls, sheet: Worksheet): @property def sheetnames(self) -> List[str]: - return self.workbook.sheetnames + return self.reader_agent.sheetnames - def _initialize_workbook(self) -> Workbook: + def _get_reader_agent(self) -> Workbook: return openpyxl.load_workbook(self.filename) def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: - sheet = self.workbook[sheetname] + sheet = self.reader_agent[sheetname] return (self._get_raw_row_content_tuple(sheet, row) for row in self._all_rows(sheet)) @@ -302,7 +302,7 @@ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: for col in self._all_cols(sheet)] def _create_sheet_processor_state(self, sheetname: str) -> Headers: - sheet = self.workbook[sheetname] + sheet = self.reader_agent[sheetname] headers: List[str] = [str(sheet.cell(row=1, column=col).value) for col in self._all_cols(sheet)] self.headers_by_sheetname[sheet.title] = headers @@ -367,7 +367,7 @@ def __init__(self, filename: str, sheet_name: str = None): def sheetnames(self) -> List[str]: return [self.sheet_name] - def _initialize_workbook(self) -> CsvReader: + def _get_reader_agent(self) -> CsvReader: return self._get_csv_reader(self.filename) @classmethod @@ -375,12 +375,12 @@ def _get_csv_reader(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: - return self.workbook + return self.reader_agent def _create_sheet_processor_state(self, sheetname: str) -> Headers: headers: Headers = self.headers_by_sheetname.get(sheetname) if headers is None: - self.headers_by_sheetname[sheetname] = headers = self.workbook.__next__() + self.headers_by_sheetname[sheetname] = headers = self.reader_agent.__next__() return headers def _process_row(self, sheetname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: @@ -396,16 +396,16 @@ class ItemCsvManager(ItemManagerMixin, CsvManager): class ItemManager(AbstractTableSetManager): @classmethod - def create_workbook(cls, filename: str) -> BasicTableSetManager: + def create_implementation_manager(cls, filename: str) -> BasicTableSetManager: if filename.endswith(".xlsx"): - workbook = ItemXlsxManager(filename) + reader_agent = ItemXlsxManager(filename) elif filename.endswith(".csv"): - workbook = ItemCsvManager(filename) + reader_agent = ItemCsvManager(filename) else: - raise ValueError("Unknown workbook type: ") - return workbook + raise ValueError(f"Unknown file type: {filename}") + return reader_agent @classmethod def load_table_set(cls, filename: str) -> AnyJsonData: - workbook = cls.create_workbook(filename) - return workbook.load_content() + manager = cls.create_implementation_manager(filename) + return manager.load_content() From 77b72f6452b72aed0b1ead9ba9a5a81c53838122 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 11:30:52 -0400 Subject: [PATCH 03/15] Rename sheetname to tabname throughout, to be more clear that this is not the workbook level artifact. Better handling of init args. --- dcicutils/sheet_utils.py | 134 ++++++++++++++++++++------------------- test/test_sheet_utils.py | 4 +- 2 files changed, 72 insertions(+), 66 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 82647ddb3..5608e61ea 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -180,6 +180,11 @@ class AbstractTableSetManager: happen is not constrained by this class. """ + @classmethod + def __init__(self, **kwargs): + if kwargs: + raise ValueError(f"Unexpectd keyword arguments initializing {self.__class__.__name__}: {kwargs}") + @classmethod def load_table_set(cls, filename: str) -> Dict[str, List[AnyJsonData]]: """ @@ -197,27 +202,28 @@ class BasicTableSetManager(AbstractTableSetManager): of this where there's only one set of headers and only one block of content. """ - def _create_sheet_processor_state(self, sheetname: str) -> Any: + def _create_sheet_processor_state(self, tabname: str) -> Any: """ This method provides for the possibility that some parsers will want auxiliary state, (such as parsed headers or a line count or a table of temporary names for objects to cross-link or some other such feature) that it carries with it as it moves from line to line parsing things. Subclasses might therefore want to make this do something more interesting. """ - ignored(sheetname) # subclasses might need this, but we don't + ignored(tabname) # subclasses might need this, but we don't return None - def __init__(self, filename: str): + def __init__(self, filename: str, **kwargs): + super().__init__(**kwargs) self.filename: str = filename - self.headers_by_sheetname: Dict[str, List[str]] = {} - self.content_by_sheetname: Dict[str, List[AnyJsonData]] = {} + self.headers_by_tabname: Dict[str, List[str]] = {} + self.content_by_tabname: Dict[str, List[AnyJsonData]] = {} self.reader_agent: Any = self._get_reader_agent() - def sheet_headers(self, sheetname: str) -> List[str]: - return self.headers_by_sheetname[sheetname] + def sheet_headers(self, tabname: str) -> List[str]: + return self.headers_by_tabname[tabname] - def sheet_content(self, sheetname: str) -> List[AnyJsonData]: - return self.content_by_sheetname[sheetname] + def sheet_content(self, tabname: str) -> List[AnyJsonData]: + return self.content_by_tabname[tabname] def _get_reader_agent(self) -> Any: """This function is responsible for opening the workbook and returning a workbook object.""" @@ -234,21 +240,21 @@ def load_table_set(cls, filename: str) -> AnyJsonData: table_set_manager: TableSetManager = cls(filename) return table_set_manager.load_content() - def __init__(self, filename: str): - super().__init__(filename=filename) + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) @property - def sheetnames(self) -> List[str]: - raise NotImplementedError(f".sheetnames is not implemented for {self.__class__.__name__}..") + def tabnames(self) -> List[str]: + raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..") - def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: + def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: """ - Given a sheetname and a state (returned by _sheet_loader_state), return a generator for a set of row values. + Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values. What constitutes a row is just something that _sheet_col_enumerator will be happy receiving. """ - raise NotImplementedError(f"._rows_for_sheetname(...) is not implemented for {self.__class__.__name__}.") + raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.") - def _process_row(self, sheetname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: + def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: """ This needs to take a state and whatever represents a row and must return a list of objects representing column values. @@ -257,14 +263,14 @@ def _process_row(self, sheetname: str, state: Any, row: List[SheetCellValue]) -> raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") def load_content(self) -> AnyJsonData: - for sheetname in self.sheetnames: + for tabname in self.tabnames: sheet_content = [] - state = self._create_sheet_processor_state(sheetname) - for row_data in self._raw_row_generator_for_sheetname(sheetname): - processed_row_data: AnyJsonData = self._process_row(sheetname, state, row_data) + state = self._create_sheet_processor_state(tabname) + for row_data in self._raw_row_generator_for_tabname(tabname): + processed_row_data: AnyJsonData = self._process_row(tabname, state, row_data) sheet_content.append(processed_row_data) - self.content_by_sheetname[sheetname] = sheet_content - return self.content_by_sheetname + self.content_by_tabname[tabname] = sheet_content + return self.content_by_tabname @classmethod def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: @@ -286,14 +292,14 @@ def _all_cols(cls, sheet: Worksheet): yield col @property - def sheetnames(self) -> List[str]: + def tabnames(self) -> List[str]: return self.reader_agent.sheetnames def _get_reader_agent(self) -> Workbook: return openpyxl.load_workbook(self.filename) - def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: - sheet = self.reader_agent[sheetname] + def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: + sheet = self.reader_agent[tabname] return (self._get_raw_row_content_tuple(sheet, row) for row in self._all_rows(sheet)) @@ -301,46 +307,46 @@ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: return [sheet.cell(row=row, column=col).value for col in self._all_cols(sheet)] - def _create_sheet_processor_state(self, sheetname: str) -> Headers: - sheet = self.reader_agent[sheetname] + def _create_sheet_processor_state(self, tabname: str) -> Headers: + sheet = self.reader_agent[tabname] headers: List[str] = [str(sheet.cell(row=1, column=col).value) for col in self._all_cols(sheet)] - self.headers_by_sheetname[sheet.title] = headers + self.headers_by_tabname[sheet.title] = headers return headers - def _process_row(self, sheetname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: - ignored(sheetname) + def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tabname) return {headers[i]: self.parse_cell_value(row_datum) for i, row_datum in enumerate(row_data)} class ItemManagerMixin(BasicTableSetManager): - def __init__(self, filename: str): - super().__init__(filename=filename) - self.patch_prototypes_by_sheetname: Dict[str, Dict] = {} - self.parsed_headers_by_sheetname: Dict[str, List[List[Union[int, str]]]] = {} + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) + self.patch_prototypes_by_tabname: Dict[str, Dict] = {} + self.parsed_headers_by_tabname: Dict[str, List[List[Union[int, str]]]] = {} - def sheet_patch_prototype(self, sheetname: str) -> Dict: - return self.patch_prototypes_by_sheetname[sheetname] + def sheet_patch_prototype(self, tabname: str) -> Dict: + return self.patch_prototypes_by_tabname[tabname] - def sheet_parsed_headers(self, sheetname: str) -> List[List[Union[int, str]]]: - return self.parsed_headers_by_sheetname[sheetname] + def sheet_parsed_headers(self, tabname: str) -> List[List[Union[int, str]]]: + return self.parsed_headers_by_tabname[tabname] - def _create_sheet_processor_state(self, sheetname: str) -> ParsedHeaders: - super()._create_sheet_processor_state(sheetname) - self._compile_sheet_headers(sheetname) - return self.sheet_parsed_headers(sheetname) + def _create_sheet_processor_state(self, tabname: str) -> ParsedHeaders: + super()._create_sheet_processor_state(tabname) + self._compile_sheet_headers(tabname) + return self.sheet_parsed_headers(tabname) - def _compile_sheet_headers(self, sheetname: str): - headers = self.headers_by_sheetname[sheetname] + def _compile_sheet_headers(self, tabname: str): + headers = self.headers_by_tabname[tabname] parsed_headers = ItemTools.parse_sheet_headers(headers) - self.parsed_headers_by_sheetname[sheetname] = parsed_headers + self.parsed_headers_by_tabname[tabname] = parsed_headers prototype = ItemTools.compute_patch_prototype(parsed_headers) - self.patch_prototypes_by_sheetname[sheetname] = prototype + self.patch_prototypes_by_tabname[tabname] = prototype - def _process_row(self, sheetname: str, parsed_headers: ParsedHeaders, row_data: SheetRow) -> AnyJsonData: - patch_item = copy.deepcopy(self.sheet_patch_prototype(sheetname)) + def _process_row(self, tabname: str, parsed_headers: ParsedHeaders, row_data: SheetRow) -> AnyJsonData: + patch_item = copy.deepcopy(self.sheet_patch_prototype(tabname)) for i, value in enumerate(row_data): parsed_value = self.parse_cell_value(value) ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value) @@ -357,15 +363,15 @@ class ItemXlsxManager(ItemManagerMixin, XlsxManager): class CsvManager(TableSetManager): - DEFAULT_SHEET_NAME = 'Sheet1' + DEFAULT_TAB_NAME = 'Sheet1' - def __init__(self, filename: str, sheet_name: str = None): - super().__init__(filename=filename) - self.sheet_name = sheet_name or self.DEFAULT_SHEET_NAME + def __init__(self, filename: str, sheet_name: str = None, **kwargs): + super().__init__(filename=filename, **kwargs) + self.tab_name = sheet_name or self.DEFAULT_TAB_NAME @property - def sheetnames(self) -> List[str]: - return [self.sheet_name] + def tabnames(self) -> List[str]: + return [self.tab_name] def _get_reader_agent(self) -> CsvReader: return self._get_csv_reader(self.filename) @@ -374,17 +380,17 @@ def _get_reader_agent(self) -> CsvReader: def _get_csv_reader(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) - def _raw_row_generator_for_sheetname(self, sheetname: str) -> Iterable[SheetRow]: + def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: return self.reader_agent - def _create_sheet_processor_state(self, sheetname: str) -> Headers: - headers: Headers = self.headers_by_sheetname.get(sheetname) + def _create_sheet_processor_state(self, tabname: str) -> Headers: + headers: Headers = self.headers_by_tabname.get(tabname) if headers is None: - self.headers_by_sheetname[sheetname] = headers = self.reader_agent.__next__() + self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() return headers - def _process_row(self, sheetname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: - ignored(sheetname) + def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tabname) return {headers[i]: self.parse_cell_value(row_datum) for i, row_datum in enumerate(row_data)} @@ -396,11 +402,11 @@ class ItemCsvManager(ItemManagerMixin, CsvManager): class ItemManager(AbstractTableSetManager): @classmethod - def create_implementation_manager(cls, filename: str) -> BasicTableSetManager: + def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: if filename.endswith(".xlsx"): - reader_agent = ItemXlsxManager(filename) + reader_agent = ItemXlsxManager(filename, **kwargs) elif filename.endswith(".csv"): - reader_agent = ItemCsvManager(filename) + reader_agent = ItemCsvManager(filename, **kwargs) else: raise ValueError(f"Unknown file type: {filename}") return reader_agent diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index df1ed522c..4a32e928f 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -158,9 +158,9 @@ def test_item_tools_set_path_value(): SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv') -SAMPLE_CSV_FILE_RAW_CONTENT = {CsvManager.DEFAULT_SHEET_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} +SAMPLE_CSV_FILE_RAW_CONTENT = {CsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} -SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_SHEET_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} +SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} def test_xlsx_manager_load_content(): From ba8c55c922bdf967d18b9201b7fbebb3bfeb5f7b Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 12:10:23 -0400 Subject: [PATCH 04/15] Add some doc strings. Rename load_table_set to just load. Arrange for ItemManager.load to take a tab_name argument so that CSV files can perhaps infer a type name. --- dcicutils/sheet_utils.py | 91 +++++++++++++++++++++++++--------------- test/test_sheet_utils.py | 22 +++++----- 2 files changed, 68 insertions(+), 45 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 5608e61ea..fc2e4752a 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -160,7 +160,7 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any class AbstractTableSetManager: """ The TableSetManager is the spanning class of anything that wants to be able to load a table set, - regardless of what it wants to load it from. To do this, it must support a load_table_set method + regardless of what it wants to load it from. To do this, it must support a load method that takes a filename and returns the file content in the form: { "Sheet1": [ @@ -180,13 +180,12 @@ class AbstractTableSetManager: happen is not constrained by this class. """ - @classmethod def __init__(self, **kwargs): if kwargs: - raise ValueError(f"Unexpectd keyword arguments initializing {self.__class__.__name__}: {kwargs}") + raise ValueError(f"Got unexpected keywords: {kwargs}") @classmethod - def load_table_set(cls, filename: str) -> Dict[str, List[AnyJsonData]]: + def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]: """ Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. For more information, see documentation of AbstractTableSetManager. @@ -202,16 +201,6 @@ class BasicTableSetManager(AbstractTableSetManager): of this where there's only one set of headers and only one block of content. """ - def _create_sheet_processor_state(self, tabname: str) -> Any: - """ - This method provides for the possibility that some parsers will want auxiliary state, - (such as parsed headers or a line count or a table of temporary names for objects to cross-link - or some other such feature) that it carries with it as it moves from line to line parsing things. - Subclasses might therefore want to make this do something more interesting. - """ - ignored(tabname) # subclasses might need this, but we don't - return None - def __init__(self, filename: str, **kwargs): super().__init__(**kwargs) self.filename: str = filename @@ -219,12 +208,22 @@ def __init__(self, filename: str, **kwargs): self.content_by_tabname: Dict[str, List[AnyJsonData]] = {} self.reader_agent: Any = self._get_reader_agent() - def sheet_headers(self, tabname: str) -> List[str]: + def tab_headers(self, tabname: str) -> List[str]: return self.headers_by_tabname[tabname] - def sheet_content(self, tabname: str) -> List[AnyJsonData]: + def tab_content(self, tabname: str) -> List[AnyJsonData]: return self.content_by_tabname[tabname] + def _create_tab_processor_state(self, tabname: str) -> Any: + """ + This method provides for the possibility that some parsers will want auxiliary state, + (such as parsed headers or a line count or a table of temporary names for objects to cross-link + or some other such feature) that it carries with it as it moves from line to line parsing things. + Subclasses might therefore want to make this do something more interesting. + """ + ignored(tabname) # subclasses might need this, but we don't + return None + def _get_reader_agent(self) -> Any: """This function is responsible for opening the workbook and returning a workbook object.""" raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") @@ -236,12 +235,12 @@ def load_content(self) -> Any: class TableSetManager(BasicTableSetManager): @classmethod - def load_table_set(cls, filename: str) -> AnyJsonData: + def load(cls, filename: str) -> AnyJsonData: table_set_manager: TableSetManager = cls(filename) return table_set_manager.load_content() - def __init__(self, filename: str, **kwargs): - super().__init__(filename=filename, **kwargs) + def __init__(self, filename: str): + super().__init__(filename=filename) @property def tabnames(self) -> List[str]: @@ -250,7 +249,6 @@ def tabnames(self) -> List[str]: def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: """ Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values. - What constitutes a row is just something that _sheet_col_enumerator will be happy receiving. """ raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.") @@ -258,14 +256,14 @@ def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> A """ This needs to take a state and whatever represents a row and must return a list of objects representing column values. - What constitutes a row is just something that _sheet_col_enumerator will be happy receiving. + What constitutes a processed up to the class, but other than that the result must be a JSON dictionary. """ raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") def load_content(self) -> AnyJsonData: for tabname in self.tabnames: sheet_content = [] - state = self._create_sheet_processor_state(tabname) + state = self._create_tab_processor_state(tabname) for row_data in self._raw_row_generator_for_tabname(tabname): processed_row_data: AnyJsonData = self._process_row(tabname, state, row_data) sheet_content.append(processed_row_data) @@ -278,6 +276,9 @@ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: class XlsxManager(TableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheets in an XLSX file. + """ @classmethod def _all_rows(cls, sheet: Worksheet): @@ -307,7 +308,7 @@ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: return [sheet.cell(row=row, column=col).value for col in self._all_cols(sheet)] - def _create_sheet_processor_state(self, tabname: str) -> Headers: + def _create_tab_processor_state(self, tabname: str) -> Headers: sheet = self.reader_agent[tabname] headers: List[str] = [str(sheet.cell(row=1, column=col).value) for col in self._all_cols(sheet)] @@ -321,6 +322,10 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An class ItemManagerMixin(BasicTableSetManager): + """ + This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows + get handled like Items instead of just flat table rows. + """ def __init__(self, filename: str, **kwargs): super().__init__(filename=filename, **kwargs) @@ -333,8 +338,10 @@ def sheet_patch_prototype(self, tabname: str) -> Dict: def sheet_parsed_headers(self, tabname: str) -> List[List[Union[int, str]]]: return self.parsed_headers_by_tabname[tabname] - def _create_sheet_processor_state(self, tabname: str) -> ParsedHeaders: - super()._create_sheet_processor_state(tabname) + def _create_tab_processor_state(self, tabname: str) -> ParsedHeaders: + super()._create_tab_processor_state(tabname) + # This will create state that allows us to efficiently assign values in the right place on each row + # by setting up a prototype we can copy and then drop values into. self._compile_sheet_headers(tabname) return self.sheet_parsed_headers(tabname) @@ -358,16 +365,23 @@ def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: class ItemXlsxManager(ItemManagerMixin, XlsxManager): + """ + This layers item-style row processing functionality on an XLSX file. + """ pass class CsvManager(TableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheet in a csv file, + returning a result that still looks like there could have been multiple tabs. + """ DEFAULT_TAB_NAME = 'Sheet1' - def __init__(self, filename: str, sheet_name: str = None, **kwargs): - super().__init__(filename=filename, **kwargs) - self.tab_name = sheet_name or self.DEFAULT_TAB_NAME + def __init__(self, filename: str, tab_name=None): + super().__init__(filename=filename) + self.tab_name = tab_name or self.DEFAULT_TAB_NAME @property def tabnames(self) -> List[str]: @@ -383,7 +397,7 @@ def _get_csv_reader(cls, filename) -> CsvReader: def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: return self.reader_agent - def _create_sheet_processor_state(self, tabname: str) -> Headers: + def _create_tab_processor_state(self, tabname: str) -> Headers: headers: Headers = self.headers_by_tabname.get(tabname) if headers is None: self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() @@ -396,22 +410,31 @@ def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> An class ItemCsvManager(ItemManagerMixin, CsvManager): + """ + This layers item-style row processing functionality on a CSV file. + """ pass class ItemManager(AbstractTableSetManager): + """ + This class will open a .xlsx or .csv file and load its content in our standard format. + (See more detailed description in AbstractTableManager.) + """ @classmethod - def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: + def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTableSetManager: if filename.endswith(".xlsx"): - reader_agent = ItemXlsxManager(filename, **kwargs) + if tab_name is not None: + raise ValueError(f".xlsx files don't need tab_name={tab_name!r}") + reader_agent = ItemXlsxManager(filename) elif filename.endswith(".csv"): - reader_agent = ItemCsvManager(filename, **kwargs) + reader_agent = ItemCsvManager(filename, tab_name=tab_name) else: raise ValueError(f"Unknown file type: {filename}") return reader_agent @classmethod - def load_table_set(cls, filename: str) -> AnyJsonData: - manager = cls.create_implementation_manager(filename) + def load(cls, filename: str, tab_name=None) -> AnyJsonData: + manager = cls.create_implementation_manager(filename, tab_name=tab_name) return manager.load_content() diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 4a32e928f..c2809a9f4 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -171,13 +171,13 @@ def test_xlsx_manager_load_content(): def test_xlsx_manager_load_workbook(): - assert XlsxManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT + assert XlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT def test_xlsx_manager_load_csv(): with pytest.raises(Exception): - XlsxManager.load_table_set(SAMPLE_CSV_FILE) + XlsxManager.load(SAMPLE_CSV_FILE) def test_item_xlsx_manager_load_content(): @@ -188,13 +188,13 @@ def test_item_xlsx_manager_load_content(): def test_item_xlsx_manager_load_workbook(): - assert ItemXlsxManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert ItemXlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT def test_item_xlsx_manager_load_csv(): with pytest.raises(Exception): - ItemXlsxManager.load_table_set(SAMPLE_CSV_FILE) + ItemXlsxManager.load(SAMPLE_CSV_FILE) def test_csv_manager_load_content(): @@ -205,13 +205,13 @@ def test_csv_manager_load_content(): def test_csv_manager_load_workbook(): - assert CsvManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT + assert CsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT def test_csv_manager_load_csv(): with pytest.raises(Exception): - CsvManager.load_table_set(SAMPLE_XLSX_FILE) + CsvManager.load(SAMPLE_XLSX_FILE) def test_item_csv_manager_load_content(): @@ -222,20 +222,20 @@ def test_item_csv_manager_load_content(): def test_item_csv_manager_load_workbook(): - assert ItemCsvManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert ItemCsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT def test_item_csv_manager_load_csv(): with pytest.raises(Exception): - ItemCsvManager.load_table_set(SAMPLE_XLSX_FILE) + ItemCsvManager.load(SAMPLE_XLSX_FILE) def test_item_manager_load_workbook(): - assert ItemManager.load_table_set(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + assert ItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT - assert ItemManager.load_table_set(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + assert ItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT with pytest.raises(ValueError): - ItemManager.load_table_set("something.else") + ItemManager.load("something.else") From 50488cb7411d29b72fa8109f6de93d1841861541 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 12:24:46 -0400 Subject: [PATCH 05/15] Add load_items function. Fix some test names. Update changelog. --- CHANGELOG.rst | 15 +++++++++++++-- dcicutils/sheet_utils.py | 3 +++ test/test_sheet_utils.py | 24 ++++++++++++++++++------ 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 61f334d68..f07b9a4c3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,9 +12,20 @@ Change Log * New module ``sheet_utils`` for loading workbooks. - * class ``WorkbookManager`` for loading raw data + * Important things of interest: - * class ``ItemManager`` for loading item data + * Class ``ItemManager`` for loading Item-style data + from either ``.xlsx`` or ``.csv`` files. + + * Function ``load_items`` that does the same as ``ItemManager.load``. + + * Various low-level implementation classes such as: + + * Classes ``XlsxManager`` and ``CsvManager`` for loading raw data + from ``.xlsx`` and ``.csv`` files, respectively. + + * Classes ``ItemXlsxManager`` and ``ItemCsvManager`` for loading Item-style data + from ``.xlsx`` and ``.csv`` files, respectively. 7.7.2 diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index fc2e4752a..7a6959a47 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -438,3 +438,6 @@ def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTab def load(cls, filename: str, tab_name=None) -> AnyJsonData: manager = cls.create_implementation_manager(filename, tab_name=tab_name) return manager.load_content() + + +load_items = ItemManager.load diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index c2809a9f4..b98c56fa5 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,7 +1,9 @@ import os import pytest -from dcicutils.sheet_utils import ItemTools, XlsxManager, ItemXlsxManager, CsvManager, ItemCsvManager, ItemManager +from dcicutils.sheet_utils import ( + ItemTools, XlsxManager, ItemXlsxManager, CsvManager, ItemCsvManager, ItemManager, load_items, +) from .conftest_settings import TEST_DIR @@ -169,7 +171,7 @@ def test_xlsx_manager_load_content(): assert wt.load_content() == SAMPLE_XLSX_FILE_RAW_CONTENT -def test_xlsx_manager_load_workbook(): +def test_xlsx_manager_load(): assert XlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT @@ -186,7 +188,7 @@ def test_item_xlsx_manager_load_content(): assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT -def test_item_xlsx_manager_load_workbook(): +def test_item_xlsx_manager_load(): assert ItemXlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT @@ -203,7 +205,7 @@ def test_csv_manager_load_content(): assert wt.load_content() == SAMPLE_CSV_FILE_RAW_CONTENT -def test_csv_manager_load_workbook(): +def test_csv_manager_load(): assert CsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT @@ -220,7 +222,7 @@ def test_item_csv_manager_load_content(): assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT -def test_item_csv_manager_load_workbook(): +def test_item_csv_manager_load(): assert ItemCsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT @@ -231,7 +233,7 @@ def test_item_csv_manager_load_csv(): ItemCsvManager.load(SAMPLE_XLSX_FILE) -def test_item_manager_load_workbook(): +def test_item_manager_load(): assert ItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT @@ -239,3 +241,13 @@ def test_item_manager_load_workbook(): with pytest.raises(ValueError): ItemManager.load("something.else") + + +def test_load_items(): + + assert load_items(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + assert load_items(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + + with pytest.raises(ValueError): + load_items("something.else") From 807e525965b11f6506b944b7775b84bd2e640082 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 12:48:41 -0400 Subject: [PATCH 06/15] Experimental bug fix from Will to hopefully make get_schema_names work. --- dcicutils/ff_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index e50ececf8..37a0439db 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -961,7 +961,7 @@ def get_schema_names(key=None, ff_env=None): if value.get('isAbstract') is True: continue # some test schemas in local don't have the id field - schema_filename = value.get('id') + schema_filename = value.get('$id') if schema_filename: schema_name[key] = schema_filename.split('/')[-1][:-5] return schema_name From 2a8e81a420ee629ad0e20d25e0d98282d0d27cab Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 13:02:39 -0400 Subject: [PATCH 07/15] update changelog --- CHANGELOG.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f07b9a4c3..0eda8ff0c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -27,6 +27,8 @@ Change Log * Classes ``ItemXlsxManager`` and ``ItemCsvManager`` for loading Item-style data from ``.xlsx`` and ``.csv`` files, respectively. +* Contains a fix for a bug in ``ff_utils.get_schema_names`` (`C4-1086 `_). + 7.7.2 ===== From 718054adeef8c77b737a0ae3895c61db0fccce0f Mon Sep 17 00:00:00 2001 From: Kent M Pitman Date: Thu, 17 Aug 2023 17:23:51 -0400 Subject: [PATCH 08/15] Update dcicutils/sheet_utils.py Co-authored-by: drio18 <58236592+drio18@users.noreply.github.com> --- dcicutils/sheet_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 7a6959a47..4b3dae21c 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -172,7 +172,7 @@ class AbstractTableSetManager: } Note that at this level of abstraction, we take no position on what form of representation is used for the rows, as long as it is JSON data of some kind. It might be - {"col1": "val1", "col2", "val2", ...} + {"col1": "val1", "col2": "val2", ...} or it might be something more structured like {"something": "val1", {"something_else": ["val2"]}} Additionally, the values stored might be altered as well. In particular, the most likely alteration From 56d1459c735c22579786c45312fa2e0ccd33466c Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 17:56:56 -0400 Subject: [PATCH 09/15] Add some comments in response to Doug's code review. --- dcicutils/sheet_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 4b3dae21c..4060e9f0d 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -157,6 +157,11 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any cls.set_path_value(datum[key], more_path, value) +# TODO: Consider whether this might want to be an abstract base class. Some change might be needed. +# Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class. +# I am less certain but open to discussion. Among other things, as implemented now, +# the __init__ method here needs to run and the documentation says that ABC's won't appear +# in the method resolution order. -kmp 17-Aug-2023 class AbstractTableSetManager: """ The TableSetManager is the spanning class of anything that wants to be able to load a table set, @@ -184,6 +189,7 @@ def __init__(self, **kwargs): if kwargs: raise ValueError(f"Got unexpected keywords: {kwargs}") + # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) @classmethod def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]: """ From 2facf9ef9ff31fdf1a712815641e52d9c26fc7a4 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 18:16:54 -0400 Subject: [PATCH 10/15] Support TSV files. --- dcicutils/sheet_utils.py | 22 +++++++++++++++++++ test/test_sheet_utils.py | 46 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 4060e9f0d..072e36e21 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -158,10 +158,12 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any # TODO: Consider whether this might want to be an abstract base class. Some change might be needed. +# # Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class. # I am less certain but open to discussion. Among other things, as implemented now, # the __init__ method here needs to run and the documentation says that ABC's won't appear # in the method resolution order. -kmp 17-Aug-2023 +# See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535 class AbstractTableSetManager: """ The TableSetManager is the spanning class of anything that wants to be able to load a table set, @@ -407,6 +409,7 @@ def _create_tab_processor_state(self, tabname: str) -> Headers: headers: Headers = self.headers_by_tabname.get(tabname) if headers is None: self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() + print(f"Headers={headers}") return headers def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: @@ -422,6 +425,23 @@ class ItemCsvManager(ItemManagerMixin, CsvManager): pass +class TsvManager(CsvManager): + """ + TSV files are just CSV files with tabs instead of commas as separators. + (We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.) + """ + @classmethod + def _get_csv_reader(cls, filename) -> CsvReader: + return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') + + +class ItemTsvManager(ItemManagerMixin, TsvManager): + """ + This layers item-style row processing functionality on a TSV file. + """ + pass + + class ItemManager(AbstractTableSetManager): """ This class will open a .xlsx or .csv file and load its content in our standard format. @@ -436,6 +456,8 @@ def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTab reader_agent = ItemXlsxManager(filename) elif filename.endswith(".csv"): reader_agent = ItemCsvManager(filename, tab_name=tab_name) + elif filename.endswith(".tsv"): + reader_agent = ItemTsvManager(filename, tab_name=tab_name) else: raise ValueError(f"Unknown file type: {filename}") return reader_agent diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index b98c56fa5..1915b3a71 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -2,7 +2,11 @@ import pytest from dcicutils.sheet_utils import ( - ItemTools, XlsxManager, ItemXlsxManager, CsvManager, ItemCsvManager, ItemManager, load_items, + # High-level interfaces + ItemManager, load_items, + # Low-level implementation + ItemTools, XlsxManager, ItemXlsxManager, + CsvManager, ItemCsvManager, TsvManager, ItemTsvManager, ) from .conftest_settings import TEST_DIR @@ -164,6 +168,12 @@ def test_item_tools_set_path_value(): SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} +SAMPLE_TSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.tsv') + +SAMPLE_TSV_FILE_RAW_CONTENT = {TsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} + +SAMPLE_TSV_FILE_ITEM_CONTENT = {ItemTsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} + def test_xlsx_manager_load_content(): @@ -233,6 +243,40 @@ def test_item_csv_manager_load_csv(): ItemCsvManager.load(SAMPLE_XLSX_FILE) +def test_tsv_manager_load_content(): + + wt = TsvManager(SAMPLE_TSV_FILE) + assert wt.load_content() == SAMPLE_TSV_FILE_RAW_CONTENT + + +def test_tsv_manager_load(): + + assert TsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_RAW_CONTENT + + +def test_tsv_manager_load_csv(): + + with pytest.raises(Exception): + TsvManager.load(SAMPLE_XLSX_FILE) + + +def test_item_tsv_manager_load_content(): + + it = ItemTsvManager(SAMPLE_TSV_FILE) + assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT + + +def test_item_tsv_manager_load(): + + assert ItemTsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + + +def test_item_tsv_manager_load_csv(): + + with pytest.raises(Exception): + ItemTsvManager.load(SAMPLE_XLSX_FILE) + + def test_item_manager_load(): assert ItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT From bcc4e636c3bea64a7cc792a949f2c4a5897066b9 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 18:19:01 -0400 Subject: [PATCH 11/15] Add changelog info about tsv files. --- CHANGELOG.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b57796c8a..2796e8def 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -15,17 +15,17 @@ Change Log * Important things of interest: * Class ``ItemManager`` for loading Item-style data - from either ``.xlsx`` or ``.csv`` files. + from any ``.xlsx``, ``.csv`` or ``.tsv`` files. * Function ``load_items`` that does the same as ``ItemManager.load``. * Various low-level implementation classes such as: - * Classes ``XlsxManager`` and ``CsvManager`` for loading raw data - from ``.xlsx`` and ``.csv`` files, respectively. + * Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data + from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. - * Classes ``ItemXlsxManager`` and ``ItemCsvManager`` for loading Item-style data - from ``.xlsx`` and ``.csv`` files, respectively. + * Classes ``ItemXlsxManager``, ``ItemCsvManager``, and ``ItemTsvManager`` for loading Item-style data + from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. * Contains a fix for a bug in ``ff_utils.get_schema_names`` (`C4-1086 `_). From 9de282e5e5475b66b36b2cabe3260ee521df9077 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Thu, 17 Aug 2023 18:22:11 -0400 Subject: [PATCH 12/15] Add a missing data file. --- test/data_files/sample_items_sheet2.tsv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 test/data_files/sample_items_sheet2.tsv diff --git a/test/data_files/sample_items_sheet2.tsv b/test/data_files/sample_items_sheet2.tsv new file mode 100644 index 000000000..d2c9e0e47 --- /dev/null +++ b/test/data_files/sample_items_sheet2.tsv @@ -0,0 +1,3 @@ +name age mother.name mother.age father.name father.age friends#0.name friends#0.age friends#1.name friends#1.age +bill 23 mary 58 fred 63 sam 22 arthur 19 +joe 9 estrella 35 anthony 34 anders 9 \ No newline at end of file From 60ada3fb1f8c24fec7aa89eae0ee7640fa9b555c Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 23 Aug 2023 22:32:52 -0400 Subject: [PATCH 13/15] Backport some small fixes and cosmetics from the schemas branch. --- dcicutils/sheet_utils.py | 50 +++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 072e36e21..f98b5f755 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -5,6 +5,7 @@ import openpyxl from dcicutils.common import AnyJsonData +from dcicutils.lang_utils import conjoined_list, maybe_pluralize from dcicutils.misc_utils import ignored from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook @@ -21,6 +22,17 @@ CsvReader = type(csv.reader(TemporaryFile())) +def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): + if kwargs: + unwanted = [f"{argname}={value!r}" if detailed else argname + for argname, value in kwargs.items() + if value is not None] + if unwanted: + does_not = "don't" if context_plural else "doesn't" + raise ValueError(f"{context} {does_not} use" + f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") + + def prefer_number(value: SheetCellValue): if isinstance(value, str): # the given value might be an int or float, in which case just fall through if not value: @@ -140,7 +152,12 @@ def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData: elif lvalue == 'null' or lvalue == '': return None elif '|' in value: - return [cls.parse_item_value(subvalue) for subvalue in value.split('|')] + if value == '|': # Use '|' for [] + return [] + else: + if value.endswith("|"): # Use 'foo|' for ['foo'] + value = value[:-1] + return [cls.parse_item_value(subvalue) for subvalue in value.split('|')] else: return prefer_number(value) else: # presumably a number (int or float) @@ -188,8 +205,7 @@ class AbstractTableSetManager: """ def __init__(self, **kwargs): - if kwargs: - raise ValueError(f"Got unexpected keywords: {kwargs}") + unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs) # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) @classmethod @@ -247,8 +263,8 @@ def load(cls, filename: str) -> AnyJsonData: table_set_manager: TableSetManager = cls(filename) return table_set_manager.load_content() - def __init__(self, filename: str): - super().__init__(filename=filename) + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) @property def tabnames(self) -> List[str]: @@ -338,12 +354,12 @@ class ItemManagerMixin(BasicTableSetManager): def __init__(self, filename: str, **kwargs): super().__init__(filename=filename, **kwargs) self.patch_prototypes_by_tabname: Dict[str, Dict] = {} - self.parsed_headers_by_tabname: Dict[str, List[List[Union[int, str]]]] = {} + self.parsed_headers_by_tabname: Dict[str, ParsedHeaders] = {} def sheet_patch_prototype(self, tabname: str) -> Dict: return self.patch_prototypes_by_tabname[tabname] - def sheet_parsed_headers(self, tabname: str) -> List[List[Union[int, str]]]: + def sheet_parsed_headers(self, tabname: str) -> ParsedHeaders: return self.parsed_headers_by_tabname[tabname] def _create_tab_processor_state(self, tabname: str) -> ParsedHeaders: @@ -387,8 +403,8 @@ class CsvManager(TableSetManager): DEFAULT_TAB_NAME = 'Sheet1' - def __init__(self, filename: str, tab_name=None): - super().__init__(filename=filename) + def __init__(self, filename: str, tab_name=None, **kwargs): + super().__init__(filename=filename, **kwargs) self.tab_name = tab_name or self.DEFAULT_TAB_NAME @property @@ -409,7 +425,6 @@ def _create_tab_processor_state(self, tabname: str) -> Headers: headers: Headers = self.headers_by_tabname.get(tabname) if headers is None: self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() - print(f"Headers={headers}") return headers def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: @@ -449,15 +464,18 @@ class ItemManager(AbstractTableSetManager): """ @classmethod - def create_implementation_manager(cls, filename: str, tab_name=None) -> BasicTableSetManager: + def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: if filename.endswith(".xlsx"): - if tab_name is not None: - raise ValueError(f".xlsx files don't need tab_name={tab_name!r}") - reader_agent = ItemXlsxManager(filename) + # unwanted_kwargs(context="ItemManager for .xlsx files", kwargs=kwargs) + reader_agent = ItemXlsxManager(filename, **kwargs) elif filename.endswith(".csv"): - reader_agent = ItemCsvManager(filename, tab_name=tab_name) + tab_name = kwargs.pop('tab_name', None) + # unwanted_kwargs(context="ItemManager for .csv files", kwargs=kwargs) + reader_agent = ItemCsvManager(filename, tab_name=tab_name, **kwargs) elif filename.endswith(".tsv"): - reader_agent = ItemTsvManager(filename, tab_name=tab_name) + tab_name = kwargs.pop('tab_name', None) + # unwanted_kwargs(context="ItemManager for .tsv files", kwargs=kwargs) + reader_agent = ItemTsvManager(filename, tab_name=tab_name, **kwargs) else: raise ValueError(f"Unknown file type: {filename}") return reader_agent From 946b9987273f2918357cc1d5c33e5451c10d9b41 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 23 Aug 2023 22:36:06 -0400 Subject: [PATCH 14/15] Add some missing newlines in data files. --- test/data_files/sample_items_sheet2.csv | 2 +- test/data_files/sample_items_sheet2.tsv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/data_files/sample_items_sheet2.csv b/test/data_files/sample_items_sheet2.csv index b1d3ec2da..95567c42a 100644 --- a/test/data_files/sample_items_sheet2.csv +++ b/test/data_files/sample_items_sheet2.csv @@ -1,3 +1,3 @@ name,age,mother.name,mother.age,father.name,father.age,friends#0.name,friends#0.age,friends#1.name,friends#1.age bill,23,mary,58,fred,63,sam,22,arthur,19 -joe,9,estrella,35,anthony,34,anders,9,, \ No newline at end of file +joe,9,estrella,35,anthony,34,anders,9,, diff --git a/test/data_files/sample_items_sheet2.tsv b/test/data_files/sample_items_sheet2.tsv index d2c9e0e47..e862bf36d 100644 --- a/test/data_files/sample_items_sheet2.tsv +++ b/test/data_files/sample_items_sheet2.tsv @@ -1,3 +1,3 @@ name age mother.name mother.age father.name father.age friends#0.name friends#0.age friends#1.name friends#1.age bill 23 mary 58 fred 63 sam 22 arthur 19 -joe 9 estrella 35 anthony 34 anders 9 \ No newline at end of file +joe 9 estrella 35 anthony 34 anders 9 From 36e7de064bc3988f8da946f2388de59505d3f33e Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 23 Aug 2023 23:57:55 -0400 Subject: [PATCH 15/15] Support for coping with .tsv files where trailing whitespace is 'helpfully' removed by an editor that doesn't understand such whitespace might be significant in TSVs. --- dcicutils/sheet_utils.py | 22 +++++++++++++------ .../sample_items_sheet_2.tsv.README.text | 4 ++++ 2 files changed, 19 insertions(+), 7 deletions(-) create mode 100644 test/data_files/sample_items_sheet_2.tsv.README.text diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index f98b5f755..fbd51194f 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -10,7 +10,7 @@ from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook from tempfile import TemporaryFile -from typing import Any, Dict, Iterable, List, Union +from typing import Any, Dict, Iterable, List, Optional, Union Header = str @@ -228,11 +228,11 @@ class BasicTableSetManager(AbstractTableSetManager): def __init__(self, filename: str, **kwargs): super().__init__(**kwargs) self.filename: str = filename - self.headers_by_tabname: Dict[str, List[str]] = {} + self.headers_by_tabname: Dict[str, Headers] = {} self.content_by_tabname: Dict[str, List[AnyJsonData]] = {} self.reader_agent: Any = self._get_reader_agent() - def tab_headers(self, tabname: str) -> List[str]: + def tab_headers(self, tabname: str) -> Headers: return self.headers_by_tabname[tabname] def tab_content(self, tabname: str) -> List[AnyJsonData]: @@ -334,8 +334,8 @@ def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: def _create_tab_processor_state(self, tabname: str) -> Headers: sheet = self.reader_agent[tabname] - headers: List[str] = [str(sheet.cell(row=1, column=col).value) - for col in self._all_cols(sheet)] + headers: Headers = [str(sheet.cell(row=1, column=col).value) + for col in self._all_cols(sheet)] self.headers_by_tabname[sheet.title] = headers return headers @@ -418,11 +418,19 @@ def _get_reader_agent(self) -> CsvReader: def _get_csv_reader(cls, filename) -> CsvReader: return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) + PAD_TRAILING_TABS = True + def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: - return self.reader_agent + headers = self.tab_headers(tabname) + n_headers = len(headers) + for row_data in self.reader_agent: + n_cols = len(row_data) + if self.PAD_TRAILING_TABS and n_cols < n_headers: + row_data = row_data + [''] * (n_headers - n_cols) + yield row_data def _create_tab_processor_state(self, tabname: str) -> Headers: - headers: Headers = self.headers_by_tabname.get(tabname) + headers: Optional[Headers] = self.headers_by_tabname.get(tabname) if headers is None: self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() return headers diff --git a/test/data_files/sample_items_sheet_2.tsv.README.text b/test/data_files/sample_items_sheet_2.tsv.README.text new file mode 100644 index 000000000..efefaf654 --- /dev/null +++ b/test/data_files/sample_items_sheet_2.tsv.README.text @@ -0,0 +1,4 @@ +Note that one of the lines in file sample_items_sheet_2.tsv has two blank fields at end of line. +PyCharm and perhaps other editors "helpfully" removes trailing whitespace from lines, +so the number of columns varies line-to-line. Instead of insisting on explicit tabs at end of line, +we pad such short lines with nulls when reading from the file.