diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 35ab4eac2..dd292f4d5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,9 +12,22 @@ Change Log * New module ``sheet_utils`` for loading workbooks. - * class ``WorkbookManager`` for loading raw data + * Important things of interest: - * class ``ItemManager`` for loading item data + * Class ``ItemManager`` for loading Item-style data + from any ``.xlsx``, ``.csv`` or ``.tsv`` files. + + * Function ``load_items`` that does the same as ``ItemManager.load``. + + * Various low-level implementation classes such as: + + * Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data + from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. + + * Classes ``ItemXlsxManager``, ``ItemCsvManager``, and ``ItemTsvManager`` for loading Item-style data + from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively. + +* Contains a fix for a bug in ``ff_utils.get_schema_names`` (`C4-1086 `_). 7.9.0 diff --git a/dcicutils/sheet_utils.py b/dcicutils/sheet_utils.py index 8125f27d3..fbd51194f 100644 --- a/dcicutils/sheet_utils.py +++ b/dcicutils/sheet_utils.py @@ -1,10 +1,16 @@ +import chardet import copy +import csv +import io +import openpyxl from dcicutils.common import AnyJsonData -from openpyxl import load_workbook +from dcicutils.lang_utils import conjoined_list, maybe_pluralize +from dcicutils.misc_utils import ignored from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook -from typing import Any, Dict, List, Optional, Union +from tempfile import TemporaryFile +from typing import Any, Dict, Iterable, List, Optional, Union Header = str @@ -12,6 +18,51 @@ ParsedHeader = List[Union[str, int]] ParsedHeaders = List[ParsedHeader] SheetCellValue = Union[int, float, str] +SheetRow = List[SheetCellValue] +CsvReader = type(csv.reader(TemporaryFile())) + + +def unwanted_kwargs(*, context, kwargs, context_plural=False, detailed=False): + if kwargs: + unwanted = [f"{argname}={value!r}" if detailed else argname + for argname, value in kwargs.items() + if value is not None] + if unwanted: + does_not = "don't" if context_plural else "doesn't" + raise ValueError(f"{context} {does_not} use" + f" {maybe_pluralize(unwanted, 'keyword argument')} {conjoined_list(unwanted)}.") + + +def prefer_number(value: SheetCellValue): + if isinstance(value, str): # the given value might be an int or float, in which case just fall through + if not value: + return None + value = value + ch0 = value[0] + if ch0 == '+' or ch0 == '-' or ch0.isdigit(): + try: + return int(value) + except Exception: + pass + try: + return float(value) + except Exception: + pass + # If we couldn't parse it as an int or float, fall through to returning the original value + pass + return value + + +def open_text_input_file_respecting_byte_order_mark(filename): + """ + Opens a file for text input, respecting a byte-order mark (BOM). + """ + with io.open(filename, 'rb') as fp: + leading_bytes = fp.read(4 * 8) # 4 bytes is all we need + bom_info = chardet.detect(leading_bytes) + detected_encoding = bom_info and bom_info.get('encoding') # tread lightly + + return io.open(filename, 'r', encoding=detected_encoding) class ItemTools: @@ -90,7 +141,7 @@ def assure_patch_prototype_shape(cls, *, parent: Union[Dict, List], keys: Parsed return parent @classmethod - def parse_value(cls, value: SheetCellValue) -> AnyJsonData: + def parse_item_value(cls, value: SheetCellValue) -> AnyJsonData: if isinstance(value, str): lvalue = value.lower() # TODO: We could consult a schema to make this less heuristic, but this may do for now @@ -101,19 +152,14 @@ def parse_value(cls, value: SheetCellValue) -> AnyJsonData: elif lvalue == 'null' or lvalue == '': return None elif '|' in value: - return [cls.parse_value(subvalue) for subvalue in value.split('|')] + if value == '|': # Use '|' for [] + return [] + else: + if value.endswith("|"): # Use 'foo|' for ['foo'] + value = value[:-1] + return [cls.parse_item_value(subvalue) for subvalue in value.split('|')] else: - ch0 = value[0] - if ch0 == '+' or ch0 == '-' or ch0.isdigit(): - try: - return int(value) - except Exception: - pass - try: - return float(value) - except Exception: - pass - return value + return prefer_number(value) else: # presumably a number (int or float) return value @@ -128,24 +174,135 @@ def set_path_value(cls, datum: Union[List, Dict], path: ParsedHeader, value: Any cls.set_path_value(datum[key], more_path, value) -class WorkbookManager: +# TODO: Consider whether this might want to be an abstract base class. Some change might be needed. +# +# Doug thinks we might want (metaclass=ABCMeta) here to make this an abstract base class. +# I am less certain but open to discussion. Among other things, as implemented now, +# the __init__ method here needs to run and the documentation says that ABC's won't appear +# in the method resolution order. -kmp 17-Aug-2023 +# See also discussion at https://github.com/4dn-dcic/utils/pull/276#discussion_r1297775535 +class AbstractTableSetManager: + """ + The TableSetManager is the spanning class of anything that wants to be able to load a table set, + regardless of what it wants to load it from. To do this, it must support a load method + that takes a filename and returns the file content in the form: + { + "Sheet1": [ + {...representation of row1 as some kind of dict...}, + {...representation of row2 as some kind of dict...} + ], + "Sheet2": [...], + ..., + } + Note that at this level of abstraction, we take no position on what form of representation is used + for the rows, as long as it is JSON data of some kind. It might be + {"col1": "val1", "col2": "val2", ...} + or it might be something more structured like + {"something": "val1", {"something_else": ["val2"]}} + Additionally, the values stored might be altered as well. In particular, the most likely alteration + is to turn "123" to 123 or "" to None, though the specifics of whether and how such transformations + happen is not constrained by this class. + """ + + def __init__(self, **kwargs): + unwanted_kwargs(context=self.__class__.__name__, kwargs=kwargs) + # TODO: Consider whether this should be an abstractmethod (but first see detailed design note at top of class.) @classmethod - def load_workbook(cls, filename: str): - wb = cls(filename) - return wb.load_content() + def load(cls, filename: str) -> Dict[str, List[AnyJsonData]]: + """ + Reads a filename and returns a dictionary that maps sheet names to rows of dictionary data. + For more information, see documentation of AbstractTableSetManager. + """ + raise NotImplementedError(f".load(...) is not implemented for {cls.__name__}.") + + +class BasicTableSetManager(AbstractTableSetManager): + """ + A BasicTableManager provides some structure that most kinds of parsers will need. + In particular, everything will likely need some way of storing headers and some way of storing content + of each sheet. Even a csv file, which doesn't have multiple tabs can be seen as the degenerate case + of this where there's only one set of headers and only one block of content. + """ - def __init__(self, filename: str): + def __init__(self, filename: str, **kwargs): + super().__init__(**kwargs) self.filename: str = filename - self.workbook: Optional[Workbook] = None - self.headers_by_sheetname: Dict[str, List[str]] = {} - self.content_by_sheetname: Dict[str, List[Any]] = {} + self.headers_by_tabname: Dict[str, Headers] = {} + self.content_by_tabname: Dict[str, List[AnyJsonData]] = {} + self.reader_agent: Any = self._get_reader_agent() - def sheet_headers(self, sheetname: str) -> List[str]: - return self.headers_by_sheetname[sheetname] + def tab_headers(self, tabname: str) -> Headers: + return self.headers_by_tabname[tabname] - def sheet_content(self, sheetname: str) -> List[Any]: - return self.content_by_sheetname[sheetname] + def tab_content(self, tabname: str) -> List[AnyJsonData]: + return self.content_by_tabname[tabname] + + def _create_tab_processor_state(self, tabname: str) -> Any: + """ + This method provides for the possibility that some parsers will want auxiliary state, + (such as parsed headers or a line count or a table of temporary names for objects to cross-link + or some other such feature) that it carries with it as it moves from line to line parsing things. + Subclasses might therefore want to make this do something more interesting. + """ + ignored(tabname) # subclasses might need this, but we don't + return None + + def _get_reader_agent(self) -> Any: + """This function is responsible for opening the workbook and returning a workbook object.""" + raise NotImplementedError(f"._get_reader_agent() is not implemented for {self.__class__.__name__}.") + + def load_content(self) -> Any: + raise NotImplementedError(f".load_content() is not implemented for {self.__class__.__name__}.") + + +class TableSetManager(BasicTableSetManager): + + @classmethod + def load(cls, filename: str) -> AnyJsonData: + table_set_manager: TableSetManager = cls(filename) + return table_set_manager.load_content() + + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) + + @property + def tabnames(self) -> List[str]: + raise NotImplementedError(f".tabnames is not implemented for {self.__class__.__name__}..") + + def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: + """ + Given a tabname and a state (returned by _sheet_loader_state), return a generator for a set of row values. + """ + raise NotImplementedError(f"._rows_for_tabname(...) is not implemented for {self.__class__.__name__}.") + + def _process_row(self, tabname: str, state: Any, row: List[SheetCellValue]) -> AnyJsonData: + """ + This needs to take a state and whatever represents a row and + must return a list of objects representing column values. + What constitutes a processed up to the class, but other than that the result must be a JSON dictionary. + """ + raise NotImplementedError(f"._process_row(...) is not implemented for {self.__class__.__name__}.") + + def load_content(self) -> AnyJsonData: + for tabname in self.tabnames: + sheet_content = [] + state = self._create_tab_processor_state(tabname) + for row_data in self._raw_row_generator_for_tabname(tabname): + processed_row_data: AnyJsonData = self._process_row(tabname, state, row_data) + sheet_content.append(processed_row_data) + self.content_by_tabname[tabname] = sheet_content + return self.content_by_tabname + + @classmethod + def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: + return prefer_number(value) + + +class XlsxManager(TableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheets in an XLSX file. + """ @classmethod def _all_rows(cls, sheet: Worksheet): @@ -159,60 +316,182 @@ def _all_cols(cls, sheet: Worksheet): for col in range(1, col_max + 1): yield col - def _load_headers(self, sheet: Worksheet): - headers: List[str] = [str(sheet.cell(row=1, column=col).value) - for col in self._all_cols(sheet)] - self.headers_by_sheetname[sheet.title] = headers - - def _load_row(self, *, sheet: Worksheet, row: int): - headers = self.sheet_headers(sheet.title) - row_dict: Dict[str, Any] = {headers[col-1]: sheet.cell(row=row, column=col).value - for col in self._all_cols(sheet)} - return row_dict - - def load_content(self): - workbook: Workbook = load_workbook(self.filename) - self.workbook = workbook - for sheetname in workbook.sheetnames: - sheet: Worksheet = workbook[sheetname] - self._load_headers(sheet) - content = [] - for row in self._all_rows(sheet): - row_dict = self._load_row(sheet=sheet, row=row) - content.append(row_dict) - self.content_by_sheetname[sheetname] = content - return self.content_by_sheetname - - -class ItemManager(ItemTools, WorkbookManager): - - def __init__(self, filename: str): - super().__init__(filename=filename) - self.patch_prototypes_by_sheetname: Dict[str, Dict] = {} - self.parsed_headers_by_sheetname: Dict[str, List[List[Union[int, str]]]] = {} - - def sheet_patch_prototype(self, sheetname: str) -> Dict: - return self.patch_prototypes_by_sheetname[sheetname] - - def sheet_parsed_headers(self, sheetname: str) -> List[List[Union[int, str]]]: - return self.parsed_headers_by_sheetname[sheetname] - - def _load_headers(self, sheet: Worksheet): - super()._load_headers(sheet) - self._compile_sheet_headers(sheet.title) - - def _compile_sheet_headers(self, sheetname: str): - headers = self.headers_by_sheetname[sheetname] - parsed_headers = self.parse_sheet_headers(headers) - self.parsed_headers_by_sheetname[sheetname] = parsed_headers - prototype = self.compute_patch_prototype(parsed_headers) - self.patch_prototypes_by_sheetname[sheetname] = prototype - - def _load_row(self, *, sheet: Worksheet, row: int): - parsed_headers = self.sheet_parsed_headers(sheet.title) - patch_item = copy.deepcopy(self.sheet_patch_prototype(sheet.title)) - for col in self._all_cols(sheet): - value = sheet.cell(row=row, column=col).value - parsed_value = self.parse_value(value) - self.set_path_value(patch_item, parsed_headers[col - 1], parsed_value) + @property + def tabnames(self) -> List[str]: + return self.reader_agent.sheetnames + + def _get_reader_agent(self) -> Workbook: + return openpyxl.load_workbook(self.filename) + + def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: + sheet = self.reader_agent[tabname] + return (self._get_raw_row_content_tuple(sheet, row) + for row in self._all_rows(sheet)) + + def _get_raw_row_content_tuple(self, sheet: Worksheet, row: int) -> SheetRow: + return [sheet.cell(row=row, column=col).value + for col in self._all_cols(sheet)] + + def _create_tab_processor_state(self, tabname: str) -> Headers: + sheet = self.reader_agent[tabname] + headers: Headers = [str(sheet.cell(row=1, column=col).value) + for col in self._all_cols(sheet)] + self.headers_by_tabname[sheet.title] = headers + return headers + + def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tabname) + return {headers[i]: self.parse_cell_value(row_datum) + for i, row_datum in enumerate(row_data)} + + +class ItemManagerMixin(BasicTableSetManager): + """ + This can add functionality to a reader such as an XlsxManager or a CsvManager in order to make its rows + get handled like Items instead of just flat table rows. + """ + + def __init__(self, filename: str, **kwargs): + super().__init__(filename=filename, **kwargs) + self.patch_prototypes_by_tabname: Dict[str, Dict] = {} + self.parsed_headers_by_tabname: Dict[str, ParsedHeaders] = {} + + def sheet_patch_prototype(self, tabname: str) -> Dict: + return self.patch_prototypes_by_tabname[tabname] + + def sheet_parsed_headers(self, tabname: str) -> ParsedHeaders: + return self.parsed_headers_by_tabname[tabname] + + def _create_tab_processor_state(self, tabname: str) -> ParsedHeaders: + super()._create_tab_processor_state(tabname) + # This will create state that allows us to efficiently assign values in the right place on each row + # by setting up a prototype we can copy and then drop values into. + self._compile_sheet_headers(tabname) + return self.sheet_parsed_headers(tabname) + + def _compile_sheet_headers(self, tabname: str): + headers = self.headers_by_tabname[tabname] + parsed_headers = ItemTools.parse_sheet_headers(headers) + self.parsed_headers_by_tabname[tabname] = parsed_headers + prototype = ItemTools.compute_patch_prototype(parsed_headers) + self.patch_prototypes_by_tabname[tabname] = prototype + + def _process_row(self, tabname: str, parsed_headers: ParsedHeaders, row_data: SheetRow) -> AnyJsonData: + patch_item = copy.deepcopy(self.sheet_patch_prototype(tabname)) + for i, value in enumerate(row_data): + parsed_value = self.parse_cell_value(value) + ItemTools.set_path_value(patch_item, parsed_headers[i], parsed_value) return patch_item + + @classmethod + def parse_cell_value(cls, value: SheetCellValue) -> AnyJsonData: + return ItemTools.parse_item_value(value) + + +class ItemXlsxManager(ItemManagerMixin, XlsxManager): + """ + This layers item-style row processing functionality on an XLSX file. + """ + pass + + +class CsvManager(TableSetManager): + """ + This implements the mechanism to get a series of rows out of the sheet in a csv file, + returning a result that still looks like there could have been multiple tabs. + """ + + DEFAULT_TAB_NAME = 'Sheet1' + + def __init__(self, filename: str, tab_name=None, **kwargs): + super().__init__(filename=filename, **kwargs) + self.tab_name = tab_name or self.DEFAULT_TAB_NAME + + @property + def tabnames(self) -> List[str]: + return [self.tab_name] + + def _get_reader_agent(self) -> CsvReader: + return self._get_csv_reader(self.filename) + + @classmethod + def _get_csv_reader(cls, filename) -> CsvReader: + return csv.reader(open_text_input_file_respecting_byte_order_mark(filename)) + + PAD_TRAILING_TABS = True + + def _raw_row_generator_for_tabname(self, tabname: str) -> Iterable[SheetRow]: + headers = self.tab_headers(tabname) + n_headers = len(headers) + for row_data in self.reader_agent: + n_cols = len(row_data) + if self.PAD_TRAILING_TABS and n_cols < n_headers: + row_data = row_data + [''] * (n_headers - n_cols) + yield row_data + + def _create_tab_processor_state(self, tabname: str) -> Headers: + headers: Optional[Headers] = self.headers_by_tabname.get(tabname) + if headers is None: + self.headers_by_tabname[tabname] = headers = self.reader_agent.__next__() + return headers + + def _process_row(self, tabname: str, headers: Headers, row_data: SheetRow) -> AnyJsonData: + ignored(tabname) + return {headers[i]: self.parse_cell_value(row_datum) + for i, row_datum in enumerate(row_data)} + + +class ItemCsvManager(ItemManagerMixin, CsvManager): + """ + This layers item-style row processing functionality on a CSV file. + """ + pass + + +class TsvManager(CsvManager): + """ + TSV files are just CSV files with tabs instead of commas as separators. + (We do not presently handle any escaping of strange characters. May need to add handling for backslash escaping.) + """ + @classmethod + def _get_csv_reader(cls, filename) -> CsvReader: + return csv.reader(open_text_input_file_respecting_byte_order_mark(filename), delimiter='\t') + + +class ItemTsvManager(ItemManagerMixin, TsvManager): + """ + This layers item-style row processing functionality on a TSV file. + """ + pass + + +class ItemManager(AbstractTableSetManager): + """ + This class will open a .xlsx or .csv file and load its content in our standard format. + (See more detailed description in AbstractTableManager.) + """ + + @classmethod + def create_implementation_manager(cls, filename: str, **kwargs) -> BasicTableSetManager: + if filename.endswith(".xlsx"): + # unwanted_kwargs(context="ItemManager for .xlsx files", kwargs=kwargs) + reader_agent = ItemXlsxManager(filename, **kwargs) + elif filename.endswith(".csv"): + tab_name = kwargs.pop('tab_name', None) + # unwanted_kwargs(context="ItemManager for .csv files", kwargs=kwargs) + reader_agent = ItemCsvManager(filename, tab_name=tab_name, **kwargs) + elif filename.endswith(".tsv"): + tab_name = kwargs.pop('tab_name', None) + # unwanted_kwargs(context="ItemManager for .tsv files", kwargs=kwargs) + reader_agent = ItemTsvManager(filename, tab_name=tab_name, **kwargs) + else: + raise ValueError(f"Unknown file type: {filename}") + return reader_agent + + @classmethod + def load(cls, filename: str, tab_name=None) -> AnyJsonData: + manager = cls.create_implementation_manager(filename, tab_name=tab_name) + return manager.load_content() + + +load_items = ItemManager.load diff --git a/poetry.lock b/poetry.lock index 480148ea1..95670b506 100644 --- a/poetry.lock +++ b/poetry.lock @@ -489,6 +489,18 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + [[package]] name = "charset-normalizer" version = "3.2.0" @@ -1621,4 +1633,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.7,<3.10" -content-hash = "9d01884634874c0304ebd91ae564ad7920cece54aea7de4c67955c2343e7d44b" +content-hash = "eb629a04469e24b917d9525dd06dac72f2014cc9ede879946909929f5c09b9fd" diff --git a/pyproject.toml b/pyproject.toml index ec5adce92..aaa4371f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ botocore = "^1.20.39" # This value is intentionally pinned and must not be changed casually. elasticsearch = "7.13.4" aws-requests-auth = ">=0.4.2,<1" +chardet = "^5.2.0" docker = "^4.4.4" gitpython = "^3.1.2" openpyxl = "^3.1.2" diff --git a/test/data_files/sample_items_sheet2.csv b/test/data_files/sample_items_sheet2.csv index b1d3ec2da..95567c42a 100644 --- a/test/data_files/sample_items_sheet2.csv +++ b/test/data_files/sample_items_sheet2.csv @@ -1,3 +1,3 @@ name,age,mother.name,mother.age,father.name,father.age,friends#0.name,friends#0.age,friends#1.name,friends#1.age bill,23,mary,58,fred,63,sam,22,arthur,19 -joe,9,estrella,35,anthony,34,anders,9,, \ No newline at end of file +joe,9,estrella,35,anthony,34,anders,9,, diff --git a/test/data_files/sample_items_sheet2.tsv b/test/data_files/sample_items_sheet2.tsv new file mode 100644 index 000000000..e862bf36d --- /dev/null +++ b/test/data_files/sample_items_sheet2.tsv @@ -0,0 +1,3 @@ +name age mother.name mother.age father.name father.age friends#0.name friends#0.age friends#1.name friends#1.age +bill 23 mary 58 fred 63 sam 22 arthur 19 +joe 9 estrella 35 anthony 34 anders 9 diff --git a/test/data_files/sample_items_sheet_2.tsv.README.text b/test/data_files/sample_items_sheet_2.tsv.README.text new file mode 100644 index 000000000..efefaf654 --- /dev/null +++ b/test/data_files/sample_items_sheet_2.tsv.README.text @@ -0,0 +1,4 @@ +Note that one of the lines in file sample_items_sheet_2.tsv has two blank fields at end of line. +PyCharm and perhaps other editors "helpfully" removes trailing whitespace from lines, +so the number of columns varies line-to-line. Instead of insisting on explicit tabs at end of line, +we pad such short lines with nulls when reading from the file. diff --git a/test/test_sheet_utils.py b/test/test_sheet_utils.py index 40286d2e3..1915b3a71 100644 --- a/test/test_sheet_utils.py +++ b/test/test_sheet_utils.py @@ -1,7 +1,13 @@ import os import pytest -from dcicutils.sheet_utils import ItemTools, WorkbookManager, ItemManager +from dcicutils.sheet_utils import ( + # High-level interfaces + ItemManager, load_items, + # Low-level implementation + ItemTools, XlsxManager, ItemXlsxManager, + CsvManager, ItemCsvManager, TsvManager, ItemTsvManager, +) from .conftest_settings import TEST_DIR @@ -52,39 +58,39 @@ def test_item_tools_compute_patch_prototype_errors(headers): assert str(exc.value) == "A header cannot begin with a numeric ref: 0" -def test_item_tools_parse_value(): +def test_item_tools_parse_item_value(): for x in [37, 19.3, True, False, None, 'simple text']: - assert ItemTools.parse_value(x) == x + assert ItemTools.parse_item_value(x) == x - assert ItemTools.parse_value('3') == 3 - assert ItemTools.parse_value('+3') == 3 - assert ItemTools.parse_value('-3') == -3 + assert ItemTools.parse_item_value('3') == 3 + assert ItemTools.parse_item_value('+3') == 3 + assert ItemTools.parse_item_value('-3') == -3 - assert ItemTools.parse_value('3.5') == 3.5 - assert ItemTools.parse_value('+3.5') == 3.5 - assert ItemTools.parse_value('-3.5') == -3.5 + assert ItemTools.parse_item_value('3.5') == 3.5 + assert ItemTools.parse_item_value('+3.5') == 3.5 + assert ItemTools.parse_item_value('-3.5') == -3.5 - assert ItemTools.parse_value('3.5e1') == 35.0 - assert ItemTools.parse_value('+3.5e1') == 35.0 - assert ItemTools.parse_value('-3.5e1') == -35.0 + assert ItemTools.parse_item_value('3.5e1') == 35.0 + assert ItemTools.parse_item_value('+3.5e1') == 35.0 + assert ItemTools.parse_item_value('-3.5e1') == -35.0 - assert ItemTools.parse_value('') is None + assert ItemTools.parse_item_value('') is None - assert ItemTools.parse_value('null') is None - assert ItemTools.parse_value('Null') is None - assert ItemTools.parse_value('NULL') is None + assert ItemTools.parse_item_value('null') is None + assert ItemTools.parse_item_value('Null') is None + assert ItemTools.parse_item_value('NULL') is None - assert ItemTools.parse_value('true') is True - assert ItemTools.parse_value('True') is True - assert ItemTools.parse_value('TRUE') is True + assert ItemTools.parse_item_value('true') is True + assert ItemTools.parse_item_value('True') is True + assert ItemTools.parse_item_value('TRUE') is True - assert ItemTools.parse_value('false') is False - assert ItemTools.parse_value('False') is False - assert ItemTools.parse_value('FALSE') is False + assert ItemTools.parse_item_value('false') is False + assert ItemTools.parse_item_value('False') is False + assert ItemTools.parse_item_value('FALSE') is False - assert ItemTools.parse_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] - assert ItemTools.parse_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] + assert ItemTools.parse_item_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma'] + assert ItemTools.parse_item_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5] def test_item_tools_set_path_value(): @@ -158,40 +164,134 @@ def test_item_tools_set_path_value(): SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv') -SAMPLE_CSV_FILE_RAW_CONTENT = SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2'] +SAMPLE_CSV_FILE_RAW_CONTENT = {CsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} -SAMPLE_CSV_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2'] +SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} +SAMPLE_TSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.tsv') -def test_workbook_manager_load_content(): +SAMPLE_TSV_FILE_RAW_CONTENT = {TsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']} - wt = WorkbookManager(SAMPLE_XLSX_FILE) +SAMPLE_TSV_FILE_ITEM_CONTENT = {ItemTsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']} + + +def test_xlsx_manager_load_content(): + + wt = XlsxManager(SAMPLE_XLSX_FILE) assert wt.load_content() == SAMPLE_XLSX_FILE_RAW_CONTENT -def test_workbook_manager_load_workbook(): +def test_xlsx_manager_load(): - assert WorkbookManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT + assert XlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT -def test_workbook_manager_load_csv(): +def test_xlsx_manager_load_csv(): with pytest.raises(Exception): - WorkbookManager.load_workbook(SAMPLE_CSV_FILE) + XlsxManager.load(SAMPLE_CSV_FILE) -def test_item_manager_load_content(): +def test_item_xlsx_manager_load_content(): - it = ItemManager(SAMPLE_XLSX_FILE) + it = ItemXlsxManager(SAMPLE_XLSX_FILE) assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT -def test_item_manager_load_workbook(): +def test_item_xlsx_manager_load(): + + assert ItemXlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + +def test_item_xlsx_manager_load_csv(): + + with pytest.raises(Exception): + ItemXlsxManager.load(SAMPLE_CSV_FILE) + + +def test_csv_manager_load_content(): + + wt = CsvManager(SAMPLE_CSV_FILE) + assert wt.load_content() == SAMPLE_CSV_FILE_RAW_CONTENT - assert ItemManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT +def test_csv_manager_load(): -def test_item_manager_load_csv(): + assert CsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT + + +def test_csv_manager_load_csv(): with pytest.raises(Exception): - ItemManager.load_workbook(SAMPLE_CSV_FILE) + CsvManager.load(SAMPLE_XLSX_FILE) + + +def test_item_csv_manager_load_content(): + + it = ItemCsvManager(SAMPLE_CSV_FILE) + assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT + + +def test_item_csv_manager_load(): + + assert ItemCsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + + +def test_item_csv_manager_load_csv(): + + with pytest.raises(Exception): + ItemCsvManager.load(SAMPLE_XLSX_FILE) + + +def test_tsv_manager_load_content(): + + wt = TsvManager(SAMPLE_TSV_FILE) + assert wt.load_content() == SAMPLE_TSV_FILE_RAW_CONTENT + + +def test_tsv_manager_load(): + + assert TsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_RAW_CONTENT + + +def test_tsv_manager_load_csv(): + + with pytest.raises(Exception): + TsvManager.load(SAMPLE_XLSX_FILE) + + +def test_item_tsv_manager_load_content(): + + it = ItemTsvManager(SAMPLE_TSV_FILE) + assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT + + +def test_item_tsv_manager_load(): + + assert ItemTsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT + + +def test_item_tsv_manager_load_csv(): + + with pytest.raises(Exception): + ItemTsvManager.load(SAMPLE_XLSX_FILE) + + +def test_item_manager_load(): + + assert ItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + assert ItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + + with pytest.raises(ValueError): + ItemManager.load("something.else") + + +def test_load_items(): + + assert load_items(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT + + assert load_items(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT + + with pytest.raises(ValueError): + load_items("something.else")