Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sheet_utils refactor to add csv functionality (C4-1088) #276

Open
wants to merge 17 commits into
base: kmp_sheet_utils
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,22 @@ Change Log

* New module ``sheet_utils`` for loading workbooks.

* class ``WorkbookManager`` for loading raw data
* Important things of interest:

* class ``ItemManager`` for loading item data
* Class ``ItemManager`` for loading Item-style data
from any ``.xlsx``, ``.csv`` or ``.tsv`` files.

* Function ``load_items`` that does the same as ``ItemManager.load``.

* Various low-level implementation classes such as:

* Classes ``XlsxManager``, ``CsvManager`` and ``TsvManager`` for loading raw data
from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively.

* Classes ``ItemXlsxManager``, ``ItemCsvManager``, and ``ItemTsvManager`` for loading Item-style data
from ``.xlsx``, ``.csv``, and ``.tsv`` files, respectively.

* Contains a fix for a bug in ``ff_utils.get_schema_names`` (`C4-1086 <https://hms-dbmi.atlassian.net/browse/C4-1086>`_).


7.9.0
Expand Down
445 changes: 362 additions & 83 deletions dcicutils/sheet_utils.py

Large diffs are not rendered by default.

14 changes: 13 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ botocore = "^1.20.39"
# This value is intentionally pinned and must not be changed casually.
elasticsearch = "7.13.4"
aws-requests-auth = ">=0.4.2,<1"
chardet = "^5.2.0"
docker = "^4.4.4"
gitpython = "^3.1.2"
openpyxl = "^3.1.2"
Expand Down
2 changes: 1 addition & 1 deletion test/data_files/sample_items_sheet2.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
name,age,mother.name,mother.age,father.name,father.age,friends#0.name,friends#0.age,friends#1.name,friends#1.age
bill,23,mary,58,fred,63,sam,22,arthur,19
joe,9,estrella,35,anthony,34,anders,9,,
joe,9,estrella,35,anthony,34,anders,9,,
3 changes: 3 additions & 0 deletions test/data_files/sample_items_sheet2.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
name age mother.name mother.age father.name father.age friends#0.name friends#0.age friends#1.name friends#1.age
bill 23 mary 58 fred 63 sam 22 arthur 19
joe 9 estrella 35 anthony 34 anders 9
4 changes: 4 additions & 0 deletions test/data_files/sample_items_sheet_2.tsv.README.text
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Note that one of the lines in file sample_items_sheet_2.tsv has two blank fields at end of line.
PyCharm and perhaps other editors "helpfully" removes trailing whitespace from lines,
so the number of columns varies line-to-line. Instead of insisting on explicit tabs at end of line,
we pad such short lines with nulls when reading from the file.
176 changes: 138 additions & 38 deletions test/test_sheet_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import os
import pytest

from dcicutils.sheet_utils import ItemTools, WorkbookManager, ItemManager
from dcicutils.sheet_utils import (
# High-level interfaces
ItemManager, load_items,
# Low-level implementation
ItemTools, XlsxManager, ItemXlsxManager,
CsvManager, ItemCsvManager, TsvManager, ItemTsvManager,
)
from .conftest_settings import TEST_DIR


Expand Down Expand Up @@ -52,39 +58,39 @@ def test_item_tools_compute_patch_prototype_errors(headers):
assert str(exc.value) == "A header cannot begin with a numeric ref: 0"


def test_item_tools_parse_value():
def test_item_tools_parse_item_value():

for x in [37, 19.3, True, False, None, 'simple text']:
assert ItemTools.parse_value(x) == x
assert ItemTools.parse_item_value(x) == x

assert ItemTools.parse_value('3') == 3
assert ItemTools.parse_value('+3') == 3
assert ItemTools.parse_value('-3') == -3
assert ItemTools.parse_item_value('3') == 3
assert ItemTools.parse_item_value('+3') == 3
assert ItemTools.parse_item_value('-3') == -3

assert ItemTools.parse_value('3.5') == 3.5
assert ItemTools.parse_value('+3.5') == 3.5
assert ItemTools.parse_value('-3.5') == -3.5
assert ItemTools.parse_item_value('3.5') == 3.5
assert ItemTools.parse_item_value('+3.5') == 3.5
assert ItemTools.parse_item_value('-3.5') == -3.5

assert ItemTools.parse_value('3.5e1') == 35.0
assert ItemTools.parse_value('+3.5e1') == 35.0
assert ItemTools.parse_value('-3.5e1') == -35.0
assert ItemTools.parse_item_value('3.5e1') == 35.0
assert ItemTools.parse_item_value('+3.5e1') == 35.0
assert ItemTools.parse_item_value('-3.5e1') == -35.0

assert ItemTools.parse_value('') is None
assert ItemTools.parse_item_value('') is None

assert ItemTools.parse_value('null') is None
assert ItemTools.parse_value('Null') is None
assert ItemTools.parse_value('NULL') is None
assert ItemTools.parse_item_value('null') is None
assert ItemTools.parse_item_value('Null') is None
assert ItemTools.parse_item_value('NULL') is None

assert ItemTools.parse_value('true') is True
assert ItemTools.parse_value('True') is True
assert ItemTools.parse_value('TRUE') is True
assert ItemTools.parse_item_value('true') is True
assert ItemTools.parse_item_value('True') is True
assert ItemTools.parse_item_value('TRUE') is True

assert ItemTools.parse_value('false') is False
assert ItemTools.parse_value('False') is False
assert ItemTools.parse_value('FALSE') is False
assert ItemTools.parse_item_value('false') is False
assert ItemTools.parse_item_value('False') is False
assert ItemTools.parse_item_value('FALSE') is False

assert ItemTools.parse_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma']
assert ItemTools.parse_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5]
assert ItemTools.parse_item_value('alpha|beta|gamma') == ['alpha', 'beta', 'gamma']
assert ItemTools.parse_item_value('alpha|true|false|null||7|1.5') == ['alpha', True, False, None, None, 7, 1.5]


def test_item_tools_set_path_value():
Expand Down Expand Up @@ -158,40 +164,134 @@ def test_item_tools_set_path_value():

SAMPLE_CSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.csv')

SAMPLE_CSV_FILE_RAW_CONTENT = SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']
SAMPLE_CSV_FILE_RAW_CONTENT = {CsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']}

SAMPLE_CSV_FILE_ITEM_CONTENT = SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']
SAMPLE_CSV_FILE_ITEM_CONTENT = {ItemCsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']}

SAMPLE_TSV_FILE = os.path.join(TEST_DIR, 'data_files/sample_items_sheet2.tsv')

def test_workbook_manager_load_content():
SAMPLE_TSV_FILE_RAW_CONTENT = {TsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_RAW_CONTENT['Sheet2']}

wt = WorkbookManager(SAMPLE_XLSX_FILE)
SAMPLE_TSV_FILE_ITEM_CONTENT = {ItemTsvManager.DEFAULT_TAB_NAME: SAMPLE_XLSX_FILE_ITEM_CONTENT['Sheet2']}


def test_xlsx_manager_load_content():

wt = XlsxManager(SAMPLE_XLSX_FILE)
assert wt.load_content() == SAMPLE_XLSX_FILE_RAW_CONTENT


def test_workbook_manager_load_workbook():
def test_xlsx_manager_load():

assert WorkbookManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT
assert XlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_RAW_CONTENT


def test_workbook_manager_load_csv():
def test_xlsx_manager_load_csv():

with pytest.raises(Exception):
WorkbookManager.load_workbook(SAMPLE_CSV_FILE)
XlsxManager.load(SAMPLE_CSV_FILE)


def test_item_manager_load_content():
def test_item_xlsx_manager_load_content():

it = ItemManager(SAMPLE_XLSX_FILE)
it = ItemXlsxManager(SAMPLE_XLSX_FILE)
assert it.load_content() == SAMPLE_XLSX_FILE_ITEM_CONTENT


def test_item_manager_load_workbook():
def test_item_xlsx_manager_load():

assert ItemXlsxManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT


def test_item_xlsx_manager_load_csv():

with pytest.raises(Exception):
ItemXlsxManager.load(SAMPLE_CSV_FILE)


def test_csv_manager_load_content():

wt = CsvManager(SAMPLE_CSV_FILE)
assert wt.load_content() == SAMPLE_CSV_FILE_RAW_CONTENT

assert ItemManager.load_workbook(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT

def test_csv_manager_load():

def test_item_manager_load_csv():
assert CsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_RAW_CONTENT


def test_csv_manager_load_csv():

with pytest.raises(Exception):
ItemManager.load_workbook(SAMPLE_CSV_FILE)
CsvManager.load(SAMPLE_XLSX_FILE)


def test_item_csv_manager_load_content():

it = ItemCsvManager(SAMPLE_CSV_FILE)
assert it.load_content() == SAMPLE_CSV_FILE_ITEM_CONTENT


def test_item_csv_manager_load():

assert ItemCsvManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT


def test_item_csv_manager_load_csv():

with pytest.raises(Exception):
ItemCsvManager.load(SAMPLE_XLSX_FILE)


def test_tsv_manager_load_content():

wt = TsvManager(SAMPLE_TSV_FILE)
assert wt.load_content() == SAMPLE_TSV_FILE_RAW_CONTENT


def test_tsv_manager_load():

assert TsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_RAW_CONTENT


def test_tsv_manager_load_csv():

with pytest.raises(Exception):
TsvManager.load(SAMPLE_XLSX_FILE)


def test_item_tsv_manager_load_content():

it = ItemTsvManager(SAMPLE_TSV_FILE)
assert it.load_content() == SAMPLE_TSV_FILE_ITEM_CONTENT


def test_item_tsv_manager_load():

assert ItemTsvManager.load(SAMPLE_TSV_FILE) == SAMPLE_TSV_FILE_ITEM_CONTENT


def test_item_tsv_manager_load_csv():

with pytest.raises(Exception):
ItemTsvManager.load(SAMPLE_XLSX_FILE)


def test_item_manager_load():

assert ItemManager.load(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT

assert ItemManager.load(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT

with pytest.raises(ValueError):
ItemManager.load("something.else")


def test_load_items():

assert load_items(SAMPLE_XLSX_FILE) == SAMPLE_XLSX_FILE_ITEM_CONTENT

assert load_items(SAMPLE_CSV_FILE) == SAMPLE_CSV_FILE_ITEM_CONTENT

with pytest.raises(ValueError):
load_items("something.else")