Skip to content

Commit

Permalink
Proper error message when trying to use an unsupported combination of…
Browse files Browse the repository at this point in the history
… Pandas option

and archives with default values (issue #106).
  • Loading branch information
niconoe committed Jul 8, 2024
1 parent d52f285 commit 60881bb
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 4 deletions.
6 changes: 6 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
Current (unreleased)
--------------------

- Proper error message when trying to use an unsupported combination of Pandas option
and archives with default values (issue #106).

v0.16.0 (2023-11-13)
--------------------

Expand Down
20 changes: 16 additions & 4 deletions dwca/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from typing import List, Optional, Dict, Any, IO, Tuple
from xml.etree.ElementTree import Element

from pandas.io.parsers import TextFileReader

import dwca.vendor
from dwca.descriptors import ArchiveDescriptor, DataFileDescriptor, shorten_term
from dwca.exceptions import RowNotFound, InvalidArchive, InvalidSimpleArchive, NotADataFile
Expand Down Expand Up @@ -200,15 +202,25 @@ def pd_read(self, relative_path, **kwargs):
kwargs['header'] = None
kwargs['names'] = datafile_descriptor.short_headers

df = read_csv(self.absolute_temporary_path(relative_path), **kwargs)
df_or_textreader = read_csv(self.absolute_temporary_path(relative_path), **kwargs)

# Add a column for default values, if present in the file descriptor
for field in datafile_descriptor.fields:
field_default_value = field['default']
if field_default_value is not None:
df[shorten_term(field['term'])] = field_default_value

return df
if isinstance(df_or_textreader, TextFileReader):
# I don't see how to assign default values to a TextFileReader, so
# this is currently unsupported
raise ValueError(
"Pandas read_csv() was called with a chunksize or iterator=True, "
"and therefore returns a TextFileReader instead of a DataFrame "
"which is not supported in combination with default values of "
"the archive."
)

df_or_textreader[shorten_term(field['term'])] = field_default_value

return df_or_textreader

def orphaned_extension_rows(self) -> Dict[str, Dict[str, List[int]]]:
"""Return a dict of the orphaned extension rows.
Expand Down
20 changes: 20 additions & 0 deletions dwca/test/test_dwcareader.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,26 @@ def test_pd_read_simple_case(self):
assert df["scientificName"].values.tolist() == \
["tetraodon fluviatilis", "betta splendens"]

def test_pd_read_chunked_default_value(self):
"""Pandas chuncksize should not be used with default values.
See: https://github.com/BelgianBiodiversityPlatform/python-dwca-reader/issues/106
"""
with DwCAReader(sample_data_path("dwca-test-default.zip")) as dwca:
with pytest.raises(ValueError):
for chunk in dwca.pd_read("occurrence.txt", chunksize=1):
pass

def test_pd_read_chunked(self):
"""If no default values are available in the archive, chunksize should work.
See: https://github.com/BelgianBiodiversityPlatform/python-dwca-reader/issues/106
"""
with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca:
for chunk in dwca.pd_read("occurrence.txt", chunksize=2):
assert isinstance(chunk, pd.DataFrame)


def test_pd_read_no_data_files(self):
with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca:
with pytest.raises(NotADataFile):
Expand Down

0 comments on commit 60881bb

Please sign in to comment.