diff --git a/CHANGES.txt b/CHANGES.txt index 5344222..13d98c2 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,9 @@ +Current (unreleased) +-------------------- + +- Proper error message when trying to use an unsupported combination of Pandas option +and archives with default values (issue #106). + v0.16.0 (2023-11-13) -------------------- diff --git a/dwca/read.py b/dwca/read.py index a571a67..9141cab 100644 --- a/dwca/read.py +++ b/dwca/read.py @@ -10,6 +10,8 @@ from typing import List, Optional, Dict, Any, IO, Tuple from xml.etree.ElementTree import Element +from pandas.io.parsers import TextFileReader + import dwca.vendor from dwca.descriptors import ArchiveDescriptor, DataFileDescriptor, shorten_term from dwca.exceptions import RowNotFound, InvalidArchive, InvalidSimpleArchive, NotADataFile @@ -200,15 +202,25 @@ def pd_read(self, relative_path, **kwargs): kwargs['header'] = None kwargs['names'] = datafile_descriptor.short_headers - df = read_csv(self.absolute_temporary_path(relative_path), **kwargs) + df_or_textreader = read_csv(self.absolute_temporary_path(relative_path), **kwargs) # Add a column for default values, if present in the file descriptor for field in datafile_descriptor.fields: field_default_value = field['default'] if field_default_value is not None: - df[shorten_term(field['term'])] = field_default_value - - return df + if isinstance(df_or_textreader, TextFileReader): + # I don't see how to assign default values to a TextFileReader, so + # this is currently unsupported + raise ValueError( + "Pandas read_csv() was called with a chunksize or iterator=True, " + "and therefore returns a TextFileReader instead of a DataFrame " + "which is not supported in combination with default values of " + "the archive." + ) + + df_or_textreader[shorten_term(field['term'])] = field_default_value + + return df_or_textreader def orphaned_extension_rows(self) -> Dict[str, Dict[str, List[int]]]: """Return a dict of the orphaned extension rows. diff --git a/dwca/test/test_dwcareader.py b/dwca/test/test_dwcareader.py index 22027b1..f3616ab 100644 --- a/dwca/test/test_dwcareader.py +++ b/dwca/test/test_dwcareader.py @@ -48,6 +48,26 @@ def test_pd_read_simple_case(self): assert df["scientificName"].values.tolist() == \ ["tetraodon fluviatilis", "betta splendens"] + def test_pd_read_chunked_default_value(self): + """Pandas chuncksize should not be used with default values. + + See: https://github.com/BelgianBiodiversityPlatform/python-dwca-reader/issues/106 + """ + with DwCAReader(sample_data_path("dwca-test-default.zip")) as dwca: + with pytest.raises(ValueError): + for chunk in dwca.pd_read("occurrence.txt", chunksize=1): + pass + + def test_pd_read_chunked(self): + """If no default values are available in the archive, chunksize should work. + + See: https://github.com/BelgianBiodiversityPlatform/python-dwca-reader/issues/106 + """ + with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca: + for chunk in dwca.pd_read("occurrence.txt", chunksize=2): + assert isinstance(chunk, pd.DataFrame) + + def test_pd_read_no_data_files(self): with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca: with pytest.raises(NotADataFile):