From 60881bb35224c2e7efcadc9b189adec1c2a75be9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolas=20No=C3=A9?= <nicolas@niconoe.eu>
Date: Mon, 8 Jul 2024 13:40:55 +0200
Subject: [PATCH] Proper error message when trying to use an unsupported
 combination of Pandas option and archives with default values (issue #106).

---
 CHANGES.txt                  |  6 ++++++
 dwca/read.py                 | 20 ++++++++++++++++----
 dwca/test/test_dwcareader.py | 20 ++++++++++++++++++++
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 5344222..13d98c2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Current (unreleased)
+--------------------
+
+- Proper error message when trying to use an unsupported combination of Pandas option
+and archives with default values (issue #106).
+
 v0.16.0 (2023-11-13)
 --------------------
 
diff --git a/dwca/read.py b/dwca/read.py
index a571a67..9141cab 100644
--- a/dwca/read.py
+++ b/dwca/read.py
@@ -10,6 +10,8 @@
 from typing import List, Optional, Dict, Any, IO, Tuple
 from xml.etree.ElementTree import Element
 
+from pandas.io.parsers import TextFileReader
+
 import dwca.vendor
 from dwca.descriptors import ArchiveDescriptor, DataFileDescriptor, shorten_term
 from dwca.exceptions import RowNotFound, InvalidArchive, InvalidSimpleArchive, NotADataFile
@@ -200,15 +202,25 @@ def pd_read(self, relative_path, **kwargs):
         kwargs['header'] = None
         kwargs['names'] = datafile_descriptor.short_headers
 
-        df = read_csv(self.absolute_temporary_path(relative_path), **kwargs)
+        df_or_textreader = read_csv(self.absolute_temporary_path(relative_path), **kwargs)
 
         # Add a column for default values, if present in the file descriptor
         for field in datafile_descriptor.fields:
             field_default_value = field['default']
             if field_default_value is not None:
-                df[shorten_term(field['term'])] = field_default_value
-
-        return df
+                if isinstance(df_or_textreader, TextFileReader):
+                    # I don't see how to assign default values to a TextFileReader, so
+                    # this is currently unsupported
+                    raise ValueError(
+                        "Pandas read_csv() was called with a chunksize or iterator=True, "
+                        "and therefore returns a TextFileReader instead of a DataFrame "
+                        "which is not supported in combination with default values of "
+                        "the archive."
+                    )
+
+                df_or_textreader[shorten_term(field['term'])] = field_default_value
+
+        return df_or_textreader
 
     def orphaned_extension_rows(self) -> Dict[str, Dict[str, List[int]]]:
         """Return a dict of the orphaned extension rows.
diff --git a/dwca/test/test_dwcareader.py b/dwca/test/test_dwcareader.py
index 22027b1..f3616ab 100644
--- a/dwca/test/test_dwcareader.py
+++ b/dwca/test/test_dwcareader.py
@@ -48,6 +48,26 @@ def test_pd_read_simple_case(self):
             assert df["scientificName"].values.tolist() == \
                 ["tetraodon fluviatilis", "betta splendens"]
 
+    def test_pd_read_chunked_default_value(self):
+        """Pandas chuncksize should not be used with default values.
+
+        See: https://github.com/BelgianBiodiversityPlatform/python-dwca-reader/issues/106
+        """
+        with DwCAReader(sample_data_path("dwca-test-default.zip")) as dwca:
+            with pytest.raises(ValueError):
+                for chunk in dwca.pd_read("occurrence.txt", chunksize=1):
+                    pass
+
+    def test_pd_read_chunked(self):
+        """If no default values are available in the archive, chunksize should work.
+
+        See: https://github.com/BelgianBiodiversityPlatform/python-dwca-reader/issues/106
+        """
+        with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca:
+            for chunk in dwca.pd_read("occurrence.txt", chunksize=2):
+                assert isinstance(chunk, pd.DataFrame)
+
+
     def test_pd_read_no_data_files(self):
         with DwCAReader(sample_data_path("dwca-simple-test-archive.zip")) as dwca:
             with pytest.raises(NotADataFile):