Fix default int types for Pandas DataFrame and Series (scikit-bio#2034)

* Fix default int types for pd DataFrame and Series * Remove trailing hash and fix fasta sniffer * Use dtype= instead of astype() * Remove return statement * Fix tests failing on linux
mataton · Jun 5, 2024 · 545c985 · 545c985
1 parent e1ede1e
commit 545c985
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 25 deletions.
diff --git a/skbio/diversity/tests/test_driver.py b/skbio/diversity/tests/test_driver.py
@@ -170,19 +170,19 @@ def test_empty(self):
         # empty vector
         actual = alpha_diversity('sobs',
                                  np.array([], dtype=np.int64))
-        expected = pd.Series([0])
+        expected = pd.Series([0], dtype=int)
         assert_series_almost_equal(actual, expected)
 
         # array of empty vector
         actual = alpha_diversity('sobs',
                                  np.array([[]], dtype=np.int64))
-        expected = pd.Series([0])
+        expected = pd.Series([0], dtype=int)
         assert_series_almost_equal(actual, expected)
 
         # array of empty vectors
         actual = alpha_diversity('sobs',
                                  np.array([[], []], dtype=np.int64))
-        expected = pd.Series([0, 0])
+        expected = pd.Series([0, 0], dtype=int)
         assert_series_almost_equal(actual, expected)
 
         # empty vector
@@ -208,12 +208,12 @@ def test_empty(self):
         # empty Table
         actual = alpha_diversity('sobs', Table(np.array([[]]), [], ['S1', ]))
         actual.index = pd.RangeIndex(len(actual))
-        expected = pd.Series([0])
+        expected = pd.Series([0], dtype=int)
         assert_series_almost_equal(actual, expected)
 
     def test_single_count_vector(self):
         actual = alpha_diversity('sobs', np.array([1, 0, 2]))
-        expected = pd.Series([2])
+        expected = pd.Series([2], dtype=int)
         assert_series_almost_equal(actual, expected)
 
         actual = alpha_diversity('faith_pd', np.array([1, 3, 0, 1, 0]),
@@ -255,14 +255,14 @@ def test_input_types(self):
 
     def test_sobs(self):
         # expected values hand-calculated
-        expected = pd.Series([3, 3, 3, 3], index=self.sids1)
+        expected = pd.Series([3, 3, 3, 3], index=self.sids1, dtype=int)
         actual = alpha_diversity('sobs', self.table1, self.sids1)
         assert_series_almost_equal(actual, expected)
         # function passed instead of string
         actual = alpha_diversity(sobs, self.table1, self.sids1)
         assert_series_almost_equal(actual, expected)
         # alt input table
-        expected = pd.Series([2, 1, 0], index=self.sids2)
+        expected = pd.Series([2, 1, 0], index=self.sids2, dtype=int)
         actual = alpha_diversity('sobs', self.table2, self.sids2)
         assert_series_almost_equal(actual, expected)
 
@@ -315,7 +315,7 @@ def test_phydiv(self):
 
     def test_no_ids(self):
         # expected values hand-calculated
-        expected = pd.Series([3, 3, 3, 3])
+        expected = pd.Series([3, 3, 3, 3], dtype=int)
         actual = alpha_diversity('sobs', self.table1)
         assert_series_almost_equal(actual, expected)
 

diff --git a/skbio/io/format/fasta.py b/skbio/io/format/fasta.py
@@ -955,7 +955,7 @@ def _parse_quality_scores(chunks):
 
     qual_str = " ".join(chunks)
     try:
-        quality = np.asarray(qual_str.split(), dtype=int)
+        quality = np.asarray(qual_str.split(), dtype=np.int64)
     except ValueError:
         raise QUALFormatError(
             "Could not convert quality scores to integers:\n%s" % str(qual_str)

diff --git a/skbio/io/format/tests/test_taxdump.py b/skbio/io/format/tests/test_taxdump.py
@@ -13,6 +13,7 @@
 import numpy as np
 
 from skbio.util import get_data_path, assert_data_frame_almost_equal
+from skbio.util._testing import _data_frame_to_default_int_type
 from skbio.io.format.taxdump import _taxdump_to_data_frame
 
 
@@ -68,6 +69,7 @@ def test_nodes_default(self):
                  'inherited_MGC_flag', 'GenBank_hidden_flag',
                  'hidden_subtree_root_flag', 'comments']).set_index('tax_id')
         exp['comments'] = exp['comments'].astype('O')
+        _data_frame_to_default_int_type(exp)
         assert_data_frame_almost_equal(obs, exp)
 
     def test_names_default(self):
@@ -130,6 +132,7 @@ def test_nodes_slim(self):
             [1038927, 562,    'no rank'],
             [2580236, 488338, 'species']],
             columns=['tax_id', 'parent_tax_id', 'rank']).set_index('tax_id')
+        _data_frame_to_default_int_type(exp)
         assert_data_frame_almost_equal(obs, exp)
 
     def test_custom_scheme(self):

diff --git a/skbio/stats/distance/tests/test_bioenv.py b/skbio/stats/distance/tests/test_bioenv.py
@@ -15,6 +15,7 @@
 from skbio.stats.distance import bioenv
 from skbio.stats.distance._bioenv import _scale
 from skbio.util import get_data_path, assert_data_frame_almost_equal
+from skbio.util._testing import _data_frame_to_default_int_type
 
 
 class BIOENVTests(TestCase):
@@ -75,16 +76,30 @@ def setUp(self):
 
         # Load expected results.
         self.exp_results = pd.read_csv(get_data_path('exp_results.txt'),
-                                       sep='\t', index_col=0)
+                                       sep='\t',
+                                       index_col=0)
+        _data_frame_to_default_int_type(self.exp_results)
+
         self.exp_results_single_column = pd.read_csv(
-            get_data_path('exp_results_single_column.txt'), sep='\t',
-            index_col=0)
+            get_data_path('exp_results_single_column.txt'),
+            sep='\t',
+            index_col=0
+        )
+        _data_frame_to_default_int_type(self.exp_results_single_column)
+
         self.exp_results_different_column_order = pd.read_csv(
-            get_data_path('exp_results_different_column_order.txt'), sep='\t',
-            index_col=0)
+            get_data_path('exp_results_different_column_order.txt'),
+            sep='\t',
+            index_col=0
+        )
+        _data_frame_to_default_int_type(self.exp_results_different_column_order)
+
         self.exp_results_vegan = pd.read_csv(
-            get_data_path('bioenv_exp_results_vegan.txt'), sep='\t',
-            index_col=0)
+            get_data_path('bioenv_exp_results_vegan.txt'),
+            sep='\t',
+            index_col=0
+        )
+        _data_frame_to_default_int_type(self.exp_results_vegan)
 
     def test_bioenv_all_columns_implicit(self):
         # Test with all columns in data frame (implicitly).
@@ -146,7 +161,11 @@ def test_bioenv_vegan_example(self):
         # same distances yields *very* similar results. Thus, the discrepancy
         # seems to stem from differences when computing ranks/ties.
         obs = bioenv(self.dm_vegan, self.df_vegan)
-        assert_data_frame_almost_equal(obs, self.exp_results_vegan, rtol=1e-3)
+        assert_data_frame_almost_equal(
+            obs,
+            self.exp_results_vegan,
+            rtol=1e-3
+            )
 
     def test_bioenv_no_distance_matrix(self):
         with self.assertRaises(TypeError):

diff --git a/skbio/stats/distance/tests/test_mantel.py b/skbio/stats/distance/tests/test_mantel.py
@@ -25,6 +25,7 @@
 from skbio.stats.distance._cutils import mantel_perm_pearsonr_cy
 from skbio.stats.distance._utils import distmat_reorder_condensed
 from skbio.util import get_data_path, assert_data_frame_almost_equal
+from skbio.util._testing import _data_frame_to_default_int_type
 
 
 class MantelTestData(TestCase):
@@ -495,25 +496,39 @@ def setUp(self):
         self.min_dms = (self.minx_dm, self.miny_dm, self.minz_dm)
 
         self.exp_results_minimal = pd.read_csv(
-            get_data_path('pwmantel_exp_results_minimal.txt'), sep='\t',
-            index_col=(0, 1))
+            get_data_path('pwmantel_exp_results_minimal.txt'),
+            sep='\t',
+            index_col=(0, 1)
+        )
+        _data_frame_to_default_int_type(self.exp_results_minimal)
 
         self.exp_results_minimal_with_labels = pd.read_csv(
             get_data_path('pwmantel_exp_results_minimal_with_labels.txt'),
-            sep='\t', index_col=(0, 1))
+            sep='\t',
+            index_col=(0, 1)
+        )
+        _data_frame_to_default_int_type(self.exp_results_minimal_with_labels)
 
         self.exp_results_duplicate_dms = pd.read_csv(
             get_data_path('pwmantel_exp_results_duplicate_dms.txt'),
-            sep='\t', index_col=(0, 1))
+            sep='\t',
+            index_col=(0, 1)
+        )
+        _data_frame_to_default_int_type(self.exp_results_duplicate_dms)
 
         self.exp_results_na_p_value = pd.read_csv(
             get_data_path('pwmantel_exp_results_na_p_value.txt'),
-            sep='\t', index_col=(0, 1))
+            sep='\t',
+            index_col=(0, 1)
+        )
+        _data_frame_to_default_int_type(self.exp_results_na_p_value)
 
         self.exp_results_reordered_distance_matrices = pd.read_csv(
-            get_data_path('pwmantel_exp_results_reordered_distance_matrices'
-                          '.txt'),
-            sep='\t', index_col=(0, 1))
+            get_data_path('pwmantel_exp_results_reordered_distance_matrices.txt'),
+            sep='\t',
+            index_col=(0, 1)
+        )
+        _data_frame_to_default_int_type(self.exp_results_reordered_distance_matrices)
 
         self.exp_results_dm_dm2 = pd.read_csv(
             get_data_path('pwmantel_exp_results_dm_dm2.txt'),

diff --git a/skbio/util/_testing.py b/skbio/util/_testing.py
@@ -349,6 +349,21 @@ def assert_data_frame_almost_equal(left, right, rtol=1e-5):
     assert_index_equal(left.index, right.index)
 
 
+def _data_frame_to_default_int_type(df):
+    """Convert integer columns in a data frame into the platform-default integer type.
+
+    Pandas DataFrame defaults to int64 when reading integers, rather than respecting
+    the platform default (Linux and MacOS: int64, Windows: int32). This causes issues
+    in comparing observed and expected data frames in Windows. This function repairs
+    the issue by converting int64 columns of a data frame into int32 in Windows.
+
+    See: https://github.com/unionai-oss/pandera/issues/726
+
+    """
+    for col in df.select_dtypes("int").columns:
+        df[col] = df[col].astype(int)
+
+
 def assert_series_almost_equal(left, right):
     # pass all kwargs to ensure this function has consistent behavior even if
     # `assert_series_equal`'s defaults change