Skip to content

Commit

Permalink
Fix default int types for Pandas DataFrame and Series (scikit-bio#2034)
Browse files Browse the repository at this point in the history
* Fix default int types for pd DataFrame and Series

* Remove trailing hash and fix fasta sniffer

* Use dtype= instead of astype()

* Remove return statement

* Fix tests failing on linux
  • Loading branch information
mataton authored Jun 5, 2024
1 parent e1ede1e commit 545c985
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 25 deletions.
16 changes: 8 additions & 8 deletions skbio/diversity/tests/test_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,19 +170,19 @@ def test_empty(self):
# empty vector
actual = alpha_diversity('sobs',
np.array([], dtype=np.int64))
expected = pd.Series([0])
expected = pd.Series([0], dtype=int)
assert_series_almost_equal(actual, expected)

# array of empty vector
actual = alpha_diversity('sobs',
np.array([[]], dtype=np.int64))
expected = pd.Series([0])
expected = pd.Series([0], dtype=int)
assert_series_almost_equal(actual, expected)

# array of empty vectors
actual = alpha_diversity('sobs',
np.array([[], []], dtype=np.int64))
expected = pd.Series([0, 0])
expected = pd.Series([0, 0], dtype=int)
assert_series_almost_equal(actual, expected)

# empty vector
Expand All @@ -208,12 +208,12 @@ def test_empty(self):
# empty Table
actual = alpha_diversity('sobs', Table(np.array([[]]), [], ['S1', ]))
actual.index = pd.RangeIndex(len(actual))
expected = pd.Series([0])
expected = pd.Series([0], dtype=int)
assert_series_almost_equal(actual, expected)

def test_single_count_vector(self):
actual = alpha_diversity('sobs', np.array([1, 0, 2]))
expected = pd.Series([2])
expected = pd.Series([2], dtype=int)
assert_series_almost_equal(actual, expected)

actual = alpha_diversity('faith_pd', np.array([1, 3, 0, 1, 0]),
Expand Down Expand Up @@ -255,14 +255,14 @@ def test_input_types(self):

def test_sobs(self):
# expected values hand-calculated
expected = pd.Series([3, 3, 3, 3], index=self.sids1)
expected = pd.Series([3, 3, 3, 3], index=self.sids1, dtype=int)
actual = alpha_diversity('sobs', self.table1, self.sids1)
assert_series_almost_equal(actual, expected)
# function passed instead of string
actual = alpha_diversity(sobs, self.table1, self.sids1)
assert_series_almost_equal(actual, expected)
# alt input table
expected = pd.Series([2, 1, 0], index=self.sids2)
expected = pd.Series([2, 1, 0], index=self.sids2, dtype=int)
actual = alpha_diversity('sobs', self.table2, self.sids2)
assert_series_almost_equal(actual, expected)

Expand Down Expand Up @@ -315,7 +315,7 @@ def test_phydiv(self):

def test_no_ids(self):
# expected values hand-calculated
expected = pd.Series([3, 3, 3, 3])
expected = pd.Series([3, 3, 3, 3], dtype=int)
actual = alpha_diversity('sobs', self.table1)
assert_series_almost_equal(actual, expected)

Expand Down
2 changes: 1 addition & 1 deletion skbio/io/format/fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,7 @@ def _parse_quality_scores(chunks):

qual_str = " ".join(chunks)
try:
quality = np.asarray(qual_str.split(), dtype=int)
quality = np.asarray(qual_str.split(), dtype=np.int64)
except ValueError:
raise QUALFormatError(
"Could not convert quality scores to integers:\n%s" % str(qual_str)
Expand Down
3 changes: 3 additions & 0 deletions skbio/io/format/tests/test_taxdump.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import numpy as np

from skbio.util import get_data_path, assert_data_frame_almost_equal
from skbio.util._testing import _data_frame_to_default_int_type
from skbio.io.format.taxdump import _taxdump_to_data_frame


Expand Down Expand Up @@ -68,6 +69,7 @@ def test_nodes_default(self):
'inherited_MGC_flag', 'GenBank_hidden_flag',
'hidden_subtree_root_flag', 'comments']).set_index('tax_id')
exp['comments'] = exp['comments'].astype('O')
_data_frame_to_default_int_type(exp)
assert_data_frame_almost_equal(obs, exp)

def test_names_default(self):
Expand Down Expand Up @@ -130,6 +132,7 @@ def test_nodes_slim(self):
[1038927, 562, 'no rank'],
[2580236, 488338, 'species']],
columns=['tax_id', 'parent_tax_id', 'rank']).set_index('tax_id')
_data_frame_to_default_int_type(exp)
assert_data_frame_almost_equal(obs, exp)

def test_custom_scheme(self):
Expand Down
35 changes: 27 additions & 8 deletions skbio/stats/distance/tests/test_bioenv.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from skbio.stats.distance import bioenv
from skbio.stats.distance._bioenv import _scale
from skbio.util import get_data_path, assert_data_frame_almost_equal
from skbio.util._testing import _data_frame_to_default_int_type


class BIOENVTests(TestCase):
Expand Down Expand Up @@ -75,16 +76,30 @@ def setUp(self):

# Load expected results.
self.exp_results = pd.read_csv(get_data_path('exp_results.txt'),
sep='\t', index_col=0)
sep='\t',
index_col=0)
_data_frame_to_default_int_type(self.exp_results)

self.exp_results_single_column = pd.read_csv(
get_data_path('exp_results_single_column.txt'), sep='\t',
index_col=0)
get_data_path('exp_results_single_column.txt'),
sep='\t',
index_col=0
)
_data_frame_to_default_int_type(self.exp_results_single_column)

self.exp_results_different_column_order = pd.read_csv(
get_data_path('exp_results_different_column_order.txt'), sep='\t',
index_col=0)
get_data_path('exp_results_different_column_order.txt'),
sep='\t',
index_col=0
)
_data_frame_to_default_int_type(self.exp_results_different_column_order)

self.exp_results_vegan = pd.read_csv(
get_data_path('bioenv_exp_results_vegan.txt'), sep='\t',
index_col=0)
get_data_path('bioenv_exp_results_vegan.txt'),
sep='\t',
index_col=0
)
_data_frame_to_default_int_type(self.exp_results_vegan)

def test_bioenv_all_columns_implicit(self):
# Test with all columns in data frame (implicitly).
Expand Down Expand Up @@ -146,7 +161,11 @@ def test_bioenv_vegan_example(self):
# same distances yields *very* similar results. Thus, the discrepancy
# seems to stem from differences when computing ranks/ties.
obs = bioenv(self.dm_vegan, self.df_vegan)
assert_data_frame_almost_equal(obs, self.exp_results_vegan, rtol=1e-3)
assert_data_frame_almost_equal(
obs,
self.exp_results_vegan,
rtol=1e-3
)

def test_bioenv_no_distance_matrix(self):
with self.assertRaises(TypeError):
Expand Down
31 changes: 23 additions & 8 deletions skbio/stats/distance/tests/test_mantel.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from skbio.stats.distance._cutils import mantel_perm_pearsonr_cy
from skbio.stats.distance._utils import distmat_reorder_condensed
from skbio.util import get_data_path, assert_data_frame_almost_equal
from skbio.util._testing import _data_frame_to_default_int_type


class MantelTestData(TestCase):
Expand Down Expand Up @@ -495,25 +496,39 @@ def setUp(self):
self.min_dms = (self.minx_dm, self.miny_dm, self.minz_dm)

self.exp_results_minimal = pd.read_csv(
get_data_path('pwmantel_exp_results_minimal.txt'), sep='\t',
index_col=(0, 1))
get_data_path('pwmantel_exp_results_minimal.txt'),
sep='\t',
index_col=(0, 1)
)
_data_frame_to_default_int_type(self.exp_results_minimal)

self.exp_results_minimal_with_labels = pd.read_csv(
get_data_path('pwmantel_exp_results_minimal_with_labels.txt'),
sep='\t', index_col=(0, 1))
sep='\t',
index_col=(0, 1)
)
_data_frame_to_default_int_type(self.exp_results_minimal_with_labels)

self.exp_results_duplicate_dms = pd.read_csv(
get_data_path('pwmantel_exp_results_duplicate_dms.txt'),
sep='\t', index_col=(0, 1))
sep='\t',
index_col=(0, 1)
)
_data_frame_to_default_int_type(self.exp_results_duplicate_dms)

self.exp_results_na_p_value = pd.read_csv(
get_data_path('pwmantel_exp_results_na_p_value.txt'),
sep='\t', index_col=(0, 1))
sep='\t',
index_col=(0, 1)
)
_data_frame_to_default_int_type(self.exp_results_na_p_value)

self.exp_results_reordered_distance_matrices = pd.read_csv(
get_data_path('pwmantel_exp_results_reordered_distance_matrices'
'.txt'),
sep='\t', index_col=(0, 1))
get_data_path('pwmantel_exp_results_reordered_distance_matrices.txt'),
sep='\t',
index_col=(0, 1)
)
_data_frame_to_default_int_type(self.exp_results_reordered_distance_matrices)

self.exp_results_dm_dm2 = pd.read_csv(
get_data_path('pwmantel_exp_results_dm_dm2.txt'),
Expand Down
15 changes: 15 additions & 0 deletions skbio/util/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,21 @@ def assert_data_frame_almost_equal(left, right, rtol=1e-5):
assert_index_equal(left.index, right.index)


def _data_frame_to_default_int_type(df):
"""Convert integer columns in a data frame into the platform-default integer type.
Pandas DataFrame defaults to int64 when reading integers, rather than respecting
the platform default (Linux and MacOS: int64, Windows: int32). This causes issues
in comparing observed and expected data frames in Windows. This function repairs
the issue by converting int64 columns of a data frame into int32 in Windows.
See: https://github.com/unionai-oss/pandera/issues/726
"""
for col in df.select_dtypes("int").columns:
df[col] = df[col].astype(int)


def assert_series_almost_equal(left, right):
# pass all kwargs to ensure this function has consistent behavior even if
# `assert_series_equal`'s defaults change
Expand Down

0 comments on commit 545c985

Please sign in to comment.