Skip to content

Commit

Permalink
address reviews
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar committed Oct 15, 2024
1 parent 4a9e944 commit b8f964b
Show file tree
Hide file tree
Showing 9 changed files with 44 additions and 39 deletions.
15 changes: 7 additions & 8 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,14 @@ repos:
entry: 'pytest\.xfail'
language: pygrep
types: [python]
- id: use-only-default-rng
name: use-only-default-rng
description: 'Enforce that `default_rng` is used instead of `np.random.seed` and it must be seeded.'
- id: no-unseeded-default-rng
name: no-unseeded-default-rng
description: 'Enforce that no non-seeded default_rng is used and default_rng is used instead of np.random.seed'
entry: |
(?x)
# no unseeded default_rng
|default_rng\(\)
# no np.random.seed
|np.random.seed\(
# Check for usage of default_rng without seeding
default_rng\(\)|
# Check for usage of np.random.seed
np.random.seed\(
language: pygrep
types: [python]
- id: cmake-format
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def test_cat_series_binop_error():
@pytest.mark.parametrize("num_elements", [10, 100, 1000])
def test_categorical_unique(num_elements):
# create categorical series
rng = np.random.default_rng(seed=0)
rng = np.random.default_rng(seed=12)
pd_cat = pd.Categorical(
pd.Series(
rng.choice(
Expand Down
10 changes: 5 additions & 5 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3903,7 +3903,7 @@ def test_select_dtype_datetime_with_frequency():


def test_dataframe_describe_exclude():
rng = np.random.default_rng(seed=0)
rng = np.random.default_rng(seed=12)
data_length = 10000

df = cudf.DataFrame()
Expand All @@ -3919,7 +3919,7 @@ def test_dataframe_describe_exclude():


def test_dataframe_describe_include():
rng = np.random.default_rng(seed=0)
rng = np.random.default_rng(seed=12)
data_length = 10000

df = cudf.DataFrame()
Expand All @@ -3934,7 +3934,7 @@ def test_dataframe_describe_include():


def test_dataframe_describe_default():
rng = np.random.default_rng(seed=0)
rng = np.random.default_rng(seed=12)
data_length = 10000

df = cudf.DataFrame()
Expand All @@ -3948,7 +3948,7 @@ def test_dataframe_describe_default():


def test_series_describe_include_all():
rng = np.random.default_rng(seed=0)
rng = np.random.default_rng(seed=12)
data_length = 10000

df = cudf.DataFrame()
Expand All @@ -3971,7 +3971,7 @@ def test_series_describe_include_all():


def test_dataframe_describe_percentiles():
rng = np.random.default_rng(seed=0)
rng = np.random.default_rng(seed=12)
data_length = 10000
sample_percentiles = [0.0, 0.1, 0.33, 0.84, 0.4, 0.99]

Expand Down
32 changes: 16 additions & 16 deletions python/cudf/cudf/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@

interpolation_methods = ["linear", "lower", "higher", "midpoint", "nearest"]

rng = np.random.default_rng(seed=0)


@pytest.mark.parametrize("method", methods)
@pytest.mark.parametrize("dtype", params_dtypes)
Expand Down Expand Up @@ -86,6 +84,7 @@ def test_series_std(ddof):


def test_series_unique():
rng = np.random.default_rng(seed=0)
for size in [10**x for x in range(5)]:
arr = rng.integers(low=-1, high=10, size=size)
mask = arr != -1
Expand Down Expand Up @@ -131,6 +130,7 @@ def test_series_nunique(nan_as_null, dropna):


def test_series_scale():
rng = np.random.default_rng(seed=0)
arr = pd.Series(rng.integers(low=-10, high=10, size=100))
sr = cudf.Series(arr)

Expand Down Expand Up @@ -231,8 +231,8 @@ def test_misc_quantiles(data, q):
@pytest.mark.parametrize(
"data",
[
{"data": rng.normal(-100, 100, 1000)},
{"data": rng.integers(-50, 50, 1000)},
{"data": np.random.default_rng(seed=0).normal(-100, 100, 1000)},
{"data": np.random.default_rng(seed=0).integers(-50, 50, 1000)},
{"data": (np.zeros(100))},
{"data": np.repeat(np.nan, 100)},
{"data": np.array([1.123, 2.343, np.nan, 0.0])},
Expand Down Expand Up @@ -282,8 +282,8 @@ def test_kurt_skew_error(op):
@pytest.mark.parametrize(
"data",
[
cudf.Series(rng.normal(-100, 100, 1000)),
cudf.Series(rng.integers(-50, 50, 1000)),
cudf.Series(np.random.default_rng(seed=0).normal(-100, 100, 1000)),
cudf.Series(np.random.default_rng(seed=0).integers(-50, 50, 1000)),
cudf.Series(np.zeros(100)),
cudf.Series(np.repeat(np.nan, 100)),
cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])),
Expand Down Expand Up @@ -346,8 +346,8 @@ def test_series_median(dtype, num_na):
@pytest.mark.parametrize(
"data",
[
rng.normal(-100, 100, 1000),
rng.integers(-50, 50, 1000),
np.random.default_rng(seed=0).normal(-100, 100, 1000),
np.random.default_rng(seed=0).integers(-50, 50, 1000),
np.zeros(100),
np.array([1.123, 2.343, np.nan, 0.0]),
np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]),
Expand Down Expand Up @@ -381,8 +381,8 @@ def test_series_pct_change(data, periods, fill_method):
@pytest.mark.parametrize(
"data1",
[
rng.normal(-100, 100, 1000),
rng.integers(-50, 50, 1000),
np.random.default_rng(seed=0).normal(-100, 100, 1000),
np.random.default_rng(seed=0).integers(-50, 50, 1000),
np.zeros(100),
np.repeat(np.nan, 100),
np.array([1.123, 2.343, np.nan, 0.0]),
Expand All @@ -395,8 +395,8 @@ def test_series_pct_change(data, periods, fill_method):
@pytest.mark.parametrize(
"data2",
[
rng.normal(-100, 100, 1000),
rng.integers(-50, 50, 1000),
np.random.default_rng(seed=0).normal(-100, 100, 1000),
np.random.default_rng(seed=0).integers(-50, 50, 1000),
np.zeros(100),
np.repeat(np.nan, 100),
np.array([1.123, 2.343, np.nan, 0.0]),
Expand Down Expand Up @@ -425,8 +425,8 @@ def test_cov1d(data1, data2):
@pytest.mark.parametrize(
"data1",
[
rng.normal(-100, 100, 1000),
rng.integers(-50, 50, 1000),
np.random.default_rng(seed=0).normal(-100, 100, 1000),
np.random.default_rng(seed=0).integers(-50, 50, 1000),
np.zeros(100),
np.repeat(np.nan, 100),
np.array([1.123, 2.343, np.nan, 0.0]),
Expand All @@ -439,8 +439,8 @@ def test_cov1d(data1, data2):
@pytest.mark.parametrize(
"data2",
[
rng.normal(-100, 100, 1000),
rng.integers(-50, 50, 1000),
np.random.default_rng(seed=0).normal(-100, 100, 1000),
np.random.default_rng(seed=0).integers(-50, 50, 1000),
np.zeros(100),
np.repeat(np.nan, 100),
np.array([1.123, 2.343, np.nan, 0.0]),
Expand Down
12 changes: 9 additions & 3 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,14 @@ def test_string_get_item(ps_gs, item):
np.array([False] * 5),
cupy.asarray(np.array([True] * 5)),
cupy.asarray(np.array([False] * 5)),
rng.integers(0, 2, 5).astype("bool").tolist(),
rng.integers(0, 2, 5).astype("bool"),
cupy.asarray(rng.integers(0, 2, 5).astype("bool")),
np.random.default_rng(seed=0)
.integers(0, 2, 5)
.astype("bool")
.tolist(),
np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool"),
cupy.asarray(
np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool")
),
],
)
def test_string_bool_mask(ps_gs, item):
Expand Down Expand Up @@ -1079,6 +1084,7 @@ def test_string_set_scalar(scalar):


def test_string_index():
rng = np.random.default_rng(seed=0)
pdf = pd.DataFrame(rng.random(size=(5, 5)))
gdf = cudf.DataFrame.from_pandas(pdf)
stringIndex = ["a", "b", "c", "d", "e"]
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/utils/hash_vocab_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def _find_hash_for_internal(hash_bin, rng):
return bins, a, b


def _perfect_hash(integers, rng, max_constant):
def _perfect_hash(integers, max_constant, rng):
num_top_level_bins = len(integers) // 4

init_bins, init_a, init_b = _pick_initial_a_b(
Expand Down Expand Up @@ -264,7 +264,7 @@ def hash_vocab(
hash_table,
inner_table_coeffs,
offsets_into_ht,
) = _perfect_hash(keys, rng, 10)
) = _perfect_hash(keys, 10, rng)

_pack_keys_and_values(hash_table, hashed_vocab)
_store_func(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf_pandas_tests/test_cudf_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,7 +1142,7 @@ def test_private_method_result_wrapped():


def test_numpy_var():
rng = np.random.default_rng(seed=0)
rng = np.random.default_rng(seed=42)
data = rng.random(1000)
psr = pd.Series(data)
sr = xpd.Series(data)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf_pandas_tests/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
reason="function names change across versions of pandas, so making sure it only runs on latest version of pandas",
)
def test_profiler():
rng = np.random.default_rng(seed=0)
rng = np.random.default_rng(seed=42)
with Profiler() as profiler:
df = pd.DataFrame(
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ def dask_client():


def test_1d_distributed(dask_client):
rng = np.random.default_rng(seed=0)
rng = np.random.default_rng(seed=42)
ts = pd.Series(rng.random(100))
m = 10
return stumpy.stumped(dask_client, ts, m)


def test_multidimensional_distributed_timeseries(dask_client):
rng = np.random.default_rng(seed=0)
rng = np.random.default_rng(seed=42)
# Each row represents data from a different dimension while each column represents
# data from the same dimension
your_time_series = rng.random(3, 1000)
Expand Down

0 comments on commit b8f964b

Please sign in to comment.