Skip to content

Commit

Permalink
[dask][tests] move make_ranking into utils (#3827)
Browse files Browse the repository at this point in the history
* move make_ranking into utils

* do not cache
  • Loading branch information
StrikerRUS authored Jan 24, 2021
1 parent 7363378 commit da44387
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 85 deletions.
91 changes: 6 additions & 85 deletions tests/python_package_test/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
import lightgbm
import lightgbm.dask as dlgbm

from .utils import make_ranking


data_output = ['array', 'scipy_csr_matrix', 'dataframe']
data_centers = [[[-4, -4], [4, 4]], [[-4, -4], [4, 4], [-4, 4]]]
group_sizes = [5, 5, 5, 10, 10, 10, 20, 20, 20, 50, 50]
Expand All @@ -44,92 +47,13 @@ def listen_port():
listen_port.port = 13000


def _make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
group=None, random_gs=False, avg_gs=10, random_state=0):
"""Generate a learning-to-rank dataset - feature vectors grouped together with
integer-valued graded relevance scores. Replace this with a sklearn.datasets function
if ranking objective becomes supported in sklearn.datasets module.
Parameters
----------
n_samples : int, optional (default=100)
Total number of documents (records) in the dataset.
n_features : int, optional (default=20)
Total number of features in the dataset.
n_informative : int, optional (default=5)
Number of features that are "informative" for ranking, as they are bias + beta * y
where bias and beta are standard normal variates. If this is greater than n_features, the dataset will have
n_features features, all will be informative.
group : array-like, optional (default=None)
1-d array or list of group sizes. When `group` is specified, this overrides n_samples, random_gs, and
avg_gs by simply creating groups with sizes group[0], ..., group[-1].
gmax : int, optional (default=2)
Maximum graded relevance value for creating relevance/target vector. If you set this to 2, for example, all
documents in a group will have relevance scores of either 0, 1, or 2.
random_gs : bool, optional (default=False)
True will make group sizes ~ Poisson(avg_gs), False will make group sizes == avg_gs.
avg_gs : int, optional (default=10)
Average number of documents (records) in each group.
Returns
-------
X : 2-d np.ndarray of shape = [n_samples (or np.sum(group)), n_features]
Input feature matrix for ranking objective.
y : 1-d np.array of shape = [n_samples (or np.sum(group))]
Integer-graded relevance scores.
group_ids : 1-d np.array of shape = [n_samples (or np.sum(group))]
Array of group ids, each value indicates to which group each record belongs.
"""
rnd_generator = check_random_state(random_state)

y_vec, group_id_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)
gid = 0

# build target, group ID vectors.
relvalues = range(gmax + 1)

# build y/target and group-id vectors with user-specified group sizes.
if group is not None and hasattr(group, '__len__'):
n_samples = np.sum(group)

for i, gsize in enumerate(group):
y_vec = np.concatenate((y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True)))
group_id_vec = np.concatenate((group_id_vec, [i] * gsize))

# build y/target and group-id vectors according to n_samples, avg_gs, and random_gs.
else:
while len(y_vec) < n_samples:
gsize = avg_gs if not random_gs else rnd_generator.poisson(avg_gs)

# groups should contain > 1 element for pairwise learning objective.
if gsize < 1:
continue

y_vec = np.append(y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True))
group_id_vec = np.append(group_id_vec, [gid] * gsize)
gid += 1

y_vec, group_id_vec = y_vec[:n_samples], group_id_vec[:n_samples]

# build feature data, X. Transform first few into informative features.
n_informative = max(min(n_features, n_informative), 0)
X = rnd_generator.uniform(size=(n_samples, n_features))

for j in range(n_informative):
bias, coef = rnd_generator.normal(size=2)
X[:, j] = bias + coef * y_vec

return X, y_vec, group_id_vec


def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs):
X, y, g = _make_ranking(n_samples=n_samples, random_state=42, **kwargs)
X, y, g = make_ranking(n_samples=n_samples, random_state=42, **kwargs)
rnd = np.random.RandomState(42)
w = rnd.rand(X.shape[0]) * 0.01
g_rle = np.array([len(list(grp)) for _, grp in itertools.groupby(g)])

if output == 'dataframe':

# add target, weight, and group to DataFrame so that partitions abide by group boundaries.
X_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
X = X_df.copy()
Expand All @@ -149,9 +73,7 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
# encode group identifiers into run-length encoding, the format LightGBMRanker is expecting
# so that within each partition, sum(g) = n_samples.
dg = dg.map_partitions(lambda p: p.groupby('g', sort=False).apply(lambda z: z.shape[0]))

elif output == 'array':

# ranking arrays: one chunk per group. Each chunk must include all columns.
p = X.shape[1]
dX, dy, dw, dg = [], [], [], []
Expand All @@ -166,7 +88,6 @@ def _create_ranking_data(n_samples=100, output='array', chunk_size=50, **kwargs)
dy = da.concatenate(dy, axis=0)
dw = da.concatenate(dw, axis=0)
dg = da.concatenate(dg, axis=0)

else:
raise ValueError('Ranking data creation only supported for Dask arrays and dataframes')

Expand All @@ -179,7 +100,7 @@ def _create_data(objective, n_samples=100, centers=2, output='array', chunk_size
elif objective == 'regression':
X, y = make_regression(n_samples=n_samples, random_state=42)
else:
raise ValueError(objective)
raise ValueError("Unknown objective '%s'" % objective)
rnd = np.random.RandomState(42)
weights = rnd.random(X.shape[0]) * 0.01

Expand All @@ -198,7 +119,7 @@ def _create_data(objective, n_samples=100, centers=2, output='array', chunk_size
dy = da.from_array(y, chunks=chunk_size)
dw = da.from_array(weights, chunk_size)
else:
raise ValueError("Unknown output type %s" % output)
raise ValueError("Unknown output type '%s'" % output)

return X, y, weights, dX, dy, dw

Expand Down
82 changes: 82 additions & 0 deletions tests/python_package_test/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# coding: utf-8
from functools import lru_cache

import numpy as np
import sklearn.datasets
from sklearn.utils import check_random_state


@lru_cache(maxsize=None)
Expand All @@ -27,3 +29,83 @@ def load_iris(**kwargs):
@lru_cache(maxsize=None)
def load_linnerud(**kwargs):
return sklearn.datasets.load_linnerud(**kwargs)


def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
group=None, random_gs=False, avg_gs=10, random_state=0):
"""Generate a learning-to-rank dataset - feature vectors grouped together with
integer-valued graded relevance scores. Replace this with a sklearn.datasets function
if ranking objective becomes supported in sklearn.datasets module.
Parameters
----------
n_samples : int, optional (default=100)
Total number of documents (records) in the dataset.
n_features : int, optional (default=20)
Total number of features in the dataset.
n_informative : int, optional (default=5)
Number of features that are "informative" for ranking, as they are bias + beta * y
where bias and beta are standard normal variates. If this is greater than n_features, the dataset will have
n_features features, all will be informative.
gmax : int, optional (default=2)
Maximum graded relevance value for creating relevance/target vector. If you set this to 2, for example, all
documents in a group will have relevance scores of either 0, 1, or 2.
group : array-like, optional (default=None)
1-d array or list of group sizes. When `group` is specified, this overrides n_samples, random_gs, and
avg_gs by simply creating groups with sizes group[0], ..., group[-1].
random_gs : bool, optional (default=False)
True will make group sizes ~ Poisson(avg_gs), False will make group sizes == avg_gs.
avg_gs : int, optional (default=10)
Average number of documents (records) in each group.
random_state : int, optional (default=0)
Random seed.
Returns
-------
X : 2-d np.ndarray of shape = [n_samples (or np.sum(group)), n_features]
Input feature matrix for ranking objective.
y : 1-d np.array of shape = [n_samples (or np.sum(group))]
Integer-graded relevance scores.
group_ids : 1-d np.array of shape = [n_samples (or np.sum(group))]
Array of group ids, each value indicates to which group each record belongs.
"""
rnd_generator = check_random_state(random_state)

y_vec, group_id_vec = np.empty((0,), dtype=int), np.empty((0,), dtype=int)
gid = 0

# build target, group ID vectors.
relvalues = range(gmax + 1)

# build y/target and group-id vectors with user-specified group sizes.
if group is not None and hasattr(group, '__len__'):
n_samples = np.sum(group)

for i, gsize in enumerate(group):
y_vec = np.concatenate((y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True)))
group_id_vec = np.concatenate((group_id_vec, [i] * gsize))

# build y/target and group-id vectors according to n_samples, avg_gs, and random_gs.
else:
while len(y_vec) < n_samples:
gsize = avg_gs if not random_gs else rnd_generator.poisson(avg_gs)

# groups should contain > 1 element for pairwise learning objective.
if gsize < 1:
continue

y_vec = np.append(y_vec, rnd_generator.choice(relvalues, size=gsize, replace=True))
group_id_vec = np.append(group_id_vec, [gid] * gsize)
gid += 1

y_vec, group_id_vec = y_vec[:n_samples], group_id_vec[:n_samples]

# build feature data, X. Transform first few into informative features.
n_informative = max(min(n_features, n_informative), 0)
X = rnd_generator.uniform(size=(n_samples, n_features))

for j in range(n_informative):
bias, coef = rnd_generator.normal(size=2)
X[:, j] = bias + coef * y_vec

return X, y_vec, group_id_vec

0 comments on commit da44387

Please sign in to comment.