Skip to content

Commit

Permalink
pylint.
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Mar 31, 2023
1 parent 0fcff38 commit 26a261b
Showing 1 changed file with 69 additions and 70 deletions.
139 changes: 69 additions & 70 deletions python-package/xgboost/testing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,77 +430,76 @@ def rlencode(x: npt.NDArray[np.int32]) -> Tuple[npt.NDArray, npt.NDArray, npt.ND
return indptr, lengths, values


def simulate_clicks(cv_data: RelDataCV) -> ClickFold:
"""Simulate click data using position biased model (PBM)."""

def init_rank_score(
X: sparse.csr_matrix,
y: npt.NDArray[np.int32],
qid: npt.NDArray[np.int32],
sample_rate: float = 0.01,
) -> npt.NDArray[np.float32]:
"""We use XGBoost to generate the initial score instead of SVMRank for
simplicity.
"""
# random sample
_rng = np.random.default_rng(1994)
n_samples = int(X.shape[0] * sample_rate)
index = np.arange(0, X.shape[0], dtype=np.uint64)
_rng.shuffle(index)
index = index[:n_samples]

X_train = X[index]
y_train = y[index]
qid_train = qid[index]

# Sort training data based on query id, required by XGBoost.
sorted_idx = np.argsort(qid_train)
X_train = X_train[sorted_idx]
y_train = y_train[sorted_idx]
qid_train = qid_train[sorted_idx]

ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist")
ltr.fit(X_train, y_train, qid=qid_train)

# Use the original order of the data.
scores = ltr.predict(X)
return scores

def simulate_one_fold(
fold: Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]],
scores_fold: npt.NDArray[np.float32],
) -> ClickFold:
"""Simulate clicks for one fold."""
X_fold, y_fold, qid_fold = fold
assert qid_fold.dtype == np.int32

qids = np.unique(qid_fold)

position = np.empty((y_fold.size,), dtype=np.int64)
clicks = np.empty((y_fold.size,), dtype=np.int32)
pbm = PBM(eta=1.0)

# Avoid grouping by qid as we want to preserve the original data partition by
# the dataset authors.
for q in qids:
qid_mask = q == qid_fold
query_scores = scores_fold[qid_mask]
# Initial rank list, scores sorted to decreasing order
query_position = np.argsort(query_scores)[::-1]
position[qid_mask] = query_position
# get labels
relevance_degrees = y_fold[qid_mask]
query_clicks = pbm.sample_clicks_for_query(
relevance_degrees, query_position
)
clicks[qid_mask] = query_clicks

assert X_fold.shape[0] == qid_fold.shape[0], (X_fold.shape, qid_fold.shape)
assert X_fold.shape[0] == clicks.shape[0], (X_fold.shape, clicks.shape)

return ClickFold(X_fold, y_fold, qid_fold, scores_fold, clicks, position)
def init_rank_score(
X: sparse.csr_matrix,
y: npt.NDArray[np.int32],
qid: npt.NDArray[np.int32],
sample_rate: float = 0.01,
) -> npt.NDArray[np.float32]:
"""We use XGBoost to generate the initial score instead of SVMRank for
simplicity.
"""
# random sample
rng = np.random.default_rng(1994)
n_samples = int(X.shape[0] * sample_rate)
index = np.arange(0, X.shape[0], dtype=np.uint64)
rng.shuffle(index)
index = index[:n_samples]

X_train = X[index]
y_train = y[index]
qid_train = qid[index]

# Sort training data based on query id, required by XGBoost.
sorted_idx = np.argsort(qid_train)
X_train = X_train[sorted_idx]
y_train = y_train[sorted_idx]
qid_train = qid_train[sorted_idx]

ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist")
ltr.fit(X_train, y_train, qid=qid_train)

# Use the original order of the data.
scores = ltr.predict(X)
return scores


def simulate_one_fold(
fold: Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]],
scores_fold: npt.NDArray[np.float32],
) -> ClickFold:
"""Simulate clicks for one fold."""
X_fold, y_fold, qid_fold = fold
assert qid_fold.dtype == np.int32

qids = np.unique(qid_fold)

position = np.empty((y_fold.size,), dtype=np.int64)
clicks = np.empty((y_fold.size,), dtype=np.int32)
pbm = PBM(eta=1.0)

# Avoid grouping by qid as we want to preserve the original data partition by
# the dataset authors.
for q in qids:
qid_mask = q == qid_fold
query_scores = scores_fold[qid_mask]
# Initial rank list, scores sorted to decreasing order
query_position = np.argsort(query_scores)[::-1]
position[qid_mask] = query_position
# get labels
relevance_degrees = y_fold[qid_mask]
query_clicks = pbm.sample_clicks_for_query(relevance_degrees, query_position)
clicks[qid_mask] = query_clicks

assert X_fold.shape[0] == qid_fold.shape[0], (X_fold.shape, qid_fold.shape)
assert X_fold.shape[0] == clicks.shape[0], (X_fold.shape, clicks.shape)

return ClickFold(X_fold, y_fold, qid_fold, scores_fold, clicks, position)


def simulate_clicks(cv_data: RelDataCV) -> ClickFold: # pylint: disable=too-many-locals
"""Simulate click data using position biased model (PBM)."""
X, y, qid = list(zip(cv_data.train, cv_data.test))

indptr = np.array([0] + [v.shape[0] for v in X])
Expand Down

0 comments on commit 26a261b

Please sign in to comment.