diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py index 0b4f5bdbfd40..e09044f54e62 100644 --- a/python-package/xgboost/testing/data.py +++ b/python-package/xgboost/testing/data.py @@ -430,77 +430,76 @@ def rlencode(x: npt.NDArray[np.int32]) -> Tuple[npt.NDArray, npt.NDArray, npt.ND return indptr, lengths, values -def simulate_clicks(cv_data: RelDataCV) -> ClickFold: - """Simulate click data using position biased model (PBM).""" - - def init_rank_score( - X: sparse.csr_matrix, - y: npt.NDArray[np.int32], - qid: npt.NDArray[np.int32], - sample_rate: float = 0.01, - ) -> npt.NDArray[np.float32]: - """We use XGBoost to generate the initial score instead of SVMRank for - simplicity. - - """ - # random sample - _rng = np.random.default_rng(1994) - n_samples = int(X.shape[0] * sample_rate) - index = np.arange(0, X.shape[0], dtype=np.uint64) - _rng.shuffle(index) - index = index[:n_samples] - - X_train = X[index] - y_train = y[index] - qid_train = qid[index] - - # Sort training data based on query id, required by XGBoost. - sorted_idx = np.argsort(qid_train) - X_train = X_train[sorted_idx] - y_train = y_train[sorted_idx] - qid_train = qid_train[sorted_idx] - - ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist") - ltr.fit(X_train, y_train, qid=qid_train) - - # Use the original order of the data. - scores = ltr.predict(X) - return scores - - def simulate_one_fold( - fold: Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]], - scores_fold: npt.NDArray[np.float32], - ) -> ClickFold: - """Simulate clicks for one fold.""" - X_fold, y_fold, qid_fold = fold - assert qid_fold.dtype == np.int32 - - qids = np.unique(qid_fold) - - position = np.empty((y_fold.size,), dtype=np.int64) - clicks = np.empty((y_fold.size,), dtype=np.int32) - pbm = PBM(eta=1.0) - - # Avoid grouping by qid as we want to preserve the original data partition by - # the dataset authors. - for q in qids: - qid_mask = q == qid_fold - query_scores = scores_fold[qid_mask] - # Initial rank list, scores sorted to decreasing order - query_position = np.argsort(query_scores)[::-1] - position[qid_mask] = query_position - # get labels - relevance_degrees = y_fold[qid_mask] - query_clicks = pbm.sample_clicks_for_query( - relevance_degrees, query_position - ) - clicks[qid_mask] = query_clicks - - assert X_fold.shape[0] == qid_fold.shape[0], (X_fold.shape, qid_fold.shape) - assert X_fold.shape[0] == clicks.shape[0], (X_fold.shape, clicks.shape) - - return ClickFold(X_fold, y_fold, qid_fold, scores_fold, clicks, position) +def init_rank_score( + X: sparse.csr_matrix, + y: npt.NDArray[np.int32], + qid: npt.NDArray[np.int32], + sample_rate: float = 0.01, +) -> npt.NDArray[np.float32]: + """We use XGBoost to generate the initial score instead of SVMRank for + simplicity. + """ + # random sample + rng = np.random.default_rng(1994) + n_samples = int(X.shape[0] * sample_rate) + index = np.arange(0, X.shape[0], dtype=np.uint64) + rng.shuffle(index) + index = index[:n_samples] + + X_train = X[index] + y_train = y[index] + qid_train = qid[index] + + # Sort training data based on query id, required by XGBoost. + sorted_idx = np.argsort(qid_train) + X_train = X_train[sorted_idx] + y_train = y_train[sorted_idx] + qid_train = qid_train[sorted_idx] + + ltr = xgboost.XGBRanker(objective="rank:ndcg", tree_method="hist") + ltr.fit(X_train, y_train, qid=qid_train) + + # Use the original order of the data. + scores = ltr.predict(X) + return scores + + +def simulate_one_fold( + fold: Tuple[sparse.csr_matrix, npt.NDArray[np.int32], npt.NDArray[np.int32]], + scores_fold: npt.NDArray[np.float32], +) -> ClickFold: + """Simulate clicks for one fold.""" + X_fold, y_fold, qid_fold = fold + assert qid_fold.dtype == np.int32 + + qids = np.unique(qid_fold) + + position = np.empty((y_fold.size,), dtype=np.int64) + clicks = np.empty((y_fold.size,), dtype=np.int32) + pbm = PBM(eta=1.0) + + # Avoid grouping by qid as we want to preserve the original data partition by + # the dataset authors. + for q in qids: + qid_mask = q == qid_fold + query_scores = scores_fold[qid_mask] + # Initial rank list, scores sorted to decreasing order + query_position = np.argsort(query_scores)[::-1] + position[qid_mask] = query_position + # get labels + relevance_degrees = y_fold[qid_mask] + query_clicks = pbm.sample_clicks_for_query(relevance_degrees, query_position) + clicks[qid_mask] = query_clicks + + assert X_fold.shape[0] == qid_fold.shape[0], (X_fold.shape, qid_fold.shape) + assert X_fold.shape[0] == clicks.shape[0], (X_fold.shape, clicks.shape) + + return ClickFold(X_fold, y_fold, qid_fold, scores_fold, clicks, position) + + +def simulate_clicks(cv_data: RelDataCV) -> ClickFold: # pylint: disable=too-many-locals + """Simulate click data using position biased model (PBM).""" X, y, qid = list(zip(cv_data.train, cv_data.test)) indptr = np.array([0] + [v.shape[0] for v in X])