Skip to content

Commit

Permalink
fix sparse multiclass local feature contributions and add test (#3382)
Browse files Browse the repository at this point in the history
  • Loading branch information
imatiach-msft authored Sep 21, 2020
1 parent 1782fcb commit eff287e
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 5 deletions.
22 changes: 17 additions & 5 deletions src/c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,7 @@ class Booster {
out_indices, out_data, data_type, &is_data_float32, num_matrices);
std::vector<int> row_sizes(num_matrices * nrow);
std::vector<int64_t> row_matrix_offsets(num_matrices * nrow);
std::vector<int64_t> matrix_offsets(num_matrices);
int64_t row_vector_cnt = 0;
for (int m = 0; m < num_matrices; ++m) {
for (int64_t i = 0; i < static_cast<int64_t>(agg.size()); ++i) {
Expand All @@ -529,6 +530,12 @@ class Booster {
}
row_vector_cnt++;
}
if (m == 0) {
matrix_offsets[m] = 0;
}
if (m + 1 < num_matrices) {
matrix_offsets[m + 1] = static_cast<int64_t>(matrix_offsets[m] + row_matrix_offsets[row_vector_cnt - 1] + row_sizes[row_vector_cnt - 1]);
}
}
// copy vector results to output for each row
int64_t indptr_index = 0;
Expand All @@ -546,7 +553,7 @@ class Booster {
OMP_LOOP_EX_BEGIN();
auto row_vector = agg[i];
int64_t row_start_index = matrix_start_index + i;
int64_t element_index = row_matrix_offsets[row_start_index];
int64_t element_index = row_matrix_offsets[row_start_index] + matrix_offsets[m];
int64_t indptr_loop_index = indptr_index + i;
for (auto it = row_vector[m].begin(); it != row_vector[m].end(); ++it) {
(*out_indices)[element_index] = it->first;
Expand Down Expand Up @@ -646,13 +653,16 @@ class Booster {
} else {
(reinterpret_cast<int64_t*>(*out_col_ptr))[col_ptr_index] = last_column_start_index + last_column_size;
}
if (m != 0) {
matrix_start_indices[m] = matrix_start_indices[m - 1] +
last_column_start_index +
last_column_size;
if (m + 1 < num_matrices) {
matrix_start_indices[m + 1] = matrix_start_indices[m] + last_column_start_index + last_column_size;
}
col_ptr_index++;
}
// Note: we parallelize across matrices instead of rows because of the column_counts[m][col_idx] increment inside the loop
OMP_INIT_EX();
#pragma omp parallel for schedule(static)
for (int m = 0; m < num_matrices; ++m) {
OMP_LOOP_EX_BEGIN();
for (int64_t i = 0; i < static_cast<int64_t>(agg.size()); ++i) {
auto row_vector = agg[i];
for (auto it = row_vector[m].begin(); it != row_vector[m].end(); ++it) {
Expand All @@ -671,7 +681,9 @@ class Booster {
}
}
}
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
out_len[0] = elements_size;
out_len[1] = col_ptr_size;
}
Expand Down
45 changes: 45 additions & 0 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1034,6 +1034,51 @@ def test_contribs_sparse(self):
# validate the values are the same
np.testing.assert_allclose(contribs_csc.toarray(), contribs_dense)

def test_contribs_sparse_multiclass(self):
n_features = 20
n_samples = 100
n_labels = 4
# generate CSR sparse dataset
X, y = make_multilabel_classification(n_samples=n_samples,
sparse=True,
n_features=n_features,
n_classes=1,
n_labels=n_labels)
y = y.flatten()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = {
'objective': 'multiclass',
'num_class': n_labels,
'verbose': -1,
}
lgb_train = lgb.Dataset(X_train, y_train)
gbm = lgb.train(params, lgb_train, num_boost_round=20)
contribs_csr = gbm.predict(X_test, pred_contrib=True)
self.assertTrue(isinstance(contribs_csr, list))
for perclass_contribs_csr in contribs_csr:
self.assertTrue(isspmatrix_csr(perclass_contribs_csr))
# convert data to dense and get back same contribs
contribs_dense = gbm.predict(X_test.toarray(), pred_contrib=True)
# validate the values are the same
contribs_csr_array = np.swapaxes(np.array([sparse_array.todense() for sparse_array in contribs_csr]), 0, 1)
contribs_csr_arr_re = contribs_csr_array.reshape((contribs_csr_array.shape[0],
contribs_csr_array.shape[1] * contribs_csr_array.shape[2]))
np.testing.assert_allclose(contribs_csr_arr_re, contribs_dense)
contribs_dense_re = contribs_dense.reshape(contribs_csr_array.shape)
self.assertLess(np.linalg.norm(gbm.predict(X_test, raw_score=True)
- np.sum(contribs_dense_re, axis=2)), 1e-4)
# validate using CSC matrix
X_test_csc = X_test.tocsc()
contribs_csc = gbm.predict(X_test_csc, pred_contrib=True)
self.assertTrue(isinstance(contribs_csc, list))
for perclass_contribs_csc in contribs_csc:
self.assertTrue(isspmatrix_csc(perclass_contribs_csc))
# validate the values are the same
contribs_csc_array = np.swapaxes(np.array([sparse_array.todense() for sparse_array in contribs_csc]), 0, 1)
contribs_csc_array = contribs_csc_array.reshape((contribs_csc_array.shape[0],
contribs_csc_array.shape[1] * contribs_csc_array.shape[2]))
np.testing.assert_allclose(contribs_csc_array, contribs_dense)

@unittest.skipIf(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, 'not enough RAM')
def test_int32_max_sparse_contribs(self):
params = {
Expand Down

0 comments on commit eff287e

Please sign in to comment.