Skip to content

Commit

Permalink
[python] replace numpy.zeros with numpy.empty for the speedup (#4410)
Browse files Browse the repository at this point in the history
  • Loading branch information
StrikerRUS authored Jun 27, 2021
1 parent db3915c commit 45ac271
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 22 deletions.
26 changes: 13 additions & 13 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ def inner_predict(mat, start_iteration, num_iteration, predict_type, preds=None)
ptr_data, type_ptr_data, _ = c_float_array(data)
n_preds = self.__get_num_preds(start_iteration, num_iteration, mat.shape[0], predict_type)
if preds is None:
preds = np.zeros(n_preds, dtype=np.float64)
preds = np.empty(n_preds, dtype=np.float64)
elif len(preds.shape) != 1 or len(preds) != n_preds:
raise ValueError("Wrong length of pre-allocated predict array")
out_num_preds = ctypes.c_int64(0)
Expand All @@ -807,7 +807,7 @@ def inner_predict(mat, start_iteration, num_iteration, predict_type, preds=None)
# __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff([0] + list(sections) + [nrow])]
n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
preds = np.zeros(sum(n_preds), dtype=np.float64)
preds = np.empty(sum(n_preds), dtype=np.float64)
for chunk, (start_idx_pred, end_idx_pred) in zip(np.array_split(mat, sections),
zip(n_preds_sections, n_preds_sections[1:])):
# avoid memory consumption by arrays concatenation operations
Expand Down Expand Up @@ -868,7 +868,7 @@ def inner_predict(csr, start_iteration, num_iteration, predict_type, preds=None)
nrow = len(csr.indptr) - 1
n_preds = self.__get_num_preds(start_iteration, num_iteration, nrow, predict_type)
if preds is None:
preds = np.zeros(n_preds, dtype=np.float64)
preds = np.empty(n_preds, dtype=np.float64)
elif len(preds.shape) != 1 or len(preds) != n_preds:
raise ValueError("Wrong length of pre-allocated predict array")
out_num_preds = ctypes.c_int64(0)
Expand Down Expand Up @@ -913,7 +913,7 @@ def inner_predict_sparse(csr, start_iteration, num_iteration, predict_type):
out_ptr_data = ctypes.POINTER(ctypes.c_float)()
else:
out_ptr_data = ctypes.POINTER(ctypes.c_double)()
out_shape = np.zeros(2, dtype=np.int64)
out_shape = np.empty(2, dtype=np.int64)
_safe_call(_LIB.LGBM_BoosterPredictSparseOutput(
self.handle,
ptr_indptr,
Expand Down Expand Up @@ -946,7 +946,7 @@ def inner_predict_sparse(csr, start_iteration, num_iteration, predict_type):
# __get_num_preds() cannot work with nrow > MAX_INT32, so calculate overall number of predictions piecemeal
n_preds = [self.__get_num_preds(start_iteration, num_iteration, i, predict_type) for i in np.diff(sections)]
n_preds_sections = np.array([0] + n_preds, dtype=np.intp).cumsum()
preds = np.zeros(sum(n_preds), dtype=np.float64)
preds = np.empty(sum(n_preds), dtype=np.float64)
for (start_idx, end_idx), (start_idx_pred, end_idx_pred) in zip(zip(sections, sections[1:]),
zip(n_preds_sections, n_preds_sections[1:])):
# avoid memory consumption by arrays concatenation operations
Expand All @@ -971,7 +971,7 @@ def inner_predict_sparse(csc, start_iteration, num_iteration, predict_type):
out_ptr_data = ctypes.POINTER(ctypes.c_float)()
else:
out_ptr_data = ctypes.POINTER(ctypes.c_double)()
out_shape = np.zeros(2, dtype=np.int64)
out_shape = np.empty(2, dtype=np.int64)
_safe_call(_LIB.LGBM_BoosterPredictSparseOutput(
self.handle,
ptr_indptr,
Expand Down Expand Up @@ -1002,7 +1002,7 @@ def inner_predict_sparse(csc, start_iteration, num_iteration, predict_type):
if predict_type == C_API_PREDICT_CONTRIB:
return inner_predict_sparse(csc, start_iteration, num_iteration, predict_type)
n_preds = self.__get_num_preds(start_iteration, num_iteration, nrow, predict_type)
preds = np.zeros(n_preds, dtype=np.float64)
preds = np.empty(n_preds, dtype=np.float64)
out_num_preds = ctypes.c_int64(0)

ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr)
Expand Down Expand Up @@ -1176,15 +1176,15 @@ def _set_init_score_by_predictor(self, predictor, data, used_indices=None):
if used_indices is not None:
assert not self.need_slice
if isinstance(data, str):
sub_init_score = np.zeros(num_data * predictor.num_class, dtype=np.float32)
sub_init_score = np.empty(num_data * predictor.num_class, dtype=np.float32)
assert num_data == len(used_indices)
for i in range(len(used_indices)):
for j in range(predictor.num_class):
sub_init_score[i * predictor.num_class + j] = init_score[used_indices[i] * predictor.num_class + j]
init_score = sub_init_score
if predictor.num_class > 1:
# need to regroup init_score
new_init_score = np.zeros(init_score.size, dtype=np.float32)
new_init_score = np.empty(init_score.size, dtype=np.float32)
for i in range(num_data):
for j in range(predictor.num_class):
new_init_score[j * num_data + i] = init_score[i * predictor.num_class + j]
Expand Down Expand Up @@ -1320,7 +1320,7 @@ def __init_from_np2d(self, mat, params_str, ref_dataset):
def __init_from_list_np2d(self, mats, params_str, ref_dataset):
"""Initialize data from a list of 2-D numpy matrices."""
ncol = mats[0].shape[1]
nrow = np.zeros((len(mats),), np.int32)
nrow = np.empty((len(mats),), np.int32)
if mats[0].dtype == np.float64:
ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))()
else:
Expand Down Expand Up @@ -3310,7 +3310,7 @@ def feature_importance(self, importance_type='split', iteration=None):
if iteration is None:
iteration = self.best_iteration
importance_type_int = FEATURE_IMPORTANCE_TYPE_MAPPER[importance_type]
result = np.zeros(self.num_feature(), dtype=np.float64)
result = np.empty(self.num_feature(), dtype=np.float64)
_safe_call(_LIB.LGBM_BoosterFeatureImportance(
self.handle,
ctypes.c_int(iteration),
Expand Down Expand Up @@ -3397,7 +3397,7 @@ def __inner_eval(self, data_name, data_idx, feval=None):
self.__get_eval_info()
ret = []
if self.__num_inner_eval > 0:
result = np.zeros(self.__num_inner_eval, dtype=np.float64)
result = np.empty(self.__num_inner_eval, dtype=np.float64)
tmp_out_len = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetEval(
self.handle,
Expand Down Expand Up @@ -3437,7 +3437,7 @@ def __inner_predict(self, data_idx):
n_preds = self.train_set.num_data() * self.__num_class
else:
n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
self.__inner_predict_buffer[data_idx] = np.zeros(n_preds, dtype=np.float64)
self.__inner_predict_buffer[data_idx] = np.empty(n_preds, dtype=np.float64)
# avoid to predict many time in one iteration
if not self.__is_predicted_cur_iter[data_idx]:
tmp_out_len = ctypes.c_int64(0)
Expand Down
6 changes: 3 additions & 3 deletions python-package/lightgbm/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
else:
flatted_group = np.zeros(num_data, dtype=np.int32)
folds = folds.split(X=np.zeros(num_data), y=full_data.get_label(), groups=flatted_group)
folds = folds.split(X=np.empty(num_data), y=full_data.get_label(), groups=flatted_group)
else:
if any(params.get(obj_alias, "") in {"lambdarank", "rank_xendcg", "xendcg",
"xe_ndcg", "xe_ndcg_mart", "xendcg_mart"}
Expand All @@ -344,12 +344,12 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
group_kfold = _LGBMGroupKFold(n_splits=nfold)
folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group)
folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group)
elif stratified:
if not SKLEARN_INSTALLED:
raise LightGBMError('scikit-learn is required for stratified cv')
skf = _LGBMStratifiedKFold(n_splits=nfold, shuffle=shuffle, random_state=seed)
folds = skf.split(X=np.zeros(num_data), y=full_data.get_label())
folds = skf.split(X=np.empty(num_data), y=full_data.get_label())
else:
if shuffle:
randidx = np.random.RandomState(seed).permutation(num_data)
Expand Down
2 changes: 1 addition & 1 deletion tests/c_api_test/test_.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def test_booster():
for line in inp.readlines():
data.append([float(x) for x in line.split('\t')[1:]])
mat = np.array(data, dtype=np.float64)
preb = np.zeros(mat.shape[0], dtype=np.float64)
preb = np.empty(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_int64(0)
data = np.array(mat.reshape(mat.size), dtype=np.float64, copy=False)
LIB.LGBM_BoosterPredictForMat(
Expand Down
8 changes: 3 additions & 5 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1441,9 +1441,8 @@ def test_max_bin_by_feature():
def test_small_max_bin():
np.random.seed(0)
y = np.random.choice([0, 1], 100)
x = np.zeros((100, 1))
x = np.ones((100, 1))
x[:30, 0] = -1
x[30:60, 0] = 1
x[60:, 0] = 2
params = {'objective': 'binary',
'seed': 0,
Expand Down Expand Up @@ -2259,7 +2258,7 @@ def test_node_level_subcol():


def test_forced_bins():
x = np.zeros((100, 2))
x = np.empty((100, 2))
x[:, 0] = np.arange(0, 1, 0.01)
x[:, 1] = -np.arange(0, 1, 0.01)
y = np.arange(0, 1, 0.01)
Expand All @@ -2275,7 +2274,6 @@ def test_forced_bins():
est = lgb.train(params, lgb_x, num_boost_round=20)
new_x = np.zeros((3, x.shape[1]))
new_x[:, 0] = [0.31, 0.37, 0.41]
new_x[:, 1] = [0, 0, 0]
predicted = est.predict(new_x)
assert len(np.unique(predicted)) == 3
new_x[:, 0] = [0, 0, 0]
Expand All @@ -2300,7 +2298,7 @@ def test_forced_bins():

def test_binning_same_sign():
# test that binning works properly for features with only positive or only negative values
x = np.zeros((99, 2))
x = np.empty((99, 2))
x[:, 0] = np.arange(0.01, 1, 0.01)
x[:, 1] = -np.arange(0.01, 1, 0.01)
y = np.arange(0.01, 1, 0.01)
Expand Down

0 comments on commit 45ac271

Please sign in to comment.