diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh index c2ee3280e56d..a7f03c34915d 100755 --- a/.ci/test_r_package.sh +++ b/.ci/test_r_package.sh @@ -104,6 +104,18 @@ if [[ $OS_NAME == "macos" ]]; then sudo installer \ -pkg $(pwd)/R.pkg \ -target / || exit 1 + + # install tidy v5.8.0 + # ref: https://groups.google.com/g/r-sig-mac/c/7u_ivEj4zhM + TIDY_URL=https://github.com/htacg/tidy-html5/releases/download/5.8.0/tidy-5.8.0-macos-x86_64+arm64.pkg + curl -sL ${TIDY_URL} -o tidy.pkg + sudo installer \ + -pkg $(pwd)/tidy.pkg \ + -target / + + # ensure that this newer version of 'tidy' is used by 'R CMD check' + # ref: https://cran.r-project.org/doc/manuals/R-exts.html#Checking-packages + export R_TIDYCMD=/usr/local/bin/tidy fi # fix for issue where CRAN was not returning {lattice} and {evaluate} when using R 3.6 @@ -263,20 +275,25 @@ fi # this check makes sure that CI builds of the package # actually use MM_PREFETCH preprocessor definition -if [[ $R_BUILD_TYPE == "cran" ]]; then - mm_prefetch_working=$( - cat $BUILD_LOG_FILE \ - | grep --count -E "checking whether MM_PREFETCH work.*yes" - ) -else - mm_prefetch_working=$( - cat $BUILD_LOG_FILE \ - | grep --count -E ".*Performing Test MM_PREFETCH - Success" - ) -fi -if [[ $mm_prefetch_working -ne 1 ]]; then - echo "MM_PREFETCH test was not passed" - exit 1 +# +# _mm_prefetch will not work on arm64 architecture +# ref: https://github.com/microsoft/LightGBM/issues/4124 +if [[ $ARCH != "arm64" ]]; then + if [[ $R_BUILD_TYPE == "cran" ]]; then + mm_prefetch_working=$( + cat $BUILD_LOG_FILE \ + | grep --count -E "checking whether MM_PREFETCH work.*yes" + ) + else + mm_prefetch_working=$( + cat $BUILD_LOG_FILE \ + | grep --count -E ".*Performing Test MM_PREFETCH - Success" + ) + fi + if [[ $mm_prefetch_working -ne 1 ]]; then + echo "MM_PREFETCH test was not passed" + exit 1 + fi fi # this check makes sure that CI builds of the package diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml index 3bcbf7ea4f79..acf5f2407c2e 100644 --- a/.github/workflows/r_package.yml +++ b/.github/workflows/r_package.yml @@ -55,12 +55,6 @@ jobs: r_version: 4.3 build_type: cmake container: 'ubuntu:22.04' - - os: ubuntu-latest - task: r-package - compiler: clang - r_version: 3.6 - build_type: cmake - container: 'ubuntu:18.04' - os: ubuntu-latest task: r-package compiler: clang @@ -138,6 +132,13 @@ jobs: r_version: 4.3 build_type: cran container: null + # macos-14 = arm64 + - os: macos-14 + task: r-package + compiler: clang + r_version: 4.3 + build_type: cran + container: null steps: - name: Prevent conversion of line endings on Windows if: startsWith(matrix.os, 'windows') @@ -188,12 +189,12 @@ jobs: CTAN_MIRROR: https://ctan.math.illinois.edu/systems/win32/miktex TINYTEX_INSTALLER: TinyTeX - name: Setup and run tests on Linux and macOS - if: matrix.os == 'macos-13' || matrix.os == 'ubuntu-latest' + if: startsWith(matrix.os, 'macos') || startsWith(matrix.os, 'ubuntu') shell: bash run: | export TASK="${{ matrix.task }}" export COMPILER="${{ matrix.compiler }}" - if [[ "${{ matrix.os }}" == "macos-13" ]]; then + if [[ "${{ matrix.os }}" =~ ^macos ]]; then export OS_NAME="macos" elif [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then export OS_NAME="linux" diff --git a/CMakeLists.txt b/CMakeLists.txt index 09eaaa214261..c287b6b31039 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,6 @@ option(USE_MPI "Enable MPI-based distributed learning" OFF) option(USE_OPENMP "Enable OpenMP" ON) option(USE_GPU "Enable GPU-accelerated training" OFF) option(USE_SWIG "Enable SWIG to generate Java API" OFF) -option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF) option(USE_TIMETAG "Set to ON to output time costs" OFF) option(USE_CUDA "Enable CUDA-accelerated training " OFF) option(USE_DEBUG "Set to ON for Debug mode" OFF) @@ -294,21 +293,6 @@ if(USE_CUDA) endforeach() endif() -if(USE_HDFS) - message( - DEPRECATION - "HDFS support in LightGBM is deprecated, and will be removed in a future release.\ - See https://github.com/microsoft/LightGBM/issues/6436. - " - ) - find_package(JNI REQUIRED) - find_path(HDFS_INCLUDE_DIR hdfs.h REQUIRED) - find_library(HDFS_LIB NAMES hdfs REQUIRED) - include_directories(${HDFS_INCLUDE_DIR}) - add_definitions(-DUSE_HDFS) - set(HDFS_CXX_LIBRARIES ${HDFS_LIB} ${JAVA_JVM_LIBRARY}) -endif() - include(CheckCXXSourceCompiles) check_cxx_source_compiles(" #include @@ -647,10 +631,6 @@ if(USE_CUDA) target_link_libraries(_lightgbm PRIVATE ${histograms}) endif() -if(USE_HDFS) - target_link_libraries(lightgbm_objs PUBLIC ${HDFS_CXX_LIBRARIES}) -endif() - if(WIN32) if(MINGW OR CYGWIN) target_link_libraries(lightgbm_objs PUBLIC ws2_32 iphlpapi) diff --git a/build-python.sh b/build-python.sh index 01c3cf7c3e02..afb4667acf97 100755 --- a/build-python.sh +++ b/build-python.sh @@ -40,8 +40,6 @@ # Compile CUDA version. # --gpu # Compile GPU version. -# --hdfs -# Compile HDFS version. # --integrated-opencl # Compile integrated OpenCL version. # --mingw @@ -148,9 +146,6 @@ while [ $# -gt 0 ]; do --gpu) BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_GPU=ON" ;; - --hdfs) - BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_HDFS=ON" - ;; --integrated-opencl) BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.__INTEGRATE_OPENCL=ON" ;; diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst index 9b774da7f958..18fcd30021ef 100644 --- a/docs/Installation-Guide.rst +++ b/docs/Installation-Guide.rst @@ -618,39 +618,6 @@ Windows The CUDA version is not supported on Windows. Use the GPU version (``device_type=gpu``) for GPU acceleration on Windows. -Build HDFS Version -~~~~~~~~~~~~~~~~~~ - -.. warning:: - HDFS support in LightGBM is deprecated, and will be removed in a future release. - See https://github.com/microsoft/LightGBM/issues/6436. - -The HDFS version of LightGBM was tested on CDH-5.14.4 cluster. - -Linux -^^^^^ - -On Linux a HDFS version of LightGBM can be built using **CMake** and **gcc**. - -1. Install `CMake`_. - -2. Run the following commands: - - .. code:: sh - - git clone --recursive https://github.com/microsoft/LightGBM - cd LightGBM - cmake -B build -S . -DUSE_HDFS=ON - # if you have installed HDFS to a customized location, you should specify paths to HDFS headers (hdfs.h) and library (libhdfs.so) like the following: - # cmake \ - # -DUSE_HDFS=ON \ - # -DHDFS_LIB="/opt/cloudera/parcels/CDH-5.14.4-1.cdh5.14.4.p0.3/lib64/libhdfs.so" \ - # -DHDFS_INCLUDE_DIR="/opt/cloudera/parcels/CDH-5.14.4-1.cdh5.14.4.p0.3/include/" \ - # .. - cmake --build build -j4 - -**Note**: In some rare cases you may need to install OpenMP runtime library separately (use your package manager and search for ``lib[g|i]omp`` for doing this). - Build Java Wrapper ~~~~~~~~~~~~~~~~~~ diff --git a/docs/_static/js/script.js b/docs/_static/js/script.js index 89d14d14aaf0..107a6a4969a3 100644 --- a/docs/_static/js/script.js +++ b/docs/_static/js/script.js @@ -29,7 +29,6 @@ $(function() { '#build-mpi-version', '#build-gpu-version', '#build-cuda-version', - '#build-hdfs-version', '#build-java-wrapper', '#build-c-unit-tests' ]; diff --git a/python-package/README.rst b/python-package/README.rst index 463cc0247a9f..bcaab539fb0a 100644 --- a/python-package/README.rst +++ b/python-package/README.rst @@ -151,23 +151,6 @@ All requirements from `Build from Sources section <#build-from-sources>`__ apply To use the CUDA version within Python, pass ``{"device": "cuda"}`` respectively in parameters. -Build HDFS Version -~~~~~~~~~~~~~~~~~~ - -.. warning:: - HDFS support in LightGBM is deprecated, and will be removed in a future release. - See https://github.com/microsoft/LightGBM/issues/6436. - -.. code:: sh - - pip install lightgbm --config-settings=cmake.define.USE_HDFS=ON - -All requirements from `Build from Sources section <#build-from-sources>`__ apply for this installation option as well. - -**HDFS** library is needed: details for installation can be found in `Installation Guide `__. - -Note that the installation process of HDFS version was tested only on **Linux**. - Build with MinGW-w64 on Windows ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -253,8 +236,6 @@ Run ``sh ./build-python.sh install --gpu`` to enable GPU support. All requiremen Run ``sh ./build-python.sh install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well. -Run ``sh ./build-python.sh install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well. - Run ``sh ./build-python.sh install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well. Run ``sh ./build-python.sh install --time-costs``, if you want to output time costs for different internal routines. All requirements from `Build with Time Costs Output section <#build-with-time-costs-output>`__ apply for this installation option as well. diff --git a/src/io/config.cpp b/src/io/config.cpp index 7516ddbd4ac6..c63de70fc16b 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -40,9 +40,24 @@ void GetFirstValueAsInt(const std::unordered_map>& params) { - int verbosity = Config().verbosity; - GetFirstValueAsInt(params, "verbose", &verbosity); - GetFirstValueAsInt(params, "verbosity", &verbosity); + int verbosity = 1; + + // if "verbosity" was found in params, prefer that to any other aliases + const auto verbosity_iter = params.find("verbosity"); + if (verbosity_iter != params.end()) { + GetFirstValueAsInt(params, "verbosity", &verbosity); + } else { + // if "verbose" was found in params and "verbosity" was not, use that value + const auto verbose_iter = params.find("verbose"); + if (verbose_iter != params.end()) { + GetFirstValueAsInt(params, "verbose", &verbosity); + } else { + // if "verbosity" and "verbose" were both missing from params, don't modify LightGBM's log level + return; + } + } + + // otherwise, update LightGBM's log level based on the passed-in value if (verbosity < 0) { LightGBM::Log::ResetLogLevel(LightGBM::LogLevel::Fatal); } else if (verbosity == 0) { diff --git a/src/io/file_io.cpp b/src/io/file_io.cpp index a2721e96c2dd..0dda86620e87 100644 --- a/src/io/file_io.cpp +++ b/src/io/file_io.cpp @@ -11,10 +11,6 @@ #include #include -#ifdef USE_HDFS -#include -#endif - namespace LightGBM { struct LocalFile : VirtualFileReader, VirtualFileWriter { @@ -56,142 +52,17 @@ struct LocalFile : VirtualFileReader, VirtualFileWriter { const std::string mode_; }; -const char* kHdfsProto = "hdfs://"; - -#ifdef USE_HDFS -const size_t kHdfsProtoLength = static_cast(strlen(kHdfsProto)); - -struct HDFSFile : VirtualFileReader, VirtualFileWriter { - HDFSFile(const std::string& filename, int flags) - : filename_(filename), flags_(flags) {} - ~HDFSFile() { - if (file_ != NULL) { - hdfsCloseFile(fs_, file_); - } - } - - bool Init() { - if (file_ == NULL) { - if (fs_ == NULL) { - fs_ = GetHDFSFileSystem(filename_); - } - if (fs_ != NULL && - (flags_ == O_WRONLY || 0 == hdfsExists(fs_, filename_.c_str()))) { - file_ = hdfsOpenFile(fs_, filename_.c_str(), flags_, 0, 0, 0); - } - } - return file_ != NULL; - } - - bool Exists() const { - if (fs_ == NULL) { - fs_ = GetHDFSFileSystem(filename_); - } - return fs_ != NULL && 0 == hdfsExists(fs_, filename_.c_str()); - } - - size_t Read(void* data, size_t bytes) const { - return FileOperation(data, bytes, &hdfsRead); - } - - size_t Write(const void* data, size_t bytes) const { - return FileOperation(data, bytes, &hdfsWrite); - } - - private: - template - using fileOp = tSize (*)(hdfsFS, hdfsFile, BufferType, tSize); - - template - inline size_t FileOperation(BufferType data, size_t bytes, - fileOp op) const { - char* buffer = const_cast(static_cast(data)); - size_t remain = bytes; - while (remain != 0) { - size_t nmax = static_cast(std::numeric_limits::max()); - tSize ret = op(fs_, file_, buffer, std::min(nmax, remain)); - if (ret > 0) { - size_t n = static_cast(ret); - remain -= n; - buffer += n; - } else if (ret == 0) { - break; - } else if (errno != EINTR) { - Log::Fatal("Failed HDFS file operation [%s]", strerror(errno)); - } - } - return bytes - remain; - } - - static hdfsFS GetHDFSFileSystem(const std::string& uri) { - size_t end = uri.find("/", kHdfsProtoLength); - if (uri.find(kHdfsProto) != 0 || end == std::string::npos) { - Log::Warning("Bad HDFS uri, no namenode found [%s]", uri.c_str()); - return NULL; - } - std::string hostport = uri.substr(kHdfsProtoLength, end - kHdfsProtoLength); - if (fs_cache_.count(hostport) == 0) { - fs_cache_[hostport] = MakeHDFSFileSystem(hostport); - } - return fs_cache_[hostport]; - } - - static hdfsFS MakeHDFSFileSystem(const std::string& hostport) { - std::istringstream iss(hostport); - std::string host; - tPort port = 0; - std::getline(iss, host, ':'); - iss >> port; - hdfsFS fs = iss.eof() ? hdfsConnect(host.c_str(), port) : NULL; - if (fs == NULL) { - Log::Warning("Could not connect to HDFS namenode [%s]", hostport.c_str()); - } - return fs; - } - - mutable hdfsFS fs_ = NULL; - hdfsFile file_ = NULL; - const std::string filename_; - const int flags_; - static std::unordered_map fs_cache_; -}; - -std::unordered_map HDFSFile::fs_cache_ = - std::unordered_map(); - -#define WITH_HDFS(x) x -#else -#define WITH_HDFS(x) Log::Fatal("HDFS support is not enabled") -#endif // USE_HDFS - std::unique_ptr VirtualFileReader::Make( const std::string& filename) { -#ifdef USE_HDFS - if (0 == filename.find(kHdfsProto)) { - WITH_HDFS(return std::unique_ptr( - new HDFSFile(filename, O_RDONLY))); - } -#endif return std::unique_ptr(new LocalFile(filename, "rb")); } std::unique_ptr VirtualFileWriter::Make( const std::string& filename) { -#ifdef USE_HDFS - if (0 == filename.find(kHdfsProto)) { - WITH_HDFS(return std::unique_ptr( - new HDFSFile(filename, O_WRONLY))); - } -#endif return std::unique_ptr(new LocalFile(filename, "wb")); } bool VirtualFileWriter::Exists(const std::string& filename) { -#ifdef USE_HDFS - if (0 == filename.find(kHdfsProto)) { - WITH_HDFS(HDFSFile file(filename, O_RDONLY); return file.Exists()); - } -#endif LocalFile file(filename, "rb"); return file.Exists(); } diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index f3367c59f911..0dfe3e47fa11 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -383,7 +383,7 @@ def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_feat arr_a = np.zeros((100, 1), dtype=np.float32) arr_b = rng.uniform(size=(100, 5)) - dataset_a = lgb.Dataset(arr_a).construct() + dataset_a = lgb.Dataset(arr_a, params={"verbose": 0}).construct() expected_msg = ( "[LightGBM] [Warning] There are no meaningful features which satisfy " "the provided configuration. Decreasing Dataset parameters min_data_in_bin " diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index d95e8599fc70..9ff56206ca70 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1469,6 +1469,7 @@ def test_parameters_are_loaded_from_model_file(tmp_path, capsys, rng): "metric": ["l2", "rmse"], "num_leaves": 5, "num_threads": 1, + "verbosity": 0, } model_file = tmp_path / "model.txt" orig_bst = lgb.train(params, ds, num_boost_round=1, categorical_feature=[1, 2]) @@ -4274,11 +4275,25 @@ def test_verbosity_and_verbose(capsys): "verbosity": 0, } lgb.train(params, ds, num_boost_round=1) - expected_msg = "[LightGBM] [Warning] verbosity is set=0, verbose=1 will be ignored. " "Current value: verbosity=0" + expected_msg = "[LightGBM] [Warning] verbosity is set=0, verbose=1 will be ignored. Current value: verbosity=0" stdout = capsys.readouterr().out assert expected_msg in stdout +def test_verbosity_is_respected_when_using_custom_objective(capsys): + X, y = make_synthetic_regression() + ds = lgb.Dataset(X, y) + params = { + "objective": mse_obj, + "nonsense": 123, + "num_leaves": 3, + } + lgb.train({**params, "verbosity": -1}, ds, num_boost_round=1) + assert capsys.readouterr().out == "" + lgb.train({**params, "verbosity": 0}, ds, num_boost_round=1) + assert "[LightGBM] [Warning] Unknown parameter: nonsense" in capsys.readouterr().out + + @pytest.mark.parametrize("verbosity_param", lgb.basic._ConfigAliases.get("verbosity")) @pytest.mark.parametrize("verbosity", [-1, 0]) def test_verbosity_can_suppress_alias_warnings(capsys, verbosity_param, verbosity): diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index a082d5054b2e..478b66035837 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1290,6 +1290,19 @@ def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth): assert "Provided parameters constrain tree depth" not in capsys.readouterr().out +def test_verbosity_is_respected_when_using_custom_objective(capsys): + X, y = make_synthetic_regression() + params = { + "objective": objective_ls, + "nonsense": 123, + "num_leaves": 3, + } + lgb.LGBMRegressor(**params, verbosity=-1, n_estimators=1).fit(X, y) + assert capsys.readouterr().out == "" + lgb.LGBMRegressor(**params, verbosity=0, n_estimators=1).fit(X, y) + assert "[LightGBM] [Warning] Unknown parameter: nonsense" in capsys.readouterr().out + + @pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) def test_getting_feature_names_in_np_input(estimator_class): # input is a numpy array, which doesn't have feature names. LightGBM adds diff --git a/tests/python_package_test/test_utilities.py b/tests/python_package_test/test_utilities.py index 3359d060e109..440a717ea1db 100644 --- a/tests/python_package_test/test_utilities.py +++ b/tests/python_package_test/test_utilities.py @@ -31,7 +31,7 @@ def dummy_metric(_, __): eval_records = {} callbacks = [lgb.record_evaluation(eval_records), lgb.log_evaluation(2), lgb.early_stopping(10)] lgb.train( - {"objective": "binary", "metric": ["auc", "binary_error"]}, + {"objective": "binary", "metric": ["auc", "binary_error"], "verbose": 1}, lgb_train, num_boost_round=10, feval=dummy_metric,