From a9603ec44c84f5936f5cb39144a5d2ab739f8363 Mon Sep 17 00:00:00 2001 From: Marcel Breyer Date: Tue, 3 May 2022 21:39:05 +0200 Subject: [PATCH 0001/1010] First version, not working. --- CMakeLists.txt | 29 +- include/plssvm/backends/OpenMP/csvm.hpp | 7 +- include/plssvm/constants.hpp | 2 + include/plssvm/core.hpp | 8 +- include/plssvm/csvm.hpp | 188 ++++++++- include/plssvm/csvm_factory.hpp | 2 +- include/plssvm/data_set.hpp | 89 ++++ include/plssvm/detail/io/arff_parsing.hpp | 38 ++ .../plssvm/detail/{ => io}/file_reader.hpp | 3 +- include/plssvm/detail/io/libsvm_parsing.hpp | 35 ++ include/plssvm/detail/string_conversion.hpp | 2 + include/plssvm/file_format_types.hpp | 28 ++ include/plssvm/parameter.hpp | 164 -------- include/plssvm/parameter_predict.hpp | 36 +- include/plssvm/parameter_train.hpp | 34 +- src/main_train.cpp | 35 +- src/plssvm/csvm.cpp | 4 +- src/plssvm/data_set.cpp | 253 +++++++++++ src/plssvm/detail/io/arff_parsing.cpp | 214 ++++++++++ src/plssvm/detail/{ => io}/file_reader.cpp | 4 +- src/plssvm/detail/io/libsvm_parsing.cpp | 259 ++++++++++++ src/plssvm/file_format_types.cpp | 37 ++ src/plssvm/parameter.cpp | 392 +----------------- src/plssvm/parameter_predict.cpp | 50 ++- src/plssvm/parameter_train.cpp | 45 +- utility_scripts/generate_data.py | 5 +- 26 files changed, 1322 insertions(+), 641 deletions(-) create mode 100644 include/plssvm/data_set.hpp create mode 100644 include/plssvm/detail/io/arff_parsing.hpp rename include/plssvm/detail/{ => io}/file_reader.hpp (98%) create mode 100644 include/plssvm/detail/io/libsvm_parsing.hpp create mode 100644 include/plssvm/file_format_types.hpp create mode 100644 src/plssvm/data_set.cpp create mode 100644 src/plssvm/detail/io/arff_parsing.cpp rename src/plssvm/detail/{ => io}/file_reader.cpp (98%) create mode 100644 src/plssvm/detail/io/libsvm_parsing.cpp create mode 100644 src/plssvm/file_format_types.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 182167dd6..e467b0b5f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,6 @@ configure_file( ## set base sources set(PLSSVM_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/execution_range.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/file_reader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/sha256.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/string_utility.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/exceptions/exceptions.cpp @@ -42,6 +41,12 @@ set(PLSSVM_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/target_platforms.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/kernel_invocation_type.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backends/SYCL/implementation_type.cpp + + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/data_set.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/file_format_types.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/io/file_reader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/io/libsvm_parsing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/io/arff_parsing.cpp ) ## create base library: linked against all backend libraries @@ -174,6 +179,28 @@ endif() set_property(TARGET ${PLSSVM_BASE_LIBRARY_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON) target_link_libraries(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC fmt::fmt) +## try finding NamedType +find_package(NamedType QUIET) +if(NamedType_FOUND) + message(STATUS "Found package NamedType.") +else() + message(STATUS "Couldn't find package NamedType. Building from source ...") + # fetch command line parser library NamedType + FetchContent_Declare(NamedType + GIT_REPOSITORY https://github.com/joboccara/NamedType.git + GIT_TAG master + GIT_SHALLOW TRUE + + set(ENABLE_TEST OFF CACHE BOOL "" FORCE) + ) + FetchContent_MakeAvailable(NamedType) + add_dependencies(${PLSSVM_BASE_LIBRARY_NAME} NamedType) + target_include_directories(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC + $ + $ + ) +endif() + ######################################################################################################################## ## create executables ## diff --git a/include/plssvm/backends/OpenMP/csvm.hpp b/include/plssvm/backends/OpenMP/csvm.hpp index 163290a6c..423f284c1 100644 --- a/include/plssvm/backends/OpenMP/csvm.hpp +++ b/include/plssvm/backends/OpenMP/csvm.hpp @@ -12,14 +12,15 @@ #pragma once #include "plssvm/csvm.hpp" // plssvm::csvm +#include "plssvm/parameter.hpp" // plssvm::parameter #include // std::vector namespace plssvm { -// forward declare parameter class -template -class parameter; +// forward declare parameter class TODO +//template +//class parameter; namespace openmp { diff --git a/include/plssvm/constants.hpp b/include/plssvm/constants.hpp index c1411679b..31de22b4d 100644 --- a/include/plssvm/constants.hpp +++ b/include/plssvm/constants.hpp @@ -13,6 +13,8 @@ namespace plssvm { +inline bool verbose = true; + /// Integer type used inside kernels. using kernel_index_type = int; diff --git a/include/plssvm/core.hpp b/include/plssvm/core.hpp index a50920b32..0bb1ff586 100644 --- a/include/plssvm/core.hpp +++ b/include/plssvm/core.hpp @@ -14,9 +14,10 @@ #include "plssvm/csvm.hpp" #include "plssvm/csvm_factory.hpp" +#include "plssvm/constants.hpp" #include "plssvm/parameter.hpp" -#include "plssvm/parameter_predict.hpp" #include "plssvm/parameter_train.hpp" +#include "plssvm/parameter_predict.hpp" #include "plssvm/backend_types.hpp" #include "plssvm/kernel_types.hpp" @@ -28,6 +29,9 @@ #include "plssvm/backends/SYCL/implementation_type.hpp" #include "plssvm/backends/SYCL/kernel_invocation_type.hpp" + +#include "plssvm/data_set.hpp" + /// The main namespace containing all public API functions. namespace plssvm {} @@ -37,6 +41,8 @@ namespace plssvm::version {} /// Namespace containing implementation details. **Should not** directly be used by users. namespace plssvm::detail {} +namespace plssvm::detail::io {} + /// Namespace containing operator overloads for [std::vector](https://en.cppreference.com/w/cpp/container/vector) and other mathematical functions on vectors. namespace plssvm::operators {} diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp index 267743d04..30df4a6af 100644 --- a/include/plssvm/csvm.hpp +++ b/include/plssvm/csvm.hpp @@ -13,6 +13,7 @@ #include "plssvm/kernel_types.hpp" // plssvm::kernel_type #include "plssvm/target_platforms.hpp" // plssvm::target_platform +#include "parameter.hpp" // plssvm::parameter #include // std::size_t #include // std::shared_ptr @@ -20,11 +21,22 @@ #include // std::is_same_v #include // std::vector +#include "NamedType/named_type.hpp" +#include +#include +#include +#include +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/backends/SYCL/implementation_type.hpp" // plssvm::sycl_generic::implementation_type +#include "plssvm/backends/SYCL/kernel_invocation_type.hpp" // plssvm::sycl_generic::kernel_invocation_type +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform + namespace plssvm { -// forward declare class -template -class parameter; +// forward declare class TODO +//template +//class parameter; /** * @brief Base class for all C-SVM backends. @@ -280,4 +292,174 @@ class csvm { extern template class csvm; extern template class csvm; +namespace new_ { + +inline bool print_info = true; + +enum class file_format_type { + libsvm, + arff +}; + +template +class parameter { + public: + using real_type = T; + + parameter() = default; + + kernel_type kernel = kernel_type::linear; + int degree = 3; + real_type gamma = real_type{ 0.0 }; + real_type coef0 = real_type{ 0.0 }; + real_type cost = real_type{ 1.0 }; + real_type epsilon = static_cast(0.001); + backend_type backend = backend_type::automatic; + target_platform target = target_platform::automatic; + + ::plssvm::sycl_generic::kernel_invocation_type sycl_kernel_invocation_type = ::plssvm::sycl_generic::kernel_invocation_type::automatic; + ::plssvm::sycl_generic::implementation_type sycl_implementation_type = ::plssvm::sycl_generic::implementation_type::automatic; +}; + +template +class data_set { + using data_matrix_type = std::vector>; + using label_vector_type = std::vector; + + public: + using real_type = T; + using label_type = int; + using size_type = std::size_t; + + explicit data_set(const std::string& filename); + + explicit data_set(data_matrix_type &&X) : X_ptr_{ std::make_shared(std::move(X)) } { + if (X_ptr_->empty()) { + throw std::runtime_error("empty matrix"); // TODO: correct exception + } + } + data_set(data_matrix_type &&X, label_vector_type &&y) : X_ptr_{ std::make_shared(std::move(X)) }, y_ptr_{ std::make_shared(std::move(y)) } { + // TODO: exception?? if size != size + } + // save the data set in the given format + void save_data_set(const std::string& filename, file_format_type format); + + // scale data features to be in range [lower, upper] + void scale(real_type lower, real_type upper); + + [[nodiscard]] const data_matrix_type& data() const noexcept { return *X_ptr_; } + [[nodiscard]] std::optional> labels() const noexcept { + return this->has_labels() ? *y_ptr_ : std::nullopt; + } + [[nodiscard]] bool has_labels() const noexcept { return y_ptr_ != nullptr; } + + [[nodiscard]] size_type num_data_points() const noexcept { return X_ptr_->size(); } + [[nodiscard]] size_type num_features() const noexcept { return X_ptr_->front().size(); } + + private: + std::shared_ptr X_ptr_{ nullptr }; + std::shared_ptr y_ptr_{ nullptr }; +}; + +template +class csvm_model { + using alpha_vector_type = std::vector; + + public: + using real_type = T; + + // read model from file + explicit csvm_model(const std::string& filename); + + // save model to file + void save_model(const std::string& filename); + + // predict labels of the data_set + std::vector::label_type> predict(const data_set &data); + // predict LS-SVM values of the data_set + std::vector predict_values(const data_set &data); + + // calculate the accuracy of the model + real_type score(); + // calculate the accuracy of the data_set + real_type score(const data_set &data); + + private: + csvm_model(parameter params, data_set data) : params_{ std::move(params) }, data_{ std::move(data) }, alphas_{ std::make_shared() } {} + + parameter params_; + data_set data_; + + std::shared_ptr alphas_; + real_type rho_{ 0.0 }; +}; + +template +class csvm { + friend csvm_model; + public: + using real_type = T; + + // create new SVM with the given parameters + explicit csvm(parameter params) : params_{ std::move(params) } {} + csvm(real_type cost, target_platform target) : params_{} { + params_.kernel = kernel_type::linear; + params_.cost = cost; + params_.backend = backend_type::openmp; + params_.target = target; + } + csvm(int degree, real_type gamma, real_type coef0, real_type cost, target_platform target) : params_{} { + params_.kernel = kernel_type::polynomial; + params_.degree = degree; + params_.gamma = gamma; + params_.coef0 = coef0; + params_.cost = cost; + params_.backend = backend_type::openmp; + params_.target = target; + } + csvm(real_type gamma, real_type cost, target_platform target) : params_{} { + params_.kernel = kernel_type::rbf; + params_.gamma = gamma; + params_.cost = cost; + params_.backend = backend_type::openmp; + params_.target = target; + } + + // learn model with default eps and iter values + csvm_model fit(const data_set &data) { + // TODO: implement + params_.epsilon = parameter{}.epsilon; + return csvm_model(params_, data); + } + // learn model until eps is reached + csvm_model fit(const data_set &data, real_type eps) { + // TODO: implement + params_.epsilon = eps; + return csvm_model(params_, data); + } + // learn model using exact iter CG iterations + csvm_model fit(const data_set &data, std::size_t iter) { + // TODO: implement + params_.epsilon = parameter{}.epsilon; + return csvm_model(params_, data); + } + // learn model until eps is reached OR iter CG iterations are reached + csvm_model fit(const data_set &data, real_type eps, std::size_t iter) { + // TODO: implement + params_.epsilon = eps; + return csvm_model(params_, data); + } + // learn model until accuracy for test data set is reached + csvm_model fit(const data_set &data_train, const data_set &data_test, real_type accuracy) { + // TODO: implement + params_.epsilon = parameter{}.epsilon; + return csvm_model(params_, data_train); + } + + private: + parameter params_; +}; + +} + } // namespace plssvm diff --git a/include/plssvm/csvm_factory.hpp b/include/plssvm/csvm_factory.hpp index 4a9c713eb..a160b9dc1 100644 --- a/include/plssvm/csvm_factory.hpp +++ b/include/plssvm/csvm_factory.hpp @@ -14,7 +14,7 @@ #include "plssvm/backend_types.hpp" // plssvm::backend #include "plssvm/csvm.hpp" // plssvm::csvm #include "plssvm/exceptions/exceptions.hpp" // plssvm::unsupported_backend_exception -#include "plssvm/parameter.hpp" // plssvm::parameter +#include "parameter.hpp" // plssvm::parameter #include // std::unique_ptr, std::make_unique diff --git a/include/plssvm/data_set.hpp b/include/plssvm/data_set.hpp new file mode 100644 index 000000000..908450eb8 --- /dev/null +++ b/include/plssvm/data_set.hpp @@ -0,0 +1,89 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Implements a data set class encapsulating all data points and potential features. + */ + +#pragma once + +#include "plssvm/file_format_types.hpp" // plssvm::file_format_type + +#include // std::size_t +#include // std::reference_wrapper, std::cref +#include // std::shared_ptr +#include // std::optional, std::make_optional, std::nullopt +#include // std::string +#include // std::vector + +#include // std::variant + +namespace plssvm { + +template +class data_set { + using data_matrix_type = std::vector>; + using label_vector_type = std::vector; + + std::map mapping_; + + public: + using real_type = T; + using label_type = T; + using size_type = std::size_t; + + data_set(); + + explicit data_set(const std::string& filename); + data_set(const std::string& filename, file_format_type format); + + explicit data_set(data_matrix_type &&X); + data_set(data_matrix_type &&X, std::vector &&y); + + data_set(data_matrix_type &&X, std::vector &&x); + data_set(data_matrix_type &&X, std::vector &&x); + + + // save the data set in the given format + void save_data_set(const std::string& filename, file_format_type format) const; + + // scale data features to be in range [-1, +1] + void scale(); + // scale data features to be in range [lower, upper] + void scale(real_type lower, real_type upper); + + [[nodiscard]] const data_matrix_type& data() const noexcept { return *X_ptr_; } + [[nodiscard]] std::optional> labels() const noexcept { + if (this->has_labels()) { + return std::make_optional(std::cref(*y_ptr_)); + } else { + return std::nullopt; + } + } + [[nodiscard]] bool has_labels() const noexcept { return y_ptr_ != nullptr; } + + [[nodiscard]] size_type num_data_points() const noexcept { return num_data_points_; } + [[nodiscard]] size_type num_features() const noexcept { return num_features_; } + + private: + void write_libsvm_file(const std::string& filename) const; + void write_arff_file(const std::string& filename) const; + void read_file(const std::string& filename, file_format_type format); + void read_libsvm_file(const std::string& filename); + void read_arff_file(const std::string& filename); + + std::shared_ptr X_ptr_{ nullptr }; + std::shared_ptr y_ptr_{ nullptr }; + + size_type num_data_points_{ 0 }; + size_type num_features_{ 0 }; +}; + +extern template class data_set; +extern template class data_set; + +} \ No newline at end of file diff --git a/include/plssvm/detail/io/arff_parsing.hpp b/include/plssvm/detail/io/arff_parsing.hpp new file mode 100644 index 000000000..9fa12f90c --- /dev/null +++ b/include/plssvm/detail/io/arff_parsing.hpp @@ -0,0 +1,38 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Implements parsing functions for the ARFF file format. + */ + +#pragma once + +#include "plssvm/detail/io/file_reader.hpp" // plssvm::detail::io::file_reader + +#include "fmt/os.h" // fmt::ostream + +#include // std::size_t +#include // std::shared_ptr +#include // std::tuple +#include // std::vector + +namespace plssvm::detail::io { + +std::tuple read_arff_header(file_reader &reader); + +template +void read_arff_data(file_reader &reader, std::size_t header, std::size_t num_features, std::size_t max_size, bool has_label, std::shared_ptr>> &X_ptr, std::shared_ptr> &y_ptr); + +void write_arff_header(fmt::ostream &out, std::size_t num_features, bool has_labels); + +template +void write_arff_data(fmt::ostream &out, const std::shared_ptr>> &X_ptr, const std::shared_ptr> &y_ptr); + +template +void write_arff_data(fmt::ostream &out, const std::shared_ptr>> &X_ptr); + +} \ No newline at end of file diff --git a/include/plssvm/detail/file_reader.hpp b/include/plssvm/detail/io/file_reader.hpp similarity index 98% rename from include/plssvm/detail/file_reader.hpp rename to include/plssvm/detail/io/file_reader.hpp index 43d85e9cd..c01a9c34b 100644 --- a/include/plssvm/detail/file_reader.hpp +++ b/include/plssvm/detail/io/file_reader.hpp @@ -9,6 +9,7 @@ * @brief Implements a file reader class responsible for reading the input file and parsing it into lines. */ +// TODO: move to io #pragma once // check if memory mapping can be supported @@ -20,7 +21,7 @@ #include // std::string_view #include // std::vector -namespace plssvm::detail { +namespace plssvm::detail::io { /** * @brief The plssvm::detail::file_reader class is responsible for reading a file and splitting it into its lines. diff --git a/include/plssvm/detail/io/libsvm_parsing.hpp b/include/plssvm/detail/io/libsvm_parsing.hpp new file mode 100644 index 000000000..dd85f32ed --- /dev/null +++ b/include/plssvm/detail/io/libsvm_parsing.hpp @@ -0,0 +1,35 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Implements parsing functions for the LIBSVM file format. + */ + +#pragma once + +#include "plssvm/detail/io/file_reader.hpp" // plssvm::detail::file_reader + +#include "fmt/os.h" // fmt::ostream + +#include // std::size_t +#include // std::shared_ptr +#include // std::vector + +namespace plssvm::detail::io { + +std::size_t parse_libsvm_num_features(file_reader &reader, std::size_t num_data_points, std::size_t start); + +template +bool read_libsvm_data(file_reader &reader, std::size_t start, std::shared_ptr>> &X_ptr, std::shared_ptr> &y_ptr); + +template +void write_libsvm_data(fmt::ostream &out, const std::shared_ptr>> &X_ptr, const std::shared_ptr> &y_ptr); + +template +void write_libsvm_data(fmt::ostream &out, const std::shared_ptr>> &X_ptr); + +} \ No newline at end of file diff --git a/include/plssvm/detail/string_conversion.hpp b/include/plssvm/detail/string_conversion.hpp index 6a673ba02..798b56b0c 100644 --- a/include/plssvm/detail/string_conversion.hpp +++ b/include/plssvm/detail/string_conversion.hpp @@ -9,6 +9,8 @@ * @brief Implements a conversion function from a string to an arithmetic type. */ +#pragma once + #include "plssvm/detail/arithmetic_type_name.hpp" // plssvm::detail::arithmetic_type_name #include "plssvm/detail/string_utility.hpp" // plssvm::detail::trim_left #include "plssvm/detail/utility.hpp" // plssvm::detail::always_false_v diff --git a/include/plssvm/file_format_types.hpp b/include/plssvm/file_format_types.hpp new file mode 100644 index 000000000..106b232ec --- /dev/null +++ b/include/plssvm/file_format_types.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include // forward declare std::ostream and std::istream + +namespace plssvm { + +enum class file_format_type { + libsvm, + arff +}; + +/** + * @brief Output the @p format to the given output-stream @p out. + * @param[in,out] out the output-stream to write the format type to + * @param[in] format the file format type + * @return the output-stream + */ +std::ostream &operator<<(std::ostream &out, file_format_type format); + +/** + * @brief Use the input-stream @p in to initialize the @p format type. + * @param[in,out] in input-stream to extract the format type from + * @param[in] format the file format type + * @return the input-stream + */ +std::istream &operator>>(std::istream &in, file_format_type &format); + +} \ No newline at end of file diff --git a/include/plssvm/parameter.hpp b/include/plssvm/parameter.hpp index 542a159a3..3a8a1a16d 100644 --- a/include/plssvm/parameter.hpp +++ b/include/plssvm/parameter.hpp @@ -47,137 +47,6 @@ class parameter { */ virtual ~parameter() = default; - /** - * @brief Parse a file in the [LIBSVM sparse file format](https://www.csie.ntu.edu.tw/~cjlin/libsvm/faq.html#f303). - * @details The sparse LIBSVM file format saves each data point with its respective class as follows: - * @code - *