From 7b4206ef0fa62ee5112c33b5d03c4ce540816ba1 Mon Sep 17 00:00:00 2001 From: howsohazard <143410553+howsohazard@users.noreply.github.com> Date: Wed, 11 Dec 2024 21:46:00 -0500 Subject: [PATCH] 22401: Fixes two bugs with sparse deviation matrices, one that reduces ML quality on nominal numeric features (#325) --- src/Amalgam/Parser.cpp | 18 ++++++++++++++++++ src/Amalgam/Parser.h | 3 +++ src/Amalgam/PlatformSpecific.h | 9 +++++---- src/Amalgam/SBFDSColumnData.h | 9 ++++++--- src/Amalgam/entity/EntityQueryBuilder.h | 16 ++++------------ 5 files changed, 36 insertions(+), 19 deletions(-) diff --git a/src/Amalgam/Parser.cpp b/src/Amalgam/Parser.cpp index a48b7dc9..f1bc9538 100644 --- a/src/Amalgam/Parser.cpp +++ b/src/Amalgam/Parser.cpp @@ -174,6 +174,24 @@ EvaluableNodeReference Parser::ParseFromKeyStringId(StringInternPool::StringID c return node; } +double Parser::ParseNumberFromKeyStringId(StringInternPool::StringID code_string_id) +{ + if(code_string_id == string_intern_pool.NOT_A_STRING_ID) + return std::numeric_limits::quiet_NaN(); + + std::string &code_string = code_string_id->string; + if(code_string.size() == 0 || code_string[0] != '\0') + return std::numeric_limits::quiet_NaN(); + + std::string_view escaped_string(&code_string[1], code_string.size() - 1); + + auto [number_value, success] = Platform_StringToNumber(escaped_string); + if(!success) + return std::numeric_limits::quiet_NaN(); + + return number_value; +} + std::string Parser::UnparseToKeyString(EvaluableNode *tree) { //if just a regular string, return it diff --git a/src/Amalgam/Parser.h b/src/Amalgam/Parser.h index f1c5b0a3..57ece8d3 100644 --- a/src/Amalgam/Parser.h +++ b/src/Amalgam/Parser.h @@ -142,6 +142,9 @@ class Parser //transforms the code_string_id into evaluable nodes static EvaluableNodeReference ParseFromKeyStringId(StringInternPool::StringID code_string_id, EvaluableNodeManager *enm); + //transforms the code_string_id into a number + static double ParseNumberFromKeyStringId(StringInternPool::StringID code_string_id); + //transforms tree into a string value that will match if the evaluable node trees match static std::string UnparseToKeyString(EvaluableNode *tree); diff --git a/src/Amalgam/PlatformSpecific.h b/src/Amalgam/PlatformSpecific.h index a94a1455..6f79f849 100644 --- a/src/Amalgam/PlatformSpecific.h +++ b/src/Amalgam/PlatformSpecific.h @@ -144,11 +144,12 @@ inline std::pair Platform_OpenFileAsString(const std::string // not currently have a working implementation on any version. // note2: std::from_chars is more desirable than std::strtod because it is locale independent // TODO 15993: Reevaluate when moving to C++20 -inline std::pair Platform_StringToNumber(const std::string &s) +template +inline std::pair Platform_StringToNumber(const StringType &s) { #ifdef OS_WINDOWS - const char *first_char = s.c_str(); - const char *last_char = first_char + s.length(); + const char *first_char = s.data(); + const char *last_char = first_char + s.size(); double value = 0.0; auto [ptr, ec] = std::from_chars(first_char, last_char, value); //if there was no parse error and nothing left on string, then it's a number @@ -158,7 +159,7 @@ inline std::pair Platform_StringToNumber(const std::string &s) #else //make sure it has a zero terminator std::string stringified_s(s); - const char *start_pointer = stringified_s.c_str(); + const char *start_pointer = stringified_s.data(); char *end_pointer = nullptr; double value = strtod(start_pointer, &end_pointer); //if didn't reach the end or grabbed nothing, then it's not a number diff --git a/src/Amalgam/SBFDSColumnData.h b/src/Amalgam/SBFDSColumnData.h index 2b36904a..f447e57f 100644 --- a/src/Amalgam/SBFDSColumnData.h +++ b/src/Amalgam/SBFDSColumnData.h @@ -693,12 +693,15 @@ class SBFDSColumnData inline size_t GetNumUniqueValues(EvaluableNodeImmediateValueType value_type = ENIVT_NULL) { if(value_type == ENIVT_NUMBER) - return numberIndices.size(); + return sortedNumberValueEntries.size(); if(value_type == ENIVT_STRING_ID) - return stringIdIndices.size(); + return stringIdValueEntries.size(); - return numberIndices.size() + stringIdIndices.size() + codeIndices.size(); + //add up unique number and string values, + // and use a heuristic for judging how many unique code values there are + return sortedNumberValueEntries.size() + stringIdIndices.size() + + (valueCodeSizeToIndices.size() + codeIndices.size()) / 2; } //returns the maximum difference between value and any other value for this column diff --git a/src/Amalgam/entity/EntityQueryBuilder.h b/src/Amalgam/entity/EntityQueryBuilder.h index 023b9fee..c84a4f50 100644 --- a/src/Amalgam/entity/EntityQueryBuilder.h +++ b/src/Amalgam/entity/EntityQueryBuilder.h @@ -57,11 +57,7 @@ namespace EntityQueryBuilder { double value = std::numeric_limits::quiet_NaN(); if(cn.first != string_intern_pool.emptyStringId) - { - auto [number_value, success] = Platform_StringToNumber(string_intern_pool.GetStringFromID(cn.first)); - if(success) - value = number_value; - } + value = Parser::ParseNumberFromKeyStringId(cn.first); ndd.emplace(value, EvaluableNode::ToNumber(cn.second)); } @@ -124,11 +120,7 @@ namespace EntityQueryBuilder { double value = std::numeric_limits::quiet_NaN(); if(cn.first != string_intern_pool.emptyStringId) - { - auto [number_value, success] = Platform_StringToNumber(string_intern_pool.GetStringFromID(cn.first)); - if(success) - value = number_value; - } + value = Parser::ParseNumberFromKeyStringId(cn.first); number_sdm.emplace(value); PopulateFeatureDeviationNominalValueData(number_sdm.back().second, cn.second); @@ -164,10 +156,10 @@ namespace EntityQueryBuilder //a list indicates that it is a pair of a sparse deviation matrix followed by a default deviation //the default being for when the first value being compared is not found auto &ocn = deviation_node->GetOrderedChildNodesReference(); - if(ocn.size() > 1) + if(ocn.size() > 0) PopulateFeatureDeviationNominalValuesMatrixData(feature_attribs, ocn[0]); - if(ocn.size() > 2) + if(ocn.size() > 1) feature_attribs.deviation = EvaluableNode::ToNumber(ocn[1]); } else