diff --git a/src/Amalgam/GeneralizedDistance.h b/src/Amalgam/GeneralizedDistance.h index dcf4fcd7..eea9dcf5 100644 --- a/src/Amalgam/GeneralizedDistance.h +++ b/src/Amalgam/GeneralizedDistance.h @@ -1132,7 +1132,12 @@ class RepeatedGeneralizedDistanceEvaluator double default_deviation = deviations_for_value->second.defaultDeviation; if(FastIsNaN(default_deviation)) { - deviations.defaultDeviation + deviations_for_value->second.defaultDeviation = feature_attributes.deviation; + + feature_data.defaultNominalMatchDistanceTerm + = distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy); + + feature_data.defaultNominalNonMatchDistanceTerm = distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy); } else @@ -1144,10 +1149,15 @@ class RepeatedGeneralizedDistanceEvaluator //divide the probability among the other classes double prob_class_given_nonmatch = default_deviation / nonmatching_classes; - deviations.defaultDeviation - = distEvaluator->ComputeDistanceTermNominalNonmatchFromMatchProbabilities( + feature_data.defaultNominalMatchDistanceTerm = + distEvaluator->ComputeDistanceTermNominalMatchFromMatchProbabilities( + index, prob_class_given_match, high_accuracy); + + feature_data.defaultNominalNonMatchDistanceTerm = + distEvaluator->ComputeDistanceTermNominalNonmatchFromMatchProbabilities( index, prob_class_given_match, prob_class_given_nonmatch, high_accuracy); } + return; } } else if(feature_data.targetValue.nodeType == ENIVT_STRING_ID) @@ -1169,7 +1179,12 @@ class RepeatedGeneralizedDistanceEvaluator double default_deviation = deviations_for_sid->second.defaultDeviation; if(FastIsNaN(default_deviation)) { - deviations.defaultDeviation + deviations_for_sid->second.defaultDeviation = feature_attributes.deviation; + + feature_data.defaultNominalMatchDistanceTerm + = distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy); + + feature_data.defaultNominalNonMatchDistanceTerm = distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy); } else @@ -1181,12 +1196,24 @@ class RepeatedGeneralizedDistanceEvaluator //divide the probability among the other classes double prob_class_given_nonmatch = default_deviation / nonmatching_classes; - deviations.defaultDeviation + feature_data.defaultNominalMatchDistanceTerm = + distEvaluator->ComputeDistanceTermNominalMatchFromMatchProbabilities( + index, prob_class_given_match, high_accuracy); + + feature_data.defaultNominalNonMatchDistanceTerm = distEvaluator->ComputeDistanceTermNominalNonmatchFromMatchProbabilities( index, prob_class_given_match, prob_class_given_nonmatch, high_accuracy); } + return; } } + + //made it here, so didn't find anything in the SDM. use fallback for default nominal terms + feature_data.defaultNominalMatchDistanceTerm = + distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy); + + feature_data.defaultNominalNonMatchDistanceTerm = + distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy); } //for the feature index, computes and stores the distance terms as measured from value to each interned value @@ -1274,7 +1301,7 @@ class RepeatedGeneralizedDistanceEvaluator return dist_term_entry->second; if(other_value.number == feature_data.targetValue.GetValueAsNumber()) - return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy); + return feature_data.defaultNominalMatchDistanceTerm; } else if(other_type == ENIVT_STRING_ID) { @@ -1283,7 +1310,7 @@ class RepeatedGeneralizedDistanceEvaluator return dist_term_entry->second; if(other_value.stringID == feature_data.targetValue.GetValueAsStringIDIfExists()) - return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy); + return feature_data.defaultNominalMatchDistanceTerm; } if(EvaluableNodeImmediateValue::IsNull(other_type, other_value)) @@ -1295,7 +1322,7 @@ class RepeatedGeneralizedDistanceEvaluator } else { - return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy); + return feature_data.defaultNominalNonMatchDistanceTerm; } } else @@ -1363,7 +1390,7 @@ class RepeatedGeneralizedDistanceEvaluator //returns the smallest nonmatching distance term regardless of value __forceinline double ComputeDistanceTermNominalNonNullSmallestNonmatch(size_t index, bool high_accuracy) { - double match_dist_term = distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy); + double match_dist_term = featureData[index].defaultNominalMatchDistanceTerm; double smallest_nonmatch = ComputeDistanceTermNonNullNominalNextSmallest(match_dist_term, index, high_accuracy); //if there is no such value, return infinite @@ -1408,6 +1435,8 @@ class RepeatedGeneralizedDistanceEvaluator void Clear() { effectiveFeatureType = EFDT_CONTINUOUS_NUMERIC; + defaultNominalMatchDistanceTerm = 0.0; + defaultNominalNonMatchDistanceTerm = 0.0; precomputedRemainingIdenticalDistanceTerm = 0.0; internedDistanceTerms.clear(); nominalStringDistanceTerms.clear(); @@ -1430,6 +1459,12 @@ class RepeatedGeneralizedDistanceEvaluator //target that the distance will be computed to EvaluableNodeImmediateValueWithType targetValue; + //the default nominal matching distance term if a term is not in the distance term matrix + double defaultNominalMatchDistanceTerm; + + //the default nominal nonmatching distance term if a term is not in the distance term matrix + double defaultNominalNonMatchDistanceTerm; + //the distance term for EFDT_REMAINING_IDENTICAL_PRECOMPUTED double precomputedRemainingIdenticalDistanceTerm; diff --git a/src/Amalgam/SBFDSColumnData.h b/src/Amalgam/SBFDSColumnData.h index ef134144..67022e76 100644 --- a/src/Amalgam/SBFDSColumnData.h +++ b/src/Amalgam/SBFDSColumnData.h @@ -727,6 +727,12 @@ class SBFDSColumnData + (valueCodeSizeToIndices.size() + codeIndices.size()) / 2; } + //returns the number of valid values (exist and not null) in the column + inline size_t GetNumValidDataElements() + { + return numberIndices.size() + stringIdIndices.size() + codeIndices.size(); + } + //returns the maximum difference between value and any other value for this column //if empty, will return infinity inline double GetMaxDifferenceTerm(GeneralizedDistanceEvaluator::FeatureAttributes &feature_attribs) diff --git a/src/Amalgam/SeparableBoxFilterDataStore.h b/src/Amalgam/SeparableBoxFilterDataStore.h index ef35b746..e9fe0a2e 100644 --- a/src/Amalgam/SeparableBoxFilterDataStore.h +++ b/src/Amalgam/SeparableBoxFilterDataStore.h @@ -1013,7 +1013,7 @@ class SeparableBoxFilterDataStore if(feature_attribs.IsFeatureNominal()) { if(FastIsNaN(feature_attribs.typeAttributes.nominalCount)) - feature_attribs.typeAttributes.nominalCount = static_cast(column_data->GetNumUniqueValues()); + feature_attribs.typeAttributes.nominalCount = static_cast(column_data->GetNumValidDataElements()); } } } diff --git a/src/Amalgam/out.txt b/src/Amalgam/out.txt index bc19b901..e56e7f25 100644 --- a/src/Amalgam/out.txt +++ b/src/Amalgam/out.txt @@ -230,11 +230,11 @@ hello world: 12 and 2 (print "hello") [(null) (null) .infinity -.infinity] -{b 2 a 1 c ["alpha" "beta" "gamma"]} +{b 2 c ["alpha" "beta" "gamma"] a 1} { b 2 - a 1 c ["alpha" "beta" "gamma"] + a 1 } (apply "6") @@ -287,7 +287,7 @@ if 2 11 (null) 11 -(null) +11 11 (null) (null) @@ -632,7 +632,7 @@ abcdef 0.14384103622589045 --first-- 4 -1 +2 1 0 a @@ -651,15 +651,15 @@ a a 1 b 2 c 3 - d 4 e 5 + f 6 } -{c 3 d 4} +{a 1 f 6} { - b 2 + a 1 c 3 - d 4 e 5 + f 6 } { a 1 @@ -683,7 +683,7 @@ abcdef --last-- this -1 +2 1 0 c @@ -702,15 +702,15 @@ c a 1 b 2 c 3 - d 4 e 5 + f 6 } -{c 3 d 4} +{a 1 f 6} { - b 2 + a 1 c 3 - d 4 e 5 + f 6 } { a 1 @@ -1061,7 +1061,7 @@ abcdef [1 3] [9 5] --indices-- -[4 "a" "c" "b"] +["b" "c" 4 "a"] [ 0 1 @@ -1073,10 +1073,10 @@ abcdef 7 ] [0 1 2 3] -[1 3 2 0] -[1 3 2 0] +[0 3 1 2] +[0 3 1 2] --values-- -["d" 1 3 2] +[2 3 "d" 1] [ "a" 1 @@ -1097,7 +1097,7 @@ abcdef 4 "d" ] -["d" 1 2 3] +[2 1 3 "d"] [ 1 2 @@ -1250,9 +1250,9 @@ list assoc [ {4 4} - 4 [4] "4" + 4 ] --set-- { @@ -1337,7 +1337,7 @@ current_index: 2 rmfile "del /s /q " rwww 1 slash "\\" - start_time 1731940058.757715 + start_time 1734038239.476452 www 1 x 12 zz 10 @@ -1383,7 +1383,7 @@ current_index: 2 rmfile "del /s /q " rwww 1 slash "\\" - start_time 1731940058.757715 + start_time 1734038239.476452 www 1 x 12 zz 10 @@ -1428,7 +1428,7 @@ current_index: 2 rmfile "del /s /q " rwww 1 slash "\\" - start_time 1731940058.757715 + start_time 1734038239.476452 www 1 x 12 zz 10 @@ -1560,23 +1560,23 @@ true [15.147298145412242 2.8707229850232165 1.1842755192409848 26.133999503489054] --weighted_rand-- -b -["b" "a" "b" "b"] +a +["b" "b" "b" "b"] b ["b" @(get (target 2) 0) "a" @(get (target 2) 2)] ["b" @(get (target 2) 0) @(get (target 2) 0) "a"] -infinity test c or d: ["d" "d" "c" "d"] +infinity test c or d: ["c" "c" "d" "c"] infinity test c or d: ["c" "d" @(get (target 2) 1) @(get (target 2) 0)] -{a 34 b 43 c 23} +{a 23 b 43 c 34} {a 30 b 50 c 20} -[1 2 4] +[1 2 3] --get_rand_seed-- 0RÊíõÿ'`¦!šc”lÿ @@ -1622,8 +1622,8 @@ infinity test c or d: ["c" "d" @(get (target 2) 1) @(get (target 2) 0)] string --set_type-- (- 3 4) -["a" 4 "b" 3] -["a" 4 "b" 3] +["b" 3 "a" 4] +["b" 3 "a" 4] {a 4 b 3} 8.7 (parallel @@ -1672,17 +1672,17 @@ string {a 3 b 4} {c "c"} ] -21: [{"a":3,"b":4},{"d":null,"c":"c"}] +21: [{"b":4,"a":3},{"d":null,"c":"c"}] 22: [{"a":3,"b":4},{"c":"c","d":null}] -23: a: 1 +23: d: 4 b: 2 e: - a - b - - .inf -d: 4 c: 3 +a: 1 24: a: 1 b: 2 @@ -1695,7 +1695,7 @@ e: - .inf 25: {a 1} -current date-time in epoch: 2024-11-18-09.27.38.8260980 +current date-time in epoch: 2024-12-12-16.17.19.5279410 2020-06-07 00:22:59 1391230800 1391230800 @@ -2093,8 +2093,8 @@ decrypted: hello 1 (associate "b" 4 "a" 3 "c" 3) ] -[3 4 2] -[3 2 4 3] +[3 2 4] +[2 3 2 4] [ ;comment 1 @@ -2105,18 +2105,18 @@ decrypted: hello ;comment x 2 - 4 3 - 6 5 - 8 7 - 10 9 - 12 11 - 14 13 + 4 + 6 + 8 + 10 + 12 + 14 ] [ [1 2 3] @@ -2244,16 +2244,6 @@ decrypted: hello {_ (null)} (replace _ - ["g"] - (lambda - [ - (get - (current_value 1) - 0 - ) - 4 - ] - ) [] (lambda { @@ -2264,6 +2254,16 @@ decrypted: hello ) } ) + ["g"] + (lambda + [ + (get + (current_value 1) + 0 + ) + 4 + ] + ) ) ) (declare @@ -2861,16 +2861,18 @@ flatten restore with parallel 19.264241099357605 --intersect_entities-- (associate "b" 4) -MergeEntityChild2 -(associate "p" 3 "q" 4) MergeEntityChild1 (associate "x" 3 "y" 4) +MergeEntityChild2 +(associate "p" 3 "q" 4) _3130331116 (associate "E" 3 "F" 4) _1651806471 (associate "e" 3 "f" 4) --union_entities-- (associate "b" 4 "a" 3 "c" 3) +MergeEntityChild1 +(associate "x" 3 "y" 4 "z" 5) MergeEntityChild2 (associate "p" @@ -2884,8 +2886,6 @@ MergeEntityChild2 "w" 7 ) -MergeEntityChild1 -(associate "x" 3 "y" 4 "z" 5) _3130331116 (associate "E" @@ -2910,7 +2910,7 @@ _1651806471 ) (parallel ##p - ["_2325275497" "_2325275497" "_2973704165" "_2973704165"] + ["_2325275497" "_2973704165" "_2325275497" "_2973704165"] ) _2325275497 (associate @@ -3262,8 +3262,14 @@ _3532185687 [] (lambda { - E 3 - F 4 + E (get + (current_value 1) + "E" + ) + F (get + (current_value 1) + "F" + ) G 5 H 6 } @@ -3288,7 +3294,16 @@ _3532185687 _ [] (lambda - {e 3 f 4} + { + e (get + (current_value 1) + "e" + ) + f (get + (current_value 1) + "f" + ) + } ) ) ) @@ -3320,7 +3335,89 @@ contained_entities new_entity: ["DiffEntityChild1" "OnlyIn2" "_3626604918" "_382 difference between DiffEntity2 and new_entity: (declare {_ (null) new_entity (null)} - (clone_entities _ new_entity) + (assign + "new_entity" + (first + (create_entities + new_entity + (call + (lambda + (declare + {_ (null)} + (replace _) + ) + ) + { + _ (retrieve_entity_root _) + } + ) + ) + ) + ) + (create_entities + (append new_entity "_3626604918") + (call + (lambda + (declare + {_ (null)} + (replace + _ + [] + (lambda + { + E (null) + F (null) + G (get + (current_value 1) + "G" + ) + H (get + (current_value 1) + "H" + ) + } + ) + ) + ) + ) + { + _ (retrieve_entity_root + (append _ "_3626604918") + ) + } + ) + ) + (create_entities + (append new_entity "_3823131681") + (call + (lambda + (declare + {_ (null)} + (replace + _ + [] + (lambda + {e (null) f (null)} + ) + ) + ) + ) + { + _ (retrieve_entity_root + (append _ "_3823131681") + ) + } + ) + ) + (clone_entities + (append _ "DiffEntityChild1") + (append new_entity "DiffEntityChild1") + ) + (clone_entities + (append _ "OnlyIn2") + (append new_entity "OnlyIn2") + ) + new_entity ) (declare {_ (null) new_entity (null)} @@ -3388,8 +3485,14 @@ difference between DiffEntity2 and new_entity: [] (lambda { - E 3 - F 4 + E (get + (current_value 1) + "E" + ) + F (get + (current_value 1) + "F" + ) G 5 H 6 } @@ -3414,7 +3517,16 @@ difference between DiffEntity2 and new_entity: _ [] (lambda - {e 3 f 4} + { + e (get + (current_value 1) + "e" + ) + f (get + (current_value 1) + "f" + ) + } ) ) ) @@ -3440,23 +3552,96 @@ contained_entities new_entity: ["OnlyIn2" "_1985995361" "_2783372341" "DiffEntit difference between DiffContainer and DiffEntity2: (declare {_ (null) new_entity (null)} - (clone_entities _ new_entity) + (assign + "new_entity" + (first + (create_entities + new_entity + (call + (lambda + (declare + {_ (null)} + (replace _) + ) + ) + { + _ (retrieve_entity_root _) + } + ) + ) + ) + ) + (create_entities + (append new_entity "_1985995361") + (call + (lambda + (declare + {_ (null)} + (replace + _ + [] + (lambda + { + E (null) + F (null) + G (get + (current_value 1) + "G" + ) + H (get + (current_value 1) + "H" + ) + } + ) + ) + ) + ) + { + _ (retrieve_entity_root + (append _ "_1985995361") + ) + } + ) + ) + (create_entities + (append new_entity "_2783372341") + (call + (lambda + (declare + {_ (null)} + (replace + _ + [] + (lambda + {e (null) f (null)} + ) + ) + ) + ) + { + _ (retrieve_entity_root + (append _ "_2783372341") + ) + } + ) + ) + (clone_entities + (append _ "OnlyIn2") + (append new_entity "OnlyIn2") + ) + (clone_entities + (append _ "DiffEntityChild1") + (append new_entity "DiffEntityChild1") + ) + new_entity ) --mix_entities-- (associate "b" 4) -MergeEntityChild2 -(associate - "p" - 3 - "q" - 4 - "v" - 6 - "w" - 7 -) MergeEntityChild1 (associate "x" 3 "y" 4) +MergeEntityChild2 +(associate "p" 3 "q" 4) _3130331116 (associate "E" 3 "F" 4) _1651806471 @@ -3539,7 +3724,7 @@ deep sets --set_entity_root_permission-- RootTest -1731940058.986386 +1734038239.858111 (true) RootTest @@ -4380,14 +4565,14 @@ case conviction:{ } cyclic feature nearest neighbors: {cyclic1 1 cyclic5 0.5} cyclic test expected: 155, 200, 190 ... deg values of 0 8 and 12: -200: 0.05555555555555555 (null - ##deg 8 +190: 0.045454545454545456 (null + ##deg 12 ) 155: 0.1 (null ##deg 0 ) -190: 0.045454545454545456 (null - ##deg 12 +200: 0.05555555555555555 (null + ##deg 8 ) --contains_label-- (true) @@ -4889,4 +5074,4 @@ rmdir /s /q amlg_code\persistent_tree_test_root del /s /q amlg_code\persist_module_test\psm.mdam del /s /q amlg_code\persist_module_test.mdam --total execution time-- -1.9228589534759521 +1.6889419555664062