From d08b19939a2f26929cad1995dd73107571b35c4c Mon Sep 17 00:00:00 2001 From: howsohazard <143410553+howsohazard@users.noreply.github.com> Date: Thu, 26 Oct 2023 22:12:12 -0400 Subject: [PATCH] 17630 & 17861: Implements number interning, improves change value efficiency, fixes cyclic bug (#25) --- src/Amalgam/GeneralizedDistance.h | 174 +++-- src/Amalgam/IntegerSet.h | 5 +- src/Amalgam/SBFDSColumnData.h | 706 ++++++++++++++++---- src/Amalgam/SeparableBoxFilterDataStore.cpp | 172 +++-- src/Amalgam/SeparableBoxFilterDataStore.h | 214 ++++-- src/Amalgam/amlg_code/test.amlg | 28 +- src/Amalgam/entity/EntityQueryBuilder.h | 27 +- src/Amalgam/evaluablenode/EvaluableNode.h | 41 +- src/Amalgam/out.txt | 60 +- 9 files changed, 1059 insertions(+), 368 deletions(-) diff --git a/src/Amalgam/GeneralizedDistance.h b/src/Amalgam/GeneralizedDistance.h index 2474b4d6..0cf9213e 100644 --- a/src/Amalgam/GeneralizedDistance.h +++ b/src/Amalgam/GeneralizedDistance.h @@ -12,29 +12,45 @@ //If defined, will use the Laplace LK metric (default). Otherwise will use Gaussian. #define DISTANCE_USE_LAPLACE_LK_METRIC true -//general class of feature comparisons -// align at 64-bits in order to play nice with data alignment where it is used -enum FeatureDifferenceType : uint64_t -{ - FDT_NOMINAL, - //continuous, but without cycles - FDT_CONTINUOUS_NUMERIC, - //like FDT_CONTINUOUS_NUMERIC, but guarantees everything is always numeric - FDT_CONTINUOUS_UNIVERSALLY_NUMERIC, - //like FDT_CONTINUOUS_NUMERIC, but has cycles - FDT_CONTINUOUS_NUMERIC_CYCLIC, - //edit distance between strings - FDT_CONTINUOUS_STRING, - //continuous measures of the number of nodes different between two sets of code - FDT_CONTINUOUS_CODE, -}; - //base data struct for holding distance parameters and metadata //generalizes Minkowski distance, information theoretic surprisal as a distance, and LukaszykKarmowski class GeneralizedDistance { public: - //initialization functions + + //general class of feature comparisons + // align at 32-bits in order to play nice with data alignment where it is used + enum FeatureDifferenceType : uint32_t + { + FDT_NOMINAL, + //continuous without cycles, may contain nonnumeric data + FDT_CONTINUOUS_NUMERIC, + //like FDT_CONTINUOUS_NUMERIC, but has cycles + FDT_CONTINUOUS_NUMERIC_CYCLIC, + //edit distance between strings + FDT_CONTINUOUS_STRING, + //continuous measures of the number of nodes different between two sets of code + FDT_CONTINUOUS_CODE, + }; + + enum EffectiveFeatureDifferenceType : uint32_t + { + EFDT_NOMINAL, + //everything is precomputed from interned values that are looked up + EFDT_VALUES_UNIVERSALLY_PRECOMPUTED, + //continuous without cycles, but everything is always numeric + EFDT_CONTINUOUS_UNIVERSALLY_NUMERIC, + //continuous without cycles, may contain nonnumeric data + EFDT_CONTINUOUS_NUMERIC, + //like FDT_CONTINUOUS_NUMERIC, but has cycles + EFDT_CONTINUOUS_NUMERIC_CYCLIC, + //continuous precomputed (cyclic or not), may contain nonnumeric data + EFDT_CONTINUOUS_NUMERIC_PRECOMPUTED, + //edit distance between strings + EFDT_CONTINUOUS_STRING, + //continuous measures of the number of nodes different between two sets of code + EFDT_CONTINUOUS_CODE, + }; //dynamically precompute and cache nominal deltas and defaults everytime the pValue is set inline void SetAndConstrainParams() @@ -70,7 +86,8 @@ class GeneralizedDistance //computes and sets unknownToUnknownDistanceTerm and knownToUnknownDistanceTerm based on // unknownToUnknownDifference and knownToUnknownDifference respectively - inline void ComputeAndStoreUncertaintyDistanceTerms(size_t index) + //if target_value_is_null_equivalent is true, it will update any precomputed values as necessary + inline void ComputeAndStoreUncertaintyDistanceTerms(size_t index, bool target_value_is_null_equivalent = false) { bool compute_accurate = NeedToPrecomputeAccurate(); bool compute_approximate = NeedToPrecomputeApproximate(); @@ -98,24 +115,64 @@ class GeneralizedDistance if(feature_params.knownToUnknownDifference == feature_params.unknownToUnknownDifference) { feature_params.knownToUnknownDistanceTerm = feature_params.unknownToUnknownDistanceTerm; - return; + } + else + { + //compute knownToUnknownDistanceTerm + if(compute_accurate) + { + feature_params.knownToUnknownDistanceTerm.SetValue( + ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference, + index, ExactApproxValuePair::EXACT), + ExactApproxValuePair::EXACT); + } + + if(compute_approximate) + { + feature_params.knownToUnknownDistanceTerm.SetValue( + ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference, + index, ExactApproxValuePair::APPROX), + ExactApproxValuePair::APPROX); + } } - //compute knownToUnknownDistanceTerm - if(compute_accurate) + if(HasNumberInternValues(index)) + { + auto &precomputed_terms = feature_params.precomputedInternDistanceTerms; + + if(target_value_is_null_equivalent) + { + precomputed_terms[0] = feature_params.unknownToUnknownDistanceTerm.GetValue(defaultPrecision); + auto k_to_unk = feature_params.knownToUnknownDistanceTerm.GetValue(defaultPrecision); + for(size_t i = 1; i < precomputed_terms.size(); i++) + precomputed_terms[i] = k_to_unk; + } + else //just set the unknown value + { + precomputed_terms[0] = feature_params.knownToUnknownDistanceTerm.GetValue(defaultPrecision); + } + } + } + + //for the feature index, computes and stores the distance terms as measured from value to each interned value + inline void ComputeAndStoreInternedNumberValuesAndDistanceTerms(size_t index, double value, std::vector *interned_values) + { + auto &feature_params = featureParams[index]; + feature_params.internedNumberIndexToNumberValue = interned_values; + + if(interned_values == nullptr) { - feature_params.knownToUnknownDistanceTerm.SetValue( - ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference, - index, ExactApproxValuePair::EXACT), - ExactApproxValuePair::EXACT); + feature_params.precomputedInternDistanceTerms.clear(); + return; } - if(compute_approximate) + feature_params.precomputedInternDistanceTerms.resize(interned_values->size()); + //first entry is known-unknown distance + feature_params.precomputedInternDistanceTerms[0] = ComputeDistanceTermKnownToUnknown(index); + for(size_t i = 1; i < feature_params.precomputedInternDistanceTerms.size(); i++) { - feature_params.knownToUnknownDistanceTerm.SetValue( - ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference, - index, ExactApproxValuePair::APPROX), - ExactApproxValuePair::APPROX); + double difference = value - interned_values->at(i); + feature_params.precomputedInternDistanceTerms[i] = ComputeDistanceTermNonNominalNonNullRegular(difference, index); } } @@ -432,6 +489,18 @@ class GeneralizedDistance return featureParams[index].knownToUnknownDistanceTerm.GetValue(defaultPrecision); } + //returns true if the feature at index has interned number values + __forceinline bool HasNumberInternValues(size_t index) + { + return featureParams[index].internedNumberIndexToNumberValue != nullptr; + } + + //returns the precomputed distance term for the interned number with intern_value_index + __forceinline double ComputeDistanceTermNumberInterned(size_t intern_value_index, size_t index) + { + return featureParams[index].precomputedInternDistanceTerms[intern_value_index]; + } + //computes the inner term for a non-nominal with an exact match of values __forceinline double ComputeDistanceTermNonNominalExactMatch(size_t index) { @@ -445,8 +514,8 @@ class GeneralizedDistance return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight; } - //computes the difference between two values non-nominal (e.g., continuous) - __forceinline double ComputeDifferenceTermNonNominal(double diff, size_t index) + //computes the base of the difference between two values non-nominal (e.g., continuous) + __forceinline double ComputeDifferenceTermBaseNonNominal(double diff, size_t index) { //compute absolute value diff = std::abs(diff); @@ -462,8 +531,8 @@ class GeneralizedDistance return diff; } - //computes the difference between two values non-nominal (e.g., continuous) that isn't cyclic - __forceinline double ComputeDifferenceTermNonNominalNonCyclic(double diff, size_t index) + //computes the base of the difference between two values non-nominal (e.g., continuous) that isn't cyclic + __forceinline double ComputeDifferenceTermBaseNonNominalNonCyclic(double diff, size_t index) { //compute absolute value diff = std::abs(diff); @@ -479,7 +548,7 @@ class GeneralizedDistance // diff can be negative __forceinline double ComputeDistanceTermNonNominalNonNullRegular(double diff, size_t index) { - diff = ComputeDifferenceTermNonNominal(diff, index); + diff = ComputeDifferenceTermBaseNonNominal(diff, index); //exponentiate and return with weight return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight; @@ -489,7 +558,7 @@ class GeneralizedDistance // diff can be negative __forceinline double ComputeDistanceTermNonNominalOneNonNullRegular(double diff, size_t index) { - diff = ComputeDifferenceTermNonNominal(diff, index); + diff = ComputeDifferenceTermBaseNonNominal(diff, index); //exponentiate and return with weight return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight; @@ -499,7 +568,7 @@ class GeneralizedDistance // diff can be negative __forceinline double ComputeDistanceTermNonNominalNonCyclicNonNullRegular(double diff, size_t index) { - diff = ComputeDifferenceTermNonNominalNonCyclic(diff, index); + diff = ComputeDifferenceTermBaseNonNominalNonCyclic(diff, index); //exponentiate and return with weight return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight; @@ -512,7 +581,7 @@ class GeneralizedDistance if(FastIsNaN(diff)) return ComputeDistanceTermKnownToUnknown(index); - diff = ComputeDifferenceTermNonNominalNonCyclic(diff, index); + diff = ComputeDifferenceTermBaseNonNominalNonCyclic(diff, index); //exponentiate and return with weight return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight; @@ -530,7 +599,7 @@ class GeneralizedDistance if(IsFeatureNominal(index)) return (diff == 0.0) ? ComputeDistanceTermNominalExactMatch(index) : ComputeDistanceTermNominalNonMatch(index); - diff = ComputeDifferenceTermNonNominal(diff, index); + diff = ComputeDifferenceTermBaseNonNominal(diff, index); return std::pow(diff, featureParams[index].weight); } @@ -547,7 +616,7 @@ class GeneralizedDistance if(IsFeatureNominal(index)) return (diff == 0.0) ? ComputeDistanceTermNominalExactMatch(index) : ComputeDistanceTermNominalNonMatch(index); - diff = ComputeDifferenceTermNonNominal(diff, index); + diff = ComputeDifferenceTermBaseNonNominal(diff, index); return diff * featureParams[index].weight; } @@ -556,7 +625,7 @@ class GeneralizedDistance __forceinline double ComputeDistanceTermNonNull(double diff, size_t index, int precision) { if(!IsFeatureNominal(index)) - diff = ComputeDifferenceTermNonNominal(diff, index); + diff = ComputeDifferenceTermBaseNonNominal(diff, index); if(pValue == 0.0) return std::pow(diff, featureParams[index].weight); @@ -587,7 +656,7 @@ class GeneralizedDistance { double diff = ComputeDifference(a, b, a_type, b_type, featureParams[index].featureType); if(FastIsNaN(diff)) - return LookupNullDistanceTerm(a, b, a_type, b_type, index);; + return LookupNullDistanceTerm(a, b, a_type, b_type, index); //if nominal, don't need to compute absolute value of diff because just need to compare to 0 if(IsFeatureNominal(index)) @@ -628,7 +697,7 @@ class GeneralizedDistance __forceinline static double ComputeDifference(EvaluableNodeImmediateValue a, EvaluableNodeImmediateValue b, EvaluableNodeImmediateValueType a_type, EvaluableNodeImmediateValueType b_type, FeatureDifferenceType feature_type) { - if(feature_type == FDT_CONTINUOUS_NUMERIC || feature_type == FDT_CONTINUOUS_UNIVERSALLY_NUMERIC + if(feature_type == FDT_CONTINUOUS_NUMERIC || feature_type == FDT_CONTINUOUS_NUMERIC_CYCLIC) { if(a_type == ENIVT_NUMBER && b_type == ENIVT_NUMBER) @@ -758,7 +827,10 @@ class GeneralizedDistance { public: inline FeatureParams() - : featureType(FDT_CONTINUOUS_NUMERIC), weight(1.0), deviation(0.0), + : featureType(FDT_CONTINUOUS_NUMERIC), + effectiveFeatureType(EFDT_CONTINUOUS_NUMERIC), + weight(1.0), + internedNumberIndexToNumberValue(nullptr), deviation(0.0), unknownToUnknownDistanceTerm(std::numeric_limits::quiet_NaN()), knownToUnknownDistanceTerm(std::numeric_limits::quiet_NaN()), unknownToUnknownDifference(std::numeric_limits::quiet_NaN()), @@ -768,9 +840,13 @@ class GeneralizedDistance } //the type of comparison for each feature - // this type is 64-bit aligned to make sure the whole structure is aligned + // this type is 32-bit aligned to make sure the whole structure is aligned FeatureDifferenceType featureType; + //the effective comparison for the feature type, specialized for performance + // this type is 32-bit aligned to make sure the whole structure is aligned + EffectiveFeatureDifferenceType effectiveFeatureType; + //weight of the feature double weight; @@ -778,6 +854,12 @@ class GeneralizedDistance ExactApproxValuePair nominalMatchDistanceTerm; ExactApproxValuePair nominalNonMatchDistanceTerm; + //pointer to a lookup table of indices to values if the feature is an interned number + std::vector *internedNumberIndexToNumberValue; + + //precomputed distance terms for each interned value looked up by intern index + std::vector precomputedInternDistanceTerms; + //type attributes dependent on featureType union { diff --git a/src/Amalgam/IntegerSet.h b/src/Amalgam/IntegerSet.h index 1ef7cf2b..30c25c66 100644 --- a/src/Amalgam/IntegerSet.h +++ b/src/Amalgam/IntegerSet.h @@ -101,10 +101,7 @@ class SortedIntegerSet __forceinline bool contains(size_t id) { auto location = std::lower_bound(std::begin(integers), std::end(integers), id); - if(location == std::end(integers)) - return false; - - return id == *location; + return (location != std::end(integers) && id == *location); } //returns true if the id exists in the set diff --git a/src/Amalgam/SBFDSColumnData.h b/src/Amalgam/SBFDSColumnData.h index 9cc1bb79..f88a9adf 100644 --- a/src/Amalgam/SBFDSColumnData.h +++ b/src/Amalgam/SBFDSColumnData.h @@ -16,15 +16,44 @@ class SBFDSColumnData { public: + + struct ValueEntry + { + //indicates the column does not use indices + static constexpr size_t NO_INDEX = std::numeric_limits::max(); + //nan value is always the 0th index + static constexpr size_t NAN_INDEX = 0; + + //if empty, initialize to invalid index + ValueEntry() + : value(), indicesWithValue(), + valueInternIndex(NO_INDEX) + { } + + ValueEntry(double number_value, size_t intern_index = NO_INDEX) + : value(number_value), indicesWithValue(), + valueInternIndex(intern_index) + { } + + ValueEntry(StringInternPool::StringID sid_value, size_t intern_index = NO_INDEX) + : value(sid_value), indicesWithValue(), + valueInternIndex(intern_index) + { } + + ValueEntry(ValueEntry &ve) + : value(ve.value), indicesWithValue(ve.indicesWithValue), valueInternIndex(ve.valueInternIndex) + { } + + EvaluableNodeImmediateValue value; + SortedIntegerSet indicesWithValue; + size_t valueInternIndex; + }; + //column needs to be named when it is created inline SBFDSColumnData(StringInternPool::StringID sid) - : stringId(sid) - { - indexWithLongestString = 0; - longestStringLength = 0; - indexWithLargestCode = 0; - largestCodeSize = 0; - } + : stringId(sid), indexWithLongestString(0), longestStringLength(0), + indexWithLargestCode(0), largestCodeSize(0), numberValuesInterned(false) + { } //like InsertIndexValue, but used only for building the column data from an empty column //this function must be called on each index in ascending order; for example, index 2 must be called after index 1 @@ -101,16 +130,16 @@ class SBFDSColumnData } } - sortedNumberValueIndexPairs.reserve(num_uniques); + sortedNumberValueEntries.reserve(num_uniques); numberIndices.ReserveNumIntegers(index_values.back().reference + 1); for(auto &index_value : index_values) { //if don't have the right bucket, then need to create one - if(sortedNumberValueIndexPairs.size() == 0 || sortedNumberValueIndexPairs.back().first != index_value.distance) - sortedNumberValueIndexPairs.emplace_back(index_value.distance, std::make_unique()); + if(sortedNumberValueEntries.size() == 0 || sortedNumberValueEntries.back()->value.number != index_value.distance) + sortedNumberValueEntries.emplace_back(std::make_unique(index_value.distance)); - sortedNumberValueIndexPairs.back().second->InsertNewLargestInteger(index_value.reference); + sortedNumberValueEntries.back()->indicesWithValue.InsertNewLargestInteger(index_value.reference); numberIndices.insert(index_value.reference); } } @@ -119,7 +148,12 @@ class SBFDSColumnData __forceinline EvaluableNodeImmediateValueType GetIndexValueType(size_t index) { if(numberIndices.contains(index)) + { + if(numberValuesInterned) + return ENIVT_NUMBER_INDIRECTION_INDEX; return ENIVT_NUMBER; + } + if(stringIdIndices.contains(index)) return ENIVT_STRING_ID; if(nullIndices.contains(index)) @@ -129,69 +163,342 @@ class SBFDSColumnData return ENIVT_CODE; } + //returns the value type, performing any resolution for intern lookups + static __forceinline EvaluableNodeImmediateValueType GetResolvedValueType(EvaluableNodeImmediateValueType value_type) + { + if(value_type == ENIVT_NUMBER_INDIRECTION_INDEX) + return ENIVT_NUMBER; + return value_type; + } + + //returns the value type that represents the values stored in this column, performing the reverse of any resolution for intern lookups + __forceinline EvaluableNodeImmediateValueType GetUnresolvedValueType(EvaluableNodeImmediateValueType value_type) + { + if(value_type == ENIVT_NUMBER && numberValuesInterned) + return ENIVT_NUMBER_INDIRECTION_INDEX; + return value_type; + } + + //returns the value performing any intern lookup if necessary + __forceinline EvaluableNodeImmediateValue GetResolvedValue(EvaluableNodeImmediateValueType value_type, EvaluableNodeImmediateValue value) + { + if(value_type == ENIVT_NUMBER_INDIRECTION_INDEX) + return EvaluableNodeImmediateValue(internedNumberIndexToNumberValue[value.indirectionIndex]); + return value; + } + //moves index from being associated with key old_value to key new_value - void ChangeIndexValue(EvaluableNodeImmediateValue old_value, EvaluableNodeImmediateValueType new_value_type, EvaluableNodeImmediateValue new_value, size_t index) + //returns the value that should be used to reference the value, which may be an index + //depending on the state of the column data + EvaluableNodeImmediateValue ChangeIndexValue(EvaluableNodeImmediateValueType old_value_type, EvaluableNodeImmediateValue old_value, + EvaluableNodeImmediateValueType new_value_type, EvaluableNodeImmediateValue new_value, size_t index) { //if new one is invalid, can quickly delete or return if(new_value_type == ENIVT_NOT_EXIST) { if(!invalidIndices.contains(index)) { - DeleteIndexValue(old_value, index); + DeleteIndexValue(old_value_type, old_value, index); invalidIndices.insert(index); } - return; + + if(numberValuesInterned) + return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX); + else + return EvaluableNodeImmediateValue(); + } + + //if the types are the same, some shortcuts may apply + //note that if the values match types and match resolved values, the old_value should be returned + //because it is already in the correct storage format for the column + if(old_value_type == new_value_type) + { + if(old_value_type == ENIVT_NULL) + return old_value; + + if(old_value_type == ENIVT_NUMBER) + { + double old_number_value = GetResolvedValue(old_value_type, old_value).number; + double new_number_value = GetResolvedValue(new_value_type, new_value).number; + if(EqualIncludingNaN(old_number_value, new_number_value)) + return old_value; + + //if made it here, then at least one of the values is not a NaN + //if one value is a NaN, just insert or delete as regular since there's little to be saved + if(FastIsNaN(old_number_value)) + { + nanIndices.erase(index); + return InsertIndexValue(new_value_type, new_value, index); + } + + if(FastIsNaN(new_number_value)) + { + DeleteIndexValue(old_value_type, old_value, index); + nanIndices.insert(index); + + if(numberValuesInterned) + return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX); + else + return EvaluableNodeImmediateValue(std::numeric_limits::quiet_NaN()); + } + + //if the value already exists, then put the index in the list + //but return the lower bound if not found so don't have to search a second time + //need to search the old value before inserting, as FindExactIndexForValue is fragile a placeholder empty entry + auto [new_value_index, new_exact_index_found] = FindExactIndexForValue(new_number_value, true); + auto [old_value_index, old_exact_index_found] = FindExactIndexForValue(old_number_value, true); + + if(old_exact_index_found) + { + //if there are multiple entries for this number, just move the id + if(sortedNumberValueEntries[old_value_index]->indicesWithValue.size() > 1) + { + //erase with old_value_index first so don't need to update index + sortedNumberValueEntries[old_value_index]->indicesWithValue.erase(index); + + if(!new_exact_index_found) + { + sortedNumberValueEntries.emplace(sortedNumberValueEntries.begin() + new_value_index, std::make_unique(new_number_value)); + InsertFirstIndexIntoNumberValueEntry(index, new_value_index); + } + else //just insert + { + sortedNumberValueEntries[new_value_index]->indicesWithValue.insert(index); + } + } + else //it's the last old_number_entry + { + if(!new_exact_index_found) + { + //remove old value and update to new + std::unique_ptr new_value_entry = std::move(sortedNumberValueEntries[old_value_index]); + new_value_entry->value.number = new_number_value; + + //move the other values out of the way + if(old_number_value < new_number_value) + { + for(size_t i = old_value_index; i + 1 < new_value_index; i++) + sortedNumberValueEntries[i] = std::move(sortedNumberValueEntries[i + 1]); + + new_value_index--; + } + else + { + for(size_t i = old_value_index; i > new_value_index; i--) + sortedNumberValueEntries[i] = std::move(sortedNumberValueEntries[i - 1]); + } + + //move new value in to empty slot created + sortedNumberValueEntries[new_value_index] = std::move(new_value_entry); + } + else //already has an entry for the new value, just delete as normal + { + sortedNumberValueEntries[new_value_index]->indicesWithValue.insert(index); + DeleteNumberValueEntry(old_value_index); + } + } + } + else //shouldn't make it here, but ensure integrity just in case + { + //insert new value in correct position + sortedNumberValueEntries.emplace(sortedNumberValueEntries.begin() + new_value_index, + std::make_unique(new_number_value)); + + InsertFirstIndexIntoNumberValueEntry(index, new_value_index); + } + + if(numberValuesInterned) + return EvaluableNodeImmediateValue(new_value_index); + else + return EvaluableNodeImmediateValue(new_value); + } + + if(old_value_type == ENIVT_STRING_ID) + { + if(old_value.stringID == new_value.stringID) + return old_value; + + //try to insert the new value if not already there + auto [new_id_entry, inserted] = stringIdValueToIndices.emplace(new_value.stringID, nullptr); + + auto old_id_entry = stringIdValueToIndices.find(old_value.stringID); + if(old_id_entry != end(stringIdValueToIndices)) + { + //if there are multiple entries for this string, just move the id + if(old_id_entry->second->size() > 1) + { + if(inserted) + new_id_entry->second = std::make_unique(); + + new_id_entry->second->insert(index); + old_id_entry->second->erase(index); + } + else //it's the last old_id_entry + { + //put the SortedIntegerSet in the new value or move the container + if(inserted) + new_id_entry->second = std::move(old_id_entry->second); + else + new_id_entry->second->insert(index); + + //erase after no longer need inserted_id_entry, as it may be invalidated + stringIdValueToIndices.erase(old_id_entry); + } + } + else if(inserted) //shouldn't make it here, but ensure integrity just in case + { + new_id_entry->second = std::make_unique(); + new_id_entry->second->insert(index); + } + + //update longest string as appropriate + if(index == indexWithLongestString) + RecomputeLongestString(); + else + UpdateLongestString(new_value.stringID, index); + + return new_value; + } + + if(old_value_type == ENIVT_CODE) + { + //only early exit if the pointers to the code are exactly the same, + // as equivalent code may be garbage collected + if(old_value.code == new_value.code) + return old_value; + + size_t old_code_size = EvaluableNode::GetDeepSize(old_value.code); + size_t new_code_size = EvaluableNode::GetDeepSize(new_value.code); + + //only need to do insert / removal logic if sizes are different + if(old_code_size != new_code_size) + { + auto [new_size_entry, inserted] = valueCodeSizeToIndices.emplace(new_code_size, nullptr); + + auto old_size_entry = valueCodeSizeToIndices.find(old_code_size); + if(old_size_entry != end(valueCodeSizeToIndices)) + { + //if there are multiple entries for this string, just move the id + if(old_size_entry->second->size() > 1) + { + if(inserted) + new_size_entry->second = std::make_unique(); + + new_size_entry->second->insert(index); + old_size_entry->second->erase(index); + } + else //it's the last old_size_entry + { + //put the SortedIntegerSet in the new value or move the container + if(inserted) + new_size_entry->second = std::move(old_size_entry->second); + else + new_size_entry->second->insert(index); + + //erase after no longer need inserted_size_entry, as it may be invalidated + valueCodeSizeToIndices.erase(old_size_entry); + } + } + else if(inserted) //shouldn't make it here, but ensure integrity just in case + { + new_size_entry->second = std::make_unique(); + new_size_entry->second->insert(index); + } + } + + //update longest string as appropriate + //see if need to update largest code + if(index == indexWithLargestCode) + RecomputeLargestCode(); + else + UpdateLargestCode(new_code_size, index); + + return new_value; + } + + if(old_value_type == ENIVT_NUMBER_INDIRECTION_INDEX) + { + if(old_value.indirectionIndex == new_value.indirectionIndex) + return old_value; + } } //delete index at old value - DeleteIndexValue(old_value, index); + DeleteIndexValue(old_value_type, old_value, index); - //add index at new value bucket - InsertIndexValue(new_value_type, new_value, index); + //add index at new value bucket + return InsertIndexValue(new_value_type, new_value, index); } - //deletes everything involving the value at the index - void DeleteIndexValue(EvaluableNodeImmediateValue value, size_t index) + //deletes a particular value based on the value_index + void DeleteNumberValueEntry(size_t value_index) { - if(invalidIndices.EraseAndRetrieve(index)) - return; + if(numberValuesInterned) + { + size_t value_intern_index = sortedNumberValueEntries[value_index]->valueInternIndex; + //if the last entry (off by one, including ValueEntry::NO_INDEX), can just resize + if(value_intern_index == internedNumberIndexToNumberValue.size() - 1) + { + internedNumberIndexToNumberValue.resize(value_intern_index); + } + else //need to actually erase it + { + internedNumberIndexToNumberValue[value_intern_index] = std::numeric_limits::quiet_NaN(); + unusedNumberValueIndices.emplace(value_intern_index); + } - //if value is null, just need to remove from the appropriate index - if(nullIndices.EraseAndRetrieve(index)) - return; + //clear out any unusedNumberValueIndices at the end other than the 0th entry + while(internedNumberIndexToNumberValue.size() > 1 && FastIsNaN(internedNumberIndexToNumberValue.back())) + internedNumberIndexToNumberValue.pop_back(); + } - if(numberIndices.EraseAndRetrieve(index)) + sortedNumberValueEntries.erase(sortedNumberValueEntries.begin() + value_index); + } + + //deletes everything involving the value at the index + void DeleteIndexValue(EvaluableNodeImmediateValueType value_type, EvaluableNodeImmediateValue value, size_t index) + { + switch(value_type) { + case ENIVT_NOT_EXIST: + invalidIndices.erase(index); + break; + + case ENIVT_NULL: + nullIndices.erase(index); + break; + + case ENIVT_NUMBER: + case ENIVT_NUMBER_INDIRECTION_INDEX: + numberIndices.erase(index); + //remove, and if not a nan, then need to also remove the number if(!nanIndices.EraseAndRetrieve(index)) { + auto resolved_value = GetResolvedValue(value_type, value); + //look up value - auto [value_index, exact_index_found] = FindExactIndexForValue(value.number); + auto [value_index, exact_index_found] = FindExactIndexForValue(resolved_value.number); if(!exact_index_found) return; //if the bucket has only one entry, we must delete the entire bucket - if(sortedNumberValueIndexPairs[value_index].second->size() == 1) - { - sortedNumberValueIndexPairs.erase(sortedNumberValueIndexPairs.begin() + value_index); - } + if(sortedNumberValueEntries[value_index]->indicesWithValue.size() == 1) + DeleteNumberValueEntry(value_index); else //else we can just remove the id from the bucket - { - sortedNumberValueIndexPairs[value_index].second->erase(index); - } + sortedNumberValueEntries[value_index]->indicesWithValue.erase(index); } + break; - return; - } - - if(stringIdIndices.EraseAndRetrieve(index)) + case ENIVT_STRING_ID: { + stringIdIndices.erase(index); auto id_entry = stringIdValueToIndices.find(value.stringID); if(id_entry != end(stringIdValueToIndices)) { auto &entities = *(id_entry->second); entities.erase(index); - + //if no more entries have the value, remove it if(entities.size() == 0) stringIdValueToIndices.erase(id_entry); @@ -199,83 +506,142 @@ class SBFDSColumnData //see if need to compute new longest string if(index == indexWithLongestString) - { - longestStringLength = 0; - //initialize to 0 in case there are no entities with strings - indexWithLongestString = 0; - for(auto &[s_id, s_entry] : stringIdValueToIndices) - UpdateLongestString(s_id, *s_entry->begin()); - } - - return; + RecomputeLongestString(); } + break; - //if made it here, then just remove from a code value type - codeIndices.erase(index); + case ENIVT_CODE: + { + codeIndices.erase(index); - //find the entities that have the correspending size - size_t num_indices = EvaluableNode::GetDeepSize(value.code); - auto id_entry = valueCodeSizeToIndices.find(num_indices); - if(id_entry == end(valueCodeSizeToIndices)) - return; + //find the entities that have the correspending size + size_t num_indices = EvaluableNode::GetDeepSize(value.code); + auto id_entry = valueCodeSizeToIndices.find(num_indices); + if(id_entry == end(valueCodeSizeToIndices)) + return; + + //remove the entity + auto &entities = *(id_entry->second); + entities.erase(index); + + if(entities.size() == 0) + valueCodeSizeToIndices.erase(id_entry); + + //see if need to update largest code + if(index == indexWithLargestCode) + RecomputeLargestCode(); + break; + } - //remove the entity - auto &entities = *(id_entry->second); - entities.erase(index); + default: //shouldn't make it here + break; + } + } - if(entities.size() == 0) - valueCodeSizeToIndices.erase(id_entry); + //deletes a particular value based on the value_index + void InsertFirstIndexIntoNumberValueEntry(size_t index, size_t value_index) + { + ValueEntry *value_entry = sortedNumberValueEntries[value_index].get(); - //see if need to update largest code - if(index == indexWithLargestCode) + value_entry->indicesWithValue.insert(index); + if(numberValuesInterned) { - largestCodeSize = 0; - //initialize to 0 in case there are no entities with code - indexWithLargestCode = 0; - for(auto &[size, entry] : valueCodeSizeToIndices) - UpdateLargestCode(size, *entry->begin()); + if(value_entry->valueInternIndex == ValueEntry::NO_INDEX) + { + //get the highest value + if(unusedNumberValueIndices.size() > 0) + { + value_entry->valueInternIndex = unusedNumberValueIndices.top(); + + //make sure the value is valid + if(value_entry->valueInternIndex < sortedNumberValueEntries.size()) + { + unusedNumberValueIndices.pop(); + } + else //not valid, clear queue + { + unusedNumberValueIndices.clear(); + //just use a new value, 0-based but leaving a spot open for NAN_INDEX + value_entry->valueInternIndex = sortedNumberValueEntries.size(); + } + } + else //just use new value of the latest size, 0-based but leaving a spot open for NAN_INDEX + { + value_entry->valueInternIndex = sortedNumberValueEntries.size(); + } + } + + if(value_entry->valueInternIndex >= internedNumberIndexToNumberValue.size()) + internedNumberIndexToNumberValue.resize(value_entry->valueInternIndex + 1, std::numeric_limits::quiet_NaN()); + + internedNumberIndexToNumberValue[value_entry->valueInternIndex] = value_entry->value.number; } } //inserts the value at id - void InsertIndexValue(EvaluableNodeImmediateValueType value_type, EvaluableNodeImmediateValue &value, size_t index) + //returns the value that should be used to reference the value, which may be an index + //depending on the state of the column data + EvaluableNodeImmediateValue InsertIndexValue(EvaluableNodeImmediateValueType value_type, + EvaluableNodeImmediateValue &value, size_t index) { if(value_type == ENIVT_NOT_EXIST) { invalidIndices.insert(index); - return; + + if(numberValuesInterned) + return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX); + else + return value; } if(value_type == ENIVT_NULL) { nullIndices.insert(index); - return; + + if(numberValuesInterned) + return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX); + else + return value; } - if(value_type == ENIVT_NUMBER) + if(value_type == ENIVT_NUMBER || value_type == ENIVT_NUMBER_INDIRECTION_INDEX) { numberIndices.insert(index); - if(FastIsNaN(value.number)) + double number_value = GetResolvedValue(value_type, value).number; + if(FastIsNaN(number_value)) { nanIndices.insert(index); - return; + + if(numberValuesInterned) + return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX); + else + return value; } //if the value already exists, then put the index in the list - auto [value_index, exact_index_found] = FindExactIndexForValue(value.number); + //but return the lower bound if not found so don't have to search a second time + auto [value_index, exact_index_found] = FindExactIndexForValue(number_value, true); if(exact_index_found) { - sortedNumberValueIndexPairs[value_index].second->insert(index); - return; + sortedNumberValueEntries[value_index]->indicesWithValue.insert(index); + + if(numberValuesInterned) + return EvaluableNodeImmediateValue(sortedNumberValueEntries[value_index]->valueInternIndex); + else + return value; } //insert new value in correct position - size_t new_value_index = FindUpperBoundIndexForValue(value.number); - auto inserted = sortedNumberValueIndexPairs.emplace(sortedNumberValueIndexPairs.begin() + new_value_index, value.number, std::make_unique()); - inserted->second->insert(index); + sortedNumberValueEntries.emplace(sortedNumberValueEntries.begin() + value_index, + std::make_unique(number_value)); - return; + InsertFirstIndexIntoNumberValueEntry(index, value_index); + + if(numberValuesInterned) + return sortedNumberValueEntries[value_index]->valueInternIndex; + else + return value; } if(value_type == ENIVT_STRING_ID) @@ -288,11 +654,10 @@ class SBFDSColumnData inserted_id_entry->second = std::make_unique(); auto &ids = *(inserted_id_entry->second); - ids.insert(index); UpdateLongestString(value.stringID, index); - return; + return value; } //value_type == ENIVT_CODE @@ -309,6 +674,8 @@ class SBFDSColumnData size_entry->second->insert(index); UpdateLargestCode(code_size, index); + + return value; } //returns the maximum difference between value and any other value for this column @@ -317,21 +684,20 @@ class SBFDSColumnData { switch(feature_params.featureType) { - case FDT_NOMINAL: + case GeneralizedDistance::FDT_NOMINAL: return 1.0; - case FDT_CONTINUOUS_NUMERIC: - case FDT_CONTINUOUS_UNIVERSALLY_NUMERIC: - if(sortedNumberValueIndexPairs.size() <= 1) + case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC: + if(sortedNumberValueEntries.size() <= 1) return 0.0; - return sortedNumberValueIndexPairs.back().first - sortedNumberValueIndexPairs[0].first; + return sortedNumberValueEntries.back()->value.number - sortedNumberValueEntries[0]->value.number; - case FDT_CONTINUOUS_NUMERIC_CYCLIC: + case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC_CYCLIC: //maximum is the other side of the cycle return feature_params.typeAttributes.maxCyclicDifference / 2; - case FDT_CONTINUOUS_STRING: + case GeneralizedDistance::FDT_CONTINUOUS_STRING: //the max difference is the worst case edit distance, of removing all the characters // and adding all the new ones if(value_type == ENIVT_STRING_ID) @@ -349,7 +715,7 @@ class SBFDSColumnData return static_cast(longestStringLength + 1); } - case FDT_CONTINUOUS_CODE: + case GeneralizedDistance::FDT_CONTINUOUS_CODE: if(value_type == ENIVT_CODE) return static_cast(largestCodeSize + EvaluableNode::GetDeepSize(value.code)); else if(value_type == ENIVT_NULL) @@ -370,43 +736,43 @@ class SBFDSColumnData // .second: true if exact index was found, false otherwise inline std::pair FindExactIndexForValue(double value, bool return_index_lower_bound = false) { - auto target_iter = std::lower_bound(begin(sortedNumberValueIndexPairs), end(sortedNumberValueIndexPairs), value, - [](const auto& value_index_pair, double value) + auto target_iter = std::lower_bound(begin(sortedNumberValueEntries), end(sortedNumberValueEntries), value, + [](const auto &value_entry, double value) { - return value_index_pair.first < value; + return value_entry->value.number < value; }); - if ((target_iter == end(sortedNumberValueIndexPairs)) || (target_iter->first != value)) // not exact match + if((target_iter == end(sortedNumberValueEntries)) || ((*target_iter)->value.number != value)) // not exact match { - return std::make_pair(return_index_lower_bound ? std::distance(begin(sortedNumberValueIndexPairs), target_iter) : -1 , false); + return std::make_pair(return_index_lower_bound ? std::distance(begin(sortedNumberValueEntries), target_iter) : -1 , false); } - return std::make_pair(std::distance(begin(sortedNumberValueIndexPairs), target_iter), true); // exact match + return std::make_pair(std::distance(begin(sortedNumberValueEntries), target_iter), true); // exact match } //returns the index of the lower bound of value inline size_t FindLowerBoundIndexForValue(double value) { - auto target_iter = std::lower_bound(begin(sortedNumberValueIndexPairs), end(sortedNumberValueIndexPairs), value, - [](const auto &value_index_pair, double value) + auto target_iter = std::lower_bound(begin(sortedNumberValueEntries), end(sortedNumberValueEntries), value, + [](const auto &value_entry, double value) { - return value_index_pair.first < value; + return value_entry->value.number < value; }); - return std::distance(begin(sortedNumberValueIndexPairs), target_iter); + return std::distance(begin(sortedNumberValueEntries), target_iter); } //returns the index of the upper bound of value inline size_t FindUpperBoundIndexForValue(double value) { - auto target_iter = std::upper_bound(begin(sortedNumberValueIndexPairs), end(sortedNumberValueIndexPairs), value, - [](double value, const auto &value_index_pair) + auto target_iter = std::upper_bound(begin(sortedNumberValueEntries), end(sortedNumberValueEntries), value, + [](double value, const auto &value_entry) { - return value < value_index_pair.first; + return value < value_entry->value.number; }); - return std::distance(begin(sortedNumberValueIndexPairs), target_iter); + return std::distance(begin(sortedNumberValueEntries), target_iter); } - //given a value, returns the index at which the value should be inserted into the sortedNumberValueIndexPairs + //given a value, returns the index at which the value should be inserted into the sortedNumberValueEntries //returns true for .second when an exact match is found, false otherwise //O(log(n)) //cycle_length will take into account whether wrapping around is closer @@ -416,15 +782,13 @@ class SBFDSColumnData // returns the closest index (lower_bound) if an exact match is not found auto [value_index, exact_index_found] = FindExactIndexForValue(value, true); if(exact_index_found) - { return std::make_pair(value_index, true); - } //if only have one element (or zero), short circuit code below - if(sortedNumberValueIndexPairs.size() <= 1) + if(sortedNumberValueEntries.size() <= 1) return std::make_pair(0, false); - size_t max_valid_index = sortedNumberValueIndexPairs.size() - 1; + size_t max_valid_index = sortedNumberValueEntries.size() - 1; size_t target_index = std::min(max_valid_index, value_index); //value_index is lower bound index since no exact match //if not cyclic or cyclic and not at the edge @@ -434,15 +798,15 @@ class SBFDSColumnData //need to check index again in case not cyclic // return index with the closer difference if(target_index < max_valid_index - && (std::abs(sortedNumberValueIndexPairs[target_index + 1].first - value) < std::abs(sortedNumberValueIndexPairs[target_index].first - value))) + && (std::abs(sortedNumberValueEntries[target_index + 1]->value.number - value) < std::abs(sortedNumberValueEntries[target_index]->value.number - value))) return std::make_pair(target_index + 1, false); else return std::make_pair(target_index, false); } else //cyclic { - double dist_to_max_index = std::abs(sortedNumberValueIndexPairs[max_valid_index].first - value); - double dist_to_0_index = std::abs(sortedNumberValueIndexPairs[0].first - value); + double dist_to_max_index = std::abs(sortedNumberValueEntries[max_valid_index]->value.number - value); + double dist_to_0_index = std::abs(sortedNumberValueEntries[0]->value.number - value); size_t other_closest_index; if(target_index == 0) @@ -458,7 +822,7 @@ class SBFDSColumnData other_closest_index = max_valid_index - 1; } - double dist_to_other_closest_index = std::abs(sortedNumberValueIndexPairs[other_closest_index].first - value); + double dist_to_other_closest_index = std::abs(sortedNumberValueEntries[other_closest_index]->value.number - value); if(dist_to_0_index <= dist_to_other_closest_index && dist_to_0_index <= dist_to_max_index) return std::make_pair(0, false); else if(dist_to_other_closest_index <= dist_to_0_index) @@ -477,7 +841,7 @@ class SBFDSColumnData if(value_type == ENIVT_NUMBER) { //there are no ids for this column, so return no results - if(sortedNumberValueIndexPairs.size() == 0) + if(sortedNumberValueEntries.size() == 0) return; //make a copy because passed by reference, and may need to change value for logic below @@ -534,19 +898,19 @@ class SBFDSColumnData if(between_values) { size_t index = value_index; - out.InsertInBatch(*sortedNumberValueIndexPairs[index].second); + out.InsertInBatch(sortedNumberValueEntries[index]->indicesWithValue); } else //if not within, populate with all indices not equal to value { //include nans nanIndices.CopyTo(out); - for(auto &[bucket_val, bucket] : sortedNumberValueIndexPairs) + for(auto &value_entry : sortedNumberValueEntries) { - if(bucket_val == low_number) + if(value_entry->value.number == low_number) continue; - out.InsertInBatch(*bucket); + out.InsertInBatch(value_entry->indicesWithValue); } } @@ -554,27 +918,27 @@ class SBFDSColumnData } size_t start_index = (low_number == -std::numeric_limits::infinity()) ? 0 : FindLowerBoundIndexForValue(low_number); - size_t end_index = (high_number == std::numeric_limits::infinity()) ? sortedNumberValueIndexPairs.size() : FindUpperBoundIndexForValue(high_number); + size_t end_index = (high_number == std::numeric_limits::infinity()) ? sortedNumberValueEntries.size() : FindUpperBoundIndexForValue(high_number); if(between_values) { //insert everything between the two indices for(size_t i = start_index; i < end_index; i++) - out.InsertInBatch(*sortedNumberValueIndexPairs[i].second); + out.InsertInBatch(sortedNumberValueEntries[i]->indicesWithValue); //include end_index if value matches - if(end_index < sortedNumberValueIndexPairs.size() && sortedNumberValueIndexPairs[end_index].first == high_number) - out.InsertInBatch(*sortedNumberValueIndexPairs[end_index].second); + if(end_index < sortedNumberValueEntries.size() && sortedNumberValueEntries[end_index]->value.number == high_number) + out.InsertInBatch(sortedNumberValueEntries[end_index]->indicesWithValue); } else //not between_values { //insert everything left of range for(size_t i = 0; i < start_index; i++) - out.InsertInBatch(*sortedNumberValueIndexPairs[i].second); + out.InsertInBatch(sortedNumberValueEntries[i]->indicesWithValue); //insert everything right of range - for(size_t i = end_index; i < sortedNumberValueIndexPairs.size(); i++) - out.InsertInBatch(*sortedNumberValueIndexPairs[i].second); + for(size_t i = end_index; i < sortedNumberValueEntries.size(); i++) + out.InsertInBatch(sortedNumberValueEntries[i]->indicesWithValue); } } @@ -636,7 +1000,7 @@ class SBFDSColumnData auto [value_index, exact_index_found] = FindExactIndexForValue(value.number); if(exact_index_found) - out.InsertInBatch(*sortedNumberValueIndexPairs[value_index].second); + out.InsertInBatch(sortedNumberValueEntries[value_index]->indicesWithValue); } else if(value_type == ENIVT_STRING_ID) { @@ -654,16 +1018,16 @@ class SBFDSColumnData if(value_type == ENIVT_NUMBER) { //there are no ids for this column, so return no results - if(sortedNumberValueIndexPairs.size() == 0) + if(sortedNumberValueEntries.size() == 0) return; //search left to right for max (bucket 0 is largest) or right to left for min - int64_t value_index = find_max ? sortedNumberValueIndexPairs.size() - 1 : 0; + int64_t value_index = find_max ? sortedNumberValueEntries.size() - 1 : 0; - while(value_index < static_cast(sortedNumberValueIndexPairs.size()) && value_index >= 0) + while(value_index < static_cast(sortedNumberValueEntries.size()) && value_index >= 0) { //add each index to the out indices and optionally output compute results - for(const auto &index : *sortedNumberValueIndexPairs[value_index].second) + for(const auto &index : sortedNumberValueEntries[value_index]->indicesWithValue) { if(indices_to_consider != nullptr && !indices_to_consider->contains(index)) continue; @@ -716,6 +1080,56 @@ class SBFDSColumnData } } + //returns true if switching to number interning would be expected to yield better results + // than number values given the current data + inline bool AreNumberInternsPreferredToValues() + { + //use heuristic of sqrt number of values compared to num unique values + // (but computed with a multiply instead of sqrt) + size_t num_unique_values = sortedNumberValueEntries.size(); + return (num_unique_values * num_unique_values <= numberIndices.size()); + } + + //returns true if switching to number values would be expected to yield better results + // than number interning given the current data + inline bool AreNumberValuesPreferredToInterns() + { + //use heuristic of sqrt number of values compared to num unique values + // (but computed with a multiply instead of sqrt) + //round up to reduce flipping back and forth + size_t num_unique_values = sortedNumberValueEntries.size(); + return (num_unique_values * num_unique_values > numberIndices.size() - num_unique_values); + } + + //clears number intern caches and changes state to not perform interning for numbers + void ConvertNumberInternsToValues() + { + if(!numberValuesInterned) + return; + + internedNumberIndexToNumberValue.clear(); + unusedNumberValueIndices.clear(); + numberValuesInterned = false; + } + + //initializes and sets up number value interning caches and changes state to perform interning for numbers + void ConvertNumberValuesToInterns() + { + if(numberValuesInterned) + return; + + internedNumberIndexToNumberValue.resize(sortedNumberValueEntries.size() + 1); + internedNumberIndexToNumberValue[0] = std::numeric_limits::quiet_NaN(); + for(size_t i = 0; i < sortedNumberValueEntries.size(); i++) + { + auto &value_entry = sortedNumberValueEntries[i]; + value_entry->valueInternIndex = i + 1; + internedNumberIndexToNumberValue[i + 1] = value_entry->value.number; + } + + numberValuesInterned = true; + } + protected: //updates longestStringLength and indexWithLongestString based on parameters @@ -730,6 +1144,16 @@ class SBFDSColumnData } } + //should be called when the longest string is invalidated + inline void RecomputeLongestString() + { + longestStringLength = 0; + //initialize to 0 in case there are no entities with strings + indexWithLongestString = 0; + for(auto &[s_id, s_entry] : stringIdValueToIndices) + UpdateLongestString(s_id, *s_entry->begin()); + } + //updates largestCodeSize and indexWithLargestCode based on parameters inline void UpdateLargestCode(size_t code_size, size_t index) { @@ -740,13 +1164,23 @@ class SBFDSColumnData } } + //should be called when the largest code is invalidated + inline void RecomputeLargestCode() + { + largestCodeSize = 0; + //initialize to 0 in case there are no entities with code + indexWithLargestCode = 0; + for(auto &[size, entry] : valueCodeSizeToIndices) + UpdateLargestCode(size, *entry->begin()); + } + public: //name of the column StringInternPool::StringID stringId; //stores values in sorted order and the entities that have each value - std::vector< std::pair> > sortedNumberValueIndexPairs; + std::vector> sortedNumberValueEntries; //maps a string id to a vector of indices that have that string CompactHashMap> stringIdValueToIndices; @@ -783,4 +1217,16 @@ class SBFDSColumnData size_t indexWithLargestCode; //the largest code size for this label size_t largestCodeSize; + + //if numberValuesInterned is true, then contains an index of each value to its location in sortedNumberValueEntries + //if a given index isn't used, then it will contain the maximum value for the index + //the 0th index is reserved for NaN, regardless of whether NaN appears in the data + std::vector internedNumberIndexToNumberValue; + + //unused / free indices in internedNumberIndexToNumberValue to make adding and removing new values efficient + //always want to fetch the lowest index to keep the interned NumberIndexToNumberValue small + FlexiblePriorityQueue, std::greater> unusedNumberValueIndices; + + //if true, then the indices of the values should be used and internedNumberIndexToValue populated + bool numberValuesInterned; }; diff --git a/src/Amalgam/SeparableBoxFilterDataStore.cpp b/src/Amalgam/SeparableBoxFilterDataStore.cpp index 389365fe..23e9c02d 100644 --- a/src/Amalgam/SeparableBoxFilterDataStore.cpp +++ b/src/Amalgam/SeparableBoxFilterDataStore.cpp @@ -15,6 +15,9 @@ void SeparableBoxFilterDataStore::BuildLabel(size_t column_index, const std::vec auto &entities_with_number_values = parametersAndBuffers.entitiesWithValues; entities_with_number_values.clear(); + //clear value interning if applied + column_data->ConvertNumberInternsToValues(); + //populate matrix and get values // maintaining the order of insertion of the entities from smallest to largest allows for better performance of the insertions // and every function called here assumes that entities are inserted in increasing order @@ -23,7 +26,7 @@ void SeparableBoxFilterDataStore::BuildLabel(size_t column_index, const std::vec EvaluableNodeImmediateValueType value_type; EvaluableNodeImmediateValue value; value_type = entities[entity_index]->GetValueAtLabelAsImmediateValue(label_id, value); - matrix[GetMatrixCellIndex(entity_index) + column_index] = value; + GetValue(entity_index, column_index) = value; column_data->InsertNextIndexValueExceptNumbers(value_type, value, entity_index, entities_with_number_values); } @@ -32,6 +35,51 @@ void SeparableBoxFilterDataStore::BuildLabel(size_t column_index, const std::vec std::stable_sort(begin(entities_with_number_values), end(entities_with_number_values)); column_data->AppendSortedNumberIndicesWithSortedIndices(entities_with_number_values); + + OptimizeColumn(column_index); +} + +void SeparableBoxFilterDataStore::OptimizeColumn(size_t column_index) +{ + auto &column_data = columnData[column_index]; + + if(column_data->numberValuesInterned) + { + if(column_data->AreNumberValuesPreferredToInterns()) + { + for(auto &value_entry : column_data->sortedNumberValueEntries) + { + double value = value_entry->value.number; + for(auto entity_index : value_entry->indicesWithValue) + GetValue(entity_index, column_index).number = value; + } + + for(auto entity_index : column_data->nanIndices) + GetValue(entity_index, column_index).number = std::numeric_limits::quiet_NaN(); + + for(auto entity_index : column_data->nullIndices) + GetValue(entity_index, column_index).number = std::numeric_limits::quiet_NaN(); + + column_data->ConvertNumberInternsToValues(); + } + } + else if(column_data->AreNumberInternsPreferredToValues()) + { + column_data->ConvertNumberValuesToInterns(); + + for(auto &value_entry : column_data->sortedNumberValueEntries) + { + size_t value_index = value_entry->valueInternIndex; + for(auto entity_index : value_entry->indicesWithValue) + GetValue(entity_index, column_index).indirectionIndex = value_index; + } + + for(auto entity_index : column_data->nanIndices) + GetValue(entity_index, column_index).number = SBFDSColumnData::ValueEntry::NAN_INDEX; + + for(auto entity_index : column_data->nullIndices) + GetValue(entity_index, column_index).number = SBFDSColumnData::ValueEntry::NAN_INDEX; + } } void SeparableBoxFilterDataStore::RemoveColumnIndex(size_t column_index_to_remove) @@ -89,15 +137,14 @@ void SeparableBoxFilterDataStore::AddEntity(Entity *entity, size_t entity_index) EvaluableNodeImmediateValueType value_type; EvaluableNodeImmediateValue value; value_type = entity->GetValueAtLabelAsImmediateValue(columnData[column_index]->stringId, value); - - matrix[cell_index] = value; - - columnData[column_index]->InsertIndexValue(value_type, value, entity_index); + matrix[cell_index] = columnData[column_index]->InsertIndexValue(value_type, value, entity_index); } //count this entity if(entity_index >= numEntities) numEntities = entity_index + 1; + + OptimizeAllColumns(); } void SeparableBoxFilterDataStore::RemoveEntity(Entity *entity, size_t entity_index, size_t entity_index_to_reassign) @@ -133,15 +180,19 @@ void SeparableBoxFilterDataStore::RemoveEntity(Entity *entity, size_t entity_ind //reassign index for each column for(size_t column_index = 0; column_index < columnData.size(); column_index++) { + auto &column_data = columnData[column_index]; + auto &val_to_overwrite = GetValue(entity_index, column_index); - auto &value_of_index_to_reassign = GetValue(entity_index_to_reassign, column_index); + auto type_to_overwrite = column_data->GetIndexValueType(entity_index); + + auto &value_to_reassign = GetValue(entity_index_to_reassign, column_index); auto value_type_to_reassign = columnData[column_index]->GetIndexValueType(entity_index_to_reassign); //remove the value where it is - columnData[column_index]->DeleteIndexValue(value_of_index_to_reassign, entity_index_to_reassign); + columnData[column_index]->DeleteIndexValue(value_type_to_reassign, value_to_reassign, entity_index_to_reassign); //change the destination to the value - columnData[column_index]->ChangeIndexValue(val_to_overwrite, value_type_to_reassign, value_of_index_to_reassign, entity_index); + columnData[column_index]->ChangeIndexValue(type_to_overwrite, val_to_overwrite, value_type_to_reassign, value_to_reassign, entity_index); } //copy data from entity_index_to_reassign to entity_index @@ -149,11 +200,13 @@ void SeparableBoxFilterDataStore::RemoveEntity(Entity *entity, size_t entity_ind //truncate matrix cache if removing the last entry, either by moving the last entity or by directly removing the last if(entity_index_to_reassign + 1 == numEntities - || (entity_index_to_reassign + 1 >= numEntities && entity_index + 1 == numEntities)) + || (entity_index_to_reassign + 1 >= numEntities && entity_index + 1 == numEntities)) DeleteLastRow(); //clean up any labels that aren't relevant RemoveAnyUnusedLabels(); + + OptimizeAllColumns(); } void SeparableBoxFilterDataStore::UpdateAllEntityLabels(Entity *entity, size_t entity_index) @@ -164,18 +217,26 @@ void SeparableBoxFilterDataStore::UpdateAllEntityLabels(Entity *entity, size_t e size_t matrix_index = GetMatrixCellIndex(entity_index); for(size_t column_index = 0; column_index < columnData.size(); column_index++) { + auto &column_data = columnData[column_index]; + EvaluableNodeImmediateValueType value_type; EvaluableNodeImmediateValue value; value_type = entity->GetValueAtLabelAsImmediateValue(columnData[column_index]->stringId, value); - columnData[column_index]->ChangeIndexValue(matrix[matrix_index], value_type, value, entity_index); - matrix[matrix_index] = value; + //update the value + auto &matrix_value = matrix[matrix_index]; + auto previous_value_type = column_data->GetIndexValueType(entity_index); + + //assign the matrix location to the updated value (which may be an index) + matrix_value = column_data->ChangeIndexValue(previous_value_type, matrix_value, value_type, value, entity_index); matrix_index++; } //clean up any labels that aren't relevant RemoveAnyUnusedLabels(); + + OptimizeAllColumns(); } void SeparableBoxFilterDataStore::UpdateEntityLabel(Entity *entity, size_t entity_index, StringInternPool::StringID label_updated) @@ -188,20 +249,25 @@ void SeparableBoxFilterDataStore::UpdateEntityLabel(Entity *entity, size_t entit if(column == end(labelIdToColumnIndex)) return; size_t column_index = column->second; + auto &column_data = columnData[column_index]; //get the new value EvaluableNodeImmediateValueType value_type; EvaluableNodeImmediateValue value; - value_type = entity->GetValueAtLabelAsImmediateValue(columnData[column_index]->stringId, value); + value_type = entity->GetValueAtLabelAsImmediateValue(column_data->stringId, value); //update the value auto &matrix_value = GetValue(entity_index, column_index); - columnData[column_index]->ChangeIndexValue(matrix_value, value_type, value, entity_index); - matrix_value = value; + auto previous_value_type = column_data->GetIndexValueType(entity_index); + + //assign the matrix location to the updated value (which may be an index) + matrix_value = column_data->ChangeIndexValue(previous_value_type, matrix_value, value_type, value, entity_index); //remove the label if no longer relevant if(IsColumnIndexRemovable(column_index)) RemoveColumnIndex(column_index); + + OptimizeColumn(column_index); } //populates distances_out with all entities and their distances that have a distance to target less than max_dist @@ -276,15 +342,15 @@ void SeparableBoxFilterDataStore::FindEntitiesWithinDistance(GeneralizedDistance //if there are fewer enabled_indices than the number of unique values for this feature, plus one for unknown values // it is usually faster (less distances to compute) to just compute distance for each unique value and add to associated sums // unless it happens to be that enabled_indices is very skewed - if(column_data->sortedNumberValueIndexPairs.size() < enabled_indices.size()) + if(column_data->sortedNumberValueEntries.size() < enabled_indices.size()) { - for(auto &[entity_list_value, entity_list] : column_data->sortedNumberValueIndexPairs) + for(auto &value_entry : column_data->sortedNumberValueEntries) { //get distance term that is applicable to each entity in this bucket - double distance_term = dist_params.ComputeDistanceTermRegularOneNonNull(target_value.number - entity_list_value, query_feature_index); + double distance_term = dist_params.ComputeDistanceTermRegularOneNonNull(target_value.number - value_entry->value.number, query_feature_index); //for each bucket, add term to their sums - for(auto entity_index : *entity_list) + for(auto entity_index : value_entry->indicesWithValue) { if(!enabled_indices.contains(entity_index)) continue; @@ -323,9 +389,10 @@ void SeparableBoxFilterDataStore::FindEntitiesWithinDistance(GeneralizedDistance //else, there are less indices to consider than possible unique values, so save computation by just considering entities that are still valid for(auto entity_index : enabled_indices) { - auto &value = GetValue(entity_index, absolute_feature_index); auto value_type = column_data->GetIndexValueType(entity_index); - + auto value = column_data->GetResolvedValue(value_type, GetValue(entity_index, absolute_feature_index)); + value_type = column_data->GetResolvedValueType(value_type); + distances[entity_index] += dist_params.ComputeDistanceTermRegular(target_value, value, target_value_type, value_type, query_feature_index); //remove entity if its distance is already greater than the max_dist @@ -384,14 +451,16 @@ void SeparableBoxFilterDataStore::FindEntitiesNearestToIndexedEntity(Generalized if(dist_params->IsFeatureEnabled(i)) { size_t column_index = found->second; + auto &column_data = columnData[column_index]; - auto &value = matrix[matrix_index_base + column_index]; - auto value_type = columnData[column_index]->GetIndexValueType(search_index); + auto value_type = column_data->GetIndexValueType(search_index); + //overwrite value in case of value interning + auto value = column_data->GetResolvedValue(value_type, matrix[matrix_index_base + column_index]); + value_type = column_data->GetResolvedValueType(value_type); - PopulateNextTargetAttributes(*dist_params, + PopulateNextTargetAttributes(*dist_params, i, target_column_indices, target_values, target_value_types, - column_index, value, value_type, - dist_params->featureParams[i].featureType); + column_index, value, value_type); } } @@ -607,7 +676,7 @@ void SeparableBoxFilterDataStore::FindNearestEntities(GeneralizedDistance &dist_ //skip this entity in the next loops enabled_indices.erase(good_match_index); - double distance = ResolveDistanceToNonMatchTargetValues(dist_params,\ + double distance = ResolveDistanceToNonMatchTargetValues(dist_params, target_column_indices, target_values, target_value_types, partial_sums, good_match_index, num_enabled_features); sorted_results.Push(DistanceReferencePair(distance, good_match_index)); } @@ -749,12 +818,14 @@ void SeparableBoxFilterDataStore::FindNearestEntities(GeneralizedDistance &dist_ } } -void SeparableBoxFilterDataStore::DeleteEntityIndexFromColumns(size_t index) +void SeparableBoxFilterDataStore::DeleteEntityIndexFromColumns(size_t entity_index) { for(size_t i = 0; i < columnData.size(); i++) { - auto &feature_value = GetValue(index, i); - columnData[i]->DeleteIndexValue(feature_value, index); + auto &column_data = columnData[i]; + auto &feature_value = GetValue(entity_index, i); + auto feature_type = column_data->GetIndexValueType(entity_index); + columnData[i]->DeleteIndexValue(feature_type, feature_value, entity_index); } } @@ -805,9 +876,9 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G size_t query_feature_index, size_t absolute_feature_index, BitArrayIntegerSet &enabled_indices) { auto &column = columnData[absolute_feature_index]; - auto feature_type = dist_params.featureParams[query_feature_index].featureType; + auto effective_feature_type = dist_params.featureParams[query_feature_index].effectiveFeatureType; - bool value_is_null = (value_type == ENIVT_NULL || (value_type == ENIVT_NUMBER && FastIsNaN(value.number))); + bool value_is_null = EvaluableNodeImmediateValue::IsNullEquivalent(value_type, value); //need to accumulate values for nulls if the value is a null if(value_is_null) { @@ -820,7 +891,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G //if the known-unknown term is less than unknown_unknown (this should be rare if nulls have semantic meaning) //then need to populate the rest of the cases double known_unknown_term = dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index); - if(feature_type == FDT_NOMINAL || known_unknown_term < unknown_unknown_term) + if(effective_feature_type == GeneralizedDistance::EFDT_NOMINAL || known_unknown_term < unknown_unknown_term) { BitArrayIntegerSet &known_unknown_indices = parametersAndBuffers.potentialMatchesSet; known_unknown_indices = enabled_indices; @@ -842,7 +913,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G } //if nominal, only need to compute the exact match - if(feature_type == FDT_NOMINAL) + if(effective_feature_type == GeneralizedDistance::EFDT_NOMINAL) { if(value_type == ENIVT_NUMBER) { @@ -850,7 +921,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G if(exact_index_found) { double term = dist_params.ComputeDistanceTermNominalExactMatch(query_feature_index); - AccumulatePartialSums(*column->sortedNumberValueIndexPairs[value_index].second, query_feature_index, term); + AccumulatePartialSums(column->sortedNumberValueEntries[value_index]->indicesWithValue, query_feature_index, term); } } else if(value_type == ENIVT_STRING_ID) @@ -882,7 +953,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G //didn't find the value return dist_params.ComputeDistanceTermNominalNonMatch(query_feature_index); } - else if(feature_type == FDT_CONTINUOUS_STRING) + else if(effective_feature_type == GeneralizedDistance::EFDT_CONTINUOUS_STRING) { if(value_type == ENIVT_STRING_ID) { @@ -897,7 +968,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G //the next closest string will have an edit distance of 1 return dist_params.ComputeDistanceTermNonNominalNonCyclicNonNullRegular(1.0, query_feature_index); } - else if(feature_type == FDT_CONTINUOUS_CODE) + else if(effective_feature_type == GeneralizedDistance::EFDT_CONTINUOUS_CODE) { //compute partial sums for all code of matching size size_t code_size = 1; @@ -918,7 +989,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G //else feature_type == FDT_CONTINUOUS_NUMERIC or FDT_CONTINUOUS_UNIVERSALLY_NUMERIC //if not a number or no numbers available, then no size - if(value_type != ENIVT_NUMBER || column->sortedNumberValueIndexPairs.size() == 0) + if(value_type != ENIVT_NUMBER || column->sortedNumberValueEntries.size() == 0) return GetMaxDistanceTermFromValue(dist_params, value, value_type, query_feature_index, absolute_feature_index); bool cyclic_feature = dist_params.IsFeatureCyclic(query_feature_index); @@ -932,12 +1003,12 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G if(exact_index_found) term = dist_params.ComputeDistanceTermNonNominalExactMatch(query_feature_index); else - term = dist_params.ComputeDistanceTermNonNominalNonNullRegular(value.number - column->sortedNumberValueIndexPairs[value_index].first, query_feature_index); + term = dist_params.ComputeDistanceTermNonNominalNonNullRegular(value.number - column->sortedNumberValueEntries[value_index]->value.number, query_feature_index); - size_t num_entities_computed = AccumulatePartialSums(*column->sortedNumberValueIndexPairs[value_index].second, query_feature_index, term); + size_t num_entities_computed = AccumulatePartialSums(column->sortedNumberValueEntries[value_index]->indicesWithValue, query_feature_index, term); //the logic below assumes there are at least two entries - size_t num_unique_number_values = column->sortedNumberValueIndexPairs.size(); + size_t num_unique_number_values = column->sortedNumberValueEntries.size(); if(num_unique_number_values <= 1) return term; @@ -967,17 +1038,18 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G size_t next_lower_index = 0; if(!cyclic_feature) { - if(lower_value_index > 0) + if(lower_value_index > 1) { next_lower_index = lower_value_index - 1; - lower_diff = std::abs(value.number - column->sortedNumberValueIndexPairs[next_lower_index].first); + lower_diff = std::abs(value.number - column->sortedNumberValueEntries[next_lower_index]->value.number); compute_lower = true; } } else //cyclic_feature { size_t next_index; - if(lower_value_index > 0) + //0th index is unknown + if(lower_value_index > 1) next_index = lower_value_index - 1; else next_index = num_unique_number_values - 1; @@ -986,7 +1058,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G if(next_index != value_index) { next_lower_index = next_index; - lower_diff = GeneralizedDistance::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueIndexPairs[next_lower_index].first), cycle_length); + lower_diff = GeneralizedDistance::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueEntries[next_lower_index]->value.number), cycle_length); compute_lower = true; } } @@ -1000,7 +1072,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G if(upper_value_index + 1 < num_unique_number_values) { next_upper_index = upper_value_index + 1; - upper_diff = std::abs(value.number - column->sortedNumberValueIndexPairs[next_upper_index].first); + upper_diff = std::abs(value.number - column->sortedNumberValueEntries[next_upper_index]->value.number); compute_upper = true; } } @@ -1009,8 +1081,8 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G size_t next_index; if(upper_value_index + 1 < num_unique_number_values) next_index = upper_value_index + 1; - else - next_index = 0; + else //0th index is unknown, start at 1st + next_index = 1; //make sure didn't wrap all the way around for cyclic features //either from the value itself or overlapping with the next_lower_index @@ -1019,7 +1091,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G if((!compute_lower || next_index != next_lower_index)) { next_upper_index = next_index; - upper_diff = GeneralizedDistance::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueIndexPairs[next_upper_index].first), cycle_length); + upper_diff = GeneralizedDistance::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueEntries[next_upper_index]->value.number), cycle_length); compute_upper = true; } else //upper and lower have overlapped, want to exit the loop @@ -1056,7 +1128,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G //use heuristic to decide whether to continue populating based on whether this diff will help the overall distance cutoffs // look at the rate of change of the difference compared to before, and how many new entities will be populated // if it is too small and doesn't fill enough (or fills too many), then stop expanding - size_t potential_entities = column->sortedNumberValueIndexPairs[next_closest_index].second->size(); + size_t potential_entities = column->sortedNumberValueEntries[next_closest_index]->indicesWithValue.size(); if(num_entities_computed + potential_entities > max_num_to_find) break; @@ -1081,7 +1153,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G } term = dist_params.ComputeDistanceTermNonNominalNonNullRegular(next_closest_diff, query_feature_index); - num_entities_computed += AccumulatePartialSums(*column->sortedNumberValueIndexPairs[next_closest_index].second, query_feature_index, term); + num_entities_computed += AccumulatePartialSums(column->sortedNumberValueEntries[next_closest_index]->indicesWithValue, query_feature_index, term); //track the rate of change of difference if(next_closest_diff - last_diff > largest_diff_delta) diff --git a/src/Amalgam/SeparableBoxFilterDataStore.h b/src/Amalgam/SeparableBoxFilterDataStore.h index 022808ed..306709a7 100644 --- a/src/Amalgam/SeparableBoxFilterDataStore.h +++ b/src/Amalgam/SeparableBoxFilterDataStore.h @@ -114,6 +114,16 @@ class SeparableBoxFilterDataStore // assumes column data is empty void BuildLabel(size_t column_index, const std::vector &entities); + //changes column to/from interning as would yield best performance + void OptimizeColumn(size_t column_ndex); + + //calls OptimizeColumn on all columns + inline void OptimizeAllColumns() + { + for(size_t column_index = 0; column_index < columnData.size(); column_index++) + OptimizeColumn(column_index); + } + //expand the structure by adding a new column/label/feature and populating with data from entities void AddLabels(std::vector &label_ids, const std::vector &entities) { @@ -286,18 +296,20 @@ class SeparableBoxFilterDataStore if(column == labelIdToColumnIndex.end()) return; size_t column_index = column->second; + auto &column_data = columnData[column_index]; - columnData[column_index]->numberIndices.CopyTo(enabled_entities); - columnData[column_index]->nanIndices.EraseTo(enabled_entities); + column_data->numberIndices.CopyTo(enabled_entities); + column_data->nanIndices.EraseTo(enabled_entities); //resize buffers and place each entity and value into its respective buffer entities.resize(enabled_entities.size()); values.resize(enabled_entities.size()); size_t index = 0; + auto value_type = column_data->GetUnresolvedValueType(ENIVT_NUMBER); for(auto entity_index : enabled_entities) { entities[index] = entity_index; - values[index] = GetValue(entity_index, column_index).number; + values[index] = column_data->GetResolvedValue(value_type, GetValue(entity_index, column_index)).number; index++; } } @@ -314,18 +326,20 @@ class SeparableBoxFilterDataStore if(column == labelIdToColumnIndex.end()) return; size_t column_index = column->second; + auto &column_data = columnData[column_index]; - columnData[column_index]->numberIndices.IntersectTo(enabled_entities); - columnData[column_index]->nanIndices.EraseTo(enabled_entities); + column_data->numberIndices.IntersectTo(enabled_entities); + column_data->nanIndices.EraseTo(enabled_entities); //resize buffers and place each entity and value into its respective buffer entities.resize(enabled_entities.size()); values.resize(enabled_entities.size()); size_t index = 0; + auto value_type = column_data->GetUnresolvedValueType(ENIVT_NUMBER); for(auto entity_index : enabled_entities) { entities[index] = entity_index; - values[index] = GetValue(entity_index, column_index).number; + values[index] = column_data->GetResolvedValue(value_type, GetValue(entity_index, column_index)).number; index++; } } @@ -414,16 +428,18 @@ class SeparableBoxFilterDataStore template inline std::function GetNumberValueFromEntityIteratorFunction(size_t column_index) { - auto number_indices_ptr = &columnData[column_index]->numberIndices; + auto column_data = columnData[column_index].get(); + auto number_indices_ptr = &column_data->numberIndices; + auto value_type = column_data->GetUnresolvedValueType(ENIVT_NUMBER); - return [&, number_indices_ptr, column_index] + return [&, number_indices_ptr, column_index, column_data, value_type] (Iter i, double &value) { size_t entity_index = *i; if(!number_indices_ptr->contains(entity_index)) return false; - value = GetValue(entity_index, column_index).number; + value = column_data->GetResolvedValue(value_type, GetValue(entity_index, column_index)).number; return true; }; } @@ -436,15 +452,17 @@ class SeparableBoxFilterDataStore if(column_index >= columnData.size()) return [](size_t i, double &value) { return false; }; - auto number_indices_ptr = &columnData[column_index]->numberIndices; + auto column_data = columnData[column_index].get(); + auto number_indices_ptr = &column_data->numberIndices; + auto value_type = column_data->GetUnresolvedValueType(ENIVT_NUMBER); - return [&, number_indices_ptr, column_index] + return [&, number_indices_ptr, column_index, column_data, value_type] (size_t i, double &value) { if(!number_indices_ptr->contains(i)) return false; - value = GetValue(i, column_index).number; + value = column_data->GetResolvedValue(value_type, GetValue(i, column_index)).number; return true; }; } @@ -504,7 +522,7 @@ class SeparableBoxFilterDataStore } //deletes the index and associated data - void DeleteEntityIndexFromColumns(size_t index); + void DeleteEntityIndexFromColumns(size_t entity_index); //adds a new labels to the database, populating new cells with -NaN, and updating the number of entities // assumes label_ids is not empty and num_entities is nonzero @@ -522,12 +540,15 @@ class SeparableBoxFilterDataStore auto &partial_sums = parametersAndBuffers.partialSums; const auto accum_location = partial_sums.GetAccumLocation(query_feature_index); + auto &column_data = columnData[absolute_feature_index]; + //for each found element, accumulate associated partial sums for(size_t entity_index : entity_indices) { //get value - auto &other_value = GetValue(entity_index, absolute_feature_index); - auto other_value_type = columnData[absolute_feature_index]->GetIndexValueType(entity_index); + auto other_value_type = column_data->GetIndexValueType(entity_index); + auto other_value = column_data->GetResolvedValue(other_value_type, GetValue(entity_index, absolute_feature_index)); + other_value_type = column_data->GetResolvedValueType(other_value_type); //compute term double term = dist_params.ComputeDistanceTermRegular(value, other_value, value_type, other_value_type, query_feature_index); @@ -663,8 +684,11 @@ class SeparableBoxFilterDataStore if(dist_params.IsFeatureEnabled(i)) { size_t column_index = target_column_indices[i]; - auto &other_value = matrix[matrix_base_position + column_index]; - auto other_value_type = columnData[column_index]->GetIndexValueType(other_index); + auto &column_data = columnData[column_index]; + + auto other_value_type = column_data->GetIndexValueType(other_index); + auto other_value = column_data->GetResolvedValue(other_value_type, matrix[matrix_base_position + column_index]); + other_value_type = column_data->GetResolvedValueType(other_value_type); dist_accum += dist_params.ComputeDistanceTermRegular(target_values[i], other_value, target_value_types[i], other_value_type, i); } @@ -681,41 +705,62 @@ class SeparableBoxFilterDataStore std::vector &target_values, std::vector &target_value_types, size_t entity_index, size_t query_feature_index) { - auto feature_type = dist_params.featureParams[query_feature_index].featureType; - - if(feature_type == FDT_NOMINAL) + switch(dist_params.featureParams[query_feature_index].effectiveFeatureType) + { + case GeneralizedDistance::EFDT_NOMINAL: return dist_params.ComputeDistanceTermNominalNonMatch(query_feature_index); - else + + case GeneralizedDistance::EFDT_CONTINUOUS_UNIVERSALLY_NUMERIC: { const size_t column_index = target_label_indices[query_feature_index]; + return dist_params.ComputeDistanceTermNonNominalNonCyclicOneNonNullRegular(target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index); + } - if(feature_type == FDT_CONTINUOUS_UNIVERSALLY_NUMERIC) - { + case GeneralizedDistance::EFDT_VALUES_UNIVERSALLY_PRECOMPUTED: + { + const size_t column_index = target_label_indices[query_feature_index]; + return dist_params.ComputeDistanceTermNumberInterned(GetValue(entity_index, column_index).indirectionIndex, query_feature_index); + } + + case GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC: + { + const size_t column_index = target_label_indices[query_feature_index]; + auto &column_data = columnData[column_index]; + if(column_data->numberIndices.contains(entity_index)) return dist_params.ComputeDistanceTermNonNominalNonCyclicOneNonNullRegular(target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index); - } - else if(feature_type == FDT_CONTINUOUS_NUMERIC) - { - auto &column_data = columnData[column_index]; - if(column_data->numberIndices.contains(entity_index)) - return dist_params.ComputeDistanceTermNonNominalNonCyclicOneNonNullRegular(target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index); - else - return dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index); - } - else if(feature_type == FDT_CONTINUOUS_NUMERIC_CYCLIC) - { - auto &column_data = columnData[column_index]; - if(column_data->numberIndices.contains(entity_index)) - return dist_params.ComputeDistanceTermNonNominalOneNonNullRegular(target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index); - else - return dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index); - } - else //feature_type == FDT_CONTINUOUS_CODE - { - auto &other_value = GetValue(entity_index, column_index); - auto other_value_type = columnData[column_index]->GetIndexValueType(entity_index); + else + return dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index); + } - return dist_params.ComputeDistanceTermRegular(target_values[query_feature_index], other_value, target_value_types[query_feature_index], other_value_type, query_feature_index); - } + case GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC_CYCLIC: + { + const size_t column_index = target_label_indices[query_feature_index]; + auto &column_data = columnData[column_index]; + if(column_data->numberIndices.contains(entity_index)) + return dist_params.ComputeDistanceTermNonNominalOneNonNullRegular(target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index); + else + return dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index); + } + + case GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC_PRECOMPUTED: + { + const size_t column_index = target_label_indices[query_feature_index]; + auto &column_data = columnData[column_index]; + if(column_data->numberIndices.contains(entity_index)) + return dist_params.ComputeDistanceTermNumberInterned(GetValue(entity_index, column_index).indirectionIndex, query_feature_index); + else + return dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index); + } + + default: //GeneralizedDistance::EFDT_CONTINUOUS_STRING or GeneralizedDistance::EFDT_CONTINUOUS_CODE + { + const size_t column_index = target_label_indices[query_feature_index]; + auto &column_data = columnData[column_index]; + auto other_value_type = column_data->GetIndexValueType(entity_index); + auto other_value = column_data->GetResolvedValue(other_value_type, GetValue(entity_index, column_index)); + + return dist_params.ComputeDistanceTermRegular(target_values[query_feature_index], other_value, target_value_types[query_feature_index], other_value_type, query_feature_index); + } } } @@ -782,50 +827,73 @@ class SeparableBoxFilterDataStore entity_index, query_feature_index); //break out of the loop before the iterator is incremented to save a few cycles - if(distance > reject_distance) - return std::make_pair(false, distance); - - if(num_uncalculated_features == 0) - break; + //do this via logic to minimize the number of branches + bool unacceptable_distance = (distance > reject_distance); + if(unacceptable_distance || num_uncalculated_features == 0) + return std::make_pair(!unacceptable_distance, distance); } - //done with computation + //shouldn't make it here return std::make_pair(true, distance); } - //populates the next target attribute in each vector based on column_index, position data, and mkdist_feature_type - // if mkdist_feature_type can be modified for efficiency, this function will update it, which is why it is passed by reference - __forceinline void PopulateNextTargetAttributes(GeneralizedDistance &dist_params, + //populates the next target attribute in each vector based on column_index, position data + //if there is a specialization of the feature type, it will update it and update dist_params accordingly + __forceinline void PopulateNextTargetAttributes(GeneralizedDistance &dist_params, size_t query_feature_index, std::vector &target_column_indices, std::vector &target_values, std::vector &target_value_types, size_t column_index, - EvaluableNodeImmediateValue &position_value, EvaluableNodeImmediateValueType position_value_type, - FeatureDifferenceType &mkdist_feature_type) + EvaluableNodeImmediateValue &position_value, EvaluableNodeImmediateValueType position_value_type) { target_column_indices.push_back(column_index); - if(mkdist_feature_type == FDT_NOMINAL || mkdist_feature_type == FDT_CONTINUOUS_STRING || mkdist_feature_type == FDT_CONTINUOUS_CODE) + auto &feature_type = dist_params.featureParams[query_feature_index].featureType; + auto &effective_feature_type = dist_params.featureParams[query_feature_index].effectiveFeatureType; + + if(feature_type == GeneralizedDistance::FDT_NOMINAL + || feature_type == GeneralizedDistance::FDT_CONTINUOUS_STRING + || feature_type == GeneralizedDistance::FDT_CONTINUOUS_CODE) { target_values.push_back(position_value); target_value_types.push_back(position_value_type); + + if(feature_type == GeneralizedDistance::FDT_NOMINAL) + effective_feature_type = GeneralizedDistance::EFDT_NOMINAL; + else if(feature_type == GeneralizedDistance::FDT_CONTINUOUS_STRING) + effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_STRING; + else if(feature_type == GeneralizedDistance::FDT_CONTINUOUS_CODE) + effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_CODE; } - else // mkdist_feature_type == FDT_CONTINUOUS_NUMERIC or FDT_CONTINUOUS_NUMERIC_CYCLIC + else // feature_type is some form of numeric { - //if everything is either non-existant or numeric, then can shortcut later + //looking for continuous; if not a number, so just put as nan + double position_value_numeric = (position_value_type == ENIVT_NUMBER ? position_value.number : std::numeric_limits::quiet_NaN()); + target_values.push_back(position_value_numeric); + target_value_types.push_back(ENIVT_NUMBER); + + //set up effective_feature_type auto &column_data = columnData[column_index]; + + //determine if all values are numeric size_t num_values_stored_as_numbers = column_data->numberIndices.size() + column_data->invalidIndices.size() + column_data->nullIndices.size(); - if(GetNumInsertedEntities() == num_values_stored_as_numbers && mkdist_feature_type == FDT_CONTINUOUS_NUMERIC) - mkdist_feature_type = FDT_CONTINUOUS_UNIVERSALLY_NUMERIC; + bool all_values_numeric = (GetNumInsertedEntities() == num_values_stored_as_numbers); - auto value_type = position_value_type; - if(value_type == ENIVT_NUMBER) + if(column_data->numberValuesInterned) { - target_values.push_back(position_value); - target_value_types.push_back(ENIVT_NUMBER); + if(all_values_numeric) + effective_feature_type = GeneralizedDistance::EFDT_VALUES_UNIVERSALLY_PRECOMPUTED; + else + effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC_PRECOMPUTED; + + dist_params.ComputeAndStoreInternedNumberValuesAndDistanceTerms(query_feature_index, position_value_numeric, &column_data->internedNumberIndexToNumberValue); } - else //looking for continuous and not a number, so just put as nan + else { - target_values.push_back(std::numeric_limits::quiet_NaN()); - target_value_types.push_back(ENIVT_NUMBER); + if(all_values_numeric && feature_type == GeneralizedDistance::FDT_CONTINUOUS_NUMERIC) + effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_UNIVERSALLY_NUMERIC; + else if(feature_type == GeneralizedDistance::FDT_CONTINUOUS_NUMERIC_CYCLIC) + effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC_CYCLIC; + else + effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC; } } } @@ -853,10 +921,9 @@ class SeparableBoxFilterDataStore if(dist_params.IsFeatureEnabled(i)) { - PopulateNextTargetAttributes(dist_params, + PopulateNextTargetAttributes(dist_params, i, target_column_indices, target_values, target_value_types, - column->second, position_values[i], position_value_types[i], - dist_params.featureParams[i].featureType); + column->second, position_values[i], position_value_types[i]); } } } @@ -889,7 +956,8 @@ class SeparableBoxFilterDataStore feature_params.unknownToUnknownDifference = unknown_distance_term; } - dist_params.ComputeAndStoreUncertaintyDistanceTerms(i); + dist_params.ComputeAndStoreUncertaintyDistanceTerms(i, + EvaluableNodeImmediateValue::IsNullEquivalent(target_value_types[i], target_values[i])); } } diff --git a/src/Amalgam/amlg_code/test.amlg b/src/Amalgam/amlg_code/test.amlg index 2847c612..55668ac8 100644 --- a/src/Amalgam/amlg_code/test.amlg +++ b/src/Amalgam/amlg_code/test.amlg @@ -1,4 +1,28 @@ (seq - ;(print (format (list (assoc a 3 b 4) (assoc c "c" d (null))) "code" "yaml") "\n") - (print (format (true) "code" "yaml") "\n") + (create_entities "BoxConvictionTestContainer" (null) ) + + (create_entities (list "BoxConvictionTestContainer" "vert0") (lambda + (null ##x 0 ##y 0 ##weight 2) + ) ) + + (create_entities (list "BoxConvictionTestContainer" "vert1") (lambda + (null ##x 0 ##y 1 ##weight 1) + ) ) + + (create_entities (list "BoxConvictionTestContainer" "vert2") (lambda + (null ##x 1 ##y 0 ##weight 1) + ) ) + + (create_entities (list "BoxConvictionTestContainer" "vert3") (lambda + (null ##x 2 ##y 1 ##weight 1) + ) ) + + ;should print: + ;dc: (list + ;(list "vert0" "vert1" "vert2" "vert3") + ;(list 1 1 1 1.4142135623730951) + ;) + (print "dc: " (compute_on_contained_entities "BoxConvictionTestContainer" (list + (compute_entity_distance_contributions 1 (list "x" "y") (list "vert3") (null) (null) (null) (null) 2.0 -1 (null) "fixed_seed" (null) "recompute_precise" (true)) + ))) ) \ No newline at end of file diff --git a/src/Amalgam/entity/EntityQueryBuilder.h b/src/Amalgam/entity/EntityQueryBuilder.h index 692d7dfc..24db9321 100644 --- a/src/Amalgam/entity/EntityQueryBuilder.h +++ b/src/Amalgam/entity/EntityQueryBuilder.h @@ -64,18 +64,18 @@ namespace EntityQueryBuilder [&dist_params](size_t i, bool found, EvaluableNode *en) { if(i < dist_params.featureParams.size()) { - auto feature_type = FDT_CONTINUOUS_NUMERIC; + auto feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC; if(found) { StringInternPool::StringID feature_type_id = EvaluableNode::ToStringIDIfExists(en); switch(feature_type_id) { - case ENBISI_nominal: feature_type = FDT_NOMINAL; break; - case ENBISI_continuous: feature_type = FDT_CONTINUOUS_NUMERIC; break; - case ENBISI_cyclic: feature_type = FDT_CONTINUOUS_NUMERIC_CYCLIC; break; - case GetStringIdFromNodeTypeFromString(ENT_STRING): feature_type = FDT_CONTINUOUS_STRING; break; - case ENBISI_code: feature_type = FDT_CONTINUOUS_CODE; break; - default: feature_type = FDT_CONTINUOUS_NUMERIC; break; + case ENBISI_nominal: feature_type = GeneralizedDistance::FDT_NOMINAL; break; + case ENBISI_continuous: feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC; break; + case ENBISI_cyclic: feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC_CYCLIC; break; + case GetStringIdFromNodeTypeFromString(ENT_STRING): feature_type = GeneralizedDistance::FDT_CONTINUOUS_STRING; break; + case ENBISI_code: feature_type = GeneralizedDistance::FDT_CONTINUOUS_CODE; break; + default: feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC; break; } } dist_params.featureParams[i].featureType = feature_type; @@ -93,7 +93,7 @@ namespace EntityQueryBuilder //get attributes based on feature type switch(dist_params.featureParams[i].featureType) { - case FDT_NOMINAL: + case GeneralizedDistance::FDT_NOMINAL: if(found && !EvaluableNode::IsNull(en)) { if(en->EvaluableNode::IsOrderedArray()) @@ -118,7 +118,7 @@ namespace EntityQueryBuilder } break; - case FDT_CONTINUOUS_NUMERIC_CYCLIC: + case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC_CYCLIC: if(found && !EvaluableNode::IsNull(en)) { if(en->EvaluableNode::IsOrderedArray()) @@ -139,14 +139,13 @@ namespace EntityQueryBuilder } else //can't be cyclic without a range { - dist_params.featureParams[i].featureType = FDT_CONTINUOUS_NUMERIC; + dist_params.featureParams[i].featureType = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC; } break; - case FDT_CONTINUOUS_NUMERIC: - case FDT_CONTINUOUS_UNIVERSALLY_NUMERIC: - case FDT_CONTINUOUS_STRING: - case FDT_CONTINUOUS_CODE: + case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC: + case GeneralizedDistance::FDT_CONTINUOUS_STRING: + case GeneralizedDistance::FDT_CONTINUOUS_CODE: if(found && !EvaluableNode::IsNull(en)) { if(en->EvaluableNode::IsOrderedArray()) diff --git a/src/Amalgam/evaluablenode/EvaluableNode.h b/src/Amalgam/evaluablenode/EvaluableNode.h index fc3d9245..a096a225 100644 --- a/src/Amalgam/evaluablenode/EvaluableNode.h +++ b/src/Amalgam/evaluablenode/EvaluableNode.h @@ -916,11 +916,12 @@ class EvaluableNode // compare two values based on their collective types enum EvaluableNodeImmediateValueType { - ENIVT_NOT_EXIST, //there is nothing to even hold the data - ENIVT_NULL, //no data being held - ENIVT_NUMBER, //number - ENIVT_STRING_ID, //stringID - ENIVT_CODE //code (more general than any of the above) + ENIVT_NOT_EXIST, //there is nothing to even hold the data + ENIVT_NULL, //no data being held + ENIVT_NUMBER, //number + ENIVT_STRING_ID, //stringID + ENIVT_CODE, //code (more general than any of the above) + ENIVT_NUMBER_INDIRECTION_INDEX //not a real EvaluableNode type, but an index to some data structure that has a number }; //structure that can hold the most immediate value type of an EvaluableNode @@ -992,28 +993,30 @@ union EvaluableNodeImmediateValue return false; //types are the same, just use type_1 for reference - if(type_1 == ENIVT_NUMBER) - { - if(EqualIncludingNaN(value_1.number, value_2.number)) - return false; - } + if(type_1 == ENIVT_NULL) + return true; + else if(type_1 == ENIVT_NUMBER) + return EqualIncludingNaN(value_1.number, value_2.number); else if(type_1 == ENIVT_STRING_ID) - { - if(value_1.stringID == value_2.stringID) - return false; - } + return (value_1.stringID == value_2.stringID); + else if(type_1 == ENIVT_NUMBER_INDIRECTION_INDEX) + return (value_1.indirectionIndex == value_2.indirectionIndex); else - { - if(EvaluableNode::AreDeepEqual(value_1.code, value_2.code)) - return false; - } + return EvaluableNode::AreDeepEqual(value_1.code, value_2.code); + } - return true; + //returns true if it is a null or null equivalent + static bool IsNullEquivalent(EvaluableNodeImmediateValueType type, EvaluableNodeImmediateValue &value) + { + return (type == ENIVT_NULL + || (type == ENIVT_NUMBER && FastIsNaN(value.number)) + || (type == ENIVT_STRING_ID && value.stringID == string_intern_pool.NOT_A_STRING_ID)); } double number; StringInternPool::StringID stringID; EvaluableNode *code; + size_t indirectionIndex; }; //used for storing a value and type together diff --git a/src/Amalgam/out.txt b/src/Amalgam/out.txt index 82c39bf5..249653cb 100644 --- a/src/Amalgam/out.txt +++ b/src/Amalgam/out.txt @@ -1227,7 +1227,7 @@ abcdef 8 ) accum_string "abcdef" - argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") + argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") bar (declare (assoc x 6) (+ x 2) @@ -1240,10 +1240,10 @@ abcdef A (assoc B 2) B 2 ) - interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1695320985.913567 + start_time 1697958069.900312 www 1 x 12 zz 10 @@ -1270,7 +1270,7 @@ abcdef 8 ) accum_string "abcdef" - argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") + argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") bar (declare (assoc x 6) (+ x 2) @@ -1283,10 +1283,10 @@ abcdef A (assoc B 2) B 2 ) - interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1695320985.913567 + start_time 1697958069.900312 www 1 x 12 zz 10 @@ -1312,7 +1312,7 @@ abcdef 8 ) accum_string "abcdef" - argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") + argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg") bar (declare (assoc x 6) (+ x 2) @@ -1325,10 +1325,10 @@ abcdef A (assoc B 2) B 2 ) - interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" + interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe" raaa 2 rwww 1 - start_time 1695320985.913567 + start_time 1697958069.900312 www 1 x 12 zz 10 @@ -1596,7 +1596,7 @@ e: - .inf 25: (assoc a 1) -current date-time in epoch: 2023-09-21-14.29.45.9780260 +current date-time in epoch: 2023-10-22-03.01.10.1536540 2020-06-07 00:22:59 1391230800 1391230800 @@ -3500,7 +3500,7 @@ deep sets --set_entity_root_permission-- RootTest -1695320986.253307 +1697958070.384504 (true) RootTest @@ -3730,7 +3730,7 @@ hello ) ) ) - (set_entity_rand_seed new_entity "ށT1Sx0є-I") + (set_entity_rand_seed new_entity "b1L0є-I") (set_entity_rand_seed (first (create_entities @@ -3743,7 +3743,7 @@ hello ) ) ) - "Tx\"J5O" + "98V:o" ) (set_entity_rand_seed (first @@ -3779,7 +3779,7 @@ hello ) ) ) - (set_entity_rand_seed new_entity "ށT1Sx0є-I") + (set_entity_rand_seed new_entity "b1L0є-I") (set_entity_rand_seed (first (create_entities @@ -4243,13 +4243,13 @@ case convictions unweighted: ) case convictions weighted by object (with erroneously long nominal): (assoc - TestContainerExec 0.28356810230095286 - vert0 1.3005955468751227 - vert1 1.3005955468751227 - vert2 1.1369409848092722 - vert3 1.272922458938727 - vert4 73.971185205438 - vert5 3.8897264529001325 + TestContainerExec 0.2828909712209332 + vert0 1.2974898602334366 + vert1 1.2974898602334366 + vert2 1.1342260882087178 + vert3 1.2974898602334366 + vert4 73.7945497230898 + vert5 3.880438191446671 ) case convictions x exists before: (assoc entity3 1.0000000933277426 entity4 0.9999998458521889 entity5 1.0000000608201045) @@ -4487,7 +4487,7 @@ a (list "hello" "!") (assoc a1 1.4142135623730951 a2 2 a3 1.4142135623730951) (assoc a1 1.4142135623730951 a3 1.4142135623730951) -(assoc a3 1.4142135623730951) +(assoc a1 1.4142135623730951) (assoc a1 5.0990195135927845 a2 2 a3 5.0990195135927845) (assoc a1 1 a3 1 a4 0) --accuracy tests-- @@ -4634,16 +4634,16 @@ cyclic KL: (assoc vert1 0.0020695242435298626 vert2 0.0020695242435298626 vert3 0.03622271709266012 - vert4 0.06081391029364311 + vert4 0.05872535496117577 ) cyclic conviction: (assoc - vert0 0.5137287240708814 - vert1 16.01087833672136 - vert2 16.01087833672136 - vert3 0.9147547047144656 - vert4 0.5448572656824459 + vert0 0.5072524658798658 + vert1 15.809039161461927 + vert2 15.809039161461927 + vert3 0.9032229616532002 + vert4 0.5571220443569613 ) -cyclic group kl divergence: 0.06081391029364306 +cyclic group kl divergence: 0.05872535496117583 surprisal transforms probabilities: (list (list "vert0" "vert1" "vert2" "vert3") @@ -4723,4 +4723,4 @@ Expecting 1000: 1000 concurrent entity writes successful: (true) --total execution time-- -1.1435298919677734 +1.1228001117706299