Skip to content

Commit

Permalink
Merge branch 'main' into 21799-sbfds-remove-matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
howsohazard authored Dec 12, 2024
2 parents 7737246 + 8ad85c5 commit 3de000a
Show file tree
Hide file tree
Showing 4 changed files with 317 additions and 91 deletions.
53 changes: 44 additions & 9 deletions src/Amalgam/GeneralizedDistance.h
Original file line number Diff line number Diff line change
Expand Up @@ -1132,7 +1132,12 @@ class RepeatedGeneralizedDistanceEvaluator
double default_deviation = deviations_for_value->second.defaultDeviation;
if(FastIsNaN(default_deviation))
{
deviations.defaultDeviation
deviations_for_value->second.defaultDeviation = feature_attributes.deviation;

feature_data.defaultNominalMatchDistanceTerm
= distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy);

feature_data.defaultNominalNonMatchDistanceTerm
= distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy);
}
else
Expand All @@ -1144,10 +1149,15 @@ class RepeatedGeneralizedDistanceEvaluator
//divide the probability among the other classes
double prob_class_given_nonmatch = default_deviation / nonmatching_classes;

deviations.defaultDeviation
= distEvaluator->ComputeDistanceTermNominalNonmatchFromMatchProbabilities(
feature_data.defaultNominalMatchDistanceTerm =
distEvaluator->ComputeDistanceTermNominalMatchFromMatchProbabilities(
index, prob_class_given_match, high_accuracy);

feature_data.defaultNominalNonMatchDistanceTerm =
distEvaluator->ComputeDistanceTermNominalNonmatchFromMatchProbabilities(
index, prob_class_given_match, prob_class_given_nonmatch, high_accuracy);
}
return;
}
}
else if(feature_data.targetValue.nodeType == ENIVT_STRING_ID)
Expand All @@ -1169,7 +1179,12 @@ class RepeatedGeneralizedDistanceEvaluator
double default_deviation = deviations_for_sid->second.defaultDeviation;
if(FastIsNaN(default_deviation))
{
deviations.defaultDeviation
deviations_for_sid->second.defaultDeviation = feature_attributes.deviation;

feature_data.defaultNominalMatchDistanceTerm
= distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy);

feature_data.defaultNominalNonMatchDistanceTerm
= distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy);
}
else
Expand All @@ -1181,12 +1196,24 @@ class RepeatedGeneralizedDistanceEvaluator
//divide the probability among the other classes
double prob_class_given_nonmatch = default_deviation / nonmatching_classes;

deviations.defaultDeviation
feature_data.defaultNominalMatchDistanceTerm =
distEvaluator->ComputeDistanceTermNominalMatchFromMatchProbabilities(
index, prob_class_given_match, high_accuracy);

feature_data.defaultNominalNonMatchDistanceTerm
= distEvaluator->ComputeDistanceTermNominalNonmatchFromMatchProbabilities(
index, prob_class_given_match, prob_class_given_nonmatch, high_accuracy);
}
return;
}
}

//made it here, so didn't find anything in the SDM. use fallback for default nominal terms
feature_data.defaultNominalMatchDistanceTerm =
distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy);

feature_data.defaultNominalNonMatchDistanceTerm =
distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy);
}

//for the feature index, computes and stores the distance terms as measured from value to each interned value
Expand Down Expand Up @@ -1274,7 +1301,7 @@ class RepeatedGeneralizedDistanceEvaluator
return dist_term_entry->second;

if(other_value.number == feature_data.targetValue.GetValueAsNumber())
return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy);
return feature_data.defaultNominalMatchDistanceTerm;
}
else if(other_type == ENIVT_STRING_ID)
{
Expand All @@ -1283,7 +1310,7 @@ class RepeatedGeneralizedDistanceEvaluator
return dist_term_entry->second;

if(other_value.stringID == feature_data.targetValue.GetValueAsStringIDIfExists())
return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy);
return feature_data.defaultNominalMatchDistanceTerm;
}

if(EvaluableNodeImmediateValue::IsNull(other_type, other_value))
Expand All @@ -1295,7 +1322,7 @@ class RepeatedGeneralizedDistanceEvaluator
}
else
{
return distEvaluator->ComputeDistanceTermNominalUniversallySymmetricNonMatch(index, high_accuracy);
return feature_data.defaultNominalNonMatchDistanceTerm;
}
}
else
Expand Down Expand Up @@ -1363,7 +1390,7 @@ class RepeatedGeneralizedDistanceEvaluator
//returns the smallest nonmatching distance term regardless of value
__forceinline double ComputeDistanceTermNominalNonNullSmallestNonmatch(size_t index, bool high_accuracy)
{
double match_dist_term = distEvaluator->ComputeDistanceTermNominalUniversallySymmetricExactMatch(index, high_accuracy);
double match_dist_term = featureData[index].defaultNominalMatchDistanceTerm;
double smallest_nonmatch = ComputeDistanceTermNonNullNominalNextSmallest(match_dist_term, index, high_accuracy);

//if there is no such value, return infinite
Expand Down Expand Up @@ -1408,6 +1435,8 @@ class RepeatedGeneralizedDistanceEvaluator
void Clear()
{
effectiveFeatureType = EFDT_CONTINUOUS_NUMERIC;
defaultNominalMatchDistanceTerm = 0.0;
defaultNominalNonMatchDistanceTerm = 0.0;
precomputedRemainingIdenticalDistanceTerm = 0.0;
internedDistanceTerms.clear();
nominalStringDistanceTerms.clear();
Expand All @@ -1430,6 +1459,12 @@ class RepeatedGeneralizedDistanceEvaluator
//target that the distance will be computed to
EvaluableNodeImmediateValueWithType targetValue;

//the default nominal matching distance term if a term is not in the distance term matrix
double defaultNominalMatchDistanceTerm;

//the default nominal nonmatching distance term if a term is not in the distance term matrix
double defaultNominalNonMatchDistanceTerm;

//the distance term for EFDT_REMAINING_IDENTICAL_PRECOMPUTED
double precomputedRemainingIdenticalDistanceTerm;

Expand Down
6 changes: 6 additions & 0 deletions src/Amalgam/SBFDSColumnData.h
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,12 @@ class SBFDSColumnData
+ (valueCodeSizeToIndices.size() + codeIndices.size()) / 2;
}

//returns the number of valid values (exist and not null) in the column
inline size_t GetNumValidDataElements()
{
return numberIndices.size() + stringIdIndices.size() + codeIndices.size();
}

//returns the maximum difference between value and any other value for this column
//if empty, will return infinity
inline double GetMaxDifferenceTerm(GeneralizedDistanceEvaluator::FeatureAttributes &feature_attribs)
Expand Down
2 changes: 1 addition & 1 deletion src/Amalgam/SeparableBoxFilterDataStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -1013,7 +1013,7 @@ class SeparableBoxFilterDataStore
if(feature_attribs.IsFeatureNominal())
{
if(FastIsNaN(feature_attribs.typeAttributes.nominalCount))
feature_attribs.typeAttributes.nominalCount = static_cast<double>(column_data->GetNumUniqueValues());
feature_attribs.typeAttributes.nominalCount = static_cast<double>(column_data->GetNumValidDataElements());
}
}
}
Expand Down
Loading

0 comments on commit 3de000a

Please sign in to comment.