Skip to content

Commit

Permalink
17630 & 17861: Implements number interning, improves change value eff…
Browse files Browse the repository at this point in the history
…iciency, fixes cyclic bug (#25)
  • Loading branch information
howsohazard authored Oct 27, 2023
1 parent 475d567 commit d08b199
Show file tree
Hide file tree
Showing 9 changed files with 1,059 additions and 368 deletions.
174 changes: 128 additions & 46 deletions src/Amalgam/GeneralizedDistance.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,45 @@
//If defined, will use the Laplace LK metric (default). Otherwise will use Gaussian.
#define DISTANCE_USE_LAPLACE_LK_METRIC true

//general class of feature comparisons
// align at 64-bits in order to play nice with data alignment where it is used
enum FeatureDifferenceType : uint64_t
{
FDT_NOMINAL,
//continuous, but without cycles
FDT_CONTINUOUS_NUMERIC,
//like FDT_CONTINUOUS_NUMERIC, but guarantees everything is always numeric
FDT_CONTINUOUS_UNIVERSALLY_NUMERIC,
//like FDT_CONTINUOUS_NUMERIC, but has cycles
FDT_CONTINUOUS_NUMERIC_CYCLIC,
//edit distance between strings
FDT_CONTINUOUS_STRING,
//continuous measures of the number of nodes different between two sets of code
FDT_CONTINUOUS_CODE,
};

//base data struct for holding distance parameters and metadata
//generalizes Minkowski distance, information theoretic surprisal as a distance, and Lukaszyk–Karmowski
class GeneralizedDistance
{
public:
//initialization functions

//general class of feature comparisons
// align at 32-bits in order to play nice with data alignment where it is used
enum FeatureDifferenceType : uint32_t
{
FDT_NOMINAL,
//continuous without cycles, may contain nonnumeric data
FDT_CONTINUOUS_NUMERIC,
//like FDT_CONTINUOUS_NUMERIC, but has cycles
FDT_CONTINUOUS_NUMERIC_CYCLIC,
//edit distance between strings
FDT_CONTINUOUS_STRING,
//continuous measures of the number of nodes different between two sets of code
FDT_CONTINUOUS_CODE,
};

enum EffectiveFeatureDifferenceType : uint32_t
{
EFDT_NOMINAL,
//everything is precomputed from interned values that are looked up
EFDT_VALUES_UNIVERSALLY_PRECOMPUTED,
//continuous without cycles, but everything is always numeric
EFDT_CONTINUOUS_UNIVERSALLY_NUMERIC,
//continuous without cycles, may contain nonnumeric data
EFDT_CONTINUOUS_NUMERIC,
//like FDT_CONTINUOUS_NUMERIC, but has cycles
EFDT_CONTINUOUS_NUMERIC_CYCLIC,
//continuous precomputed (cyclic or not), may contain nonnumeric data
EFDT_CONTINUOUS_NUMERIC_PRECOMPUTED,
//edit distance between strings
EFDT_CONTINUOUS_STRING,
//continuous measures of the number of nodes different between two sets of code
EFDT_CONTINUOUS_CODE,
};

//dynamically precompute and cache nominal deltas and defaults everytime the pValue is set
inline void SetAndConstrainParams()
Expand Down Expand Up @@ -70,7 +86,8 @@ class GeneralizedDistance

//computes and sets unknownToUnknownDistanceTerm and knownToUnknownDistanceTerm based on
// unknownToUnknownDifference and knownToUnknownDifference respectively
inline void ComputeAndStoreUncertaintyDistanceTerms(size_t index)
//if target_value_is_null_equivalent is true, it will update any precomputed values as necessary
inline void ComputeAndStoreUncertaintyDistanceTerms(size_t index, bool target_value_is_null_equivalent = false)
{
bool compute_accurate = NeedToPrecomputeAccurate();
bool compute_approximate = NeedToPrecomputeApproximate();
Expand Down Expand Up @@ -98,24 +115,64 @@ class GeneralizedDistance
if(feature_params.knownToUnknownDifference == feature_params.unknownToUnknownDifference)
{
feature_params.knownToUnknownDistanceTerm = feature_params.unknownToUnknownDistanceTerm;
return;
}
else
{
//compute knownToUnknownDistanceTerm
if(compute_accurate)
{
feature_params.knownToUnknownDistanceTerm.SetValue(
ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference,
index, ExactApproxValuePair::EXACT),
ExactApproxValuePair::EXACT);
}

if(compute_approximate)
{
feature_params.knownToUnknownDistanceTerm.SetValue(
ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference,
index, ExactApproxValuePair::APPROX),
ExactApproxValuePair::APPROX);
}
}

//compute knownToUnknownDistanceTerm
if(compute_accurate)
if(HasNumberInternValues(index))
{
auto &precomputed_terms = feature_params.precomputedInternDistanceTerms;

if(target_value_is_null_equivalent)
{
precomputed_terms[0] = feature_params.unknownToUnknownDistanceTerm.GetValue(defaultPrecision);
auto k_to_unk = feature_params.knownToUnknownDistanceTerm.GetValue(defaultPrecision);
for(size_t i = 1; i < precomputed_terms.size(); i++)
precomputed_terms[i] = k_to_unk;
}
else //just set the unknown value
{
precomputed_terms[0] = feature_params.knownToUnknownDistanceTerm.GetValue(defaultPrecision);
}
}
}

//for the feature index, computes and stores the distance terms as measured from value to each interned value
inline void ComputeAndStoreInternedNumberValuesAndDistanceTerms(size_t index, double value, std::vector<double> *interned_values)
{
auto &feature_params = featureParams[index];
feature_params.internedNumberIndexToNumberValue = interned_values;

if(interned_values == nullptr)
{
feature_params.knownToUnknownDistanceTerm.SetValue(
ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference,
index, ExactApproxValuePair::EXACT),
ExactApproxValuePair::EXACT);
feature_params.precomputedInternDistanceTerms.clear();
return;
}

if(compute_approximate)
feature_params.precomputedInternDistanceTerms.resize(interned_values->size());
//first entry is known-unknown distance
feature_params.precomputedInternDistanceTerms[0] = ComputeDistanceTermKnownToUnknown(index);
for(size_t i = 1; i < feature_params.precomputedInternDistanceTerms.size(); i++)
{
feature_params.knownToUnknownDistanceTerm.SetValue(
ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference,
index, ExactApproxValuePair::APPROX),
ExactApproxValuePair::APPROX);
double difference = value - interned_values->at(i);
feature_params.precomputedInternDistanceTerms[i] = ComputeDistanceTermNonNominalNonNullRegular(difference, index);
}
}

Expand Down Expand Up @@ -432,6 +489,18 @@ class GeneralizedDistance
return featureParams[index].knownToUnknownDistanceTerm.GetValue(defaultPrecision);
}

//returns true if the feature at index has interned number values
__forceinline bool HasNumberInternValues(size_t index)
{
return featureParams[index].internedNumberIndexToNumberValue != nullptr;
}

//returns the precomputed distance term for the interned number with intern_value_index
__forceinline double ComputeDistanceTermNumberInterned(size_t intern_value_index, size_t index)
{
return featureParams[index].precomputedInternDistanceTerms[intern_value_index];
}

//computes the inner term for a non-nominal with an exact match of values
__forceinline double ComputeDistanceTermNonNominalExactMatch(size_t index)
{
Expand All @@ -445,8 +514,8 @@ class GeneralizedDistance
return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight;
}

//computes the difference between two values non-nominal (e.g., continuous)
__forceinline double ComputeDifferenceTermNonNominal(double diff, size_t index)
//computes the base of the difference between two values non-nominal (e.g., continuous)
__forceinline double ComputeDifferenceTermBaseNonNominal(double diff, size_t index)
{
//compute absolute value
diff = std::abs(diff);
Expand All @@ -462,8 +531,8 @@ class GeneralizedDistance
return diff;
}

//computes the difference between two values non-nominal (e.g., continuous) that isn't cyclic
__forceinline double ComputeDifferenceTermNonNominalNonCyclic(double diff, size_t index)
//computes the base of the difference between two values non-nominal (e.g., continuous) that isn't cyclic
__forceinline double ComputeDifferenceTermBaseNonNominalNonCyclic(double diff, size_t index)
{
//compute absolute value
diff = std::abs(diff);
Expand All @@ -479,7 +548,7 @@ class GeneralizedDistance
// diff can be negative
__forceinline double ComputeDistanceTermNonNominalNonNullRegular(double diff, size_t index)
{
diff = ComputeDifferenceTermNonNominal(diff, index);
diff = ComputeDifferenceTermBaseNonNominal(diff, index);

//exponentiate and return with weight
return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight;
Expand All @@ -489,7 +558,7 @@ class GeneralizedDistance
// diff can be negative
__forceinline double ComputeDistanceTermNonNominalOneNonNullRegular(double diff, size_t index)
{
diff = ComputeDifferenceTermNonNominal(diff, index);
diff = ComputeDifferenceTermBaseNonNominal(diff, index);

//exponentiate and return with weight
return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight;
Expand All @@ -499,7 +568,7 @@ class GeneralizedDistance
// diff can be negative
__forceinline double ComputeDistanceTermNonNominalNonCyclicNonNullRegular(double diff, size_t index)
{
diff = ComputeDifferenceTermNonNominalNonCyclic(diff, index);
diff = ComputeDifferenceTermBaseNonNominalNonCyclic(diff, index);

//exponentiate and return with weight
return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight;
Expand All @@ -512,7 +581,7 @@ class GeneralizedDistance
if(FastIsNaN(diff))
return ComputeDistanceTermKnownToUnknown(index);

diff = ComputeDifferenceTermNonNominalNonCyclic(diff, index);
diff = ComputeDifferenceTermBaseNonNominalNonCyclic(diff, index);

//exponentiate and return with weight
return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight;
Expand All @@ -530,7 +599,7 @@ class GeneralizedDistance
if(IsFeatureNominal(index))
return (diff == 0.0) ? ComputeDistanceTermNominalExactMatch(index) : ComputeDistanceTermNominalNonMatch(index);

diff = ComputeDifferenceTermNonNominal(diff, index);
diff = ComputeDifferenceTermBaseNonNominal(diff, index);

return std::pow(diff, featureParams[index].weight);
}
Expand All @@ -547,7 +616,7 @@ class GeneralizedDistance
if(IsFeatureNominal(index))
return (diff == 0.0) ? ComputeDistanceTermNominalExactMatch(index) : ComputeDistanceTermNominalNonMatch(index);

diff = ComputeDifferenceTermNonNominal(diff, index);
diff = ComputeDifferenceTermBaseNonNominal(diff, index);

return diff * featureParams[index].weight;
}
Expand All @@ -556,7 +625,7 @@ class GeneralizedDistance
__forceinline double ComputeDistanceTermNonNull(double diff, size_t index, int precision)
{
if(!IsFeatureNominal(index))
diff = ComputeDifferenceTermNonNominal(diff, index);
diff = ComputeDifferenceTermBaseNonNominal(diff, index);

if(pValue == 0.0)
return std::pow(diff, featureParams[index].weight);
Expand Down Expand Up @@ -587,7 +656,7 @@ class GeneralizedDistance
{
double diff = ComputeDifference(a, b, a_type, b_type, featureParams[index].featureType);
if(FastIsNaN(diff))
return LookupNullDistanceTerm(a, b, a_type, b_type, index);;
return LookupNullDistanceTerm(a, b, a_type, b_type, index);

//if nominal, don't need to compute absolute value of diff because just need to compare to 0
if(IsFeatureNominal(index))
Expand Down Expand Up @@ -628,7 +697,7 @@ class GeneralizedDistance
__forceinline static double ComputeDifference(EvaluableNodeImmediateValue a, EvaluableNodeImmediateValue b,
EvaluableNodeImmediateValueType a_type, EvaluableNodeImmediateValueType b_type, FeatureDifferenceType feature_type)
{
if(feature_type == FDT_CONTINUOUS_NUMERIC || feature_type == FDT_CONTINUOUS_UNIVERSALLY_NUMERIC
if(feature_type == FDT_CONTINUOUS_NUMERIC
|| feature_type == FDT_CONTINUOUS_NUMERIC_CYCLIC)
{
if(a_type == ENIVT_NUMBER && b_type == ENIVT_NUMBER)
Expand Down Expand Up @@ -758,7 +827,10 @@ class GeneralizedDistance
{
public:
inline FeatureParams()
: featureType(FDT_CONTINUOUS_NUMERIC), weight(1.0), deviation(0.0),
: featureType(FDT_CONTINUOUS_NUMERIC),
effectiveFeatureType(EFDT_CONTINUOUS_NUMERIC),
weight(1.0),
internedNumberIndexToNumberValue(nullptr), deviation(0.0),
unknownToUnknownDistanceTerm(std::numeric_limits<double>::quiet_NaN()),
knownToUnknownDistanceTerm(std::numeric_limits<double>::quiet_NaN()),
unknownToUnknownDifference(std::numeric_limits<double>::quiet_NaN()),
Expand All @@ -768,16 +840,26 @@ class GeneralizedDistance
}

//the type of comparison for each feature
// this type is 64-bit aligned to make sure the whole structure is aligned
// this type is 32-bit aligned to make sure the whole structure is aligned
FeatureDifferenceType featureType;

//the effective comparison for the feature type, specialized for performance
// this type is 32-bit aligned to make sure the whole structure is aligned
EffectiveFeatureDifferenceType effectiveFeatureType;

//weight of the feature
double weight;

//distance terms for nominals
ExactApproxValuePair nominalMatchDistanceTerm;
ExactApproxValuePair nominalNonMatchDistanceTerm;

//pointer to a lookup table of indices to values if the feature is an interned number
std::vector<double> *internedNumberIndexToNumberValue;

//precomputed distance terms for each interned value looked up by intern index
std::vector<double> precomputedInternDistanceTerms;

//type attributes dependent on featureType
union
{
Expand Down
5 changes: 1 addition & 4 deletions src/Amalgam/IntegerSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,7 @@ class SortedIntegerSet
__forceinline bool contains(size_t id)
{
auto location = std::lower_bound(std::begin(integers), std::end(integers), id);
if(location == std::end(integers))
return false;

return id == *location;
return (location != std::end(integers) && id == *location);
}

//returns true if the id exists in the set
Expand Down
Loading

0 comments on commit d08b199

Please sign in to comment.