From d08b19939a2f26929cad1995dd73107571b35c4c Mon Sep 17 00:00:00 2001
From: howsohazard <143410553+howsohazard@users.noreply.github.com>
Date: Thu, 26 Oct 2023 22:12:12 -0400
Subject: [PATCH] 17630 & 17861: Implements number interning, improves change
 value efficiency, fixes cyclic bug (#25)

---
 src/Amalgam/GeneralizedDistance.h           | 174 +++--
 src/Amalgam/IntegerSet.h                    |   5 +-
 src/Amalgam/SBFDSColumnData.h               | 706 ++++++++++++++++----
 src/Amalgam/SeparableBoxFilterDataStore.cpp | 172 +++--
 src/Amalgam/SeparableBoxFilterDataStore.h   | 214 ++++--
 src/Amalgam/amlg_code/test.amlg             |  28 +-
 src/Amalgam/entity/EntityQueryBuilder.h     |  27 +-
 src/Amalgam/evaluablenode/EvaluableNode.h   |  41 +-
 src/Amalgam/out.txt                         |  60 +-
 9 files changed, 1059 insertions(+), 368 deletions(-)

diff --git a/src/Amalgam/GeneralizedDistance.h b/src/Amalgam/GeneralizedDistance.h
index 2474b4d6..0cf9213e 100644
--- a/src/Amalgam/GeneralizedDistance.h
+++ b/src/Amalgam/GeneralizedDistance.h
@@ -12,29 +12,45 @@
 //If defined, will use the Laplace LK metric (default).  Otherwise will use Gaussian.
 #define DISTANCE_USE_LAPLACE_LK_METRIC true
 
-//general class of feature comparisons
-// align at 64-bits in order to play nice with data alignment where it is used
-enum FeatureDifferenceType : uint64_t
-{
-	FDT_NOMINAL,
-	//continuous, but without cycles
-	FDT_CONTINUOUS_NUMERIC,
-	//like FDT_CONTINUOUS_NUMERIC, but guarantees everything is always numeric
-	FDT_CONTINUOUS_UNIVERSALLY_NUMERIC,
-	//like FDT_CONTINUOUS_NUMERIC, but has cycles
-	FDT_CONTINUOUS_NUMERIC_CYCLIC,
-	//edit distance between strings
-	FDT_CONTINUOUS_STRING,
-	//continuous measures of the number of nodes different between two sets of code
-	FDT_CONTINUOUS_CODE,
-};
-
 //base data struct for holding distance parameters and metadata
 //generalizes Minkowski distance, information theoretic surprisal as a distance, and Lukaszyk–Karmowski
 class GeneralizedDistance
 {
 public:
-	//initialization functions
+
+	//general class of feature comparisons
+	// align at 32-bits in order to play nice with data alignment where it is used
+	enum FeatureDifferenceType : uint32_t
+	{
+		FDT_NOMINAL,
+		//continuous without cycles, may contain nonnumeric data
+		FDT_CONTINUOUS_NUMERIC,
+		//like FDT_CONTINUOUS_NUMERIC, but has cycles
+		FDT_CONTINUOUS_NUMERIC_CYCLIC,
+		//edit distance between strings
+		FDT_CONTINUOUS_STRING,
+		//continuous measures of the number of nodes different between two sets of code
+		FDT_CONTINUOUS_CODE,
+	};
+
+	enum EffectiveFeatureDifferenceType : uint32_t
+	{
+		EFDT_NOMINAL,
+		//everything is precomputed from interned values that are looked up
+		EFDT_VALUES_UNIVERSALLY_PRECOMPUTED,
+		//continuous without cycles, but everything is always numeric
+		EFDT_CONTINUOUS_UNIVERSALLY_NUMERIC,
+		//continuous without cycles, may contain nonnumeric data
+		EFDT_CONTINUOUS_NUMERIC,
+		//like FDT_CONTINUOUS_NUMERIC, but has cycles
+		EFDT_CONTINUOUS_NUMERIC_CYCLIC,
+		//continuous precomputed (cyclic or not), may contain nonnumeric data
+		EFDT_CONTINUOUS_NUMERIC_PRECOMPUTED,
+		//edit distance between strings
+		EFDT_CONTINUOUS_STRING,
+		//continuous measures of the number of nodes different between two sets of code
+		EFDT_CONTINUOUS_CODE,
+	};
 
 	//dynamically precompute and cache nominal deltas and defaults everytime the pValue is set
 	inline void SetAndConstrainParams()
@@ -70,7 +86,8 @@ class GeneralizedDistance
 
 	//computes and sets unknownToUnknownDistanceTerm and knownToUnknownDistanceTerm based on
 	// unknownToUnknownDifference and knownToUnknownDifference respectively
-	inline void ComputeAndStoreUncertaintyDistanceTerms(size_t index)
+	//if target_value_is_null_equivalent is true, it will update any precomputed values as necessary
+	inline void ComputeAndStoreUncertaintyDistanceTerms(size_t index, bool target_value_is_null_equivalent = false)
 	{
 		bool compute_accurate = NeedToPrecomputeAccurate();
 		bool compute_approximate = NeedToPrecomputeApproximate();
@@ -98,24 +115,64 @@ class GeneralizedDistance
 		if(feature_params.knownToUnknownDifference == feature_params.unknownToUnknownDifference)
 		{
 			feature_params.knownToUnknownDistanceTerm = feature_params.unknownToUnknownDistanceTerm;
-			return;
+		}
+		else
+		{
+			//compute knownToUnknownDistanceTerm
+			if(compute_accurate)
+			{
+				feature_params.knownToUnknownDistanceTerm.SetValue(
+					ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference,
+						index, ExactApproxValuePair::EXACT),
+					ExactApproxValuePair::EXACT);
+			}
+
+			if(compute_approximate)
+			{
+				feature_params.knownToUnknownDistanceTerm.SetValue(
+					ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference,
+						index, ExactApproxValuePair::APPROX),
+					ExactApproxValuePair::APPROX);
+			}
 		}
 
-		//compute knownToUnknownDistanceTerm
-		if(compute_accurate)
+		if(HasNumberInternValues(index))
+		{
+			auto &precomputed_terms = feature_params.precomputedInternDistanceTerms;
+
+			if(target_value_is_null_equivalent)
+			{
+				precomputed_terms[0] = feature_params.unknownToUnknownDistanceTerm.GetValue(defaultPrecision);
+				auto k_to_unk = feature_params.knownToUnknownDistanceTerm.GetValue(defaultPrecision);
+				for(size_t i = 1; i < precomputed_terms.size(); i++)
+					precomputed_terms[i] = k_to_unk;
+			}
+			else //just set the unknown value
+			{
+				precomputed_terms[0] = feature_params.knownToUnknownDistanceTerm.GetValue(defaultPrecision);
+			}			
+		}
+	}
+
+	//for the feature index, computes and stores the distance terms as measured from value to each interned value
+	inline void ComputeAndStoreInternedNumberValuesAndDistanceTerms(size_t index, double value, std::vector<double> *interned_values)
+	{
+		auto &feature_params = featureParams[index];
+		feature_params.internedNumberIndexToNumberValue = interned_values;
+
+		if(interned_values == nullptr)
 		{
-			feature_params.knownToUnknownDistanceTerm.SetValue(
-				ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference,
-					index, ExactApproxValuePair::EXACT),
-				ExactApproxValuePair::EXACT);
+			feature_params.precomputedInternDistanceTerms.clear();
+			return;
 		}
 
-		if(compute_approximate)
+		feature_params.precomputedInternDistanceTerms.resize(interned_values->size());
+		//first entry is known-unknown distance
+		feature_params.precomputedInternDistanceTerms[0] = ComputeDistanceTermKnownToUnknown(index);
+		for(size_t i = 1; i < feature_params.precomputedInternDistanceTerms.size(); i++)
 		{
-			feature_params.knownToUnknownDistanceTerm.SetValue(
-				ComputeDistanceTermNonNull(feature_params.knownToUnknownDifference,
-					index, ExactApproxValuePair::APPROX),
-				ExactApproxValuePair::APPROX);
+			double difference = value - interned_values->at(i);
+			feature_params.precomputedInternDistanceTerms[i] = ComputeDistanceTermNonNominalNonNullRegular(difference, index);
 		}
 	}
 
@@ -432,6 +489,18 @@ class GeneralizedDistance
 		return featureParams[index].knownToUnknownDistanceTerm.GetValue(defaultPrecision);
 	}
 
+	//returns true if the feature at index has interned number values
+	__forceinline bool HasNumberInternValues(size_t index)
+	{
+		return featureParams[index].internedNumberIndexToNumberValue != nullptr;
+	}
+
+	//returns the precomputed distance term for the interned number with intern_value_index
+	__forceinline double ComputeDistanceTermNumberInterned(size_t intern_value_index, size_t index)
+	{
+		return featureParams[index].precomputedInternDistanceTerms[intern_value_index];
+	}
+
 	//computes the inner term for a non-nominal with an exact match of values
 	__forceinline double ComputeDistanceTermNonNominalExactMatch(size_t index)
 	{
@@ -445,8 +514,8 @@ class GeneralizedDistance
 		return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight;
 	}
 
-	//computes the difference between two values non-nominal (e.g., continuous)
-	__forceinline double ComputeDifferenceTermNonNominal(double diff, size_t index)
+	//computes the base of the difference between two values non-nominal (e.g., continuous)
+	__forceinline double ComputeDifferenceTermBaseNonNominal(double diff, size_t index)
 	{
 		//compute absolute value
 		diff = std::abs(diff);
@@ -462,8 +531,8 @@ class GeneralizedDistance
 		return diff;
 	}
 
-	//computes the difference between two values non-nominal (e.g., continuous) that isn't cyclic
-	__forceinline double ComputeDifferenceTermNonNominalNonCyclic(double diff, size_t index)
+	//computes the base of the difference between two values non-nominal (e.g., continuous) that isn't cyclic
+	__forceinline double ComputeDifferenceTermBaseNonNominalNonCyclic(double diff, size_t index)
 	{
 		//compute absolute value
 		diff = std::abs(diff);
@@ -479,7 +548,7 @@ class GeneralizedDistance
 	// diff can be negative
 	__forceinline double ComputeDistanceTermNonNominalNonNullRegular(double diff, size_t index)
 	{
-		diff = ComputeDifferenceTermNonNominal(diff, index);
+		diff = ComputeDifferenceTermBaseNonNominal(diff, index);
 
 		//exponentiate and return with weight
 		return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight;
@@ -489,7 +558,7 @@ class GeneralizedDistance
 	// diff can be negative
 	__forceinline double ComputeDistanceTermNonNominalOneNonNullRegular(double diff, size_t index)
 	{
-		diff = ComputeDifferenceTermNonNominal(diff, index);
+		diff = ComputeDifferenceTermBaseNonNominal(diff, index);
 
 		//exponentiate and return with weight
 		return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight;
@@ -499,7 +568,7 @@ class GeneralizedDistance
 	// diff can be negative
 	__forceinline double ComputeDistanceTermNonNominalNonCyclicNonNullRegular(double diff, size_t index)
 	{
-		diff = ComputeDifferenceTermNonNominalNonCyclic(diff, index);
+		diff = ComputeDifferenceTermBaseNonNominalNonCyclic(diff, index);
 
 		//exponentiate and return with weight
 		return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight;
@@ -512,7 +581,7 @@ class GeneralizedDistance
 		if(FastIsNaN(diff))
 			return ComputeDistanceTermKnownToUnknown(index);
 
-		diff = ComputeDifferenceTermNonNominalNonCyclic(diff, index);
+		diff = ComputeDifferenceTermBaseNonNominalNonCyclic(diff, index);
 
 		//exponentiate and return with weight
 		return ExponentiateDifferenceTerm(diff, defaultPrecision) * featureParams[index].weight;
@@ -530,7 +599,7 @@ class GeneralizedDistance
 		if(IsFeatureNominal(index))
 			return (diff == 0.0) ? ComputeDistanceTermNominalExactMatch(index) : ComputeDistanceTermNominalNonMatch(index);
 
-		diff = ComputeDifferenceTermNonNominal(diff, index);
+		diff = ComputeDifferenceTermBaseNonNominal(diff, index);
 
 		return std::pow(diff, featureParams[index].weight);
 	}
@@ -547,7 +616,7 @@ class GeneralizedDistance
 		if(IsFeatureNominal(index))
 			return (diff == 0.0) ? ComputeDistanceTermNominalExactMatch(index) : ComputeDistanceTermNominalNonMatch(index);
 
-		diff = ComputeDifferenceTermNonNominal(diff, index);
+		diff = ComputeDifferenceTermBaseNonNominal(diff, index);
 
 		return diff * featureParams[index].weight;
 	}
@@ -556,7 +625,7 @@ class GeneralizedDistance
 	__forceinline double ComputeDistanceTermNonNull(double diff, size_t index, int precision)
 	{
 		if(!IsFeatureNominal(index))
-			diff = ComputeDifferenceTermNonNominal(diff, index);
+			diff = ComputeDifferenceTermBaseNonNominal(diff, index);
 
 		if(pValue == 0.0)
 			return std::pow(diff, featureParams[index].weight);
@@ -587,7 +656,7 @@ class GeneralizedDistance
 	{
 		double diff = ComputeDifference(a, b, a_type, b_type, featureParams[index].featureType);
 		if(FastIsNaN(diff))
-			return LookupNullDistanceTerm(a, b, a_type, b_type, index);;
+			return LookupNullDistanceTerm(a, b, a_type, b_type, index);
 
 		//if nominal, don't need to compute absolute value of diff because just need to compare to 0
 		if(IsFeatureNominal(index))
@@ -628,7 +697,7 @@ class GeneralizedDistance
 	__forceinline static double ComputeDifference(EvaluableNodeImmediateValue a, EvaluableNodeImmediateValue b,
 		EvaluableNodeImmediateValueType a_type, EvaluableNodeImmediateValueType b_type, FeatureDifferenceType feature_type)
 	{
-		if(feature_type == FDT_CONTINUOUS_NUMERIC || feature_type == FDT_CONTINUOUS_UNIVERSALLY_NUMERIC
+		if(feature_type == FDT_CONTINUOUS_NUMERIC
 			|| feature_type == FDT_CONTINUOUS_NUMERIC_CYCLIC)
 		{
 			if(a_type == ENIVT_NUMBER && b_type == ENIVT_NUMBER)
@@ -758,7 +827,10 @@ class GeneralizedDistance
 	{
 	public:
 		inline FeatureParams()
-			: featureType(FDT_CONTINUOUS_NUMERIC), weight(1.0), deviation(0.0),
+			: featureType(FDT_CONTINUOUS_NUMERIC),
+			effectiveFeatureType(EFDT_CONTINUOUS_NUMERIC),
+			weight(1.0),
+			internedNumberIndexToNumberValue(nullptr), deviation(0.0),
 			unknownToUnknownDistanceTerm(std::numeric_limits<double>::quiet_NaN()),
 			knownToUnknownDistanceTerm(std::numeric_limits<double>::quiet_NaN()),
 			unknownToUnknownDifference(std::numeric_limits<double>::quiet_NaN()),
@@ -768,9 +840,13 @@ class GeneralizedDistance
 		}
 
 		//the type of comparison for each feature
-		// this type is 64-bit aligned to make sure the whole structure is aligned
+		// this type is 32-bit aligned to make sure the whole structure is aligned
 		FeatureDifferenceType featureType;
 
+		//the effective comparison for the feature type, specialized for performance
+		// this type is 32-bit aligned to make sure the whole structure is aligned
+		EffectiveFeatureDifferenceType effectiveFeatureType;
+
 		//weight of the feature
 		double weight;
 
@@ -778,6 +854,12 @@ class GeneralizedDistance
 		ExactApproxValuePair nominalMatchDistanceTerm;
 		ExactApproxValuePair nominalNonMatchDistanceTerm;
 
+		//pointer to a lookup table of indices to values if the feature is an interned number
+		std::vector<double> *internedNumberIndexToNumberValue;
+
+		//precomputed distance terms for each interned value looked up by intern index
+		std::vector<double> precomputedInternDistanceTerms;
+
 		//type attributes dependent on featureType
 		union
 		{
diff --git a/src/Amalgam/IntegerSet.h b/src/Amalgam/IntegerSet.h
index 1ef7cf2b..30c25c66 100644
--- a/src/Amalgam/IntegerSet.h
+++ b/src/Amalgam/IntegerSet.h
@@ -101,10 +101,7 @@ class SortedIntegerSet
 	__forceinline bool contains(size_t id)
 	{
 		auto location = std::lower_bound(std::begin(integers), std::end(integers), id);
-		if(location == std::end(integers))
-			return false;
-
-		return id == *location;
+		return (location != std::end(integers) && id == *location);
 	}
 
 	//returns true if the id exists in the set
diff --git a/src/Amalgam/SBFDSColumnData.h b/src/Amalgam/SBFDSColumnData.h
index 9cc1bb79..f88a9adf 100644
--- a/src/Amalgam/SBFDSColumnData.h
+++ b/src/Amalgam/SBFDSColumnData.h
@@ -16,15 +16,44 @@
 class SBFDSColumnData
 {
 public:
+
+	struct ValueEntry
+	{
+		//indicates the column does not use indices
+		static constexpr size_t NO_INDEX = std::numeric_limits<size_t>::max();
+		//nan value is always the 0th index
+		static constexpr size_t NAN_INDEX = 0;
+
+		//if empty, initialize to invalid index
+		ValueEntry()
+			: value(), indicesWithValue(),
+			valueInternIndex(NO_INDEX)
+		{	}
+
+		ValueEntry(double number_value, size_t intern_index = NO_INDEX)
+			: value(number_value), indicesWithValue(),
+			valueInternIndex(intern_index)
+		{	}
+
+		ValueEntry(StringInternPool::StringID sid_value, size_t intern_index = NO_INDEX)
+			: value(sid_value), indicesWithValue(),
+			valueInternIndex(intern_index)
+		{	}
+
+		ValueEntry(ValueEntry &ve)
+			: value(ve.value), indicesWithValue(ve.indicesWithValue), valueInternIndex(ve.valueInternIndex)
+		{	}
+
+		EvaluableNodeImmediateValue value;
+		SortedIntegerSet indicesWithValue;
+		size_t valueInternIndex;
+	};
+
 	//column needs to be named when it is created
 	inline SBFDSColumnData(StringInternPool::StringID sid)
-		: stringId(sid)
-	{	
-		indexWithLongestString = 0;
-		longestStringLength = 0;
-		indexWithLargestCode = 0;
-		largestCodeSize = 0;
-	}
+		: stringId(sid), indexWithLongestString(0), longestStringLength(0),
+		indexWithLargestCode(0), largestCodeSize(0), numberValuesInterned(false)
+	{	}
 
 	//like InsertIndexValue, but used only for building the column data from an empty column
 	//this function must be called on each index in ascending order; for example, index 2 must be called after index 1
@@ -101,16 +130,16 @@ class SBFDSColumnData
 			}
 		}
 
-		sortedNumberValueIndexPairs.reserve(num_uniques);
+		sortedNumberValueEntries.reserve(num_uniques);
 		numberIndices.ReserveNumIntegers(index_values.back().reference + 1);
 
 		for(auto &index_value : index_values)
 		{
 			//if don't have the right bucket, then need to create one
-			if(sortedNumberValueIndexPairs.size() == 0 || sortedNumberValueIndexPairs.back().first != index_value.distance)
-				sortedNumberValueIndexPairs.emplace_back(index_value.distance, std::make_unique<SortedIntegerSet>());
+			if(sortedNumberValueEntries.size() == 0 || sortedNumberValueEntries.back()->value.number != index_value.distance)
+				sortedNumberValueEntries.emplace_back(std::make_unique<ValueEntry>(index_value.distance));
 
-			sortedNumberValueIndexPairs.back().second->InsertNewLargestInteger(index_value.reference);
+			sortedNumberValueEntries.back()->indicesWithValue.InsertNewLargestInteger(index_value.reference);
 			numberIndices.insert(index_value.reference);
 		}
 	}
@@ -119,7 +148,12 @@ class SBFDSColumnData
 	__forceinline EvaluableNodeImmediateValueType GetIndexValueType(size_t index)
 	{
 		if(numberIndices.contains(index))
+		{
+			if(numberValuesInterned)
+				return ENIVT_NUMBER_INDIRECTION_INDEX;
 			return ENIVT_NUMBER;
+		}
+
 		if(stringIdIndices.contains(index))
 			return ENIVT_STRING_ID;
 		if(nullIndices.contains(index))
@@ -129,69 +163,342 @@ class SBFDSColumnData
 		return ENIVT_CODE;
 	}
 
+	//returns the value type, performing any resolution for intern lookups
+	static __forceinline EvaluableNodeImmediateValueType GetResolvedValueType(EvaluableNodeImmediateValueType value_type)
+	{
+		if(value_type == ENIVT_NUMBER_INDIRECTION_INDEX)
+			return ENIVT_NUMBER;
+		return value_type;
+	}
+
+	//returns the value type that represents the values stored in this column, performing the reverse of any resolution for intern lookups
+	__forceinline EvaluableNodeImmediateValueType GetUnresolvedValueType(EvaluableNodeImmediateValueType value_type)
+	{
+		if(value_type == ENIVT_NUMBER && numberValuesInterned)
+			return ENIVT_NUMBER_INDIRECTION_INDEX;
+		return value_type;
+	}
+
+	//returns the value performing any intern lookup if necessary
+	__forceinline EvaluableNodeImmediateValue GetResolvedValue(EvaluableNodeImmediateValueType value_type, EvaluableNodeImmediateValue value)
+	{
+		if(value_type == ENIVT_NUMBER_INDIRECTION_INDEX)
+			return EvaluableNodeImmediateValue(internedNumberIndexToNumberValue[value.indirectionIndex]);
+		return value;
+	}
+
 	//moves index from being associated with key old_value to key new_value
-	void ChangeIndexValue(EvaluableNodeImmediateValue old_value, EvaluableNodeImmediateValueType new_value_type, EvaluableNodeImmediateValue new_value, size_t index)
+	//returns the value that should be used to reference the value, which may be an index
+	//depending on the state of the column data
+	EvaluableNodeImmediateValue ChangeIndexValue(EvaluableNodeImmediateValueType old_value_type, EvaluableNodeImmediateValue old_value,
+		EvaluableNodeImmediateValueType new_value_type, EvaluableNodeImmediateValue new_value, size_t index)
 	{
 		//if new one is invalid, can quickly delete or return
 		if(new_value_type == ENIVT_NOT_EXIST)
 		{
 			if(!invalidIndices.contains(index))
 			{
-				DeleteIndexValue(old_value, index);
+				DeleteIndexValue(old_value_type, old_value, index);
 				invalidIndices.insert(index);
 			}
-			return;
+
+			if(numberValuesInterned)
+				return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX);
+			else
+				return EvaluableNodeImmediateValue();
+		}
+
+		//if the types are the same, some shortcuts may apply
+		//note that if the values match types and match resolved values, the old_value should be returned
+		//because it is already in the correct storage format for the column
+		if(old_value_type == new_value_type)
+		{
+			if(old_value_type == ENIVT_NULL)
+				return old_value;
+
+			if(old_value_type == ENIVT_NUMBER)
+			{
+				double old_number_value = GetResolvedValue(old_value_type, old_value).number;
+				double new_number_value = GetResolvedValue(new_value_type, new_value).number;
+				if(EqualIncludingNaN(old_number_value, new_number_value))
+					return old_value;
+
+				//if made it here, then at least one of the values is not a NaN
+				//if one value is a NaN, just insert or delete as regular since there's little to be saved
+				if(FastIsNaN(old_number_value))
+				{
+					nanIndices.erase(index);
+					return InsertIndexValue(new_value_type, new_value, index);
+				}
+
+				if(FastIsNaN(new_number_value))
+				{
+					DeleteIndexValue(old_value_type, old_value, index);
+					nanIndices.insert(index);
+
+					if(numberValuesInterned)
+						return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX);
+					else
+						return EvaluableNodeImmediateValue(std::numeric_limits<double>::quiet_NaN());
+				}
+
+				//if the value already exists, then put the index in the list
+				//but return the lower bound if not found so don't have to search a second time
+				//need to search the old value before inserting, as FindExactIndexForValue is fragile a placeholder empty entry
+				auto [new_value_index, new_exact_index_found] = FindExactIndexForValue(new_number_value, true);
+				auto [old_value_index, old_exact_index_found] = FindExactIndexForValue(old_number_value, true);
+
+				if(old_exact_index_found)
+				{
+					//if there are multiple entries for this number, just move the id
+					if(sortedNumberValueEntries[old_value_index]->indicesWithValue.size() > 1)
+					{
+						//erase with old_value_index first so don't need to update index
+						sortedNumberValueEntries[old_value_index]->indicesWithValue.erase(index);
+
+						if(!new_exact_index_found)
+						{
+							sortedNumberValueEntries.emplace(sortedNumberValueEntries.begin() + new_value_index, std::make_unique<ValueEntry>(new_number_value));
+							InsertFirstIndexIntoNumberValueEntry(index, new_value_index);
+						}
+						else //just insert
+						{
+							sortedNumberValueEntries[new_value_index]->indicesWithValue.insert(index);
+						}
+					}
+					else //it's the last old_number_entry
+					{
+						if(!new_exact_index_found)
+						{
+							//remove old value and update to new
+							std::unique_ptr<ValueEntry> new_value_entry = std::move(sortedNumberValueEntries[old_value_index]);
+							new_value_entry->value.number = new_number_value;
+
+							//move the other values out of the way
+							if(old_number_value < new_number_value)
+							{
+								for(size_t i = old_value_index; i + 1 < new_value_index; i++)
+									sortedNumberValueEntries[i] = std::move(sortedNumberValueEntries[i + 1]);
+
+								new_value_index--;
+							}
+							else
+							{
+								for(size_t i = old_value_index; i > new_value_index; i--)
+									sortedNumberValueEntries[i] = std::move(sortedNumberValueEntries[i - 1]);
+							}
+
+							//move new value in to empty slot created
+							sortedNumberValueEntries[new_value_index] = std::move(new_value_entry);
+						}
+						else //already has an entry for the new value, just delete as normal
+						{
+							sortedNumberValueEntries[new_value_index]->indicesWithValue.insert(index);
+							DeleteNumberValueEntry(old_value_index);
+						}
+					}
+				}
+				else //shouldn't make it here, but ensure integrity just in case
+				{
+					//insert new value in correct position
+					sortedNumberValueEntries.emplace(sortedNumberValueEntries.begin() + new_value_index,
+						std::make_unique<ValueEntry>(new_number_value));
+
+					InsertFirstIndexIntoNumberValueEntry(index, new_value_index);
+				}
+
+				if(numberValuesInterned)
+					return EvaluableNodeImmediateValue(new_value_index);
+				else
+					return EvaluableNodeImmediateValue(new_value);
+			}
+
+			if(old_value_type == ENIVT_STRING_ID)
+			{
+				if(old_value.stringID == new_value.stringID)
+					return old_value;
+
+				//try to insert the new value if not already there
+				auto [new_id_entry, inserted] = stringIdValueToIndices.emplace(new_value.stringID, nullptr);
+
+				auto old_id_entry = stringIdValueToIndices.find(old_value.stringID);
+				if(old_id_entry != end(stringIdValueToIndices))
+				{
+					//if there are multiple entries for this string, just move the id
+					if(old_id_entry->second->size() > 1)
+					{
+						if(inserted)
+							new_id_entry->second = std::make_unique<SortedIntegerSet>();
+
+						new_id_entry->second->insert(index);
+						old_id_entry->second->erase(index);
+					}
+					else //it's the last old_id_entry
+					{
+						//put the SortedIntegerSet in the new value or move the container
+						if(inserted)
+							new_id_entry->second = std::move(old_id_entry->second);
+						else
+							new_id_entry->second->insert(index);
+
+						//erase after no longer need inserted_id_entry, as it may be invalidated
+						stringIdValueToIndices.erase(old_id_entry);
+					}
+				}
+				else if(inserted) //shouldn't make it here, but ensure integrity just in case
+				{
+					new_id_entry->second = std::make_unique<SortedIntegerSet>();
+					new_id_entry->second->insert(index);
+				}
+
+				//update longest string as appropriate
+				if(index == indexWithLongestString)
+					RecomputeLongestString();
+				else
+					UpdateLongestString(new_value.stringID, index);
+
+				return new_value;
+			}
+
+			if(old_value_type == ENIVT_CODE)
+			{
+				//only early exit if the pointers to the code are exactly the same,
+				// as equivalent code may be garbage collected
+				if(old_value.code == new_value.code)
+					return old_value;
+
+				size_t old_code_size = EvaluableNode::GetDeepSize(old_value.code);
+				size_t new_code_size = EvaluableNode::GetDeepSize(new_value.code);
+
+				//only need to do insert / removal logic if sizes are different
+				if(old_code_size != new_code_size)
+				{
+					auto [new_size_entry, inserted] = valueCodeSizeToIndices.emplace(new_code_size, nullptr);
+
+					auto old_size_entry = valueCodeSizeToIndices.find(old_code_size);
+					if(old_size_entry != end(valueCodeSizeToIndices))
+					{
+						//if there are multiple entries for this string, just move the id
+						if(old_size_entry->second->size() > 1)
+						{
+							if(inserted)
+								new_size_entry->second = std::make_unique<SortedIntegerSet>();
+
+							new_size_entry->second->insert(index);
+							old_size_entry->second->erase(index);
+						}
+						else //it's the last old_size_entry
+						{
+							//put the SortedIntegerSet in the new value or move the container
+							if(inserted)
+								new_size_entry->second = std::move(old_size_entry->second);
+							else
+								new_size_entry->second->insert(index);
+
+							//erase after no longer need inserted_size_entry, as it may be invalidated
+							valueCodeSizeToIndices.erase(old_size_entry);
+						}
+					}
+					else if(inserted) //shouldn't make it here, but ensure integrity just in case
+					{
+						new_size_entry->second = std::make_unique<SortedIntegerSet>();
+						new_size_entry->second->insert(index);
+					}
+				}
+
+				//update longest string as appropriate
+				//see if need to update largest code
+				if(index == indexWithLargestCode)
+					RecomputeLargestCode();
+				else
+					UpdateLargestCode(new_code_size, index);
+
+				return new_value;
+			}
+
+			if(old_value_type == ENIVT_NUMBER_INDIRECTION_INDEX)
+			{
+				if(old_value.indirectionIndex == new_value.indirectionIndex)
+					return old_value;
+			}
 		}
 
 		//delete index at old value
-		DeleteIndexValue(old_value, index);
+		DeleteIndexValue(old_value_type, old_value, index);
 
-		//add index at new value bucket 
-		InsertIndexValue(new_value_type, new_value, index);
+		//add index at new value bucket
+		return InsertIndexValue(new_value_type, new_value, index);
 	}
 
-	//deletes everything involving the value at the index
-	void DeleteIndexValue(EvaluableNodeImmediateValue value, size_t index)
+	//deletes a particular value based on the value_index
+	void DeleteNumberValueEntry(size_t value_index)
 	{
-		if(invalidIndices.EraseAndRetrieve(index))
-			return;
+		if(numberValuesInterned)
+		{
+			size_t value_intern_index = sortedNumberValueEntries[value_index]->valueInternIndex;
+			//if the last entry (off by one, including ValueEntry::NO_INDEX), can just resize
+			if(value_intern_index == internedNumberIndexToNumberValue.size() - 1)
+			{
+				internedNumberIndexToNumberValue.resize(value_intern_index);
+			}
+			else //need to actually erase it
+			{
+				internedNumberIndexToNumberValue[value_intern_index] = std::numeric_limits<double>::quiet_NaN();
+				unusedNumberValueIndices.emplace(value_intern_index);
+			}
 
-		//if value is null, just need to remove from the appropriate index
-		if(nullIndices.EraseAndRetrieve(index))
-			return;
+			//clear out any unusedNumberValueIndices at the end other than the 0th entry
+			while(internedNumberIndexToNumberValue.size() > 1 && FastIsNaN(internedNumberIndexToNumberValue.back()))
+				internedNumberIndexToNumberValue.pop_back();
+		}
 
-		if(numberIndices.EraseAndRetrieve(index))
+		sortedNumberValueEntries.erase(sortedNumberValueEntries.begin() + value_index);
+	}
+
+	//deletes everything involving the value at the index
+	void DeleteIndexValue(EvaluableNodeImmediateValueType value_type, EvaluableNodeImmediateValue value, size_t index)
+	{
+		switch(value_type)
 		{
+		case ENIVT_NOT_EXIST:
+			invalidIndices.erase(index);
+			break;
+
+		case ENIVT_NULL:
+			nullIndices.erase(index);
+			break;
+
+		case ENIVT_NUMBER:
+		case ENIVT_NUMBER_INDIRECTION_INDEX:
+			numberIndices.erase(index);
+
 			//remove, and if not a nan, then need to also remove the number
 			if(!nanIndices.EraseAndRetrieve(index))
 			{
+				auto resolved_value = GetResolvedValue(value_type, value);
+
 				//look up value
-				auto [value_index, exact_index_found] = FindExactIndexForValue(value.number);
+				auto [value_index, exact_index_found] = FindExactIndexForValue(resolved_value.number);
 				if(!exact_index_found)
 					return;
 
 				//if the bucket has only one entry, we must delete the entire bucket
-				if(sortedNumberValueIndexPairs[value_index].second->size() == 1)
-				{
-					sortedNumberValueIndexPairs.erase(sortedNumberValueIndexPairs.begin() + value_index);
-				}
+				if(sortedNumberValueEntries[value_index]->indicesWithValue.size() == 1)
+					DeleteNumberValueEntry(value_index);
 				else //else we can just remove the id from the bucket
-				{
-					sortedNumberValueIndexPairs[value_index].second->erase(index);
-				}
+					sortedNumberValueEntries[value_index]->indicesWithValue.erase(index);
 			}
+			break;
 
-			return;
-		}
-
-		if(stringIdIndices.EraseAndRetrieve(index))
+		case ENIVT_STRING_ID:
 		{
+			stringIdIndices.erase(index);
 			auto id_entry = stringIdValueToIndices.find(value.stringID);
 			if(id_entry != end(stringIdValueToIndices))
 			{
 				auto &entities = *(id_entry->second);
 				entities.erase(index);
-				
+
 				//if no more entries have the value, remove it
 				if(entities.size() == 0)
 					stringIdValueToIndices.erase(id_entry);
@@ -199,83 +506,142 @@ class SBFDSColumnData
 
 			//see if need to compute new longest string
 			if(index == indexWithLongestString)
-			{
-				longestStringLength = 0;
-				//initialize to 0 in case there are no entities with strings
-				indexWithLongestString = 0;
-				for(auto &[s_id, s_entry] : stringIdValueToIndices)
-					UpdateLongestString(s_id, *s_entry->begin());
-			}
-
-			return;
+				RecomputeLongestString();
 		}
+		break;
 
-		//if made it here, then just remove from a code value type
-		codeIndices.erase(index);
+		case ENIVT_CODE:
+		{
+			codeIndices.erase(index);
 
-		//find the entities that have the correspending size
-		size_t num_indices = EvaluableNode::GetDeepSize(value.code);
-		auto id_entry = valueCodeSizeToIndices.find(num_indices);
-		if(id_entry == end(valueCodeSizeToIndices))
-			return;
+			//find the entities that have the correspending size
+			size_t num_indices = EvaluableNode::GetDeepSize(value.code);
+			auto id_entry = valueCodeSizeToIndices.find(num_indices);
+			if(id_entry == end(valueCodeSizeToIndices))
+				return;
+
+			//remove the entity
+			auto &entities = *(id_entry->second);
+			entities.erase(index);
+
+			if(entities.size() == 0)
+				valueCodeSizeToIndices.erase(id_entry);
+
+			//see if need to update largest code
+			if(index == indexWithLargestCode)
+				RecomputeLargestCode();
+			break;
+		}
 
-		//remove the entity
-		auto &entities = *(id_entry->second);
-		entities.erase(index);
+		default: //shouldn't make it here
+			break;
+		}
+	}
 
-		if(entities.size() == 0)
-			valueCodeSizeToIndices.erase(id_entry);
+	//deletes a particular value based on the value_index
+	void InsertFirstIndexIntoNumberValueEntry(size_t index, size_t value_index)
+	{
+		ValueEntry *value_entry = sortedNumberValueEntries[value_index].get();
 
-		//see if need to update largest code
-		if(index == indexWithLargestCode)
+		value_entry->indicesWithValue.insert(index);
+		if(numberValuesInterned)
 		{
-			largestCodeSize = 0;
-			//initialize to 0 in case there are no entities with code
-			indexWithLargestCode = 0;
-			for(auto &[size, entry] : valueCodeSizeToIndices)
-				UpdateLargestCode(size, *entry->begin());
+			if(value_entry->valueInternIndex == ValueEntry::NO_INDEX)
+			{
+				//get the highest value 
+				if(unusedNumberValueIndices.size() > 0)
+				{
+					value_entry->valueInternIndex = unusedNumberValueIndices.top();
+
+					//make sure the value is valid
+					if(value_entry->valueInternIndex < sortedNumberValueEntries.size())
+					{
+						unusedNumberValueIndices.pop();
+					}
+					else //not valid, clear queue
+					{
+						unusedNumberValueIndices.clear();
+						//just use a new value, 0-based but leaving a spot open for NAN_INDEX
+						value_entry->valueInternIndex = sortedNumberValueEntries.size();
+					}
+				}
+				else //just use new value of the latest size, 0-based but leaving a spot open for NAN_INDEX
+				{
+					value_entry->valueInternIndex = sortedNumberValueEntries.size();
+				}
+			}
+
+			if(value_entry->valueInternIndex >= internedNumberIndexToNumberValue.size())
+				internedNumberIndexToNumberValue.resize(value_entry->valueInternIndex + 1, std::numeric_limits<double>::quiet_NaN());
+
+			internedNumberIndexToNumberValue[value_entry->valueInternIndex] = value_entry->value.number;
 		}
 	}
 
 	//inserts the value at id
-	void InsertIndexValue(EvaluableNodeImmediateValueType value_type, EvaluableNodeImmediateValue &value, size_t index)
+	//returns the value that should be used to reference the value, which may be an index
+	//depending on the state of the column data
+	EvaluableNodeImmediateValue InsertIndexValue(EvaluableNodeImmediateValueType value_type,
+		EvaluableNodeImmediateValue &value, size_t index)
 	{
 		if(value_type == ENIVT_NOT_EXIST)
 		{
 			invalidIndices.insert(index);
-			return;
+
+			if(numberValuesInterned)
+				return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX);
+			else
+				return value;
 		}
 
 		if(value_type == ENIVT_NULL)
 		{
 			nullIndices.insert(index);
-			return;
+
+			if(numberValuesInterned)
+				return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX);
+			else
+				return value;
 		}
 
-		if(value_type == ENIVT_NUMBER)
+		if(value_type == ENIVT_NUMBER || value_type == ENIVT_NUMBER_INDIRECTION_INDEX)
 		{
 			numberIndices.insert(index);
 
-			if(FastIsNaN(value.number))
+			double number_value = GetResolvedValue(value_type, value).number;
+			if(FastIsNaN(number_value))
 			{
 				nanIndices.insert(index);
-				return;
+
+				if(numberValuesInterned)
+					return EvaluableNodeImmediateValue(ValueEntry::NAN_INDEX);
+				else
+					return value;
 			}
 			
 			//if the value already exists, then put the index in the list
-			auto [value_index, exact_index_found] = FindExactIndexForValue(value.number);
+			//but return the lower bound if not found so don't have to search a second time
+			auto [value_index, exact_index_found] = FindExactIndexForValue(number_value, true);
 			if(exact_index_found)
 			{
-				sortedNumberValueIndexPairs[value_index].second->insert(index);
-				return;
+				sortedNumberValueEntries[value_index]->indicesWithValue.insert(index);
+
+				if(numberValuesInterned)
+					return EvaluableNodeImmediateValue(sortedNumberValueEntries[value_index]->valueInternIndex);
+				else
+					return value;
 			}
 
 			//insert new value in correct position
-			size_t new_value_index = FindUpperBoundIndexForValue(value.number);
-			auto inserted = sortedNumberValueIndexPairs.emplace(sortedNumberValueIndexPairs.begin() + new_value_index, value.number, std::make_unique<SortedIntegerSet>());
-			inserted->second->insert(index);
+			sortedNumberValueEntries.emplace(sortedNumberValueEntries.begin() + value_index,
+				std::make_unique<ValueEntry>(number_value));
 
-			return;
+			InsertFirstIndexIntoNumberValueEntry(index, value_index);
+
+			if(numberValuesInterned)
+				return sortedNumberValueEntries[value_index]->valueInternIndex;
+			else
+				return value;
 		}
 
 		if(value_type == ENIVT_STRING_ID)
@@ -288,11 +654,10 @@ class SBFDSColumnData
 				inserted_id_entry->second = std::make_unique<SortedIntegerSet>();
 
 			auto &ids = *(inserted_id_entry->second);
-			
 			ids.insert(index);
 
 			UpdateLongestString(value.stringID, index);
-			return;
+			return value;
 		}
 
 		//value_type == ENIVT_CODE
@@ -309,6 +674,8 @@ class SBFDSColumnData
 		size_entry->second->insert(index);
 
 		UpdateLargestCode(code_size, index);
+
+		return value;
 	}
 
 	//returns the maximum difference between value and any other value for this column
@@ -317,21 +684,20 @@ class SBFDSColumnData
 	{
 		switch(feature_params.featureType)
 		{
-		case FDT_NOMINAL:
+		case GeneralizedDistance::FDT_NOMINAL:
 			return 1.0;
 
-		case FDT_CONTINUOUS_NUMERIC:
-		case FDT_CONTINUOUS_UNIVERSALLY_NUMERIC:
-			if(sortedNumberValueIndexPairs.size() <= 1)
+		case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC:
+			if(sortedNumberValueEntries.size() <= 1)
 				return 0.0;
 
-			return sortedNumberValueIndexPairs.back().first - sortedNumberValueIndexPairs[0].first;
+			return sortedNumberValueEntries.back()->value.number - sortedNumberValueEntries[0]->value.number;
 
-		case FDT_CONTINUOUS_NUMERIC_CYCLIC:
+		case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC_CYCLIC:
 			//maximum is the other side of the cycle
 			return feature_params.typeAttributes.maxCyclicDifference / 2;
 
-		case FDT_CONTINUOUS_STRING:
+		case GeneralizedDistance::FDT_CONTINUOUS_STRING:
 			//the max difference is the worst case edit distance, of removing all the characters
 			// and adding all the new ones
 			if(value_type == ENIVT_STRING_ID)
@@ -349,7 +715,7 @@ class SBFDSColumnData
 				return static_cast<double>(longestStringLength + 1);
 			}
 
-		case FDT_CONTINUOUS_CODE:
+		case GeneralizedDistance::FDT_CONTINUOUS_CODE:
 			if(value_type == ENIVT_CODE)
 				return static_cast<double>(largestCodeSize + EvaluableNode::GetDeepSize(value.code));
 			else if(value_type == ENIVT_NULL)
@@ -370,43 +736,43 @@ class SBFDSColumnData
 	// .second: true if exact index was found, false otherwise
 	inline std::pair<size_t, bool> FindExactIndexForValue(double value, bool return_index_lower_bound = false)
 	{
-		auto target_iter = std::lower_bound(begin(sortedNumberValueIndexPairs), end(sortedNumberValueIndexPairs), value,
-			[](const auto& value_index_pair, double value)
+		auto target_iter = std::lower_bound(begin(sortedNumberValueEntries), end(sortedNumberValueEntries), value,
+			[](const auto &value_entry, double value)
 			{
-				return value_index_pair.first < value;
+				return value_entry->value.number < value;
 			});
 
-		if ((target_iter == end(sortedNumberValueIndexPairs)) || (target_iter->first != value)) // not exact match
+		if((target_iter == end(sortedNumberValueEntries)) || ((*target_iter)->value.number != value)) // not exact match
 		{
-			return std::make_pair(return_index_lower_bound ? std::distance(begin(sortedNumberValueIndexPairs), target_iter) : -1 , false);
+			return std::make_pair(return_index_lower_bound ? std::distance(begin(sortedNumberValueEntries), target_iter) : -1 , false);
 		}
 
-		return std::make_pair(std::distance(begin(sortedNumberValueIndexPairs), target_iter), true); // exact match
+		return std::make_pair(std::distance(begin(sortedNumberValueEntries), target_iter), true); // exact match
 	}
 
 	//returns the index of the lower bound of value
 	inline size_t FindLowerBoundIndexForValue(double value)
 	{
-		auto target_iter = std::lower_bound(begin(sortedNumberValueIndexPairs), end(sortedNumberValueIndexPairs), value,
-			[](const auto &value_index_pair, double value)
+		auto target_iter = std::lower_bound(begin(sortedNumberValueEntries), end(sortedNumberValueEntries), value,
+			[](const auto &value_entry, double value)
 			{
-				return value_index_pair.first < value;
+				return value_entry->value.number < value;
 			});
-		return std::distance(begin(sortedNumberValueIndexPairs), target_iter);
+		return std::distance(begin(sortedNumberValueEntries), target_iter);
 	}
 
 	//returns the index of the upper bound of value
 	inline size_t FindUpperBoundIndexForValue(double value)
 	{
-		auto target_iter = std::upper_bound(begin(sortedNumberValueIndexPairs), end(sortedNumberValueIndexPairs), value,
-			[](double value, const auto &value_index_pair)
+		auto target_iter = std::upper_bound(begin(sortedNumberValueEntries), end(sortedNumberValueEntries), value,
+			[](double value, const auto &value_entry)
 			{
-				return value < value_index_pair.first;
+				return value < value_entry->value.number;
 			});
-		return std::distance(begin(sortedNumberValueIndexPairs), target_iter);
+		return std::distance(begin(sortedNumberValueEntries), target_iter);
 	}
 
-	//given a value, returns the index at which the value should be inserted into the sortedNumberValueIndexPairs
+	//given a value, returns the index at which the value should be inserted into the sortedNumberValueEntries
 	//returns true for .second when an exact match is found, false otherwise
 	//O(log(n))
 	//cycle_length will take into account whether wrapping around is closer
@@ -416,15 +782,13 @@ class SBFDSColumnData
 		// returns the closest index (lower_bound) if an exact match is not found
 		auto [value_index, exact_index_found] = FindExactIndexForValue(value, true);
 		if(exact_index_found)
-		{
 			return std::make_pair(value_index, true);
-		}
 
 		//if only have one element (or zero), short circuit code below
-		if(sortedNumberValueIndexPairs.size() <= 1)
+		if(sortedNumberValueEntries.size() <= 1)
 			return std::make_pair(0, false);
 
-		size_t max_valid_index = sortedNumberValueIndexPairs.size() - 1;
+		size_t max_valid_index = sortedNumberValueEntries.size() - 1;
 		size_t target_index = std::min(max_valid_index, value_index); //value_index is lower bound index since no exact match
 
 		//if not cyclic or cyclic and not at the edge
@@ -434,15 +798,15 @@ class SBFDSColumnData
 			//need to check index again in case not cyclic
 			// return index with the closer difference
 			if(target_index < max_valid_index
-					&& (std::abs(sortedNumberValueIndexPairs[target_index + 1].first - value) < std::abs(sortedNumberValueIndexPairs[target_index].first - value)))
+					&& (std::abs(sortedNumberValueEntries[target_index + 1]->value.number - value) < std::abs(sortedNumberValueEntries[target_index]->value.number - value)))
 				return std::make_pair(target_index + 1, false);
 			else
 				return std::make_pair(target_index, false);
 		}
 		else //cyclic
 		{
-			double dist_to_max_index = std::abs(sortedNumberValueIndexPairs[max_valid_index].first - value);
-			double dist_to_0_index = std::abs(sortedNumberValueIndexPairs[0].first - value);
+			double dist_to_max_index = std::abs(sortedNumberValueEntries[max_valid_index]->value.number - value);
+			double dist_to_0_index = std::abs(sortedNumberValueEntries[0]->value.number - value);
 			size_t other_closest_index;
 
 			if(target_index == 0)
@@ -458,7 +822,7 @@ class SBFDSColumnData
 				other_closest_index = max_valid_index - 1;
 			}
 
-			double dist_to_other_closest_index = std::abs(sortedNumberValueIndexPairs[other_closest_index].first - value);
+			double dist_to_other_closest_index = std::abs(sortedNumberValueEntries[other_closest_index]->value.number - value);
 			if(dist_to_0_index <= dist_to_other_closest_index && dist_to_0_index <= dist_to_max_index)
 				return std::make_pair(0, false);
 			else if(dist_to_other_closest_index <= dist_to_0_index)
@@ -477,7 +841,7 @@ class SBFDSColumnData
 		if(value_type == ENIVT_NUMBER)
 		{
 			//there are no ids for this column, so return no results
-			if(sortedNumberValueIndexPairs.size() == 0)
+			if(sortedNumberValueEntries.size() == 0)
 				return;
 
 			//make a copy because passed by reference, and may need to change value for logic below
@@ -534,19 +898,19 @@ class SBFDSColumnData
 				if(between_values)
 				{
 					size_t index = value_index;
-					out.InsertInBatch(*sortedNumberValueIndexPairs[index].second);
+					out.InsertInBatch(sortedNumberValueEntries[index]->indicesWithValue);
 				}
 				else //if not within, populate with all indices not equal to value
 				{
 					//include nans
 					nanIndices.CopyTo(out);
 
-					for(auto &[bucket_val, bucket] : sortedNumberValueIndexPairs)
+					for(auto &value_entry : sortedNumberValueEntries)
 					{
-						if(bucket_val == low_number)
+						if(value_entry->value.number == low_number)
 							continue;
 
-						out.InsertInBatch(*bucket);
+						out.InsertInBatch(value_entry->indicesWithValue);
 					}
 				}
 
@@ -554,27 +918,27 @@ class SBFDSColumnData
 			}
 
 			size_t start_index = (low_number == -std::numeric_limits<double>::infinity()) ? 0 : FindLowerBoundIndexForValue(low_number);
-			size_t end_index = (high_number == std::numeric_limits<double>::infinity()) ? sortedNumberValueIndexPairs.size() : FindUpperBoundIndexForValue(high_number);
+			size_t end_index = (high_number == std::numeric_limits<double>::infinity()) ? sortedNumberValueEntries.size() : FindUpperBoundIndexForValue(high_number);
 
 			if(between_values)
 			{
 				//insert everything between the two indices
 				for(size_t i = start_index; i < end_index; i++)
-					out.InsertInBatch(*sortedNumberValueIndexPairs[i].second);
+					out.InsertInBatch(sortedNumberValueEntries[i]->indicesWithValue);
 
 				//include end_index if value matches
-				if(end_index < sortedNumberValueIndexPairs.size() && sortedNumberValueIndexPairs[end_index].first == high_number)
-					out.InsertInBatch(*sortedNumberValueIndexPairs[end_index].second);
+				if(end_index < sortedNumberValueEntries.size() && sortedNumberValueEntries[end_index]->value.number == high_number)
+					out.InsertInBatch(sortedNumberValueEntries[end_index]->indicesWithValue);
 			}
 			else //not between_values
 			{
 				//insert everything left of range
 				for(size_t i = 0; i < start_index; i++)
-					out.InsertInBatch(*sortedNumberValueIndexPairs[i].second);
+					out.InsertInBatch(sortedNumberValueEntries[i]->indicesWithValue);
 
 				//insert everything right of range
-				for(size_t i = end_index; i < sortedNumberValueIndexPairs.size(); i++)
-					out.InsertInBatch(*sortedNumberValueIndexPairs[i].second);
+				for(size_t i = end_index; i < sortedNumberValueEntries.size(); i++)
+					out.InsertInBatch(sortedNumberValueEntries[i]->indicesWithValue);
 			}
 
 		}
@@ -636,7 +1000,7 @@ class SBFDSColumnData
 
 			auto [value_index, exact_index_found] = FindExactIndexForValue(value.number);
 			if(exact_index_found)
-				out.InsertInBatch(*sortedNumberValueIndexPairs[value_index].second);
+				out.InsertInBatch(sortedNumberValueEntries[value_index]->indicesWithValue);
 		}
 		else if(value_type == ENIVT_STRING_ID)
 		{
@@ -654,16 +1018,16 @@ class SBFDSColumnData
 		if(value_type == ENIVT_NUMBER)
 		{
 			//there are no ids for this column, so return no results
-			if(sortedNumberValueIndexPairs.size() == 0)
+			if(sortedNumberValueEntries.size() == 0)
 				return;
 
 			//search left to right for max (bucket 0 is largest) or right to left for min
-			int64_t value_index = find_max ? sortedNumberValueIndexPairs.size() - 1 : 0;
+			int64_t value_index = find_max ? sortedNumberValueEntries.size() - 1 : 0;
 
-			while(value_index < static_cast<int64_t>(sortedNumberValueIndexPairs.size()) && value_index >= 0)
+			while(value_index < static_cast<int64_t>(sortedNumberValueEntries.size()) && value_index >= 0)
 			{
 				//add each index to the out indices and optionally output compute results
-				for(const auto &index : *sortedNumberValueIndexPairs[value_index].second)
+				for(const auto &index : sortedNumberValueEntries[value_index]->indicesWithValue)
 				{
 					if(indices_to_consider != nullptr && !indices_to_consider->contains(index))
 						continue;
@@ -716,6 +1080,56 @@ class SBFDSColumnData
 		}
 	}
 
+	//returns true if switching to number interning would be expected to yield better results
+	// than number values given the current data
+	inline bool AreNumberInternsPreferredToValues()
+	{
+		//use heuristic of sqrt number of values compared to num unique values
+		// (but computed with a multiply instead of sqrt)
+		size_t num_unique_values = sortedNumberValueEntries.size();
+		return (num_unique_values * num_unique_values <= numberIndices.size());
+	}
+
+	//returns true if switching to number values would be expected to yield better results
+	// than number interning given the current data
+	inline bool AreNumberValuesPreferredToInterns()
+	{
+		//use heuristic of sqrt number of values compared to num unique values
+		// (but computed with a multiply instead of sqrt)
+		//round up to reduce flipping back and forth
+		size_t num_unique_values = sortedNumberValueEntries.size();
+		return (num_unique_values * num_unique_values > numberIndices.size() - num_unique_values);
+	}
+
+	//clears number intern caches and changes state to not perform interning for numbers
+	void ConvertNumberInternsToValues()
+	{
+		if(!numberValuesInterned)
+			return;
+
+		internedNumberIndexToNumberValue.clear();
+		unusedNumberValueIndices.clear();
+		numberValuesInterned = false;
+	}
+
+	//initializes and sets up number value interning caches and changes state to perform interning for numbers
+	void ConvertNumberValuesToInterns()
+	{
+		if(numberValuesInterned)
+			return;
+
+		internedNumberIndexToNumberValue.resize(sortedNumberValueEntries.size() + 1);
+		internedNumberIndexToNumberValue[0] = std::numeric_limits<double>::quiet_NaN();
+		for(size_t i = 0; i < sortedNumberValueEntries.size(); i++)
+		{
+			auto &value_entry = sortedNumberValueEntries[i];
+			value_entry->valueInternIndex = i + 1;
+			internedNumberIndexToNumberValue[i + 1] = value_entry->value.number;
+		}
+
+		numberValuesInterned = true;
+	}
+
 protected:
 
 	//updates longestStringLength and indexWithLongestString based on parameters
@@ -730,6 +1144,16 @@ class SBFDSColumnData
 		}
 	}
 
+	//should be called when the longest string is invalidated
+	inline void RecomputeLongestString()
+	{
+		longestStringLength = 0;
+		//initialize to 0 in case there are no entities with strings
+		indexWithLongestString = 0;
+		for(auto &[s_id, s_entry] : stringIdValueToIndices)
+			UpdateLongestString(s_id, *s_entry->begin());
+	}
+
 	//updates largestCodeSize and indexWithLargestCode based on parameters
 	inline void UpdateLargestCode(size_t code_size, size_t index)
 	{
@@ -740,13 +1164,23 @@ class SBFDSColumnData
 		}
 	}
 
+	//should be called when the largest code is invalidated
+	inline void RecomputeLargestCode()
+	{
+		largestCodeSize = 0;
+		//initialize to 0 in case there are no entities with code
+		indexWithLargestCode = 0;
+		for(auto &[size, entry] : valueCodeSizeToIndices)
+			UpdateLargestCode(size, *entry->begin());
+	}
+
 public:
 
 	//name of the column
 	StringInternPool::StringID stringId;
 
 	//stores values in sorted order and the entities that have each value
-	std::vector< std::pair<double, std::unique_ptr<SortedIntegerSet>> > sortedNumberValueIndexPairs;
+	std::vector<std::unique_ptr<ValueEntry>> sortedNumberValueEntries;
 
 	//maps a string id to a vector of indices that have that string
 	CompactHashMap<StringInternPool::StringID, std::unique_ptr<SortedIntegerSet>> stringIdValueToIndices;
@@ -783,4 +1217,16 @@ class SBFDSColumnData
 	size_t indexWithLargestCode;
 	//the largest code size for this label
 	size_t largestCodeSize;
+
+	//if numberValuesInterned is true, then contains an index of each value to its location in sortedNumberValueEntries
+	//if a given index isn't used, then it will contain the maximum value for the index
+	//the 0th index is reserved for NaN, regardless of whether NaN appears in the data
+	std::vector<double> internedNumberIndexToNumberValue;
+
+	//unused / free indices in internedNumberIndexToNumberValue to make adding and removing new values efficient
+	//always want to fetch the lowest index to keep the interned NumberIndexToNumberValue small
+	FlexiblePriorityQueue<size_t, std::vector<size_t>, std::greater<size_t>> unusedNumberValueIndices;
+
+	//if true, then the indices of the values should be used and internedNumberIndexToValue populated
+	bool numberValuesInterned;
 };
diff --git a/src/Amalgam/SeparableBoxFilterDataStore.cpp b/src/Amalgam/SeparableBoxFilterDataStore.cpp
index 389365fe..23e9c02d 100644
--- a/src/Amalgam/SeparableBoxFilterDataStore.cpp
+++ b/src/Amalgam/SeparableBoxFilterDataStore.cpp
@@ -15,6 +15,9 @@ void SeparableBoxFilterDataStore::BuildLabel(size_t column_index, const std::vec
 	auto &entities_with_number_values = parametersAndBuffers.entitiesWithValues;
 	entities_with_number_values.clear();
 
+	//clear value interning if applied
+	column_data->ConvertNumberInternsToValues();
+
 	//populate matrix and get values
 	// maintaining the order of insertion of the entities from smallest to largest allows for better performance of the insertions
 	// and every function called here assumes that entities are inserted in increasing order
@@ -23,7 +26,7 @@ void SeparableBoxFilterDataStore::BuildLabel(size_t column_index, const std::vec
 		EvaluableNodeImmediateValueType value_type;
 		EvaluableNodeImmediateValue value;
 		value_type = entities[entity_index]->GetValueAtLabelAsImmediateValue(label_id, value);
-		matrix[GetMatrixCellIndex(entity_index) + column_index] = value;
+		GetValue(entity_index, column_index) = value;
 
 		column_data->InsertNextIndexValueExceptNumbers(value_type, value, entity_index, entities_with_number_values);
 	}
@@ -32,6 +35,51 @@ void SeparableBoxFilterDataStore::BuildLabel(size_t column_index, const std::vec
 	std::stable_sort(begin(entities_with_number_values), end(entities_with_number_values));
 
 	column_data->AppendSortedNumberIndicesWithSortedIndices(entities_with_number_values);
+
+	OptimizeColumn(column_index);
+}
+
+void SeparableBoxFilterDataStore::OptimizeColumn(size_t column_index)
+{
+	auto &column_data = columnData[column_index];
+
+	if(column_data->numberValuesInterned)
+	{
+		if(column_data->AreNumberValuesPreferredToInterns())
+		{
+			for(auto &value_entry : column_data->sortedNumberValueEntries)
+			{
+				double value = value_entry->value.number;
+				for(auto entity_index : value_entry->indicesWithValue)
+					GetValue(entity_index, column_index).number = value;
+			}
+
+			for(auto entity_index : column_data->nanIndices)
+				GetValue(entity_index, column_index).number = std::numeric_limits<double>::quiet_NaN();
+
+			for(auto entity_index : column_data->nullIndices)
+				GetValue(entity_index, column_index).number = std::numeric_limits<double>::quiet_NaN();
+
+			column_data->ConvertNumberInternsToValues();
+		}
+	}
+	else if(column_data->AreNumberInternsPreferredToValues())
+	{
+		column_data->ConvertNumberValuesToInterns();
+
+		for(auto &value_entry : column_data->sortedNumberValueEntries)
+		{
+			size_t value_index = value_entry->valueInternIndex;
+			for(auto entity_index : value_entry->indicesWithValue)
+				GetValue(entity_index, column_index).indirectionIndex = value_index;
+		}
+
+		for(auto entity_index : column_data->nanIndices)
+			GetValue(entity_index, column_index).number = SBFDSColumnData::ValueEntry::NAN_INDEX;
+
+		for(auto entity_index : column_data->nullIndices)
+			GetValue(entity_index, column_index).number = SBFDSColumnData::ValueEntry::NAN_INDEX;
+	}
 }
 
 void SeparableBoxFilterDataStore::RemoveColumnIndex(size_t column_index_to_remove)
@@ -89,15 +137,14 @@ void SeparableBoxFilterDataStore::AddEntity(Entity *entity, size_t entity_index)
 		EvaluableNodeImmediateValueType value_type;
 		EvaluableNodeImmediateValue value;
 		value_type = entity->GetValueAtLabelAsImmediateValue(columnData[column_index]->stringId, value);
-
-		matrix[cell_index] = value;
-
-		columnData[column_index]->InsertIndexValue(value_type, value, entity_index);
+		matrix[cell_index] = columnData[column_index]->InsertIndexValue(value_type, value, entity_index);
 	}
 
 	//count this entity
 	if(entity_index >= numEntities)
 		numEntities = entity_index + 1;
+
+	OptimizeAllColumns();
 }
 
 void SeparableBoxFilterDataStore::RemoveEntity(Entity *entity, size_t entity_index, size_t entity_index_to_reassign)
@@ -133,15 +180,19 @@ void SeparableBoxFilterDataStore::RemoveEntity(Entity *entity, size_t entity_ind
 	//reassign index for each column
 	for(size_t column_index = 0; column_index < columnData.size(); column_index++)
 	{
+		auto &column_data = columnData[column_index];
+
 		auto &val_to_overwrite = GetValue(entity_index, column_index);
-		auto &value_of_index_to_reassign = GetValue(entity_index_to_reassign, column_index);
+		auto type_to_overwrite = column_data->GetIndexValueType(entity_index);
+
+		auto &value_to_reassign = GetValue(entity_index_to_reassign, column_index);
 		auto value_type_to_reassign = columnData[column_index]->GetIndexValueType(entity_index_to_reassign);
 
 		//remove the value where it is
-		columnData[column_index]->DeleteIndexValue(value_of_index_to_reassign, entity_index_to_reassign);
+		columnData[column_index]->DeleteIndexValue(value_type_to_reassign, value_to_reassign, entity_index_to_reassign);
 
 		//change the destination to the value
-		columnData[column_index]->ChangeIndexValue(val_to_overwrite, value_type_to_reassign, value_of_index_to_reassign, entity_index);
+		columnData[column_index]->ChangeIndexValue(type_to_overwrite, val_to_overwrite, value_type_to_reassign, value_to_reassign, entity_index);
 	}
 
 	//copy data from entity_index_to_reassign to entity_index
@@ -149,11 +200,13 @@ void SeparableBoxFilterDataStore::RemoveEntity(Entity *entity, size_t entity_ind
 
 	//truncate matrix cache if removing the last entry, either by moving the last entity or by directly removing the last
 	if(entity_index_to_reassign + 1 == numEntities
-		|| (entity_index_to_reassign + 1 >= numEntities && entity_index + 1 == numEntities))
+			|| (entity_index_to_reassign + 1 >= numEntities && entity_index + 1 == numEntities))
 		DeleteLastRow();
 
 	//clean up any labels that aren't relevant
 	RemoveAnyUnusedLabels();
+
+	OptimizeAllColumns();
 }
 
 void SeparableBoxFilterDataStore::UpdateAllEntityLabels(Entity *entity, size_t entity_index)
@@ -164,18 +217,26 @@ void SeparableBoxFilterDataStore::UpdateAllEntityLabels(Entity *entity, size_t e
 	size_t matrix_index = GetMatrixCellIndex(entity_index);
 	for(size_t column_index = 0; column_index < columnData.size(); column_index++)
 	{
+		auto &column_data = columnData[column_index];
+
 		EvaluableNodeImmediateValueType value_type;
 		EvaluableNodeImmediateValue value;
 		value_type = entity->GetValueAtLabelAsImmediateValue(columnData[column_index]->stringId, value);
 
-		columnData[column_index]->ChangeIndexValue(matrix[matrix_index], value_type, value, entity_index);
-		matrix[matrix_index] = value;
+		//update the value
+		auto &matrix_value = matrix[matrix_index];
+		auto previous_value_type = column_data->GetIndexValueType(entity_index);
+
+		//assign the matrix location to the updated value (which may be an index)
+		matrix_value = column_data->ChangeIndexValue(previous_value_type, matrix_value, value_type, value, entity_index);
 
 		matrix_index++;
 	}
 
 	//clean up any labels that aren't relevant
 	RemoveAnyUnusedLabels();
+
+	OptimizeAllColumns();
 }
 
 void SeparableBoxFilterDataStore::UpdateEntityLabel(Entity *entity, size_t entity_index, StringInternPool::StringID label_updated)
@@ -188,20 +249,25 @@ void SeparableBoxFilterDataStore::UpdateEntityLabel(Entity *entity, size_t entit
 	if(column == end(labelIdToColumnIndex))
 		return;
 	size_t column_index = column->second;
+	auto &column_data = columnData[column_index];
 
 	//get the new value
 	EvaluableNodeImmediateValueType value_type;
 	EvaluableNodeImmediateValue value;
-	value_type = entity->GetValueAtLabelAsImmediateValue(columnData[column_index]->stringId, value);
+	value_type = entity->GetValueAtLabelAsImmediateValue(column_data->stringId, value);
 
 	//update the value
 	auto &matrix_value = GetValue(entity_index, column_index);
-	columnData[column_index]->ChangeIndexValue(matrix_value, value_type, value, entity_index);
-	matrix_value = value;
+	auto previous_value_type = column_data->GetIndexValueType(entity_index);
+
+	//assign the matrix location to the updated value (which may be an index)
+	matrix_value = column_data->ChangeIndexValue(previous_value_type, matrix_value, value_type, value, entity_index);
 
 	//remove the label if no longer relevant
 	if(IsColumnIndexRemovable(column_index))
 		RemoveColumnIndex(column_index);
+
+	OptimizeColumn(column_index);
 }
 
 //populates distances_out with all entities and their distances that have a distance to target less than max_dist
@@ -276,15 +342,15 @@ void SeparableBoxFilterDataStore::FindEntitiesWithinDistance(GeneralizedDistance
 			//if there are fewer enabled_indices than the number of unique values for this feature, plus one for unknown values
 			// it is usually faster (less distances to compute) to just compute distance for each unique value and add to associated sums
 			// unless it happens to be that enabled_indices is very skewed
-			if(column_data->sortedNumberValueIndexPairs.size() < enabled_indices.size())
+			if(column_data->sortedNumberValueEntries.size() < enabled_indices.size())
 			{
-				for(auto &[entity_list_value, entity_list] : column_data->sortedNumberValueIndexPairs)
+				for(auto &value_entry : column_data->sortedNumberValueEntries)
 				{
 					//get distance term that is applicable to each entity in this bucket
-					double distance_term = dist_params.ComputeDistanceTermRegularOneNonNull(target_value.number - entity_list_value, query_feature_index);
+					double distance_term = dist_params.ComputeDistanceTermRegularOneNonNull(target_value.number - value_entry->value.number, query_feature_index);
 
 					//for each bucket, add term to their sums
-					for(auto entity_index : *entity_list)
+					for(auto entity_index : value_entry->indicesWithValue)
 					{
 						if(!enabled_indices.contains(entity_index))
 							continue;
@@ -323,9 +389,10 @@ void SeparableBoxFilterDataStore::FindEntitiesWithinDistance(GeneralizedDistance
 		//else, there are less indices to consider than possible unique values, so save computation by just considering entities that are still valid
 		for(auto entity_index : enabled_indices)
 		{
-			auto &value = GetValue(entity_index, absolute_feature_index);
 			auto value_type = column_data->GetIndexValueType(entity_index);
-
+			auto value = column_data->GetResolvedValue(value_type, GetValue(entity_index, absolute_feature_index));
+			value_type = column_data->GetResolvedValueType(value_type);
+			
 			distances[entity_index] += dist_params.ComputeDistanceTermRegular(target_value, value, target_value_type, value_type, query_feature_index);
 
 			//remove entity if its distance is already greater than the max_dist
@@ -384,14 +451,16 @@ void SeparableBoxFilterDataStore::FindEntitiesNearestToIndexedEntity(Generalized
 		if(dist_params->IsFeatureEnabled(i))
 		{
 			size_t column_index = found->second;
+			auto &column_data = columnData[column_index];
 
-			auto &value = matrix[matrix_index_base + column_index];
-			auto value_type = columnData[column_index]->GetIndexValueType(search_index);
+			auto value_type = column_data->GetIndexValueType(search_index);
+			//overwrite value in case of value interning
+			auto value = column_data->GetResolvedValue(value_type, matrix[matrix_index_base + column_index]);
+			value_type = column_data->GetResolvedValueType(value_type);
 
-			PopulateNextTargetAttributes(*dist_params,
+			PopulateNextTargetAttributes(*dist_params, i,
 				target_column_indices, target_values, target_value_types,
-				column_index, value, value_type,
-				dist_params->featureParams[i].featureType);
+				column_index, value, value_type);
 		}
 	}
 
@@ -607,7 +676,7 @@ void SeparableBoxFilterDataStore::FindNearestEntities(GeneralizedDistance &dist_
 		//skip this entity in the next loops
 		enabled_indices.erase(good_match_index);
 
-		double distance = ResolveDistanceToNonMatchTargetValues(dist_params,\
+		double distance = ResolveDistanceToNonMatchTargetValues(dist_params,
 			target_column_indices, target_values, target_value_types, partial_sums, good_match_index, num_enabled_features);
 		sorted_results.Push(DistanceReferencePair(distance, good_match_index));
 	}
@@ -749,12 +818,14 @@ void SeparableBoxFilterDataStore::FindNearestEntities(GeneralizedDistance &dist_
 	}
 }
 
-void SeparableBoxFilterDataStore::DeleteEntityIndexFromColumns(size_t index)
+void SeparableBoxFilterDataStore::DeleteEntityIndexFromColumns(size_t entity_index)
 {
 	for(size_t i = 0; i < columnData.size(); i++)
 	{
-		auto &feature_value = GetValue(index, i);
-		columnData[i]->DeleteIndexValue(feature_value, index);
+		auto &column_data = columnData[i];
+		auto &feature_value = GetValue(entity_index, i);
+		auto feature_type = column_data->GetIndexValueType(entity_index);
+		columnData[i]->DeleteIndexValue(feature_type, feature_value, entity_index);
 	}
 }
 
@@ -805,9 +876,9 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 	size_t query_feature_index, size_t absolute_feature_index, BitArrayIntegerSet &enabled_indices)
 {
 	auto &column = columnData[absolute_feature_index];
-	auto feature_type = dist_params.featureParams[query_feature_index].featureType;
+	auto effective_feature_type = dist_params.featureParams[query_feature_index].effectiveFeatureType;
 
-	bool value_is_null = (value_type == ENIVT_NULL || (value_type == ENIVT_NUMBER && FastIsNaN(value.number)));
+	bool value_is_null = EvaluableNodeImmediateValue::IsNullEquivalent(value_type, value);
 	//need to accumulate values for nulls if the value is a null
 	if(value_is_null)
 	{
@@ -820,7 +891,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 		//if the known-unknown term is less than unknown_unknown (this should be rare if nulls have semantic meaning)
 		//then need to populate the rest of the cases
 		double known_unknown_term = dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index);
-		if(feature_type == FDT_NOMINAL || known_unknown_term < unknown_unknown_term)
+		if(effective_feature_type == GeneralizedDistance::EFDT_NOMINAL || known_unknown_term < unknown_unknown_term)
 		{
 			BitArrayIntegerSet &known_unknown_indices = parametersAndBuffers.potentialMatchesSet;
 			known_unknown_indices = enabled_indices;
@@ -842,7 +913,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 	}
 
 	//if nominal, only need to compute the exact match
-	if(feature_type == FDT_NOMINAL)
+	if(effective_feature_type == GeneralizedDistance::EFDT_NOMINAL)
 	{
 		if(value_type == ENIVT_NUMBER)
 		{
@@ -850,7 +921,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 			if(exact_index_found)
 			{
 				double term = dist_params.ComputeDistanceTermNominalExactMatch(query_feature_index);
-				AccumulatePartialSums(*column->sortedNumberValueIndexPairs[value_index].second, query_feature_index, term);
+				AccumulatePartialSums(column->sortedNumberValueEntries[value_index]->indicesWithValue, query_feature_index, term);
 			}
 		}
 		else if(value_type == ENIVT_STRING_ID)
@@ -882,7 +953,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 		//didn't find the value
 		return dist_params.ComputeDistanceTermNominalNonMatch(query_feature_index);
 	}
-	else if(feature_type == FDT_CONTINUOUS_STRING)
+	else if(effective_feature_type == GeneralizedDistance::EFDT_CONTINUOUS_STRING)
 	{
 		if(value_type == ENIVT_STRING_ID)
 		{
@@ -897,7 +968,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 		//the next closest string will have an edit distance of 1
 		return dist_params.ComputeDistanceTermNonNominalNonCyclicNonNullRegular(1.0, query_feature_index);
 	}
-	else if(feature_type == FDT_CONTINUOUS_CODE)
+	else if(effective_feature_type == GeneralizedDistance::EFDT_CONTINUOUS_CODE)
 	{
 		//compute partial sums for all code of matching size
 		size_t code_size = 1;
@@ -918,7 +989,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 	//else feature_type == FDT_CONTINUOUS_NUMERIC or FDT_CONTINUOUS_UNIVERSALLY_NUMERIC
 
 	//if not a number or no numbers available, then no size
-	if(value_type != ENIVT_NUMBER || column->sortedNumberValueIndexPairs.size() == 0)
+	if(value_type != ENIVT_NUMBER || column->sortedNumberValueEntries.size() == 0)
 		return GetMaxDistanceTermFromValue(dist_params, value, value_type, query_feature_index, absolute_feature_index);
 
 	bool cyclic_feature = dist_params.IsFeatureCyclic(query_feature_index);
@@ -932,12 +1003,12 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 	if(exact_index_found)
 		term = dist_params.ComputeDistanceTermNonNominalExactMatch(query_feature_index);
 	else
-		term = dist_params.ComputeDistanceTermNonNominalNonNullRegular(value.number - column->sortedNumberValueIndexPairs[value_index].first, query_feature_index);
+		term = dist_params.ComputeDistanceTermNonNominalNonNullRegular(value.number - column->sortedNumberValueEntries[value_index]->value.number, query_feature_index);
 
-	size_t num_entities_computed = AccumulatePartialSums(*column->sortedNumberValueIndexPairs[value_index].second, query_feature_index, term);
+	size_t num_entities_computed = AccumulatePartialSums(column->sortedNumberValueEntries[value_index]->indicesWithValue, query_feature_index, term);
 
 	//the logic below assumes there are at least two entries
-	size_t num_unique_number_values = column->sortedNumberValueIndexPairs.size();
+	size_t num_unique_number_values = column->sortedNumberValueEntries.size();
 	if(num_unique_number_values <= 1)
 		return term;
 
@@ -967,17 +1038,18 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 		size_t next_lower_index = 0;
 		if(!cyclic_feature)
 		{
-			if(lower_value_index > 0)
+			if(lower_value_index > 1)
 			{
 				next_lower_index = lower_value_index - 1;
-				lower_diff = std::abs(value.number - column->sortedNumberValueIndexPairs[next_lower_index].first);
+				lower_diff = std::abs(value.number - column->sortedNumberValueEntries[next_lower_index]->value.number);
 				compute_lower = true;
 			}
 		}
 		else //cyclic_feature
 		{
 			size_t next_index;
-			if(lower_value_index > 0)
+			//0th index is unknown
+			if(lower_value_index > 1)
 				next_index = lower_value_index - 1;
 			else
 				next_index = num_unique_number_values - 1;
@@ -986,7 +1058,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 			if(next_index != value_index)
 			{
 				next_lower_index = next_index;
-				lower_diff = GeneralizedDistance::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueIndexPairs[next_lower_index].first), cycle_length);
+				lower_diff = GeneralizedDistance::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueEntries[next_lower_index]->value.number), cycle_length);
 				compute_lower = true;
 			}
 		}
@@ -1000,7 +1072,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 			if(upper_value_index + 1 < num_unique_number_values)
 			{
 				next_upper_index = upper_value_index + 1;
-				upper_diff = std::abs(value.number - column->sortedNumberValueIndexPairs[next_upper_index].first);
+				upper_diff = std::abs(value.number - column->sortedNumberValueEntries[next_upper_index]->value.number);
 				compute_upper = true;
 			}
 		}
@@ -1009,8 +1081,8 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 			size_t next_index;
 			if(upper_value_index + 1 < num_unique_number_values)
 				next_index = upper_value_index + 1;
-			else
-				next_index = 0;
+			else //0th index is unknown, start at 1st
+				next_index = 1;
 
 			//make sure didn't wrap all the way around for cyclic features
 			//either from the value itself or overlapping with the next_lower_index
@@ -1019,7 +1091,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 				if((!compute_lower || next_index != next_lower_index))
 				{
 					next_upper_index = next_index;
-					upper_diff = GeneralizedDistance::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueIndexPairs[next_upper_index].first), cycle_length);
+					upper_diff = GeneralizedDistance::ConstrainDifferenceToCyclicDifference(std::abs(value.number - column->sortedNumberValueEntries[next_upper_index]->value.number), cycle_length);
 					compute_upper = true;
 				}
 				else //upper and lower have overlapped, want to exit the loop
@@ -1056,7 +1128,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 			//use heuristic to decide whether to continue populating based on whether this diff will help the overall distance cutoffs
 			// look at the rate of change of the difference compared to before, and how many new entities will be populated
 			// if it is too small and doesn't fill enough (or fills too many), then stop expanding
-			size_t potential_entities = column->sortedNumberValueIndexPairs[next_closest_index].second->size();
+			size_t potential_entities = column->sortedNumberValueEntries[next_closest_index]->indicesWithValue.size();
 			if(num_entities_computed + potential_entities > max_num_to_find)
 				break;
 
@@ -1081,7 +1153,7 @@ double SeparableBoxFilterDataStore::PopulatePartialSumsWithSimilarFeatureValue(G
 		}
 
 		term = dist_params.ComputeDistanceTermNonNominalNonNullRegular(next_closest_diff, query_feature_index);
-		num_entities_computed += AccumulatePartialSums(*column->sortedNumberValueIndexPairs[next_closest_index].second, query_feature_index, term);
+		num_entities_computed += AccumulatePartialSums(column->sortedNumberValueEntries[next_closest_index]->indicesWithValue, query_feature_index, term);
 
 		//track the rate of change of difference
 		if(next_closest_diff - last_diff > largest_diff_delta)
diff --git a/src/Amalgam/SeparableBoxFilterDataStore.h b/src/Amalgam/SeparableBoxFilterDataStore.h
index 022808ed..306709a7 100644
--- a/src/Amalgam/SeparableBoxFilterDataStore.h
+++ b/src/Amalgam/SeparableBoxFilterDataStore.h
@@ -114,6 +114,16 @@ class SeparableBoxFilterDataStore
 	// assumes column data is empty
 	void BuildLabel(size_t column_index, const std::vector<Entity *> &entities);
 
+	//changes column to/from interning as would yield best performance
+	void OptimizeColumn(size_t column_ndex);
+
+	//calls OptimizeColumn on all columns
+	inline void OptimizeAllColumns()
+	{
+		for(size_t column_index = 0; column_index < columnData.size(); column_index++)
+			OptimizeColumn(column_index);
+	}
+
 	//expand the structure by adding a new column/label/feature and populating with data from entities
 	void AddLabels(std::vector<size_t> &label_ids, const std::vector<Entity *> &entities)
 	{
@@ -286,18 +296,20 @@ class SeparableBoxFilterDataStore
 		if(column == labelIdToColumnIndex.end())
 			return;
 		size_t column_index = column->second;
+		auto &column_data = columnData[column_index];
 
-		columnData[column_index]->numberIndices.CopyTo(enabled_entities);
-		columnData[column_index]->nanIndices.EraseTo(enabled_entities);
+		column_data->numberIndices.CopyTo(enabled_entities);
+		column_data->nanIndices.EraseTo(enabled_entities);
 
 		//resize buffers and place each entity and value into its respective buffer
 		entities.resize(enabled_entities.size());
 		values.resize(enabled_entities.size());
 		size_t index = 0;
+		auto value_type = column_data->GetUnresolvedValueType(ENIVT_NUMBER);
 		for(auto entity_index : enabled_entities)
 		{
 			entities[index] = entity_index;
-			values[index] = GetValue(entity_index, column_index).number;
+			values[index] = column_data->GetResolvedValue(value_type, GetValue(entity_index, column_index)).number;
 			index++;
 		}
 	}
@@ -314,18 +326,20 @@ class SeparableBoxFilterDataStore
 		if(column == labelIdToColumnIndex.end())
 			return;
 		size_t column_index = column->second;
+		auto &column_data = columnData[column_index];
 
-		columnData[column_index]->numberIndices.IntersectTo(enabled_entities);
-		columnData[column_index]->nanIndices.EraseTo(enabled_entities);
+		column_data->numberIndices.IntersectTo(enabled_entities);
+		column_data->nanIndices.EraseTo(enabled_entities);
 
 		//resize buffers and place each entity and value into its respective buffer
 		entities.resize(enabled_entities.size());
 		values.resize(enabled_entities.size());
 		size_t index = 0;
+		auto value_type = column_data->GetUnresolvedValueType(ENIVT_NUMBER);
 		for(auto entity_index : enabled_entities)
 		{
 			entities[index] = entity_index;
-			values[index] = GetValue(entity_index, column_index).number;
+			values[index] = column_data->GetResolvedValue(value_type, GetValue(entity_index, column_index)).number;
 			index++;
 		}
 	}
@@ -414,16 +428,18 @@ class SeparableBoxFilterDataStore
 	template<typename Iter>
 	inline std::function<bool(Iter, double &)> GetNumberValueFromEntityIteratorFunction(size_t column_index)
 	{
-		auto number_indices_ptr = &columnData[column_index]->numberIndices;
+		auto column_data = columnData[column_index].get();
+		auto number_indices_ptr = &column_data->numberIndices;
+		auto value_type = column_data->GetUnresolvedValueType(ENIVT_NUMBER);
 
-		return [&, number_indices_ptr, column_index]
+		return [&, number_indices_ptr, column_index, column_data, value_type]
 		(Iter i, double &value)
 		{
 			size_t entity_index = *i;
 			if(!number_indices_ptr->contains(entity_index))
 				return false;
 
-			value = GetValue(entity_index, column_index).number;
+			value = column_data->GetResolvedValue(value_type, GetValue(entity_index, column_index)).number;
 			return true;
 		};
 	}
@@ -436,15 +452,17 @@ class SeparableBoxFilterDataStore
 		if(column_index >= columnData.size())
 			return [](size_t i, double &value) { return false; };
 
-		auto number_indices_ptr = &columnData[column_index]->numberIndices;
+		auto column_data = columnData[column_index].get();
+		auto number_indices_ptr = &column_data->numberIndices;
+		auto value_type = column_data->GetUnresolvedValueType(ENIVT_NUMBER);
 
-		return [&, number_indices_ptr, column_index]
+		return [&, number_indices_ptr, column_index, column_data, value_type]
 			(size_t i, double &value)
 			{
 				if(!number_indices_ptr->contains(i))
 					return false;
 
-				value = GetValue(i, column_index).number;
+				value = column_data->GetResolvedValue(value_type, GetValue(i, column_index)).number;
 				return true;
 			};
 	}
@@ -504,7 +522,7 @@ class SeparableBoxFilterDataStore
 	}
 
 	//deletes the index and associated data
-	void DeleteEntityIndexFromColumns(size_t index);
+	void DeleteEntityIndexFromColumns(size_t entity_index);
 
 	//adds a new labels to the database, populating new cells with -NaN, and updating the number of entities
 	// assumes label_ids is not empty and num_entities is nonzero
@@ -522,12 +540,15 @@ class SeparableBoxFilterDataStore
 		auto &partial_sums = parametersAndBuffers.partialSums;
 		const auto accum_location = partial_sums.GetAccumLocation(query_feature_index);
 
+		auto &column_data = columnData[absolute_feature_index];
+
 		//for each found element, accumulate associated partial sums
 		for(size_t entity_index : entity_indices)
 		{
 			//get value
-			auto &other_value = GetValue(entity_index, absolute_feature_index);
-			auto other_value_type = columnData[absolute_feature_index]->GetIndexValueType(entity_index);
+			auto other_value_type = column_data->GetIndexValueType(entity_index);
+			auto other_value = column_data->GetResolvedValue(other_value_type, GetValue(entity_index, absolute_feature_index));
+			other_value_type = column_data->GetResolvedValueType(other_value_type);
 
 			//compute term
 			double term = dist_params.ComputeDistanceTermRegular(value, other_value, value_type, other_value_type, query_feature_index);
@@ -663,8 +684,11 @@ class SeparableBoxFilterDataStore
 			if(dist_params.IsFeatureEnabled(i))
 			{
 				size_t column_index = target_column_indices[i];
-				auto &other_value = matrix[matrix_base_position + column_index];
-				auto other_value_type = columnData[column_index]->GetIndexValueType(other_index);
+				auto &column_data = columnData[column_index];
+
+				auto other_value_type = column_data->GetIndexValueType(other_index);
+				auto other_value = column_data->GetResolvedValue(other_value_type, matrix[matrix_base_position + column_index]);
+				other_value_type = column_data->GetResolvedValueType(other_value_type);
 
 				dist_accum += dist_params.ComputeDistanceTermRegular(target_values[i], other_value, target_value_types[i], other_value_type, i);
 			}
@@ -681,41 +705,62 @@ class SeparableBoxFilterDataStore
 		std::vector<EvaluableNodeImmediateValue> &target_values, std::vector<EvaluableNodeImmediateValueType> &target_value_types,
 		size_t entity_index, size_t query_feature_index)
 	{
-		auto feature_type = dist_params.featureParams[query_feature_index].featureType;
-
-		if(feature_type == FDT_NOMINAL)
+		switch(dist_params.featureParams[query_feature_index].effectiveFeatureType)
+		{
+		case GeneralizedDistance::EFDT_NOMINAL:
 			return dist_params.ComputeDistanceTermNominalNonMatch(query_feature_index);
-		else
+
+		case GeneralizedDistance::EFDT_CONTINUOUS_UNIVERSALLY_NUMERIC:
 		{
 			const size_t column_index = target_label_indices[query_feature_index];
+			return dist_params.ComputeDistanceTermNonNominalNonCyclicOneNonNullRegular(target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index);
+		}
 
-			if(feature_type == FDT_CONTINUOUS_UNIVERSALLY_NUMERIC)
-			{
+		case GeneralizedDistance::EFDT_VALUES_UNIVERSALLY_PRECOMPUTED:
+		{
+			const size_t column_index = target_label_indices[query_feature_index];
+			return dist_params.ComputeDistanceTermNumberInterned(GetValue(entity_index, column_index).indirectionIndex, query_feature_index);
+		}
+
+		case GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC:
+		{
+			const size_t column_index = target_label_indices[query_feature_index];
+			auto &column_data = columnData[column_index];
+			if(column_data->numberIndices.contains(entity_index))
 				return dist_params.ComputeDistanceTermNonNominalNonCyclicOneNonNullRegular(target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index);
-			}
-			else if(feature_type == FDT_CONTINUOUS_NUMERIC)
-			{
-				auto &column_data = columnData[column_index];
-				if(column_data->numberIndices.contains(entity_index))
-					return dist_params.ComputeDistanceTermNonNominalNonCyclicOneNonNullRegular(target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index);
-				else
-					return dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index);
-			}
-			else if(feature_type == FDT_CONTINUOUS_NUMERIC_CYCLIC)
-			{
-				auto &column_data = columnData[column_index];
-				if(column_data->numberIndices.contains(entity_index))
-					return dist_params.ComputeDistanceTermNonNominalOneNonNullRegular(target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index);
-				else
-					return dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index);
-			}
-			else //feature_type == FDT_CONTINUOUS_CODE
-			{
-				auto &other_value = GetValue(entity_index, column_index);
-				auto other_value_type = columnData[column_index]->GetIndexValueType(entity_index);
+			else
+				return dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index);
+		}
 
-				return dist_params.ComputeDistanceTermRegular(target_values[query_feature_index], other_value, target_value_types[query_feature_index], other_value_type, query_feature_index);
-			}
+		case GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC_CYCLIC:
+		{
+			const size_t column_index = target_label_indices[query_feature_index];
+			auto &column_data = columnData[column_index];
+			if(column_data->numberIndices.contains(entity_index))
+				return dist_params.ComputeDistanceTermNonNominalOneNonNullRegular(target_values[query_feature_index].number - GetValue(entity_index, column_index).number, query_feature_index);
+			else
+				return dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index);
+		}
+
+		case GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC_PRECOMPUTED:
+		{
+			const size_t column_index = target_label_indices[query_feature_index];
+			auto &column_data = columnData[column_index];
+			if(column_data->numberIndices.contains(entity_index))
+				return dist_params.ComputeDistanceTermNumberInterned(GetValue(entity_index, column_index).indirectionIndex, query_feature_index);
+			else
+				return dist_params.ComputeDistanceTermKnownToUnknown(query_feature_index);
+		}
+
+		default: //GeneralizedDistance::EFDT_CONTINUOUS_STRING or GeneralizedDistance::EFDT_CONTINUOUS_CODE
+		{
+			const size_t column_index = target_label_indices[query_feature_index];
+			auto &column_data = columnData[column_index];
+			auto other_value_type = column_data->GetIndexValueType(entity_index);
+			auto other_value = column_data->GetResolvedValue(other_value_type, GetValue(entity_index, column_index));
+
+			return dist_params.ComputeDistanceTermRegular(target_values[query_feature_index], other_value, target_value_types[query_feature_index], other_value_type, query_feature_index);
+		}
 		}
 	}
 
@@ -782,50 +827,73 @@ class SeparableBoxFilterDataStore
 				entity_index, query_feature_index);
 
 			//break out of the loop before the iterator is incremented to save a few cycles
-			if(distance > reject_distance)
-				return std::make_pair(false, distance);
-
-			if(num_uncalculated_features == 0)
-				break;
+			//do this via logic to minimize the number of branches
+			bool unacceptable_distance = (distance > reject_distance);
+			if(unacceptable_distance || num_uncalculated_features == 0)
+				return std::make_pair(!unacceptable_distance, distance);
 		}
 
-		//done with computation
+		//shouldn't make it here
 		return std::make_pair(true, distance);
 	}
 
-	//populates the next target attribute in each vector based on column_index, position data, and mkdist_feature_type
-	// if mkdist_feature_type can be modified for efficiency, this function will update it, which is why it is passed by reference
-	__forceinline void PopulateNextTargetAttributes(GeneralizedDistance &dist_params,
+	//populates the next target attribute in each vector based on column_index, position data
+	//if there is a specialization of the feature type, it will update it and update dist_params accordingly
+	__forceinline void PopulateNextTargetAttributes(GeneralizedDistance &dist_params, size_t query_feature_index,
 		std::vector<size_t> &target_column_indices, std::vector<EvaluableNodeImmediateValue> &target_values,
 		std::vector<EvaluableNodeImmediateValueType> &target_value_types, size_t column_index,
-		EvaluableNodeImmediateValue &position_value, EvaluableNodeImmediateValueType position_value_type,
-		FeatureDifferenceType &mkdist_feature_type)
+		EvaluableNodeImmediateValue &position_value, EvaluableNodeImmediateValueType position_value_type)
 	{
 		target_column_indices.push_back(column_index);
 
-		if(mkdist_feature_type == FDT_NOMINAL || mkdist_feature_type == FDT_CONTINUOUS_STRING || mkdist_feature_type == FDT_CONTINUOUS_CODE)
+		auto &feature_type = dist_params.featureParams[query_feature_index].featureType;
+		auto &effective_feature_type = dist_params.featureParams[query_feature_index].effectiveFeatureType;
+
+		if(feature_type == GeneralizedDistance::FDT_NOMINAL
+			|| feature_type == GeneralizedDistance::FDT_CONTINUOUS_STRING
+			|| feature_type == GeneralizedDistance::FDT_CONTINUOUS_CODE)
 		{
 			target_values.push_back(position_value);
 			target_value_types.push_back(position_value_type);
+
+			if(feature_type == GeneralizedDistance::FDT_NOMINAL)
+				effective_feature_type = GeneralizedDistance::EFDT_NOMINAL;
+			else if(feature_type == GeneralizedDistance::FDT_CONTINUOUS_STRING)
+				effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_STRING;
+			else if(feature_type == GeneralizedDistance::FDT_CONTINUOUS_CODE)
+				effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_CODE;
 		}
-		else // mkdist_feature_type == FDT_CONTINUOUS_NUMERIC or FDT_CONTINUOUS_NUMERIC_CYCLIC
+		else // feature_type is some form of numeric
 		{
-			//if everything is either non-existant or numeric, then can shortcut later
+			//looking for continuous; if not a number, so just put as nan
+			double position_value_numeric = (position_value_type == ENIVT_NUMBER ? position_value.number : std::numeric_limits<double>::quiet_NaN());
+			target_values.push_back(position_value_numeric);
+			target_value_types.push_back(ENIVT_NUMBER);
+
+			//set up effective_feature_type
 			auto &column_data = columnData[column_index];
+
+			//determine if all values are numeric
 			size_t num_values_stored_as_numbers = column_data->numberIndices.size() + column_data->invalidIndices.size() + column_data->nullIndices.size();
-			if(GetNumInsertedEntities() == num_values_stored_as_numbers && mkdist_feature_type == FDT_CONTINUOUS_NUMERIC)
-				mkdist_feature_type = FDT_CONTINUOUS_UNIVERSALLY_NUMERIC;
+			bool all_values_numeric = (GetNumInsertedEntities() == num_values_stored_as_numbers);
 
-			auto value_type = position_value_type;
-			if(value_type == ENIVT_NUMBER)
+			if(column_data->numberValuesInterned)
 			{
-				target_values.push_back(position_value);
-				target_value_types.push_back(ENIVT_NUMBER);
+				if(all_values_numeric)
+					effective_feature_type = GeneralizedDistance::EFDT_VALUES_UNIVERSALLY_PRECOMPUTED;
+				else
+					effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC_PRECOMPUTED;
+
+				dist_params.ComputeAndStoreInternedNumberValuesAndDistanceTerms(query_feature_index, position_value_numeric, &column_data->internedNumberIndexToNumberValue);
 			}
-			else //looking for continuous and not a number, so just put as nan
+			else
 			{
-				target_values.push_back(std::numeric_limits<double>::quiet_NaN());
-				target_value_types.push_back(ENIVT_NUMBER);
+				if(all_values_numeric && feature_type == GeneralizedDistance::FDT_CONTINUOUS_NUMERIC)
+					effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_UNIVERSALLY_NUMERIC;
+				else if(feature_type == GeneralizedDistance::FDT_CONTINUOUS_NUMERIC_CYCLIC)
+					effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC_CYCLIC;
+				else
+					effective_feature_type = GeneralizedDistance::EFDT_CONTINUOUS_NUMERIC;
 			}
 		}
 	}
@@ -853,10 +921,9 @@ class SeparableBoxFilterDataStore
 
 			if(dist_params.IsFeatureEnabled(i))
 			{
-				PopulateNextTargetAttributes(dist_params,
+				PopulateNextTargetAttributes(dist_params, i,
 					target_column_indices, target_values, target_value_types,
-					column->second, position_values[i], position_value_types[i],
-					dist_params.featureParams[i].featureType);
+					column->second, position_values[i], position_value_types[i]);
 			}
 		}
 	}
@@ -889,7 +956,8 @@ class SeparableBoxFilterDataStore
 					feature_params.unknownToUnknownDifference = unknown_distance_term;
 			}
 
-			dist_params.ComputeAndStoreUncertaintyDistanceTerms(i);
+			dist_params.ComputeAndStoreUncertaintyDistanceTerms(i,
+				EvaluableNodeImmediateValue::IsNullEquivalent(target_value_types[i], target_values[i]));
 		}
 	}
 
diff --git a/src/Amalgam/amlg_code/test.amlg b/src/Amalgam/amlg_code/test.amlg
index 2847c612..55668ac8 100644
--- a/src/Amalgam/amlg_code/test.amlg
+++ b/src/Amalgam/amlg_code/test.amlg
@@ -1,4 +1,28 @@
 (seq
-	;(print (format (list (assoc a 3 b 4) (assoc c "c" d (null))) "code" "yaml") "\n")
-	(print (format (true) "code" "yaml") "\n")
+ (create_entities "BoxConvictionTestContainer" (null) )
+
+ (create_entities (list "BoxConvictionTestContainer" "vert0") (lambda
+  (null ##x 0 ##y 0 ##weight 2)
+ ) )
+
+ (create_entities (list "BoxConvictionTestContainer" "vert1") (lambda
+  (null ##x 0 ##y 1 ##weight 1)
+ ) )
+
+ (create_entities (list "BoxConvictionTestContainer" "vert2") (lambda
+  (null ##x 1 ##y 0 ##weight 1)
+ ) )
+
+ (create_entities (list "BoxConvictionTestContainer" "vert3") (lambda
+  (null ##x 2 ##y 1 ##weight 1)
+ ) )
+
+  ;should print:
+  ;dc: (list
+	;(list "vert0" "vert1" "vert2" "vert3")
+	;(list 1 1 1 1.4142135623730951)
+  ;)
+  (print "dc: " (compute_on_contained_entities "BoxConvictionTestContainer" (list
+  (compute_entity_distance_contributions 1 (list "x" "y") (list "vert3") (null) (null) (null) (null) 2.0 -1 (null) "fixed_seed" (null) "recompute_precise" (true))
+ )))
 )
\ No newline at end of file
diff --git a/src/Amalgam/entity/EntityQueryBuilder.h b/src/Amalgam/entity/EntityQueryBuilder.h
index 692d7dfc..24db9321 100644
--- a/src/Amalgam/entity/EntityQueryBuilder.h
+++ b/src/Amalgam/entity/EntityQueryBuilder.h
@@ -64,18 +64,18 @@ namespace EntityQueryBuilder
 			[&dist_params](size_t i, bool found, EvaluableNode *en) {
 				if(i < dist_params.featureParams.size())
 				{
-					auto feature_type = FDT_CONTINUOUS_NUMERIC;
+					auto feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC;
 					if(found)
 					{
 						StringInternPool::StringID feature_type_id = EvaluableNode::ToStringIDIfExists(en);
 						switch(feature_type_id)
 						{
-						case ENBISI_nominal:								feature_type = FDT_NOMINAL;						break;
-						case ENBISI_continuous:								feature_type = FDT_CONTINUOUS_NUMERIC;			break;
-						case ENBISI_cyclic:									feature_type = FDT_CONTINUOUS_NUMERIC_CYCLIC;	break;
-						case GetStringIdFromNodeTypeFromString(ENT_STRING): feature_type = FDT_CONTINUOUS_STRING;			break;	
-						case ENBISI_code:									feature_type = FDT_CONTINUOUS_CODE;				break;
-						default:											feature_type = FDT_CONTINUOUS_NUMERIC;			break;
+						case ENBISI_nominal:								feature_type = GeneralizedDistance::FDT_NOMINAL;					break;
+						case ENBISI_continuous:								feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC;			break;
+						case ENBISI_cyclic:									feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC_CYCLIC;	break;
+						case GetStringIdFromNodeTypeFromString(ENT_STRING): feature_type = GeneralizedDistance::FDT_CONTINUOUS_STRING;			break;	
+						case ENBISI_code:									feature_type = GeneralizedDistance::FDT_CONTINUOUS_CODE;			break;
+						default:											feature_type = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC;			break;
 						}
 					}
 					dist_params.featureParams[i].featureType = feature_type;
@@ -93,7 +93,7 @@ namespace EntityQueryBuilder
 					//get attributes based on feature type
 					switch(dist_params.featureParams[i].featureType)
 					{
-					case FDT_NOMINAL:
+					case GeneralizedDistance::FDT_NOMINAL:
 						if(found && !EvaluableNode::IsNull(en))
 						{
 							if(en->EvaluableNode::IsOrderedArray())
@@ -118,7 +118,7 @@ namespace EntityQueryBuilder
 						}
 						break;
 
-					case FDT_CONTINUOUS_NUMERIC_CYCLIC:
+					case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC_CYCLIC:
 						if(found && !EvaluableNode::IsNull(en))
 						{
 							if(en->EvaluableNode::IsOrderedArray())
@@ -139,14 +139,13 @@ namespace EntityQueryBuilder
 						}
 						else //can't be cyclic without a range
 						{
-							dist_params.featureParams[i].featureType = FDT_CONTINUOUS_NUMERIC;
+							dist_params.featureParams[i].featureType = GeneralizedDistance::FDT_CONTINUOUS_NUMERIC;
 						}
 						break;
 
-					case FDT_CONTINUOUS_NUMERIC:
-					case FDT_CONTINUOUS_UNIVERSALLY_NUMERIC:
-					case FDT_CONTINUOUS_STRING:
-					case FDT_CONTINUOUS_CODE:
+					case GeneralizedDistance::FDT_CONTINUOUS_NUMERIC:
+					case GeneralizedDistance::FDT_CONTINUOUS_STRING:
+					case GeneralizedDistance::FDT_CONTINUOUS_CODE:
 						if(found && !EvaluableNode::IsNull(en))
 						{
 							if(en->EvaluableNode::IsOrderedArray())
diff --git a/src/Amalgam/evaluablenode/EvaluableNode.h b/src/Amalgam/evaluablenode/EvaluableNode.h
index fc3d9245..a096a225 100644
--- a/src/Amalgam/evaluablenode/EvaluableNode.h
+++ b/src/Amalgam/evaluablenode/EvaluableNode.h
@@ -916,11 +916,12 @@ class EvaluableNode
 // compare two values based on their collective types
 enum EvaluableNodeImmediateValueType
 {
-	ENIVT_NOT_EXIST,	//there is nothing to even hold the data
-	ENIVT_NULL,			//no data being held
-	ENIVT_NUMBER,		//number
-	ENIVT_STRING_ID,	//stringID
-	ENIVT_CODE			//code (more general than any of the above)
+	ENIVT_NOT_EXIST,			//there is nothing to even hold the data
+	ENIVT_NULL,					//no data being held
+	ENIVT_NUMBER,				//number
+	ENIVT_STRING_ID,			//stringID
+	ENIVT_CODE,					//code (more general than any of the above)
+	ENIVT_NUMBER_INDIRECTION_INDEX		//not a real EvaluableNode type, but an index to some data structure that has a number
 };
 
 //structure that can hold the most immediate value type of an EvaluableNode 
@@ -992,28 +993,30 @@ union EvaluableNodeImmediateValue
 			return false;
 
 		//types are the same, just use type_1 for reference
-		if(type_1 == ENIVT_NUMBER)
-		{
-			if(EqualIncludingNaN(value_1.number, value_2.number))
-				return false;
-		}
+		if(type_1 == ENIVT_NULL)
+			return true;
+		else if(type_1 == ENIVT_NUMBER)
+			return EqualIncludingNaN(value_1.number, value_2.number);
 		else if(type_1 == ENIVT_STRING_ID)
-		{
-			if(value_1.stringID == value_2.stringID)
-				return false;
-		}
+			return (value_1.stringID == value_2.stringID);
+		else if(type_1 == ENIVT_NUMBER_INDIRECTION_INDEX)
+			return (value_1.indirectionIndex == value_2.indirectionIndex);
 		else
-		{
-			if(EvaluableNode::AreDeepEqual(value_1.code, value_2.code))
-				return false;
-		}
+			return EvaluableNode::AreDeepEqual(value_1.code, value_2.code);
+	}
 
-		return true;
+	//returns true if it is a null or null equivalent
+	static bool IsNullEquivalent(EvaluableNodeImmediateValueType type, EvaluableNodeImmediateValue &value)
+	{
+		return (type == ENIVT_NULL
+				|| (type == ENIVT_NUMBER && FastIsNaN(value.number))
+				|| (type == ENIVT_STRING_ID && value.stringID == string_intern_pool.NOT_A_STRING_ID));
 	}
 
 	double number;
 	StringInternPool::StringID stringID;
 	EvaluableNode *code;
+	size_t indirectionIndex;
 };
 
 //used for storing a value and type together
diff --git a/src/Amalgam/out.txt b/src/Amalgam/out.txt
index 82c39bf5..249653cb 100644
--- a/src/Amalgam/out.txt
+++ b/src/Amalgam/out.txt
@@ -1227,7 +1227,7 @@ abcdef
 				8
 			)
 		accum_string "abcdef"
-		argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg")
+		argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg")
 		bar (declare
 				(assoc x 6)
 				(+ x 2)
@@ -1240,10 +1240,10 @@ abcdef
 				A (assoc B 2)
 				B 2
 			)
-		interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe"
+		interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe"
 		raaa 2
 		rwww 1
-		start_time 1695320985.913567
+		start_time 1697958069.900312
 		www 1
 		x 12
 		zz 10
@@ -1270,7 +1270,7 @@ abcdef
 			8
 		)
 	accum_string "abcdef"
-	argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg")
+	argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg")
 	bar (declare
 			(assoc x 6)
 			(+ x 2)
@@ -1283,10 +1283,10 @@ abcdef
 			A (assoc B 2)
 			B 2
 		)
-	interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe"
+	interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe"
 	raaa 2
 	rwww 1
-	start_time 1695320985.913567
+	start_time 1697958069.900312
 	www 1
 	x 12
 	zz 10
@@ -1312,7 +1312,7 @@ abcdef
 			8
 		)
 	accum_string "abcdef"
-	argv (list "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg")
+	argv (list "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\src\\Amalgam\\./amlg_code/full_test.amlg")
 	bar (declare
 			(assoc x 6)
 			(+ x 2)
@@ -1325,10 +1325,10 @@ abcdef
 			A (assoc B 2)
 			B 2
 		)
-	interpreter "C:\\Users\\ChristopherHazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe"
+	interpreter "C:\\Users\\Chris Hazard\\Desktop\\Howso_repos\\amalgam\\x64\\MT_Release_EXE\\Amalgam.exe"
 	raaa 2
 	rwww 1
-	start_time 1695320985.913567
+	start_time 1697958069.900312
 	www 1
 	x 12
 	zz 10
@@ -1596,7 +1596,7 @@ e:
   - .inf
 
 25: (assoc a 1)
-current date-time in epoch: 2023-09-21-14.29.45.9780260
+current date-time in epoch: 2023-10-22-03.01.10.1536540
 2020-06-07 00:22:59
 1391230800
 1391230800
@@ -3500,7 +3500,7 @@ deep sets
 
 --set_entity_root_permission--
 RootTest
-1695320986.253307
+1697958070.384504
 (true)
 
 RootTest
@@ -3730,7 +3730,7 @@ hello
 				)
 			)
 	)
-	(set_entity_rand_seed new_entity "ºÞÀT1Sx0Ñ”-I´»ÿ")
+	(set_entity_rand_seed new_entity "bÁ¬ò­«1L0Ñ”-I´»ÿ")
 	(set_entity_rand_seed
 		(first
 			(create_entities
@@ -3743,7 +3743,7 @@ hello
 				)
 			)
 		)
-		"TìxÊ\"½ÖJüð‡²5O´ÿ"
+		" ºþŽ9Öí8­ÕV­:oàÿ"
 	)
 	(set_entity_rand_seed
 		(first
@@ -3779,7 +3779,7 @@ hello
 				)
 			)
 	)
-	(set_entity_rand_seed new_entity "ºÞÀT1Sx0Ñ”-I´»ÿ")
+	(set_entity_rand_seed new_entity "bÁ¬ò­«1L0Ñ”-I´»ÿ")
 	(set_entity_rand_seed
 		(first
 			(create_entities
@@ -4243,13 +4243,13 @@ case convictions unweighted:
 )
 case convictions weighted by object (with erroneously long nominal):
 (assoc
-	TestContainerExec 0.28356810230095286
-	vert0 1.3005955468751227
-	vert1 1.3005955468751227
-	vert2 1.1369409848092722
-	vert3 1.272922458938727
-	vert4 73.971185205438
-	vert5 3.8897264529001325
+	TestContainerExec 0.2828909712209332
+	vert0 1.2974898602334366
+	vert1 1.2974898602334366
+	vert2 1.1342260882087178
+	vert3 1.2974898602334366
+	vert4 73.7945497230898
+	vert5 3.880438191446671
 )
 case convictions x exists before:
 (assoc entity3 1.0000000933277426 entity4 0.9999998458521889 entity5 1.0000000608201045)
@@ -4487,7 +4487,7 @@ a
 (list "hello" "!")
 (assoc a1 1.4142135623730951 a2 2 a3 1.4142135623730951)
 (assoc a1 1.4142135623730951 a3 1.4142135623730951)
-(assoc a3 1.4142135623730951)
+(assoc a1 1.4142135623730951)
 (assoc a1 5.0990195135927845 a2 2 a3 5.0990195135927845)
 (assoc a1 1 a3 1 a4 0)
 --accuracy tests--
@@ -4634,16 +4634,16 @@ cyclic KL: (assoc
 	vert1 0.0020695242435298626
 	vert2 0.0020695242435298626
 	vert3 0.03622271709266012
-	vert4 0.06081391029364311
+	vert4 0.05872535496117577
 )
 cyclic conviction: (assoc
-	vert0 0.5137287240708814
-	vert1 16.01087833672136
-	vert2 16.01087833672136
-	vert3 0.9147547047144656
-	vert4 0.5448572656824459
+	vert0 0.5072524658798658
+	vert1 15.809039161461927
+	vert2 15.809039161461927
+	vert3 0.9032229616532002
+	vert4 0.5571220443569613
 )
-cyclic group kl divergence: 0.06081391029364306
+cyclic group kl divergence: 0.05872535496117583
 surprisal transforms
 probabilities: (list
 	(list "vert0" "vert1" "vert2" "vert3")
@@ -4723,4 +4723,4 @@ Expecting 1000: 1000
 concurrent entity writes successful: (true)
 
 --total execution time--
-1.1435298919677734
+1.1228001117706299