howsoai · howsohazard · Oct 30, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
@@ -15,7 +15,7 @@
 
 ## Introduction
 
-Amalgam&trade; is a domain specific language ([DSL](https://en.wikipedia.org/wiki/Domain-specific_language)) developed primarily for [genetic programming](https://en.wikipedia.org/wiki/Generic_programming) and [instance based machine learning](https://en.wikipedia.org/wiki/Instance-based_learning), but also for simulation, agent based modeling, data storage and retrieval, the mathematics of probability theory and information theory, and game content and AI. The language format is somewhat LISP-like in that it uses parenthesized list format with prefix notation and is geared toward functional programming, where there is a one-to-one mapping between the code and the corresponding parse tree.
+Amalgam&trade; is a domain specific language ([DSL](https://en.wikipedia.org/wiki/Domain-specific_language)) developed primarily for [genetic programming](https://en.wikipedia.org/wiki/Genetic_programming) and [instance based machine learning](https://en.wikipedia.org/wiki/Instance-based_learning), but also for simulation, agent based modeling, data storage and retrieval, the mathematics of probability theory and information theory, and game content and AI. The language format is somewhat LISP-like in that it uses parenthesized list format with prefix notation and is geared toward functional programming, where there is a one-to-one mapping between the code and the corresponding parse tree.
 
 Whereas virtually all practical programming languages are primarily designed for some combination of programmer productivity and computational performance, Amalgam prioritizes code matching and merging, as well as a deep equivalence of code and data. Amalgam uses _entities_ to store code and data, with a rich query system to find entities by their _labels_. The language uses a variable stack, but all attributes and methods are stored directly as labels in entities. There is no separate class versus instance, but entities can be used as prototypes to be copied and modified. Though code and data are represented as trees from the root of each entity, graphs in code and data structures are permitted and are flattened to code using special references. Further, instead of failing early when there is an error, Amalgam supports genetic programming and code mixing by being extremely weakly typed, and attempts to find a way to execute code no matter whether types match or not.
 

@@ -950,7 +950,8 @@ class BitArrayIntegerSet
 	}
 
 	//Sets this to the BitArrayIntegerSet to the set that contains only elements that it and another jointly contain
-	void Intersect(BitArrayIntegerSet &other)
+	// does NOT update the number of elements, so UpdateNumElements must be called
+	void IntersectInBatch(BitArrayIntegerSet &other)
 	{
 		//if no intersection, then just clear and exit
 		if(numElements == 0 || other.numElements == 0)
@@ -971,6 +972,12 @@ class BitArrayIntegerSet
 			bitBucket[i] = 0;
 
 		TrimBack();
+	}
+
+	//Sets this to the BitArrayIntegerSet to the set that contains only elements that it and another jointly contain
+	inline void Intersect(BitArrayIntegerSet &other)
+	{
+		IntersectInBatch(other);
 		UpdateNumElements();
 	}
 
@@ -1492,12 +1499,22 @@ class EfficientIntegerSet
 	}
 
 	//removs all elements of this container from other
-	void EraseTo(BitArrayIntegerSet &other)
+	inline void EraseTo(BitArrayIntegerSet &other, bool in_batch = false)
 	{
 		if(isSisContainer)
-			other.erase(sisContainer);
+		{
+			if(in_batch)
+				other.EraseInBatch(sisContainer);
+			else
+				other.erase(sisContainer);
+		}
 		else
-			other.erase(baisContainer);
+		{
+			if(in_batch)
+				other.EraseInBatch(baisContainer);
+			else
+				other.erase(baisContainer);
+		}
 	}
 
 	//removes all elements contained by other, intended for calling in a batch
@@ -1660,12 +1677,22 @@ class EfficientIntegerSet
 	}
 
 	//sets other to the set that contains only elements that it and other jointly contain
-	inline void IntersectTo(BitArrayIntegerSet &other)
+	inline void IntersectTo(BitArrayIntegerSet &other, bool in_batch = false)
 	{
 		if(IsSisContainer())
-			other.Intersect(sisContainer);
+		{
+			if(in_batch)
+				other.IntersectInBatch(sisContainer);
+			else
+				other.Intersect(sisContainer);
+		}
 		else
-			other.Intersect(baisContainer);
+		{
+			if(in_batch)
+				other.IntersectInBatch(baisContainer);
+			else
+				other.Intersect(baisContainer);
+		}
 	}
 
 	//flips the elements in the set starting with element 0 up to but not including up_to_id

@@ -299,11 +299,6 @@ void SeparableBoxFilterDataStore::FindEntitiesWithinDistance(GeneralizedDistance
 	distances.clear();
 	distances.resize(GetNumInsertedEntities(), 0.0);
 
-	//remove any entities that are missing labels
-	for(auto absolute_feature_index : target_column_indices)
-		columnData[absolute_feature_index]->invalidIndices.EraseInBatchFrom(enabled_indices);
-	enabled_indices.UpdateNumElements();
-
 	//for each desired feature, compute and add distance terms of possible window query candidate entities
 	for(size_t query_feature_index = 0; query_feature_index < target_column_indices.size(); query_feature_index++)
 	{
@@ -476,11 +471,6 @@ void SeparableBoxFilterDataStore::FindEntitiesNearestToIndexedEntity(Generalized
 	possible_knn_indices.erase(search_index);
 	possible_knn_indices.erase(ignore_index);
 
-	//remove invalid cases
-	for(size_t absolute_feature_index : target_column_indices)
-		columnData[absolute_feature_index]->invalidIndices.EraseInBatchFrom(possible_knn_indices);
-	possible_knn_indices.UpdateNumElements();
-
 	//if num enabled indices < top_k, return sorted distances
 	if(GetNumInsertedEntities() <= top_k || possible_knn_indices.size() <= top_k)
 		return FindAllValidElementDistances(*dist_params, target_column_indices, target_values, target_value_types, possible_knn_indices, distances_out, rand_stream);
@@ -634,11 +624,6 @@ void SeparableBoxFilterDataStore::FindNearestEntities(GeneralizedDistance &dist_
 
 	PopulateUnknownFeatureValueTerms(dist_params);
 
-	//ignore cases with missing labels
-	for(size_t i = 0; i < num_enabled_features; i++)
-		columnData[target_column_indices[i]]->invalidIndices.EraseInBatchFrom(enabled_indices);
-	enabled_indices.UpdateNumElements();
-
 	enabled_indices.erase(ignore_entity_index);
 
 	//if num enabled indices < top_k, return sorted distances

@@ -266,7 +266,9 @@ class SeparableBoxFilterDataStore
 	}
 
 	//filters out to include only entities that have the given feature
-	inline void IntersectEntitiesWithFeature(size_t feature_id, BitArrayIntegerSet &out)
+	//if in_batch is true, will update out in batch for performance,
+	//meaning its number of elements will need to be updated
+	inline void IntersectEntitiesWithFeature(size_t feature_id, BitArrayIntegerSet &out, bool in_batch)
 	{
 		if(numEntities == 0)
 		{
@@ -281,7 +283,7 @@ class SeparableBoxFilterDataStore
 			return;
 		}
 
-		columnData[column->second]->invalidIndices.EraseTo(out);
+		columnData[column->second]->invalidIndices.EraseTo(out, in_batch);
 	}
 
 	//sets out to include only entities that have the given feature and records the values into
@@ -364,7 +366,9 @@ class SeparableBoxFilterDataStore
 	}
 
 	//filters out to include only entities that don't have the given feature
-	inline void IntersectEntitiesWithoutFeature(size_t feature_id, BitArrayIntegerSet &out)
+	//if in_batch is true, will update out in batch for performance,
+	//meaning its number of elements will need to be updated
+	inline void IntersectEntitiesWithoutFeature(size_t feature_id, BitArrayIntegerSet &out, bool in_batch)
 	{
 		if(numEntities == 0)
 			return;
@@ -373,7 +377,7 @@ class SeparableBoxFilterDataStore
 		if(column == labelIdToColumnIndex.end())
 			return;
 
-		columnData[column->second]->invalidIndices.IntersectTo(out);
+		columnData[column->second]->invalidIndices.IntersectTo(out, in_batch);
 	}
 
 	//given a feature_id, value_type, and value, inserts into out all the entities that have the value
@@ -488,6 +492,7 @@ class SeparableBoxFilterDataStore
 
 	//populates distances_out with all entities and their distances that have a distance to target less than max_dist
 	//if enabled_indices is not nullptr, intersects with the enabled_indices set.
+	//assumes that enabled_indices only contains indices that have valid values for all the features
 	void FindEntitiesWithinDistance(GeneralizedDistance &dist_params, std::vector<size_t> &position_label_ids,
 		std::vector<EvaluableNodeImmediateValue> &position_values, std::vector<EvaluableNodeImmediateValueType> &position_value_types,
 		double max_dist, BitArrayIntegerSet &enabled_indices, std::vector<DistanceReferencePair<size_t>> &distances_out);
@@ -496,13 +501,15 @@ class SeparableBoxFilterDataStore
 	// if expand_to_first_nonzero_distance is set, then it will expand top_k until it it finds the first nonzero distance or until it includes all enabled indices 
 	// if const_dist_params is true, then it will make a copy before making any modifications
 	//will not modify enabled_indices, but instead will make a copy for any modifications
+	//assumes that enabled_indices only contains indices that have valid values for all the features
 	void FindEntitiesNearestToIndexedEntity(GeneralizedDistance *dist_params_ref, std::vector<size_t> &position_label_ids,
 		bool constant_dist_params, size_t search_index, size_t top_k, BitArrayIntegerSet &enabled_indices,
 		bool expand_to_first_nonzero_distance, std::vector<DistanceReferencePair<size_t>> &distances_out,
 		size_t ignore_index = std::numeric_limits<size_t>::max(), RandomStream rand_stream = RandomStream());
 
 	//Finds the nearest neighbors
 	//enabled_indices is the set of entities to find from, and will be modified
+	//assumes that enabled_indices only contains indices that have valid values for all the features
 	void FindNearestEntities(GeneralizedDistance &dist_params, std::vector<size_t> &position_label_ids,
 		std::vector<EvaluableNodeImmediateValue> &position_values, std::vector<EvaluableNodeImmediateValueType> &position_value_types,
 		size_t top_k, size_t ignore_entity_index, BitArrayIntegerSet &enabled_indices,

@@ -695,6 +695,8 @@ EvaluableNodeReference EntityQueryCondition::GetMatchingEntities(Entity *contain
 	{
 		size_t num_to_keep = std::min(static_cast<size_t>(maxToRetrieve), matching_entities.size());
 
+		distParams.SetAndConstrainParams();
+
 		//get values for each entity
 		StochasticTieBreakingPriorityQueue<DistanceReferencePair<Entity *>> nearest_entities(randomStream.CreateOtherStreamViaRand());
 		for(size_t i = 0; i < matching_entities.size(); i++)
@@ -755,6 +757,7 @@ EvaluableNodeReference EntityQueryCondition::GetMatchingEntities(Entity *contain
 
 	case ENT_QUERY_WITHIN_GENERALIZED_DISTANCE:
 	{
+		distParams.SetAndConstrainParams();
 		//find those that match
 		for(size_t i = 0; i < matching_entities.size(); i++)
 		{

@@ -399,75 +399,6 @@ namespace EntityQueryBuilder
 				}
 			}
 		}
-
-
-		//check if any of the positions are not valid
-		bool need_exist_query = false;
-		bool has_position_data = !DoesDistanceQueryUseEntitiesInsteadOfPosition(condition_type);
-
-		//check for any disabled features (e.g., zero'd weight)
-		if(has_position_data)
-		{
-			for(size_t i = 0; i < cur_condition->distParams.featureParams.size(); i++)
-			{
-				if(!cur_condition->distParams.IsFeatureEnabled(i))
-				{
-					need_exist_query = true;
-					break;
-				}
-			}
-		}
-		else //entities may have missing data, so need exist query
-		{
-			need_exist_query = true;
-		}
-
-		if(need_exist_query)
-		{
-			//add exists query and swap, so the exists_condition is before cur_condition
-			conditions.emplace_back();
-			EntityQueryCondition *exists_condition = &(conditions.back());
-
-			//need to reretrieve the pointer in case there has been a reallocation via emplace_back
-			// don't get the end one just placed, get the one before that
-			cur_condition = &conditions[conditions.size() - 2];
-
-			//swap data and pointers
-			std::swap(*exists_condition, *cur_condition);
-			std::swap(exists_condition, cur_condition);
-
-			exists_condition->queryType = ENT_QUERY_EXISTS;
-			//if has_position_data, then will add on those needed features below
-			// but if it doesn't, then need to include all labels
-			if(!has_position_data)
-				exists_condition->existLabels = cur_condition->positionLabels;
-
-			//remove any 0 weighted features; if has_position_data, then move them to the exist query
-			// don't increment i here because if a feature is moved to the exists_condition,
-			// then a new feature is moved into that new index and that feature position needs to be rechecked
-			for(size_t i = 0; i < cur_condition->positionLabels.size();)
-			{
-				if(cur_condition->distParams.featureParams[i].weight == 0.0)
-				{
-					//only move/remove data if the right type of query
-					if(has_position_data)
-					{
-						exists_condition->existLabels.push_back(cur_condition->positionLabels[i]);
-						cur_condition->valueToCompare.erase(cur_condition->valueToCompare.begin() + i);
-						cur_condition->valueTypes.erase(cur_condition->valueTypes.begin() + i);
-					}
-
-					cur_condition->positionLabels.erase(cur_condition->positionLabels.begin() + i);
-					cur_condition->distParams.featureParams.erase(begin(cur_condition->distParams.featureParams) + i);
-					continue;
-				}
-
-				i++;
-			}
-		}
-
-		//perform this last to make sure all changes are in
-		cur_condition->distParams.SetAndConstrainParams();
 	}
 
 	//builds a query condition from cn