Skip to content

Commit

Permalink
22173: Improves performance by implementing thread local allocations (#…
Browse files Browse the repository at this point in the history
…310)

This is an improvement to performance of memroy management in amalgam.
It changes allocation of EvaluableNodes so that we first try to allocate
from a thread local buffer; only if that buffer is empty do we try to
allocate from the EvaluableNodeManager pool. When eagerly freeing nodes,
we also put them back in the thread local buffer.

---------

Co-authored-by: Nathan Ricci <[email protected]>
Co-authored-by: Christopher J. Hazard, PhD <[email protected]>
  • Loading branch information
3 people authored Nov 25, 2024
1 parent 72dddc6 commit edd4acb
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 107 deletions.
2 changes: 1 addition & 1 deletion src/Amalgam/Parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,7 @@ void Parser::Unparse(UnparseData &upd, EvaluableNode *tree, EvaluableNode *paren
std::swap(upd.parentNodes, references);
Unparse(upd, code_to_print, nullptr, expanded_whitespace, indentation_depth, need_initial_indent);
std::swap(upd.parentNodes, references); //put the old parentNodes back
enm.FreeNodeTree(code_to_print);
//don't free code_to_print, but let enm's destructor clean it up

return;
}
Expand Down
7 changes: 2 additions & 5 deletions src/Amalgam/entity/Entity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -859,11 +859,8 @@ void Entity::SetRoot(EvaluableNode *_code, bool allocated_with_entity_enm, Evalu
EvaluableNode *cur_root = GetRoot();
bool entity_previously_empty = (cur_root == nullptr || cur_root->GetNumChildNodes() == 0);

if(_code == nullptr)
{
evaluableNodeManager.SetRootNode(evaluableNodeManager.AllocNode(ENT_NULL));
}
else if(allocated_with_entity_enm && metadata_modifier == EvaluableNodeManager::ENMM_NO_CHANGE)
if(_code == nullptr
|| (allocated_with_entity_enm && metadata_modifier == EvaluableNodeManager::ENMM_NO_CHANGE))
{
evaluableNodeManager.SetRootNode(_code);
}
Expand Down
14 changes: 13 additions & 1 deletion src/Amalgam/evaluablenode/EvaluableNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#define AMALGAM_FAST_MEMORY_INTEGRITY
#endif


//forward declarations:
class EvaluableNodeManager;

Expand Down Expand Up @@ -145,7 +146,7 @@ class EvaluableNode
inline void InitializeType(EvaluableNodeType _type)
{
#ifdef AMALGAM_FAST_MEMORY_INTEGRITY
assert(IsEvaluableNodeTypeValid(_type));
assert(IsEvaluableNodeTypeValid(_type) || _type == ENT_DEALLOCATED);
#endif

type = _type;
Expand All @@ -171,6 +172,17 @@ class EvaluableNode
attributes.individualAttribs.isIdempotent = true;
value.ConstructMappedChildNodes();
}
else if(_type == ENT_DEALLOCATED)
{
#ifdef AMALGAM_FAST_MEMORY_INTEGRITY
//use a value that is more apparent that something went wrong
value.numberValueContainer.numberValue = std::numeric_limits<double>::quiet_NaN();
#else
value.numberValueContainer.numberValue = 0;
#endif

value.numberValueContainer.labelStringID = StringInternPool::NOT_A_STRING_ID;
}
else
{
value.ConstructOrderedChildNodes();
Expand Down
192 changes: 116 additions & 76 deletions src/Amalgam/evaluablenode/EvaluableNodeManagement.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,63 +59,76 @@ EvaluableNode *EvaluableNodeManager::AllocNode(EvaluableNode *original, Evaluabl
return n;
}


void InitializeListHeadOrNode(EvaluableNode *node, EvaluableNode *parent, EvaluableNodeType child_node_type, size_t node_index, std::vector<EvaluableNode*> *ocn_buffer)
{
if(node_index == 0)
{
// parent
node->InitializeType(ENT_LIST);
std::vector<EvaluableNode *> *ocn_ptr = &node->GetOrderedChildNodesReference();
std::swap(*ocn_buffer, *ocn_ptr);
}
else
{
// child node; initialize it and add it to the list items
auto &ocn = parent->GetOrderedChildNodesReference();
ocn[node_index-1] = node;
node->InitializeType(child_node_type);
}
}

EvaluableNode *EvaluableNodeManager::AllocListNodeWithOrderedChildNodes(EvaluableNodeType child_node_type, size_t num_child_nodes)
{
if(num_child_nodes == 0)
return AllocNode(ENT_LIST);

EvaluableNode *parent = nullptr;
// Allocate from TLab

size_t num_to_alloc = 1 + num_child_nodes;
size_t num_allocated = 0;
size_t num_to_alloc = num_child_nodes + 1;
size_t num_total_nodes_needed = 0;

EvaluableNode *retval = nullptr;

//start off allocating the parent node, then switch to child_node_type
EvaluableNodeType cur_type = ENT_LIST;

//ordered child nodes destination; preallocate outside of the lock (for performance) and swap in
std::vector<EvaluableNode *> *ocn_ptr = nullptr;
std::vector<EvaluableNode *> ocn_buffer;
ocn_buffer.resize(num_child_nodes);

//outer loop needed for multithreading, but doesn't hurt anything for single threading
while(num_allocated < num_to_alloc)
{
EvaluableNode *newNode = nullptr;

while((newNode = GetNextNodeFromTLab()) && num_allocated < num_to_alloc)
{
#ifdef MULTITHREAD_SUPPORT
//attempt to allocate as many as possible using an atomic without write locking
Concurrency::ReadLock lock(managerAttributesMutex);
#endif
if(parent == nullptr)
parent = newNode;

InitializeListHeadOrNode(newNode, parent, child_node_type, num_allocated, &ocn_buffer);
num_allocated++;
}

if(num_allocated >= num_to_alloc)
{
//we got enough nodes out of the tlab
return parent;
}

{ // Not enough nodes in TLab; add some.
#ifdef MULTITHREAD_SUPPORT
Concurrency::ReadLock lock(managerAttributesMutex);
#endif

for(; num_allocated < num_to_alloc; num_allocated++)
int num_added_to_tlab = 0;
for(; num_allocated + num_added_to_tlab < num_to_alloc; num_added_to_tlab++)
{
//attempt to allocate a node and make sure it's valid
size_t allocated_index = firstUnusedNodeIndex++;
if(allocated_index < nodes.size())
{
if(nodes[allocated_index] != nullptr)
nodes[allocated_index]->InitializeType(cur_type);
else
nodes[allocated_index] = new EvaluableNode(cur_type);

//if first node, populate the parent node
if(num_allocated == 0)
{
//prep parent node
retval = nodes[allocated_index];

//get the pointer to place child elements,
// but swap out the preallocated ordered child nodes
ocn_ptr = &retval->GetOrderedChildNodesReference();
std::swap(ocn_buffer, *ocn_ptr);
if(nodes[allocated_index] == nullptr)
nodes[allocated_index] = new EvaluableNode(ENT_DEALLOCATED);


//advance type to child node type
cur_type = child_node_type;
}
else //set the appropriate child node
{
(*ocn_ptr)[num_allocated - 1] = nodes[allocated_index];
}
AddNodeToTLab(nodes[allocated_index]);
}
else
{
Expand All @@ -125,34 +138,38 @@ EvaluableNode *EvaluableNodeManager::AllocListNodeWithOrderedChildNodes(Evaluabl
}
}

//if have allocated enough, just return
if(num_allocated == num_to_alloc)
return retval;

num_total_nodes_needed = firstUnusedNodeIndex + (num_to_alloc - num_allocated);
// If we added enough nodes to the tlab, use them in the next loop iteration
if( num_added_to_tlab + num_allocated >= num_to_alloc)
continue;
}

#ifdef MULTITHREAD_SUPPORT

//don't have enough nodes, so need to attempt a write lock to allocate more
Concurrency::WriteLock write_lock(managerAttributesMutex);
// There weren't enough free nodes available to fill the tlab; allocate more
num_total_nodes_needed = firstUnusedNodeIndex + (num_to_alloc - num_allocated);
{
#ifdef MULTITHREAD_SUPPORT
//don't have enough nodes, so need to attempt a write lock to allocate more
Concurrency::WriteLock write_lock(managerAttributesMutex);

//try again after write lock to allocate a node in case another thread has performed the allocation
//already have the write lock, so don't need to worry about another thread stealing firstUnusedNodeIndex
#endif
//try again after write lock to allocate a node in case another thread has performed the allocation
//already have the write lock, so don't need to worry about another thread stealing firstUnusedNodeIndex
#endif

//if don't currently have enough free nodes to meet the needs, then expand the allocation
if(nodes.size() <= num_total_nodes_needed)
{
size_t new_num_nodes = static_cast<size_t>(allocExpansionFactor * num_total_nodes_needed) + 1;

//fill new EvaluableNode slots with nullptr
nodes.resize(new_num_nodes, nullptr);
//if don't currently have enough free nodes to meet the needs, then expand the allocation
if(nodes.size() <= num_total_nodes_needed)
{
size_t new_num_nodes = static_cast<size_t>(allocExpansionFactor * num_total_nodes_needed) + 1;

//fill new EvaluableNode slots with nullptr
nodes.resize(new_num_nodes, nullptr);
}
}
}

//shouldn't make it here
return retval;
// unreachable
assert(false);
return nullptr;
}

void EvaluableNodeManager::UpdateGarbageCollectionTrigger(size_t previous_num_nodes)
Expand Down Expand Up @@ -184,8 +201,10 @@ void EvaluableNodeManager::CollectGarbage()
PerformanceProfiler::StartOperation(collect_garbage_string, GetNumberOfUsedNodes());
}

ClearThreadLocalAllocationBuffer();

#ifdef MULTITHREAD_SUPPORT

//free lock so can attempt to enter write lock to collect garbage
if(memory_modification_lock != nullptr)
memory_modification_lock->unlock();
Expand Down Expand Up @@ -250,24 +269,38 @@ void EvaluableNodeManager::FreeAllNodes()
}

EvaluableNode *EvaluableNodeManager::AllocUninitializedNode()
{
size_t allocated_index = 0;
{
EvaluableNode *tlab_node = GetNextNodeFromTLab();
//Fast Path; get node from thread local buffer
if(tlab_node != nullptr)
return tlab_node;

#ifdef MULTITHREAD_SUPPORT
{
//attempt to allocate using an atomic without write locking

//slow path allocation; attempt to allocate using an atomic without write locking
Concurrency::ReadLock lock(managerAttributesMutex);

//attempt to allocate a node and make sure it's valid
allocated_index = firstUnusedNodeIndex++;
if(allocated_index < nodes.size())
//attempt to allocate enough nodes to refill thread local buffer
size_t first_index_to_allocate = firstUnusedNodeIndex.fetch_add(tlabSize);
size_t last_index_to_allocate = first_index_to_allocate + tlabSize;

if(last_index_to_allocate < nodes.size())
{
if(nodes[allocated_index] == nullptr)
nodes[allocated_index] = new EvaluableNode();
for(size_t i = first_index_to_allocate; i < last_index_to_allocate; i++)
{
if(nodes[i] == nullptr)
nodes[i] = new EvaluableNode(ENT_DEALLOCATED);

return nodes[allocated_index];
AddNodeToTLab(nodes[i]);
}

return GetNextNodeFromTLab();
}
//the node wasn't valid; put it back and do a write lock to allocate more
--firstUnusedNodeIndex;

//couldn't allocate enough valid nodes; reset index and allocate more
firstUnusedNodeIndex -= tlabSize;
ClearThreadLocalAllocationBuffer();
}
//don't have enough nodes, so need to attempt a write lock to allocate more
Concurrency::WriteLock write_lock(managerAttributesMutex);
Expand All @@ -276,17 +309,18 @@ EvaluableNode *EvaluableNodeManager::AllocUninitializedNode()
//already have the write lock, so don't need to worry about another thread stealing firstUnusedNodeIndex
//use the cached value for firstUnusedNodeIndex, allocated_index, to check if another thread has performed the allocation
//as other threads may have reduced firstUnusedNodeIndex, incurring more unnecessary write locks when a memory expansion is needed
#else
allocated_index = firstUnusedNodeIndex;

#endif
//reduce accesses to the atomic variable for performance
size_t allocated_index = firstUnusedNodeIndex++;

size_t num_nodes = nodes.size();
if(allocated_index < num_nodes && firstUnusedNodeIndex < num_nodes)
if(allocated_index < num_nodes)
{
if(nodes[firstUnusedNodeIndex] == nullptr)
nodes[firstUnusedNodeIndex] = new EvaluableNode();
if(nodes[allocated_index] == nullptr)
nodes[allocated_index] = new EvaluableNode();

return nodes[firstUnusedNodeIndex++];
return nodes[allocated_index];
}

//ran out, so need another node; push a bunch on the heap so don't need to reallocate as often and slow down garbage collection
Expand All @@ -295,10 +329,10 @@ EvaluableNode *EvaluableNodeManager::AllocUninitializedNode()
//fill new EvaluableNode slots with nullptr
nodes.resize(new_num_nodes, nullptr);

if(nodes[firstUnusedNodeIndex] == nullptr)
nodes[firstUnusedNodeIndex] = new EvaluableNode();
if(nodes[allocated_index] == nullptr)
nodes[allocated_index] = new EvaluableNode();

return nodes[firstUnusedNodeIndex++];
return nodes[allocated_index];
}

void EvaluableNodeManager::FreeAllNodesExceptReferencedNodes(size_t cur_first_unused_node_index)
Expand Down Expand Up @@ -435,6 +469,9 @@ void EvaluableNodeManager::FreeNodeTreeRecurse(EvaluableNode *tree)
}

tree->Invalidate();

tree->InitializeType(ENT_DEALLOCATED);
AddNodeToTLab(tree);
}

void EvaluableNodeManager::FreeNodeTreeWithCyclesRecurse(EvaluableNode *tree)
Expand All @@ -451,6 +488,7 @@ void EvaluableNodeManager::FreeNodeTreeWithCyclesRecurse(EvaluableNode *tree)
auto &tree_mcn = tree->GetMappedChildNodesReference();
std::swap(mcn, tree_mcn);
tree->Invalidate();
AddNodeToTLab(tree);

for(auto &[_, e] : mcn)
{
Expand All @@ -464,6 +502,7 @@ void EvaluableNodeManager::FreeNodeTreeWithCyclesRecurse(EvaluableNode *tree)
else if(tree->IsImmediate())
{
tree->Invalidate();
AddNodeToTLab(tree);
}
else //ordered
{
Expand All @@ -473,6 +512,7 @@ void EvaluableNodeManager::FreeNodeTreeWithCyclesRecurse(EvaluableNode *tree)
auto &tree_ocn = tree->GetOrderedChildNodesReference();
std::swap(ocn, tree_ocn);
tree->Invalidate();
AddNodeToTLab(tree);

for(auto &e : ocn)
{
Expand Down
Loading

0 comments on commit edd4acb

Please sign in to comment.