Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

22173: Improves performance by implementing thread local allocations #310

Merged
merged 37 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
564cc0b
Initial commit
Nov 14, 2024
bba2079
commented out debug prints.
Nov 15, 2024
d8ea434
Initilize tlab nodes to state ENT_NULL
Nov 15, 2024
28294d0
Put deallocated nodes back into tlab.
Nov 18, 2024
9d33fed
Moving around some code.
Nov 19, 2024
70c55e0
22156: Cleanup, progress, identified issue
howsohazard Nov 19, 2024
371fbda
22156: Re-adds tlab clears in concurrency
howsohazard Nov 19, 2024
22d2a78
22156: Collects more to tlab from frees
howsohazard Nov 19, 2024
bb85f4c
Implemented AllocListNodeWithOrderedChildNodes
Nov 21, 2024
d06458c
Added asserts, marked nodes as unallocated.
Nov 22, 2024
72528d2
Initial commit
Nov 14, 2024
21d70be
commented out debug prints.
Nov 15, 2024
7d1643f
Initilize tlab nodes to state ENT_NULL
Nov 15, 2024
55f9043
Put deallocated nodes back into tlab.
Nov 18, 2024
5ff973f
Moving around some code.
Nov 19, 2024
c8fd7eb
22156: Cleanup, progress, identified issue
howsohazard Nov 19, 2024
5f24c65
22156: Re-adds tlab clears in concurrency
howsohazard Nov 19, 2024
86aa6a3
22156: Collects more to tlab from frees
howsohazard Nov 19, 2024
e54dd4d
Implemented AllocListNodeWithOrderedChildNodes
Nov 21, 2024
c722275
Added asserts, marked nodes as unallocated.
Nov 22, 2024
ee0e02a
22156: Fixes to compilation
howsohazard Nov 22, 2024
1702af5
Fixed infinite loop by updated num_total_nodes_needed.
Nov 22, 2024
29c71da
Merged in changed. Fixed loop. Removed debug prints.
Nov 22, 2024
f13c02b
Put loop fix back in.
Nov 22, 2024
5783e8b
22156: Fixes bug, memory leak remains
howsohazard Nov 22, 2024
af42e13
22156: Fixes issue, minor style changes
howsohazard Nov 23, 2024
62e8acd
Untracked settings.json
Nov 25, 2024
4738e99
Get rid of debug defines.
Nov 25, 2024
ac191de
Fixed bug when adding nodes to tlab for list.
Nov 25, 2024
a654b77
made TLab private.
Nov 25, 2024
0ecd08e
Removed unneeded include.
Nov 25, 2024
c3401e3
Removed unused variable.
Nov 25, 2024
2b3cae9
Remove debug print.
Nov 25, 2024
557cb03
Fixed bug in AllocListNodeWithOrderedChildNodes
Nov 25, 2024
966f1ed
Merge branch 'main' into 22173-thread-local-allocation
howsohazard Nov 25, 2024
f38924f
Style clean ups; comments.
Nov 25, 2024
948c011
22173: Fixes possible issue when setting an Entity root to null
howsohazard Nov 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/Amalgam/Parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ EvaluableNode *Parser::GetCodeForPathToSharedNodeFromParentAToParentB(UnparseDat
if(shared_node == nullptr || a_parent == nullptr || b_parent == nullptr)
return nullptr;

//TODO 22156: allocate from enm in a way that does not invalidate tlab (but does not increase the size of EvaluableNodeManager)
naricc marked this conversation as resolved.
Show resolved Hide resolved

//find all parent nodes of a to find collision with parent node of b, along with depth counts
EvaluableNode::ReferenceCountType a_parent_nodes;
size_t a_ancestor_depth = 1;
Expand Down Expand Up @@ -991,7 +993,7 @@ void Parser::Unparse(UnparseData &upd, EvaluableNode *tree, EvaluableNode *paren
std::swap(upd.parentNodes, references);
Unparse(upd, code_to_print, nullptr, expanded_whitespace, indentation_depth, need_initial_indent);
std::swap(upd.parentNodes, references); //put the old parentNodes back
enm.FreeNodeTree(code_to_print);
//don't free code_to_print, but let enm's destructor clean it up

return;
}
Expand Down
14 changes: 13 additions & 1 deletion src/Amalgam/evaluablenode/EvaluableNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#define AMALGAM_FAST_MEMORY_INTEGRITY
#endif


//forward declarations:
class EvaluableNodeManager;

Expand Down Expand Up @@ -145,7 +146,7 @@ class EvaluableNode
inline void InitializeType(EvaluableNodeType _type)
{
#ifdef AMALGAM_FAST_MEMORY_INTEGRITY
assert(IsEvaluableNodeTypeValid(_type));
assert(IsEvaluableNodeTypeValid(_type) || _type == ENT_DEALLOCATED);
#endif

type = _type;
Expand All @@ -171,6 +172,17 @@ class EvaluableNode
attributes.individualAttribs.isIdempotent = true;
value.ConstructMappedChildNodes();
}
else if(_type == ENT_DEALLOCATED)
{
#ifdef AMALGAM_FAST_MEMORY_INTEGRITY
//use a value that is more apparent that something went wrong
value.numberValueContainer.numberValue = std::numeric_limits<double>::quiet_NaN();
#else
value.numberValueContainer.numberValue = 0;
#endif

value.numberValueContainer.labelStringID = StringInternPool::NOT_A_STRING_ID;
}
else
{
value.ConstructOrderedChildNodes();
Expand Down
196 changes: 121 additions & 75 deletions src/Amalgam/evaluablenode/EvaluableNodeManagement.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <string>
#include <vector>
#include <utility>
#include <iostream>
naricc marked this conversation as resolved.
Show resolved Hide resolved

#ifdef MULTITHREAD_SUPPORT
Concurrency::ReadWriteMutex EvaluableNodeManager::memoryModificationMutex;
Expand Down Expand Up @@ -59,63 +60,79 @@ EvaluableNode *EvaluableNodeManager::AllocNode(EvaluableNode *original, Evaluabl
return n;
}


void InitializeListHeadOrNode(EvaluableNode *node, EvaluableNode *parent, EvaluableNodeType child_node_type, size_t node_index, std::vector<EvaluableNode*> *ocn_buffer)
{
if(node_index == 0)
{
// parent
node->InitializeType(ENT_LIST);
std::vector<EvaluableNode *> *ocn_ptr = &node->GetOrderedChildNodesReference();
std::swap(*ocn_buffer, *ocn_ptr);
}
else
{
// child node; initialize it and add it to the list items
std::vector<EvaluableNode *> *ocn_ptr = &parent->GetOrderedChildNodesReference();
(*ocn_ptr)[node_index-1] = node;
naricc marked this conversation as resolved.
Show resolved Hide resolved
node->InitializeType(child_node_type);
}
}

EvaluableNode *EvaluableNodeManager::AllocListNodeWithOrderedChildNodes(EvaluableNodeType child_node_type, size_t num_child_nodes)
{
if(num_child_nodes == 0)
return AllocNode(ENT_LIST);

EvaluableNode *parent = nullptr;
// Allocate from TLab

size_t num_to_alloc = 1 + num_child_nodes;
size_t num_allocated = 0;
size_t num_to_alloc = num_child_nodes + 1;
size_t num_total_nodes_needed = 0;

EvaluableNode *retval = nullptr;

//start off allocating the parent node, then switch to child_node_type
EvaluableNodeType cur_type = ENT_LIST;

//ordered child nodes destination; preallocate outside of the lock (for performance) and swap in
std::vector<EvaluableNode *> *ocn_ptr = nullptr;
std::vector<EvaluableNode *> ocn_buffer;
ocn_buffer.resize(num_child_nodes);

//outer loop needed for multithreading, but doesn't hurt anything for single threading
while(num_allocated < num_to_alloc)
{
EvaluableNode *newNode = nullptr;

while((newNode = GetNextNodeFromTLab()) && num_allocated < num_to_alloc)
{
#ifdef MULTITHREAD_SUPPORT
//attempt to allocate as many as possible using an atomic without write locking
Concurrency::ReadLock lock(managerAttributesMutex);
#endif
if(parent == nullptr)
naricc marked this conversation as resolved.
Show resolved Hide resolved
{
parent = newNode;
}

for(; num_allocated < num_to_alloc; num_allocated++)
InitializeListHeadOrNode(newNode, parent, child_node_type, num_allocated, &ocn_buffer);
num_allocated++;
}

if(num_allocated >= num_to_alloc)
{
//we got enough nodes out of the tlab
return parent;
}

{ // Not enough nodes in TLab; add some.
#ifdef MULTITHREAD_SUPPORT
Concurrency::ReadLock lock(managerAttributesMutex);
#endif

int num_added_to_tlab = 0;
for(; num_allocated + num_added_to_tlab < num_to_alloc; num_added_to_tlab++)
{
//attempt to allocate a node and make sure it's valid
size_t allocated_index = firstUnusedNodeIndex++;
if(allocated_index < nodes.size())
{
if(nodes[allocated_index] != nullptr)
nodes[allocated_index]->InitializeType(cur_type);
else
nodes[allocated_index] = new EvaluableNode(cur_type);

//if first node, populate the parent node
if(num_allocated == 0)
if(nodes[allocated_index] == nullptr)
{
//prep parent node
retval = nodes[allocated_index];

//get the pointer to place child elements,
// but swap out the preallocated ordered child nodes
ocn_ptr = &retval->GetOrderedChildNodesReference();
std::swap(ocn_buffer, *ocn_ptr);

//advance type to child node type
cur_type = child_node_type;
}
else //set the appropriate child node
{
(*ocn_ptr)[num_allocated - 1] = nodes[allocated_index];
nodes[allocated_index] = new EvaluableNode(ENT_DEALLOCATED);
naricc marked this conversation as resolved.
Show resolved Hide resolved
}

AddNodeToTLab(nodes[allocated_index]);
}
else
{
Expand All @@ -125,34 +142,40 @@ EvaluableNode *EvaluableNodeManager::AllocListNodeWithOrderedChildNodes(Evaluabl
}
}

//if have allocated enough, just return
if(num_allocated == num_to_alloc)
return retval;

num_total_nodes_needed = firstUnusedNodeIndex + (num_to_alloc - num_allocated);
if( num_added_to_tlab + num_allocated >= num_to_alloc)
naricc marked this conversation as resolved.
Show resolved Hide resolved
{
// We were able to add enough nodes to tlab; use them next time through the loop
continue;
}
}

#ifdef MULTITHREAD_SUPPORT

//don't have enough nodes, so need to attempt a write lock to allocate more
Concurrency::WriteLock write_lock(managerAttributesMutex);
// There weren't enough free nodes available to fill the tlab; allocate more
num_total_nodes_needed = firstUnusedNodeIndex + (num_to_alloc - num_allocated);
{
#ifdef MULTITHREAD_SUPPORT
//don't have enough nodes, so need to attempt a write lock to allocate more
Concurrency::WriteLock write_lock(managerAttributesMutex);

//try again after write lock to allocate a node in case another thread has performed the allocation
//already have the write lock, so don't need to worry about another thread stealing firstUnusedNodeIndex
#endif
//try again after write lock to allocate a node in case another thread has performed the allocation
//already have the write lock, so don't need to worry about another thread stealing firstUnusedNodeIndex
#endif

//if don't currently have enough free nodes to meet the needs, then expand the allocation
if(nodes.size() <= num_total_nodes_needed)
{
size_t new_num_nodes = static_cast<size_t>(allocExpansionFactor * num_total_nodes_needed) + 1;

//fill new EvaluableNode slots with nullptr
nodes.resize(new_num_nodes, nullptr);
//if don't currently have enough free nodes to meet the needs, then expand the allocation
if(nodes.size() <= num_total_nodes_needed)
{
size_t new_num_nodes = static_cast<size_t>(allocExpansionFactor * num_total_nodes_needed) + 1;

//fill new EvaluableNode slots with nullptr
nodes.resize(new_num_nodes, nullptr);
}
}
}

//shouldn't make it here
return retval;
// unreachable
assert(false);
return nullptr;
}

void EvaluableNodeManager::UpdateGarbageCollectionTrigger(size_t previous_num_nodes)
Expand Down Expand Up @@ -184,8 +207,10 @@ void EvaluableNodeManager::CollectGarbage()
PerformanceProfiler::StartOperation(collect_garbage_string, GetNumberOfUsedNodes());
}

ClearThreadLocalAllocationBuffer();

#ifdef MULTITHREAD_SUPPORT

//free lock so can attempt to enter write lock to collect garbage
if(memory_modification_lock != nullptr)
memory_modification_lock->unlock();
Expand Down Expand Up @@ -250,24 +275,38 @@ void EvaluableNodeManager::FreeAllNodes()
}

EvaluableNode *EvaluableNodeManager::AllocUninitializedNode()
{
size_t allocated_index = 0;
{
EvaluableNode *tlab_node = GetNextNodeFromTLab();
//Fast Path; get node from thread local buffer
if(tlab_node != nullptr)
return tlab_node;

#ifdef MULTITHREAD_SUPPORT
{
//attempt to allocate using an atomic without write locking

//slow path allocation; attempt to allocate using an atomic without write locking
Concurrency::ReadLock lock(managerAttributesMutex);

//attempt to allocate a node and make sure it's valid
allocated_index = firstUnusedNodeIndex++;
if(allocated_index < nodes.size())
//attempt to allocate enough nodes to refill thread local buffer
size_t first_index_to_allocate = firstUnusedNodeIndex.fetch_add(tlabSize);
size_t last_index_to_allocate = first_index_to_allocate + tlabSize;

if(last_index_to_allocate < nodes.size())
{
if(nodes[allocated_index] == nullptr)
nodes[allocated_index] = new EvaluableNode();
for(size_t i = first_index_to_allocate; i < last_index_to_allocate; i++)
{
if(nodes[i] == nullptr)
nodes[i] = new EvaluableNode(ENT_DEALLOCATED);

return nodes[allocated_index];
AddNodeToTLab(nodes[i]);
}

return GetNextNodeFromTLab();
}
//the node wasn't valid; put it back and do a write lock to allocate more
--firstUnusedNodeIndex;

//couldn't allocate enough valid nodes; reset index and allocate more
firstUnusedNodeIndex -= tlabSize;
ClearThreadLocalAllocationBuffer();
}
//don't have enough nodes, so need to attempt a write lock to allocate more
Concurrency::WriteLock write_lock(managerAttributesMutex);
Expand All @@ -276,17 +315,18 @@ EvaluableNode *EvaluableNodeManager::AllocUninitializedNode()
//already have the write lock, so don't need to worry about another thread stealing firstUnusedNodeIndex
//use the cached value for firstUnusedNodeIndex, allocated_index, to check if another thread has performed the allocation
//as other threads may have reduced firstUnusedNodeIndex, incurring more unnecessary write locks when a memory expansion is needed
#else
allocated_index = firstUnusedNodeIndex;

#endif
//reduce accesses to the atomic variable for performance
size_t allocated_index = firstUnusedNodeIndex++;

size_t num_nodes = nodes.size();
if(allocated_index < num_nodes && firstUnusedNodeIndex < num_nodes)
if(allocated_index < num_nodes)
{
if(nodes[firstUnusedNodeIndex] == nullptr)
nodes[firstUnusedNodeIndex] = new EvaluableNode();
if(nodes[allocated_index] == nullptr)
nodes[allocated_index] = new EvaluableNode();

return nodes[firstUnusedNodeIndex++];
return nodes[allocated_index];
}

//ran out, so need another node; push a bunch on the heap so don't need to reallocate as often and slow down garbage collection
Expand All @@ -295,10 +335,10 @@ EvaluableNode *EvaluableNodeManager::AllocUninitializedNode()
//fill new EvaluableNode slots with nullptr
nodes.resize(new_num_nodes, nullptr);

if(nodes[firstUnusedNodeIndex] == nullptr)
nodes[firstUnusedNodeIndex] = new EvaluableNode();
if(nodes[allocated_index] == nullptr)
nodes[allocated_index] = new EvaluableNode();

return nodes[firstUnusedNodeIndex++];
return nodes[allocated_index];
}

void EvaluableNodeManager::FreeAllNodesExceptReferencedNodes(size_t cur_first_unused_node_index)
Expand Down Expand Up @@ -435,6 +475,9 @@ void EvaluableNodeManager::FreeNodeTreeRecurse(EvaluableNode *tree)
}

tree->Invalidate();

tree->InitializeType(ENT_DEALLOCATED);
AddNodeToTLab(tree);
}

void EvaluableNodeManager::FreeNodeTreeWithCyclesRecurse(EvaluableNode *tree)
Expand All @@ -451,6 +494,7 @@ void EvaluableNodeManager::FreeNodeTreeWithCyclesRecurse(EvaluableNode *tree)
auto &tree_mcn = tree->GetMappedChildNodesReference();
std::swap(mcn, tree_mcn);
tree->Invalidate();
AddNodeToTLab(tree);

for(auto &[_, e] : mcn)
{
Expand All @@ -464,6 +508,7 @@ void EvaluableNodeManager::FreeNodeTreeWithCyclesRecurse(EvaluableNode *tree)
else if(tree->IsImmediate())
{
tree->Invalidate();
AddNodeToTLab(tree);
}
else //ordered
{
Expand All @@ -473,6 +518,7 @@ void EvaluableNodeManager::FreeNodeTreeWithCyclesRecurse(EvaluableNode *tree)
auto &tree_ocn = tree->GetOrderedChildNodesReference();
std::swap(ocn, tree_ocn);
tree->Invalidate();
AddNodeToTLab(tree);

for(auto &e : ocn)
{
Expand Down
Loading
Loading