Skip to content

Commit

Permalink
Precaching, randomness, speed optimization. [documentation pending].
Browse files Browse the repository at this point in the history
  • Loading branch information
nyoungbq committed Jun 14, 2024
1 parent 696a210 commit 0e2cc2a
Show file tree
Hide file tree
Showing 6 changed files with 262 additions and 31 deletions.
121 changes: 93 additions & 28 deletions src/Plugins/SimplnxCore/src/SimplnxCore/Filters/Algorithms/DBSCAN.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,22 +84,23 @@ class FindEpsilonNeighborhoodsImpl
std::vector<std::list<usize>>& m_Neighborhoods;
};

template <typename T, bool PrecacheT>
template <typename T, bool PrecacheV = true, bool RandomInitV = true>
class DBSCANTemplate
{
private:
using AbstractDataStoreT = AbstractDataStore<T>;

public:
DBSCANTemplate(DBSCAN* filter, const AbstractDataStoreT& inputDataStore, const std::unique_ptr<MaskCompare>& maskDataArray, AbstractDataStore<int32>& fIdsDataStore, float32 epsilon, int32 minPoints,
ClusterUtilities::DistanceMetric distMetric)
ClusterUtilities::DistanceMetric distMetric, std::mt19937_64::result_type seed)
: m_Filter(filter)
, m_InputDataStore(inputDataStore)
, m_Mask(maskDataArray)
, m_FeatureIds(fIdsDataStore)
, m_Epsilon(epsilon)
, m_MinPoints(minPoints)
, m_DistMetric(distMetric)
, m_Seed(seed)
{
}
~DBSCANTemplate() = default;
Expand All @@ -120,7 +121,7 @@ class DBSCANTemplate

std::vector<std::list<usize>> epsilonNeighborhoods;

if constexpr(PrecacheT)
if constexpr(PrecacheV)
{
// In-memory only with current implementation for speed with std::list
epsilonNeighborhoods = std::vector<std::list<usize>>(numTuples);
Expand All @@ -133,40 +134,85 @@ class DBSCANTemplate
m_Filter->updateProgress("Neighborhoods found.");
}

std::mt19937_64 gen(m_Seed);
std::uniform_int_distribution<usize> dist(0, numTuples - 1);

m_Filter->updateProgress("Beginning clustering...");
auto start = std::chrono::steady_clock::now();
for(usize i = 0; i < numTuples; i++)
usize i = 0;
uint8 misses = 0;
while(std::find(visited.begin(), visited.end(), false) != visited.end())
{
if(m_Filter->getCancel())
{
return;
}

if(m_Mask->isTrue(i) && !visited[i])
usize index;
if constexpr(!RandomInitV)
{
index = i;
if(i >= numTuples)
{
break;
}
i++;
}
if constexpr(RandomInitV)
{
index = dist(gen);
}

if(visited[index])
{
if(misses >= 10)
{
auto findIter = std::find(visited.begin(), visited.end(), false);
if(findIter == visited.end())
{
break;
}
index = std::distance(visited.begin(), findIter);

if constexpr(RandomInitV)
{
dist = std::uniform_int_distribution<usize>(index, numTuples - 1);
}
}
else
{
misses++;
continue;
}
}

misses = 0;

if(m_Mask->isTrue(index))
{
visited[i] = true;
visited[index] = true;
auto now = std::chrono::steady_clock::now();
//// Only send updates every 1 second
// Only send updates every 1 second
if(std::chrono::duration_cast<std::chrono::milliseconds>(now - start).count() > 1000)
{
float32 progress = (static_cast<float32>(i) / static_cast<float32>(numTuples)) * 100.0f;
m_Filter->updateProgress(fmt::format("Scanning Data || Visited Point {} of {} || {:.2f}% Completed", i, numTuples, progress));
float32 progress = (static_cast<float32>(index) / static_cast<float32>(numTuples)) * 100.0f;
m_Filter->updateProgress(fmt::format("Scanning Data || Visited Point {} of {} || {:.2f}% Completed", index, numTuples, progress));
start = std::chrono::steady_clock::now();
}

std::list<usize> neighbors;
if constexpr(PrecacheT)
if constexpr(PrecacheV)
{
neighbors = epsilonNeighborhoods[i];
neighbors = epsilonNeighborhoods[index];
}
if constexpr(!PrecacheT)
if constexpr(!PrecacheV)
{
for(usize j = 0; j < numTuples; j++)
{
if(m_Mask->isTrue(j))
{
float64 dist = ClusterUtilities::GetDistance(m_InputDataStore, (numCompDims * i), m_InputDataStore, (numCompDims * j), numCompDims, m_DistMetric);
if(dist < m_Epsilon)
float64 distance = ClusterUtilities::GetDistance(m_InputDataStore, (numCompDims * index), m_InputDataStore, (numCompDims * j), numCompDims, m_DistMetric);
if(distance < m_Epsilon)
{
neighbors.push_back(j);
}
Expand All @@ -176,8 +222,8 @@ class DBSCANTemplate

if(static_cast<int32>(neighbors.size()) < m_MinPoints)
{
m_FeatureIds[i] = 0;
clustered[i] = true;
m_FeatureIds[index] = 0;
clustered[index] = true;
}
else
{
Expand All @@ -186,8 +232,8 @@ class DBSCANTemplate
return;
}
cluster++;
m_FeatureIds[i] = cluster;
clustered[i] = true;
m_FeatureIds[index] = cluster;
clustered[index] = true;

for(auto&& idx : neighbors)
{
Expand All @@ -198,18 +244,18 @@ class DBSCANTemplate
visited[idx] = true;

std::list<usize> neighbors_prime;
if constexpr(PrecacheT)
if constexpr(PrecacheV)
{
neighbors_prime = epsilonNeighborhoods[idx];
}
if constexpr(!PrecacheT)
if constexpr(!PrecacheV)
{
for(usize j = 0; j < numTuples; j++)
{
if(m_Mask->isTrue(j))
{
float64 dist = ClusterUtilities::GetDistance(m_InputDataStore, (numCompDims * idx), m_InputDataStore, (numCompDims * j), numCompDims, m_DistMetric);
if(dist < m_Epsilon)
float64 distance = ClusterUtilities::GetDistance(m_InputDataStore, (numCompDims * idx), m_InputDataStore, (numCompDims * j), numCompDims, m_DistMetric);
if(distance < m_Epsilon)
{
neighbors_prime.push_back(j);
}
Expand All @@ -231,6 +277,10 @@ class DBSCANTemplate
}
}
}
else
{
visited[index] = true;
}
}
m_Filter->updateProgress("Clustering Complete!");
}
Expand All @@ -243,21 +293,36 @@ class DBSCANTemplate
float32 m_Epsilon;
int32 m_MinPoints;
ClusterUtilities::DistanceMetric m_DistMetric;
std::mt19937_64::result_type m_Seed;
};

struct DBSCANFunctor
{
template <typename T>
void operator()(bool cache, DBSCAN* filter, const IDataArray& inputIDataArray, const std::unique_ptr<MaskCompare>& maskCompare, Int32Array& fIds, float32 epsilon, int32 minPoints,
ClusterUtilities::DistanceMetric distMetric)
void operator()(bool cache, bool useRandom, DBSCAN* filter, const IDataArray& inputIDataArray, const std::unique_ptr<MaskCompare>& maskCompare, Int32Array& fIds, float32 epsilon, int32 minPoints,
ClusterUtilities::DistanceMetric distMetric, std::mt19937_64::result_type seed)
{
if(cache)
{
DBSCANTemplate<T, true>(filter, dynamic_cast<const DataArray<T>&>(inputIDataArray).getDataStoreRef(), maskCompare, fIds.getDataStoreRef(), epsilon, minPoints, distMetric)();
if(useRandom)
{
DBSCANTemplate<T, true, true>(filter, dynamic_cast<const DataArray<T>&>(inputIDataArray).getDataStoreRef(), maskCompare, fIds.getDataStoreRef(), epsilon, minPoints, distMetric, seed)();
}
else
{
DBSCANTemplate<T, true, false>(filter, dynamic_cast<const DataArray<T>&>(inputIDataArray).getDataStoreRef(), maskCompare, fIds.getDataStoreRef(), epsilon, minPoints, distMetric, seed)();
}
}
else
{
DBSCANTemplate<T, false>(filter, dynamic_cast<const DataArray<T>&>(inputIDataArray).getDataStoreRef(), maskCompare, fIds.getDataStoreRef(), epsilon, minPoints, distMetric)();
if(useRandom)
{
DBSCANTemplate<T, false, true>(filter, dynamic_cast<const DataArray<T>&>(inputIDataArray).getDataStoreRef(), maskCompare, fIds.getDataStoreRef(), epsilon, minPoints, distMetric, seed)();
}
else
{
DBSCANTemplate<T, false, false>(filter, dynamic_cast<const DataArray<T>&>(inputIDataArray).getDataStoreRef(), maskCompare, fIds.getDataStoreRef(), epsilon, minPoints, distMetric, seed)();
}
}
}
};
Expand Down Expand Up @@ -305,8 +370,8 @@ Result<> DBSCAN::operator()()
return MakeErrorResult(-54060, message);
}

ExecuteNeighborFunction(DBSCANFunctor{}, clusteringArray.getDataType(), m_InputValues->AllowCaching, this, clusteringArray, maskCompare, featureIds, m_InputValues->Epsilon, m_InputValues->MinPoints,
m_InputValues->DistanceMetric);
ExecuteNeighborFunction(DBSCANFunctor{}, clusteringArray.getDataType(), m_InputValues->AllowCaching, m_InputValues->UseRandom, this, clusteringArray, maskCompare, featureIds, m_InputValues->Epsilon,
m_InputValues->MinPoints, m_InputValues->DistanceMetric, m_InputValues->Seed);

updateProgress("Resizing Clustering Attribute Matrix...");
auto& featureIdsDataStore = featureIds.getDataStoreRef();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include "simplnx/Parameters/NumberParameter.hpp"
#include "simplnx/Utilities/ClusteringUtilities.hpp"

#include <random>

namespace nx::core
{
struct SIMPLNXCORE_EXPORT DBSCANInputValues
Expand All @@ -23,6 +25,8 @@ struct SIMPLNXCORE_EXPORT DBSCANInputValues
ClusterUtilities::DistanceMetric DistanceMetric;
DataPath FeatureAM;
bool AllowCaching;
bool UseRandom;
std::mt19937_64::result_type Seed;
};

/**
Expand Down
42 changes: 39 additions & 3 deletions src/Plugins/SimplnxCore/src/SimplnxCore/Filters/DBSCANFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,14 @@ using namespace nx::core;
namespace
{
const std::string k_MaskName = "temp_mask";
}

enum AlgType
{
Iterative,
Random,
SeededRandom
};
} // namespace

namespace nx::core
{
Expand Down Expand Up @@ -66,10 +73,18 @@ Parameters DBSCANFilter::parameters() const
Parameters params;

// Create the parameter descriptors that are needed for this filter
params.insertSeparator(Parameters::Separator{"Random Number Seed Parameters"});
params.insertLinkableParameter(std::make_unique<ChoicesParameter>(k_SeedChoice_Key, "Initialization Type", "Whether to use random or iterative for start state. See Documentation for further detail",
to_underlying(::AlgType::SeededRandom),
ChoicesParameter::Choices{"Iterative", "Random", "Seeded Random"})); // sequence dependent DO NOT REORDER
params.insert(std::make_unique<NumberParameter<uint64>>(k_SeedValue_Key, "Seed Value", "The seed fed into the random generator", std::mt19937::default_seed));
params.insert(std::make_unique<DataObjectNameParameter>(k_SeedArrayName_Key, "Stored Seed Value Array Name", "Name of array holding the seed value", "DBSCAN SeedValue"));

params.insertSeparator(Parameters::Separator{"Input Parameter(s)"});
params.insert(std::make_unique<BoolParameter>(k_UsePrecaching_Key, "Use Precaching", "If true the algorithm will be significantly faster, but it requires more memory", true));
params.insert(std::make_unique<Float32Parameter>(k_Epsilon_Key, "Epsilon", "This will be the tuple size for Cluster Attribute Matrix and the values within", 0.0001));
params.insert(std::make_unique<Int32Parameter>(k_MinPoints_Key, "Minimum Points", "This will be the tuple size for Cluster Attribute Matrix and the values within", 0.0001));
params.insert(std::make_unique<Float32Parameter>(k_Epsilon_Key, "Epsilon", "The epsilon-neighborhood around each point is queried", 0.0001));
params.insert(std::make_unique<Int32Parameter>(k_MinPoints_Key, "Minimum Points",
"The minimum number of points needed to form a 'dense region' (i.e., the minimum number of points needed to be called a cluster)", 2));
params.insert(
std::make_unique<ChoicesParameter>(k_DistanceMetric_Key, "Distance Metric", "Distance Metric type to be used for calculations", to_underlying(ClusterUtilities::DistanceMetric::Euclidean),
ChoicesParameter::Choices{"Euclidean", "Squared Euclidean", "Manhattan", "Cosine", "Pearson", "Squared Pearson"})); // sequence dependent DO NOT REORDER
Expand Down Expand Up @@ -143,6 +158,13 @@ IFilter::PreflightResult DBSCANFilter::preflightImpl(const DataStructure& dataSt
resultOutputActions.value().appendAction(std::move(createAction));
}

// For caching seed run to run
if(filterArgs.value<::AlgType>(k_SeedChoice_Key) != AlgType::Iterative)
{
auto createAction = std::make_unique<CreateArrayAction>(DataType::uint64, std::vector<usize>{1}, std::vector<usize>{1}, DataPath({filterArgs.value<std::string>(k_SeedArrayName_Key)}));
resultOutputActions.value().appendAction(std::move(createAction));
}

// Return both the resultOutputActions and the preflightUpdatedValues via std::move()
return {std::move(resultOutputActions), std::move(preflightUpdatedValues)};
}
Expand All @@ -158,6 +180,18 @@ Result<> DBSCANFilter::executeImpl(DataStructure& dataStructure, const Arguments
dataStructure.getDataRefAs<BoolArray>(maskPath).fill(true);
}

auto seed = filterArgs.value<std::mt19937_64::result_type>(k_SeedValue_Key);
if(filterArgs.value<::AlgType>(k_SeedChoice_Key) != AlgType::SeededRandom)
{
seed = static_cast<std::mt19937_64::result_type>(std::chrono::steady_clock::now().time_since_epoch().count());
}

if(filterArgs.value<::AlgType>(k_SeedChoice_Key) != AlgType::Iterative)
{
// Store Seed Value in Top Level Array
dataStructure.getDataRefAs<UInt64Array>(DataPath({filterArgs.value<std::string>(k_SeedArrayName_Key)}))[0] = seed;
}

DBSCANInputValues inputValues;

inputValues.Epsilon = filterArgs.value<float32>(k_Epsilon_Key);
Expand All @@ -170,6 +204,8 @@ Result<> DBSCANFilter::executeImpl(DataStructure& dataStructure, const Arguments
inputValues.FeatureIdsArrayPath = fIdsPath;
inputValues.FeatureAM = filterArgs.value<DataPath>(k_FeatureAMPath_Key);
inputValues.AllowCaching = filterArgs.value<bool>(k_UsePrecaching_Key);
inputValues.UseRandom = filterArgs.value<::AlgType>(k_SeedChoice_Key) != AlgType::Iterative;
inputValues.Seed = filterArgs.value<std::mt19937_64::result_type>(k_SeedValue_Key);

return DBSCAN(dataStructure, messageHandler, shouldCancel, &inputValues)();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ class SIMPLNXCORE_EXPORT DBSCANFilter : public IFilter
DBSCANFilter& operator=(DBSCANFilter&&) noexcept = delete;

// Parameter Keys
static inline constexpr StringLiteral k_SeedChoice_Key = "seed_choice";
static inline constexpr StringLiteral k_SeedValue_Key = "seed_value";
static inline constexpr StringLiteral k_SeedArrayName_Key = "seed_array_name";
static inline constexpr StringLiteral k_UsePrecaching_Key = "use_precaching";
static inline constexpr StringLiteral k_Epsilon_Key = "epsilon";
static inline constexpr StringLiteral k_MinPoints_Key = "min_points";
Expand Down
1 change: 1 addition & 0 deletions src/Plugins/SimplnxCore/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ set(${PLUGIN_NAME}UnitTest_SRCS
CreatePythonSkeletonTest.cpp
CropImageGeometryTest.cpp
CropVertexGeometryTest.cpp
DBSCANTest.cpp
DeleteDataTest.cpp
DREAM3DFileTest.cpp
ErodeDilateBadDataTest.cpp
Expand Down
Loading

0 comments on commit 0e2cc2a

Please sign in to comment.