Skip to content

Commit

Permalink
Dont loop through the labels to avoid collisions (#5672)
Browse files Browse the repository at this point in the history
* Dont loop through the labels to avoid collisions

Instead a map is used to know what labels have what original value and display text, they are uniquely defined by this.
Before was naively looping through the labels to check and merge them in a preceding step.
Same goes for intsId, it is simply tracked now instead of found by looping.

Both loops are now lookups at the cost of a little maintenance

Fixes jasp-stats/jasp-issues#2910 (extremely slow loading of semibig text file)

* Add yet another map...

Looping through the labels to find the Non-missing ones slows down the application quite a bit with bigger datasets.
so instead make a mapping in labelsTempCount() that we can easily query.
  • Loading branch information
JorisGoosen authored Sep 26, 2024
1 parent b6e2a6f commit 355382a
Show file tree
Hide file tree
Showing 5 changed files with 164 additions and 102 deletions.
209 changes: 128 additions & 81 deletions CommonData/column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -493,10 +493,8 @@ columnType Column::setValues(const stringvec & values, const stringvec & labels,
bool onlyDoubles = true,
onlyInts = true;

//Make sure we have only 1 label per value and display combo, because otherwise this will get too complicated
if(labelsMergeDuplicates() && aChange)
(*aChange) = true;

//Weve already made sure we have only 1 label per value and display combo, because otherwise this will get too complicated

intset ints; // to suggest whether this is a scalar or not we need to know whether we have more than treshold ints or not.
int tmpInt;
double tmpDbl;
Expand Down Expand Up @@ -636,10 +634,13 @@ void Column::_dbUpdateLabelOrder(bool noIncRevisionWhenBatchedPlease)

intintmap orderPerDbIds;

_highestIntsId = 0;
for(size_t i=0; i<_labels.size(); i++)
{
_labels[i]->setOrder(i);
orderPerDbIds[_labels[i]->dbId()] = i;

_highestIntsId = std::max(_highestIntsId, _labels[i]->intsId());
}

db().labelsSetOrder(orderPerDbIds);
Expand All @@ -657,6 +658,8 @@ void Column::labelsClear(bool doIncRevision)
db().labelsClear(_id);
_labels.clear();
_labelByIntsIdMap.clear();
_labelByValDis.clear();
_highestIntsId = 0;

if(doIncRevision)
incRevision(false);
Expand Down Expand Up @@ -692,6 +695,9 @@ int Column::labelsAdd(int display)

int Column::labelsAdd(const std::string &display)
{

JASPTIMER_SCOPE(Column::labelsAdd displaystring);

if(display == "")
return EmptyValues::missingValueInteger;

Expand All @@ -708,30 +714,33 @@ int Column::labelsAdd(const std::string &display)

int Column::labelsAdd(const std::string & display, const std::string & description, const Json::Value & originalValue)
{
sizetset intIds;

for(Label * label : _labels)
intIds.insert(label->intsId());

for(size_t newIntId = 0; ; newIntId++)
if(intIds.count(newIntId) == 0)
return labelsAdd(newIntId, display, true, description, originalValue);
JASPTIMER_SCOPE(Column::labelsAdd 3 args);

return labelsAdd(++_highestIntsId, display, true, description, originalValue);
}

int Column::labelsAdd(int value, const std::string & display, bool filterAllows, const std::string & description, const Json::Value & originalValue, int order, int id)
{
JASPTIMER_SCOPE(Column::labelsAdd);
JASPTIMER_SCOPE(Column::labelsAdd lotsa arg);

auto valDisplay = std::make_pair(Label::originalValueAsString(this, originalValue), display);

if(_labelByValDis.count(valDisplay))
return _labelByValDis.at(valDisplay)->intsId();

Label * label = new Label(this, display, value, filterAllows, description, originalValue, order, id);
_labels.push_back(label);

_labelByIntsIdMap[label->intsId()] = label;
_labelByIntsIdMap[label->intsId()] = label;
_labelByValDis[valDisplay] = label;

_highestIntsId = std::max(_highestIntsId, label->intsId());

_dbUpdateLabelOrder(true);
return label->intsId();
}

void Column::labelsRemoveByIntsId(std::set<int> valuesToRemove)
void Column::labelsRemoveByIntsId(std::set<int> valuesToRemove, bool updateOrder)
{
if (valuesToRemove.empty()) return;

Expand All @@ -744,7 +753,12 @@ void Column::labelsRemoveByIntsId(std::set<int> valuesToRemove)
[&](Label * label) {
if(std::find(valuesToRemove.begin(), valuesToRemove.end(), label->intsId()) != valuesToRemove.end())
{
_labelByIntsIdMap.erase(label->intsId());
_labelByIntsIdMap.erase(label->intsId());

auto valDis = std::make_pair(label->originalValueAsString(), label->labelDisplay());
if(_labelByValDis.count(valDis) && _labelByValDis.at(valDis) == label)
_labelByValDis.erase(valDis);

label->dbDelete();
delete label;
return true;
Expand All @@ -753,13 +767,14 @@ void Column::labelsRemoveByIntsId(std::set<int> valuesToRemove)
}),
_labels.end());

_dbUpdateLabelOrder();
if(updateOrder)
_dbUpdateLabelOrder();
}

strintmap Column::labelsResetValues(int & maxValue)
{
JASPTIMER_SCOPE(Column::labelsResetValues);

beginBatchedLabelsDB();

strintmap result;
Expand All @@ -780,6 +795,8 @@ strintmap Column::labelsResetValues(int & maxValue)

maxValue = labelValue;

_highestIntsId = maxValue;

endBatchedLabelsDB();

return result;
Expand All @@ -797,12 +814,14 @@ void Column::labelsRemoveBeyond(size_t indexToStartRemoving)

void Column::labelsTempReset()
{
_labelsTemp . clear();
_labelsTempDbls . clear();
_labelsTempToIndex . clear();
_labelsTempRevision = -1;
_labelsTempMaxWidth = 0;
_labelsTempNumerics = 0;
_labelsTemp . clear();
_labelsTempDbls . clear();
_labelsTempToIndex . clear();
_labelsTempRevision = -1;
_labelsTempMaxWidth = 0;
_labelsTempNumerics = 0;
_labelByNonEmptyIndex .clear();
_labelNonEmptyIndexByLabel .clear();
}

int Column::labelsTempCount()
Expand All @@ -814,15 +833,20 @@ int Column::labelsTempCount()
_labelsTemp . reserve(_labels.size());
_labelsTempDbls . reserve(_labels.size());

size_t nonEmptyIndex = 0;
for(size_t r=0; r<_labels.size(); r++)
if(!_labels[r]->isEmptyValue())
{
_labelsTemp . push_back(_labels[r]->label());
_labelsTempDbls . push_back(_labels[r]->originalValue().isDouble() ? _labels[r]->originalValue().asDouble() : EmptyValues::missingValueDouble);
_labelsTempToIndex[_labelsTemp[_labelsTemp.size()-1]] = _labelsTemp.size()-1; //We store the index in _labelsTemp in a map.
_labelByNonEmptyIndex[nonEmptyIndex] = _labels[r];
_labelNonEmptyIndexByLabel[_labels[r]] = nonEmptyIndex;

if(!std::isnan(*_labelsTempDbls.rbegin()))
_labelsTempNumerics++;

nonEmptyIndex++;
}

doubleset dblset;
Expand Down Expand Up @@ -879,15 +903,14 @@ std::string Column::labelsTempDisplay(size_t tempLabelIndex)
return _labelsTemp[tempLabelIndex];
}

Label * Column::labelByIndexNotEmpty(size_t index) const
int Column::labelIndexNonEmpty(Label *label) const
{
return !_labelNonEmptyIndexByLabel.count(label) ? -1 : _labelNonEmptyIndexByLabel.at(label);
}

Label * Column::labelByIndexNotEmpty(int index) const
{
size_t nonEmpty = 0;

for(size_t l=0; l<_labels.size(); l++)
if(!_labels[l]->isEmptyValue() && nonEmpty++ == index)
return _labels[l];

return nullptr;
return !_labelByNonEmptyIndex.count(index) ? nullptr : _labelByNonEmptyIndex.at(index);
}

size_t Column::labelCountNotEmpty() const
Expand Down Expand Up @@ -941,8 +964,13 @@ int Column::labelsDoubleValueIsTempLabelRow(double dbl)
void Column::_resetLabelValueMap()
{
_labelByIntsIdMap.clear();
_labelByValDis.clear();

for(Label * label : _labels)
_labelByIntsIdMap[label->intsId()] = label;
{
_labelByIntsIdMap[label->intsId()] = label;
_labelByValDis[std::make_pair(label->originalValueAsString(), label->labelDisplay())] = label;
}

labelsTempReset();
}
Expand Down Expand Up @@ -1272,11 +1300,21 @@ bool Column::replaceDoubleLabelFromRowWithDouble(size_t row, double dbl)
return true;
}

void Column::labelValueChanged(Label *label, double aDouble)
void Column::labelValueChanged(Label *label, double aDouble, const Json::Value & previousOriginal)
{
auto oldValDis = std::make_pair(Label::originalValueAsString(this, previousOriginal), label->labelDisplay());
bool merged = _labelByValDis.count(label->origValDisplay()) != 0;

if(merged)
labelsMergeDuplicateInto(label);

//Make sure it was registered before:
assert(_labelByValDis[oldValDis] == label);
//And that its new location is free:
assert(_labelByValDis.count(label->origValDisplay()) == 0 || _labelByValDis.at(label->origValDisplay()) == label);

//Lets assume that all occurences of a label in _dbls are the same.
//So when we encounter one that is the same as what is passed here we can return immediately

for(size_t r=0; r<_dbls.size(); r++)
if(_ints[r] == label->intsId())
{
Expand All @@ -1286,6 +1324,13 @@ void Column::labelValueChanged(Label *label, double aDouble)
_dbls[r] = aDouble;
}


_labelByValDis.erase(oldValDis);
_labelByValDis[label->origValDisplay()] = label;

if(merged)
_dbUpdateLabelOrder();

dbUpdateValues();
}

Expand All @@ -1295,12 +1340,30 @@ void Column::labelsHandleAutoSort(bool doDbUpdateEtc)
labelsOrderByValue(doDbUpdateEtc);
}

void Column::labelDisplayChanged(Label *label)
void Column::labelDisplayChanged(Label *label, const std::string & previousDisplay)
{
auto oldValDis = std::make_pair(label->originalValueAsString(), previousDisplay);
bool merged = _labelByValDis.count(label->origValDisplay()) != 0;

if(merged)
labelsMergeDuplicateInto(label);


//Make sure it was registered before:
assert(_labelByValDis[oldValDis] == label);
//And that its new location is free:
assert(_labelByValDis.count(label->origValDisplay()) == 0 || _labelByValDis.at(label->origValDisplay()) == label);

_labelByValDis.erase(oldValDis);
_labelByValDis[label->origValDisplay()] = label;

if(merged)
_dbUpdateLabelOrder();

if(_labelsTempRevision < _revision)
return; //We dont care about this change anymore if the list is out of date

size_t labelIdx = labelIndex(label);
size_t labelIdx = labelIndexNonEmpty(label);

if(_labelsTemp.size() > labelIdx)
_labelsTemp[labelIdx] = label->label();
Expand Down Expand Up @@ -1513,54 +1576,46 @@ Labelset Column::labelsByValue(const std::string & value) const
return Labelset(found.begin(), found.end());
}



Label * Column::labelByValueAndDisplay(const std::string &value, const std::string &labelText) const
{
JASPTIMER_SCOPE(Column::labelsByValueAndDisplay);

Labels found;
for(Label * label : _labels)
if(label->originalValueAsString() == value && label->label() == labelText)
return label;


return nullptr;
auto valDis = std::make_pair(value, labelText);
return _labelByValDis.count(valDis) == 0 ? nullptr : _labelByValDis.at(valDis);
}

bool Column::labelsMergeDuplicates()
void Column::labelsMergeDuplicateInto(Label * labelPrime)
{
bool thereWasDuplication = false;

for(size_t labelIndex=0; labelIndex < _labels.size(); labelIndex++)
const std::string value = labelPrime->originalValueAsString(),
labelText = labelPrime->label();
Labelset found;
std::copy_if(_labels.begin(), _labels.end(), std::inserter(found, found.begin()), [&value, &labelText](Label * label)
{
const std::string value = _labels[labelIndex]->originalValueAsString(),
labelText = _labels[labelIndex]->label();
Labels found;
std::copy_if(_labels.begin(), _labels.end(), std::back_inserter(found), [&value, &labelText](Label * label)
{
return label->originalValueAsString() == value && label->label() == labelText;
});
return label->originalValueAsString() == value && label->label() == labelText;
});

if(found.size() > 1)
{
found.erase(labelPrime);

if(found.size() > 1)
{
thereWasDuplication = true;
for(Label * otherLabel : found)
for(int & anInt : _ints)
if(anInt == otherLabel->intsId())
anInt = labelPrime->intsId();

//First one wins
for(size_t f=1; f<found.size(); f++)
for(int & anInt : _ints)
if(anInt == found[f]->intsId())
anInt = found[0]->intsId();
intset ids;

found.erase(found.begin());
intset ids;

for(Label * label : found)
ids.insert(label->intsId());

labelsRemoveByIntsId(ids);
}
for(Label * label : found)
ids.insert(label->intsId());

labelsRemoveByIntsId(ids, false);

_labelByValDis[labelPrime->origValDisplay()] = labelPrime;
labelsTempReset();
}

return thereWasDuplication;
}

bool Column::labelsRemoveOrphans()
Expand All @@ -1578,14 +1633,6 @@ bool Column::labelsRemoveOrphans()
return idsNotUsed.size();
}

int Column::labelIndex(const Label *label) const
{
for(size_t i=0; i<_labels.size(); i++)
if(_labels[i] == label)
return i;
return -1;
}

std::set<size_t> Column::labelsMoveRows(std::vector<qsizetype> rows, bool up)
{
JASPTIMER_SCOPE(Column::labelsMoveRows);
Expand Down Expand Up @@ -1921,11 +1968,11 @@ Json::Value Column::serializeLabels() const

void Column::deserializeLabelsForCopy(const Json::Value & labels)
{

labelsTempReset();

beginBatchedLabelsDB();
_labelByIntsIdMap.clear();
_labelByValDis.clear();
_labels.clear();

if (labels.isArray())
Expand Down
Loading

0 comments on commit 355382a

Please sign in to comment.