Skip to content

Commit

Permalink
YT-21253 Include HyperLogLog in YT table columnar statistics
Browse files Browse the repository at this point in the history
тестирование HLL на случайно сгенерированных данных:
p=10 показывает худшую погрешность в 9.9% (равномерное распределение на отрезке [0, 10^6), 10 HLL-групп, 1М значений, 631К уникальных
b5399faf1a9757b07a2d2ee25bd16b8a27be7939
  • Loading branch information
alephonea committed Jul 2, 2024
1 parent a358c84 commit 877c563
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 0 deletions.
3 changes: 3 additions & 0 deletions yt/cpp/mapreduce/interface/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -1193,6 +1193,9 @@ struct TTableColumnarStatistics
/// Total data weight for all chunks for each of requested columns.
THashMap<TString, i64> ColumnDataWeight;

/// Estimated number of unique elements for each column.
THashMap<TString, ui64> ColumnEstimatedUniqueCounts;

/// Total weight of all old chunks that don't keep columnar statistics.
i64 LegacyChunksDataWeight = 0;

Expand Down
1 change: 1 addition & 0 deletions yt/cpp/mapreduce/interface/serialize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,7 @@ void Deserialize(TTableColumnarStatistics& statistics, const TNode& node)
{
const auto& nodeMap = node.AsMap();
DESERIALIZE_ITEM("column_data_weights", statistics.ColumnDataWeight);
DESERIALIZE_ITEM("column_estimated_unique_counts", statistics.ColumnEstimatedUniqueCounts);
DESERIALIZE_ITEM("legacy_chunks_data_weight", statistics.LegacyChunksDataWeight);
DESERIALIZE_ITEM("timestamp_total_weight", statistics.TimestampTotalWeight);
}
Expand Down
58 changes: 58 additions & 0 deletions yt/yt/core/misc/hyperloglog.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,32 @@
#include "hyperloglog_bias.h"
#include "farm_hash.h"

#include <yt/yt_proto/yt/core/misc/proto/hyperloglog.pb.h>

#include <cmath>

namespace NYT {

////////////////////////////////////////////////////////////////////////////////

template <int Precision>
class THyperLogLog;

template <int Precision>
void FormatValue(TStringBuilderBase* builder, const THyperLogLog<Precision>& value, TStringBuf format);

template <int Precision>
void ToProto(
NProto::THyperLogLog* protoHyperLogLog,
const THyperLogLog<Precision>& hyperloglog);

template <int Precision>
void FromProto(
THyperLogLog<Precision>* hyperloglog,
const NProto::THyperLogLog& protoHyperLogLog);

////////////////////////////////////////////////////////////////////////////////

template <int Precision>
class THyperLogLog
{
Expand All @@ -28,7 +48,19 @@ class THyperLogLog

static ui64 EstimateCardinality(const std::vector<ui64>& values);

bool operator==(const THyperLogLog<Precision>& other) const = default;

private:
friend void ToProto<Precision>(
NProto::THyperLogLog* protoHyperLogLog,
const THyperLogLog<Precision>& hyperloglog);

friend void FromProto<Precision>(
THyperLogLog<Precision>* hyperloglog,
const NProto::THyperLogLog& protoHyperLogLog);

friend void FormatValue<Precision>(TStringBuilderBase* builder, const THyperLogLog<Precision>& value, TStringBuf format);

static constexpr ui64 RegisterCount = (ui64)1 << Precision;
static constexpr ui64 PrecisionMask = RegisterCount - 1;
static constexpr double Threshold = NDetail::Thresholds[Precision - 4];
Expand Down Expand Up @@ -134,6 +166,32 @@ ui64 THyperLogLog<Precision>::EstimateCardinality(const std::vector<ui64>& value
return state.EstimateCardinality();
}

template <int Precision>
void ToProto(
NProto::THyperLogLog* protoHyperLogLog,
const THyperLogLog<Precision>& hyperloglog)
{
ToProto(protoHyperLogLog->mutable_registers(), hyperloglog.ZeroCounts_);
}

template <int Precision>
void FromProto(
THyperLogLog<Precision>* hyperloglog,
const NProto::THyperLogLog& protoHyperLogLog)
{
YT_VERIFY(protoHyperLogLog.registers_size() == std::ssize(hyperloglog->ZeroCounts_));
// FromProto() template supports vectors but not ranges, so copy underlying values directly.
std::copy(protoHyperLogLog.registers().begin(), protoHyperLogLog.registers().end(), hyperloglog->ZeroCounts_.begin());
}

////////////////////////////////////////////////////////////////////////////////

template <int Precision>
void FormatValue(TStringBuilderBase* builder, const THyperLogLog<Precision>& value, TStringBuf /*format*/)
{
builder->AppendFormat("%v", std::span<const ui8>(value.ZeroCounts_));
}

////////////////////////////////////////////////////////////////////////////////

} // namespace NYT
2 changes: 2 additions & 0 deletions yt/yt_proto/yt/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ target_proto_messages(yt_proto-yt-core PRIVATE
${PROJECT_SOURCE_DIR}/yt/yt_proto/yt/core/misc/proto/bloom_filter.proto
${PROJECT_SOURCE_DIR}/yt/yt_proto/yt/core/misc/proto/error.proto
${PROJECT_SOURCE_DIR}/yt/yt_proto/yt/core/misc/proto/guid.proto
${PROJECT_SOURCE_DIR}/yt/yt_proto/yt/core/misc/proto/hyperloglog.proto
${PROJECT_SOURCE_DIR}/yt/yt_proto/yt/core/misc/proto/protobuf_helpers.proto
${PROJECT_SOURCE_DIR}/yt/yt_proto/yt/core/tracing/proto/span.proto
${PROJECT_SOURCE_DIR}/yt/yt_proto/yt/core/tracing/proto/tracing_ext.proto
Expand All @@ -54,6 +55,7 @@ target_sources(yt_proto-yt-core PRIVATE
${PROJECT_BINARY_DIR}/yt/yt_proto/yt/core/misc/proto/bloom_filter.pb.h
${PROJECT_BINARY_DIR}/yt/yt_proto/yt/core/misc/proto/error.pb.h
${PROJECT_BINARY_DIR}/yt/yt_proto/yt/core/misc/proto/guid.pb.h
${PROJECT_BINARY_DIR}/yt/yt_proto/yt/core/misc/proto/hyperloglog.pb.h
${PROJECT_BINARY_DIR}/yt/yt_proto/yt/core/misc/proto/protobuf_helpers.pb.h
${PROJECT_BINARY_DIR}/yt/yt_proto/yt/core/tracing/proto/span.pb.h
${PROJECT_BINARY_DIR}/yt/yt_proto/yt/core/tracing/proto/tracing_ext.pb.h
Expand Down
17 changes: 17 additions & 0 deletions yt/yt_proto/yt/core/misc/proto/hyperloglog.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package NYT.NProto;

option java_package = "tech.ytsaurus";
option java_multiple_files = true;

option go_package = "a.yandex-team.ru/yt/go/proto/core/misc";

////////////////////////////////////////////////////////////////////////////////

message THyperLogLog
{
// Use uint32 for 8-bit integers. It is protobuf's most narrow int type.
// Register values will still use one byte on the wire, because of varint encoding.
repeated uint32 registers = 1;
}

////////////////////////////////////////////////////////////////////////////////
1 change: 1 addition & 0 deletions yt/yt_proto/yt/core/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ SRCS(
misc/proto/bloom_filter.proto
misc/proto/error.proto
misc/proto/guid.proto
misc/proto/hyperloglog.proto
misc/proto/protobuf_helpers.proto

tracing/proto/span.proto
Expand Down

0 comments on commit 877c563

Please sign in to comment.