From 483d23043f0dd6449418e4f30ff29929193f266b Mon Sep 17 00:00:00 2001 From: liqiang Date: Sat, 7 Dec 2024 15:33:19 +0000 Subject: [PATCH 01/11] feat: v1 --- core/prometheus/labels/TextParserSIMD.cpp | 63 ++++ core/prometheus/labels/TextParserSIMD.h | 337 ++++++++++++++++++++++ 2 files changed, 400 insertions(+) create mode 100644 core/prometheus/labels/TextParserSIMD.cpp create mode 100644 core/prometheus/labels/TextParserSIMD.h diff --git a/core/prometheus/labels/TextParserSIMD.cpp b/core/prometheus/labels/TextParserSIMD.cpp new file mode 100644 index 0000000000..5f1fdad532 --- /dev/null +++ b/core/prometheus/labels/TextParserSIMD.cpp @@ -0,0 +1,63 @@ +/* + * Copyright 2024 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "prometheus/labels/TextParserSIMD.h" + +#include + +#include +#include +#include + +#include "models/MetricEvent.h" +#include "models/PipelineEventGroup.h" +#include "models/StringView.h" +#include "prometheus/Utils.h" + +using namespace std; + +namespace logtail::prom { + + +// start to parse metric sample:test_metric{k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx + + +// parse:test_metric{k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx + + +// parse:k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx + + +// parse:"v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx + + +// parse:v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx + +// parse:, k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx +// or parse:} 9.9410452992e+10 1715829785083 # exemplarsxxx + + +// parse:9.9410452992e+10 1715829785083 # exemplarsxxx + + +// parse:1715829785083 # exemplarsxxx +// timestamp will be 1715829785.083 in OpenMetrics + + + + + +} // namespace logtail::prom diff --git a/core/prometheus/labels/TextParserSIMD.h b/core/prometheus/labels/TextParserSIMD.h new file mode 100644 index 0000000000..9b9edaf892 --- /dev/null +++ b/core/prometheus/labels/TextParserSIMD.h @@ -0,0 +1,337 @@ +/* + * Copyright 2024 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +#include "logger/Logger.h" +#include "models/MetricEvent.h" +#include "models/PipelineEventGroup.h" +#include "models/StringView.h" +#include "prometheus/Utils.h" + +namespace logtail::prom { + +enum class TextState { Start, Done, Error }; + +// no strict grammar for prom +class TextParserSIMD { +public: + TextParserSIMD() = default; + explicit TextParserSIMD(bool honorTimestamps) : mHonorTimestamps(honorTimestamps) {} + + void SetDefaultTimestamp(uint64_t defaultTimestamp, uint32_t defaultNanoSec) { + mDefaultTimestamp = defaultTimestamp; + mDefaultNanoTimestamp = defaultNanoSec; + } + + PipelineEventGroup Parse(const std::string& content, uint64_t defaultTimestamp, uint32_t defaultNanoSec) { + SetDefaultTimestamp(defaultTimestamp, defaultNanoSec); + auto eGroup = PipelineEventGroup(std::make_shared()); + std::vector lines; + // pre-reserve vector size by 1024 which is experience value per line + lines.reserve(content.size() / 1024); + SplitStringView(content, '\n', lines); + for (const auto& line : lines) { + if (!IsValidMetric(line)) { + continue; + } + auto metricEvent = eGroup.CreateMetricEvent(); + if (ParseLine(line, *metricEvent)) { + eGroup.MutableEvents().emplace_back(std::move(metricEvent), false, nullptr); + } + } + + return eGroup; + } + + bool ParseLine(StringView line, MetricEvent& metricEvent) { + mState = TextState::Start; + mLabelName.clear(); + mEscape = FindFirstLetter(line.data(), line.size(), '\\').has_value(); + + HandleStart(metricEvent, line.data(), line.size()); + + if (mState == TextState::Done) { + return true; + } + + return false; + } + +private: + std::optional FindFirstLetter(const char* s, size_t len, char target) { + size_t res = 0; + while (res < len) { + if (s[res] == target) { + return res; + } + res++; + } + return std::nullopt; + } + std::optional FindFirstLetters(const char* s, size_t len, char target1, char target2) { + size_t res = 0; + while (res < len) { + if (s[res] == target1 || s[res] == target2) { + return res; + } + res++; + } + return std::nullopt; + } + + std::optional SkipTrailingWhitespace(const char* s, size_t pos) { + for (; pos > 0 && (s[pos] == ' ' || s[pos] == '\t'); --pos) { + } + if (pos == 0 && (s[pos] == ' ' || s[pos] == '\t')) { + return std::nullopt; + } + return pos; + } + + + void HandleError(const std::string& errMsg) { + LOG_WARNING(sLogger, ("text parser error parsing line", errMsg)); + mState = TextState::Error; + } + + void HandleStart(MetricEvent& metricEvent, const char* s, const size_t len) { + auto pos = SkipLeadingWhitespace(s, len, 0); + HandleMetricName(metricEvent, s + pos, len - pos); + } + void HandleMetricName(MetricEvent& metricEvent, const char* s, size_t len) { + auto pos = FindFirstLetter(s, len, '{'); + if (pos.has_value()) { + auto endPos = SkipTrailingWhitespace(s, pos.value() - 1); + if (endPos.has_value()) { + metricEvent.SetNameNoCopy(StringView(s, endPos.value() + 1)); + } else { + HandleError("error end of metric name"); + return; + } + auto nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); + HandleLabelName(metricEvent, s + nextPos, len - nextPos); + } else { + auto nextPos = FindFirstLetters(s, len, ' ', '\t'); + if (nextPos.has_value()) { + metricEvent.SetNameNoCopy(StringView(s, nextPos.value())); + auto nextNextPos = SkipLeadingWhitespace(s, len, nextPos.value()); + HandleSampleValue(metricEvent, s + nextNextPos, len - nextNextPos); + } + } + } + void HandleLabelName(MetricEvent& metricEvent, const char* s, size_t len) { + auto pos = FindFirstLetter(s, len, '='); + if (pos.has_value()) { + auto endPos = SkipTrailingWhitespace(s, pos.value() - 1); + if (endPos.has_value()) { + if (FindFirstLetter(s, endPos.value(), '"').has_value()) { + HandleError("invalid character in label name"); + return; + } + mLabelName = StringView(s, endPos.value() + 1); + } else { + HandleError("error end of metric name"); + return; + } + auto nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); + HandleLabelValue(metricEvent, s + nextPos, len - nextPos); + } else { + if (len > 0 && s[0] == '}') { + auto nextPos = SkipLeadingWhitespace(s, len, 1); + HandleSampleValue(metricEvent, s + nextPos, len - nextPos); + } else { + HandleError("invalid character in label name"); + } + } + } + void HandleLabelValue(MetricEvent& metricEvent, const char* s, size_t len) { + // left quote has been consumed + // LableValue supports escape char + if (len == 0 || s[0] != '"') { + HandleError("invalid character in label value"); + return; + } + s = s + 1; + len--; + size_t nextPos = 0; + if (mEscape) { + // slow path + // escape char + std::string labelValue; + size_t pos = 0; + for (size_t i = 0; i < len; i++) { + if (s[i] == '\\') { + if (i + 1 < len) { + switch (s[i + 1]) { + case 'n': + labelValue.push_back('\n'); + break; + case '\\': + case '\"': + labelValue.push_back(s[i + 1]); + break; + default: + labelValue.push_back('\\'); + labelValue.push_back(s[i + 1]); + break; + } + i++; + } else { + HandleError("invalid escape char"); + return; + } + } else if (s[i] == '"') { + pos = i; + break; + } else { + labelValue.push_back(s[i]); + } + } + auto sb = metricEvent.GetSourceBuffer()->CopyString(labelValue); + metricEvent.SetTag(mLabelName, StringView(sb.data, sb.size)); + nextPos = SkipLeadingWhitespace(s, len, pos + 1); + } else { + const auto pos = FindFirstLetter(s, len, '"'); + if (pos.has_value()) { + metricEvent.SetTagNoCopy(mLabelName, StringView(s, pos.value())); + nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); + } else { + HandleError("invalid character in label value"); + return; + } + } + if (s[nextPos] == ',') { + nextPos++; + nextPos = SkipLeadingWhitespace(s, len, nextPos); + if (s[nextPos] == '}') { + nextPos++; + nextPos = SkipLeadingWhitespace(s, len, nextPos); + HandleSampleValue(metricEvent, s + nextPos, len - nextPos); + return; + } + HandleLabelName(metricEvent, s + nextPos, len - nextPos); + } else if (s[nextPos] == '}') { + nextPos++; + nextPos = SkipLeadingWhitespace(s, len, nextPos); + HandleSampleValue(metricEvent, s + nextPos, len - nextPos); + } else { + HandleError("invalid character in label value"); + } + } + void HandleSampleValue(MetricEvent& metricEvent, const char* s, size_t len) { + auto pos = FindFirstLetters(s, len, ' ', '\t'); + if (!pos.has_value()) { + pos = FindFirstLetter(s, len, '#'); + } + size_t valueLen = 0; + if (pos.has_value()) { + valueLen = pos.value(); + } else { + valueLen = len; + } + if (valueLen == 0) { + HandleError("invalid sample value"); + return; + } + auto tmpSampleValue = StringView(s, valueLen); + try { + auto sampleValue = std::stod(tmpSampleValue.to_string()); + metricEvent.SetValue(sampleValue); + } catch (...) { + HandleError("invalid sample value"); + return; + } + if ((pos.has_value() && s[pos.value()] == '#') || valueLen == len) { + metricEvent.SetTimestamp(mDefaultTimestamp, mDefaultNanoTimestamp); + mState = TextState::Done; + return; + } + s = s + pos.value() + 1; + len -= pos.value() + 1; + auto nextPos = SkipLeadingWhitespace(s, len, 0); + HandleTimestamp(metricEvent, s + nextPos, len - nextPos); + } + void HandleTimestamp(MetricEvent& metricEvent, const char* s, size_t len) { + // '#' is for exemplars, and we don't need it + auto pos = FindFirstLetters(s, len, ' ', '\t'); + if (!pos.has_value()) { + pos = FindFirstLetter(s, len, '#'); + } + size_t valueLen = 0; + if (pos.has_value()) { + valueLen = pos.value(); + } else { + valueLen = len; + } + if (valueLen == 0) { + mState = TextState::Done; + return; + } + auto tmpTimestamp = StringView(s, valueLen); + double milliTimestamp = 0; + try { + milliTimestamp = std::stod(tmpTimestamp.to_string()); + } catch (...) { + HandleError("invalid timestamp"); + return; + } + + if (milliTimestamp > 1ULL << 63) { + HandleError("timestamp overflow"); + return; + } + if (milliTimestamp < 1UL << 31) { + milliTimestamp *= 1000; + } + time_t timestamp = (int64_t)milliTimestamp / 1000; + auto ns = ((int64_t)milliTimestamp % 1000) * 1000000; + if (mHonorTimestamps) { + metricEvent.SetTimestamp(timestamp, ns); + } else { + metricEvent.SetTimestamp(mDefaultTimestamp, mDefaultNanoTimestamp); + } + mState = TextState::Done; + } + + inline size_t SkipLeadingWhitespace(const char* s, size_t len, size_t pos) { + while (pos < len && (s[pos] == ' ' || s[pos] == '\t')) { + pos++; + } + return pos; + } + + TextState mState{TextState::Start}; + bool mEscape{false}; + + StringView mLabelName; + + bool mHonorTimestamps{true}; + time_t mDefaultTimestamp{0}; + uint32_t mDefaultNanoTimestamp{0}; + +#ifdef APSARA_UNIT_TEST_MAIN + friend class TextParserUnittest; +#endif +}; + +} // namespace logtail::prom From 5a86248f5b36c8fe078127805350f2a8eeb627a7 Mon Sep 17 00:00:00 2001 From: liqiang Date: Sat, 7 Dec 2024 15:33:45 +0000 Subject: [PATCH 02/11] ut --- .../prometheus/TextParserSIMDUnittest.cpp | 394 ++++++++++++++++++ 1 file changed, 394 insertions(+) create mode 100644 core/unittest/prometheus/TextParserSIMDUnittest.cpp diff --git a/core/unittest/prometheus/TextParserSIMDUnittest.cpp b/core/unittest/prometheus/TextParserSIMDUnittest.cpp new file mode 100644 index 0000000000..34d23b3622 --- /dev/null +++ b/core/unittest/prometheus/TextParserSIMDUnittest.cpp @@ -0,0 +1,394 @@ +/* + * Copyright 2024 iLogtail Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "MetricEvent.h" +#include "models/PipelineEventGroup.h" +#include "prometheus/labels/TextParser.h" +#include "prometheus/labels/TextParserSIMD.h" +#include "unittest/Unittest.h" + +using namespace std; + +namespace logtail::prom { + +bool IsDoubleEqual(double a, double b) { + return fabs(a - b) < 0.000001; +} + +class TextParserSIMDUnittest : public testing::Test { +public: + void TestParseMultipleLines() const; + void TestParseMetricWithTagsAndTimestamp() const; + void TestParseMetricWithManyTags() const; + void TestParseUnicodeLabelValue(); + + void TestParseFaliure(); + void TestParseSuccess(); + + void TestHonorTimestamps(); +}; + +void TextParserSIMDUnittest::TestParseMultipleLines() const { + auto parser = TextParserSIMD(); + const auto eGroup = parser.Parse(R"""( +# begin + +test_metric1{k1="v1", k2="v 1.0 + test_metric2{k1="v1", k2="v2"} 2.0 1234567890 +test_metric3{k1="v1",k2="v2"} 9.9410452992e+10 + test_metric4{k1="v1",k2="v2"} 9.9410452992e+10 1715829785083 + test_metric5{k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 +test_metric6{k1="v1",k2="v2",} 9.9410452992e+10 1715829785083 +test_metric7{k1="v1",k2="v2", } 9.9410452992e+10 1715829785083 +test_metric8{k1="v1", k2="v2", } 9.9410452992e+10 1715829785083 + +# end + )""", + 0, + 0); + const auto& events = &eGroup.GetEvents(); + APSARA_TEST_EQUAL(7UL, events->size()); +} +UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseMultipleLines) + +void TextParserSIMDUnittest::TestParseMetricWithTagsAndTimestamp() const { + auto parser = TextParserSIMD(); + string rawData = R"""( + test_metric{k1="v1", k2="v2"} 9.9410452992e+10 1715829785083 + test_metric2{k1="v1", k2="v2"} 2.0 1715829785083 + test_metric3{k1="v1",k2="v2"} 4.2 92233720368547758080000 + )"""; + const auto eGroup = parser.Parse(rawData, 0, 0); + + + // test_metric + const auto& events = &eGroup.GetEvents(); + const auto& event = events->front(); + const auto& metric = event.Get(); + APSARA_TEST_EQUAL("test_metric", metric->GetName().to_string()); + APSARA_TEST_EQUAL(1715829785, metric->GetTimestamp()); + APSARA_TEST_EQUAL(83000000, metric->GetTimestampNanosecond()); + APSARA_TEST_TRUE(IsDoubleEqual(9.9410452992e+10, metric->GetValue()->mValue)); + APSARA_TEST_EQUAL("v1", metric->GetTag("k1").to_string()); + APSARA_TEST_EQUAL("v2", metric->GetTag("k2").to_string()); + + // test_metric2 + const auto& event2 = events->at(1); + const auto& metric2 = event2.Get(); + APSARA_TEST_EQUAL("test_metric2", metric2->GetName().to_string()); + APSARA_TEST_EQUAL(1715829785, metric2->GetTimestamp()); + APSARA_TEST_TRUE(IsDoubleEqual(2.0, metric2->GetValue()->mValue)); + APSARA_TEST_EQUAL("v1", metric2->GetTag("k1").to_string()); + APSARA_TEST_EQUAL("v2", metric2->GetTag("k2").to_string()); + + // test_metric3 is not generated because of timestamp overflow + APSARA_TEST_EQUAL(2UL, events->size()); +} +UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseMetricWithTagsAndTimestamp) + +void TextParserSIMDUnittest::TestParseMetricWithManyTags() const { + auto parser = TextParserSIMD(); + string rawData + = R"""(container_blkio_device_usage_total{container="",device="/dev/nvme0n1",id="/",image="",major="259",minor="0",name="",namespace="",operation="Async",pod=""} 9.9410452992e+10 1715829785083)"""; + const auto eGroup = parser.Parse(rawData, 1715829785, 83000000); + const auto& events = &eGroup.GetEvents(); + APSARA_TEST_EQUAL(1UL, events->size()); + const auto& event = events->front(); + const auto& metric = event.Get(); + APSARA_TEST_EQUAL("container_blkio_device_usage_total", metric->GetName().to_string()); + APSARA_TEST_EQUAL(1715829785, metric->GetTimestamp()); + APSARA_TEST_TRUE(IsDoubleEqual(9.9410452992e+10, metric->GetValue()->mValue)); + + APSARA_TEST_EQUAL("", metric->GetTag("container").to_string()); + APSARA_TEST_EQUAL("/dev/nvme0n1", metric->GetTag("device").to_string()); + APSARA_TEST_EQUAL("/", metric->GetTag("id").to_string()); + APSARA_TEST_EQUAL("", metric->GetTag("image").to_string()); + APSARA_TEST_EQUAL("259", metric->GetTag("major").to_string()); + APSARA_TEST_EQUAL("0", metric->GetTag("minor").to_string()); + APSARA_TEST_EQUAL("", metric->GetTag("name").to_string()); + APSARA_TEST_EQUAL("", metric->GetTag("namespace").to_string()); + APSARA_TEST_EQUAL("Async", metric->GetTag("operation").to_string()); + APSARA_TEST_EQUAL("", metric->GetTag("pod").to_string()); +} +UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseMetricWithManyTags) + +void TextParserSIMDUnittest::TestParseFaliure() { + auto f = [](const std::string& content) { + TextParserSIMD parser; + PipelineEventGroup eGroup = parser.Parse(content, 0, 0); + APSARA_TEST_EQUAL(0UL, eGroup.GetEvents().size()); + }; + + // Empty lines and comments + f(""); + f(" "); + f("\t"); + f("\t \r"); + f("\t\t \n\n # foobar"); + f("#foobar"); + f("#foobar\n"); + + // invalid tags + f("a{"); + f("a { "); + f("a {foo"); + f("a {foo} 3"); + f("a {foo ="); + f("a {foo =\"bar"); + f("a {foo =\"b\\ar"); + f("a {foo = \"bar\""); + f("a {foo =\"bar\","); + f("a {foo =\"bar\" , "); + f("a {foo =\"bar\" , baz } 2"); + + // Invalid tags - see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4284 + f(R"(a{"__name__":"upsd_time_left_ns","host":"myhost", "status_OB":"true"} 12)"); + f(R"(a{host:"myhost"} 12)"); + f(R"(a{host:"myhost",foo="bar"} 12)"); + + // Empty metric name + f(R"({foo="bar"})"); + + // Invalid quotes for label value + f(R"({foo='bar'} 23)"); + f(R"({foo=`bar`} 23"); + + // Missing value + f("aaa"); + f(" aaa"); + f(" aaa "); + f(" aaa \n"); + f(R"( aa{foo="bar"} )" + + std::string("\n")); + + // Invalid value + f("foo bar"); + f("foo bar 124"); + + // Invalid timestamp + f("foo 123 bar"); +} +UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseFaliure) + +void TextParserSIMDUnittest::TestParseSuccess() { + TextParserSIMD parser; + string rawData; + // single value + rawData = "foobar 123"; + auto res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foobar"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 123.0)); + + rawData = "foobar 123.456 789\n"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foobar"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 123.456)); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 789); + + rawData = R"( + # TYPE cassandra_token_ownership_ratio gauge +cassandra_token_ownership_ratio 78.9)"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), + "cassandra_token_ownership_ratio"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 78.9)); + + // `#` char in label value + rawData = R"(foo{bar="#1 az"} 24)"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foo"); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("bar").to_string(), "#1 az"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 24.0)); + + // Incorrectly escaped backlash. This is real-world case, which must be supported. + rawData = R"(mssql_sql_server_active_transactions_sec{loginname="domain\somelogin",env="develop"} 56)"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), + "mssql_sql_server_active_transactions_sec"); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("loginname").to_string(), "domain\\somelogin"); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("env").to_string(), "develop"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 56.0)); + + rawData = R"(foo_bucket{le="10",a="#b"} 17)"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foo_bucket"); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("le").to_string(), "10"); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("a").to_string(), "#b"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 17.0)); + + // "Infinity" word - this has been added in OpenMetrics. + // See https://github.com/OpenObservability/OpenMetrics/blob/master/OpenMetrics.md + // Checks for https://github.com/VictoriaMetrics/VictoriaMetrics/issues/924 + rawData = R"(foo Infinity + bar +Infinity + baz -infinity + aaa +inf + bbb -INF + ccc INF)"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().size(), 6UL); + APSARA_TEST_EQUAL(res.GetEvents()[0].Cast().GetName().to_string(), "foo"); + APSARA_TEST_EQUAL(res.GetEvents()[0].Cast().GetValue()->mValue, + std::numeric_limits::infinity()); + APSARA_TEST_EQUAL(res.GetEvents()[1].Cast().GetName().to_string(), "bar"); + APSARA_TEST_EQUAL(res.GetEvents()[1].Cast().GetValue()->mValue, + std::numeric_limits::infinity()); + APSARA_TEST_EQUAL(res.GetEvents()[2].Cast().GetName().to_string(), "baz"); + APSARA_TEST_EQUAL(res.GetEvents()[2].Cast().GetValue()->mValue, + -std::numeric_limits::infinity()); + APSARA_TEST_EQUAL(res.GetEvents()[3].Cast().GetName().to_string(), "aaa"); + APSARA_TEST_EQUAL(res.GetEvents()[3].Cast().GetValue()->mValue, + std::numeric_limits::infinity()); + APSARA_TEST_EQUAL(res.GetEvents()[4].Cast().GetName().to_string(), "bbb"); + APSARA_TEST_EQUAL(res.GetEvents()[4].Cast().GetValue()->mValue, + -std::numeric_limits::infinity()); + APSARA_TEST_EQUAL(res.GetEvents()[5].Cast().GetName().to_string(), "ccc"); + APSARA_TEST_EQUAL(res.GetEvents()[5].Cast().GetValue()->mValue, + std::numeric_limits::infinity()); + + // tags + rawData = R"(foo{bar="b\"a\\z"} -1.2)"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foo"); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("bar").to_string(), "b\"a\\z"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, -1.2)); + + // Empty tags + rawData = R"(foo {bar="baz",aa="",x="y"} 1 2)"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foo"); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("bar").to_string(), "baz"); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("aa").to_string(), ""); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("x").to_string(), "y"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 1.0)); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 2); + + // Multi lines with invalid line + rawData = "\t foo\t { } 0.3\t 2\naaa\n barbaz 0.34 43\n"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().size(), 2UL); + APSARA_TEST_EQUAL(res.GetEvents()[0].Cast().GetName().to_string(), "foo"); + APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents()[0].Cast().GetValue()->mValue, 0.3)); + APSARA_TEST_EQUAL(res.GetEvents()[0].Cast().GetTimestamp(), 2); + APSARA_TEST_EQUAL(res.GetEvents()[1].Cast().GetName().to_string(), "barbaz"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents()[1].Cast().GetValue()->mValue, 0.34)); + APSARA_TEST_EQUAL(res.GetEvents()[1].Cast().GetTimestamp(), 43); + + // Spaces around tags + rawData = R"(vm_accounting { name="vminsertRows", accountID = "1" , projectID= "1" } 277779100)"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "vm_accounting"); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("name").to_string(), "vminsertRows"); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("accountID").to_string(), "1"); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("projectID").to_string(), "1"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 277779100.0)); + + // Exemplars + rawData = "abc 123 456 # foobar"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "abc"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 123.0)); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 456); + + // float timestamp + rawData = "abc 123 456.789"; + res = parser.Parse(rawData, 0, 0); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "abc"); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 123.0)); + APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents().back().Cast().GetTimestamp(), 456)); + APSARA_TEST_TRUE( + IsDoubleEqual(res.GetEvents().back().Cast().GetTimestampNanosecond().value(), 789000000)); +} + +UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseSuccess) + +void TextParserSIMDUnittest::TestHonorTimestamps() { + // false + TextParserSIMD parser(false); + // has timestamp + std::string rawData = "abc 123 456"; + PipelineEventGroup res = parser.Parse(rawData, 789, 111); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 789); + APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents().back().Cast().GetTimestampNanosecond().value(), 111)); + + // no timestamp + rawData = "abc 123"; + res = parser.Parse(rawData, 789, 111); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 789); + APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents().back().Cast().GetTimestampNanosecond().value(), 111)); + + + // true + parser.mHonorTimestamps = true; + // has timestamp + rawData = "abc 123 456"; + res = parser.Parse(rawData, 789, 111); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 456); + APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents().back().Cast().GetTimestampNanosecond().value(), 0)); + + // no timestamp + rawData = "abc 123"; + res = parser.Parse(rawData, 789, 111); + APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 789); + APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents().back().Cast().GetTimestampNanosecond().value(), 111)); +} + +UNIT_TEST_CASE(TextParserSIMDUnittest, TestHonorTimestamps) + +void TextParserSIMDUnittest::TestParseUnicodeLabelValue() { + auto parser = TextParserSIMD(); + string rawData + = R"""(container_blkio_device_usage_total{container="",device="/dev/nvme0n1δΈ­ζ–‡",id="/πŸ˜€",image="",major="259",minor="0",name="",namespace="",operation="Async",pod=""} 9.9410452992e+10 1715829785083)"""; + const auto eGroup = parser.Parse(rawData, 1715829785, 83000000); + const auto& events = &eGroup.GetEvents(); + APSARA_TEST_EQUAL(1UL, events->size()); + const auto& event = events->front(); + const auto& metric = event.Get(); + APSARA_TEST_EQUAL("container_blkio_device_usage_total", metric->GetName().to_string()); + APSARA_TEST_EQUAL(1715829785, metric->GetTimestamp()); + APSARA_TEST_TRUE(IsDoubleEqual(9.9410452992e+10, metric->GetValue()->mValue)); + + APSARA_TEST_EQUAL("", metric->GetTag("container").to_string()); + APSARA_TEST_EQUAL("/dev/nvme0n1δΈ­ζ–‡", metric->GetTag("device").to_string()); + APSARA_TEST_EQUAL("/πŸ˜€", metric->GetTag("id").to_string()); + APSARA_TEST_EQUAL("", metric->GetTag("image").to_string()); + APSARA_TEST_EQUAL("259", metric->GetTag("major").to_string()); + APSARA_TEST_EQUAL("0", metric->GetTag("minor").to_string()); + APSARA_TEST_EQUAL("", metric->GetTag("name").to_string()); + APSARA_TEST_EQUAL("", metric->GetTag("namespace").to_string()); + APSARA_TEST_EQUAL("Async", metric->GetTag("operation").to_string()); + APSARA_TEST_EQUAL("", metric->GetTag("pod").to_string()); +} + +UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseUnicodeLabelValue) + +} // namespace logtail + +UNIT_TEST_MAIN From 1aa13f7ab5d5b6e836e583adb27afec154347557 Mon Sep 17 00:00:00 2001 From: liqiang Date: Sat, 7 Dec 2024 16:09:44 +0000 Subject: [PATCH 03/11] feat: simd v2 --- core/prometheus/labels/TextParserSIMD.cpp | 352 +++++++++++++++++- core/prometheus/labels/TextParserSIMD.h | 292 +-------------- core/unittest/prometheus/CMakeLists.txt | 4 + .../prometheus/TextParserBenchmark.cpp | 18 +- 4 files changed, 372 insertions(+), 294 deletions(-) diff --git a/core/prometheus/labels/TextParserSIMD.cpp b/core/prometheus/labels/TextParserSIMD.cpp index 5f1fdad532..9f503a3279 100644 --- a/core/prometheus/labels/TextParserSIMD.cpp +++ b/core/prometheus/labels/TextParserSIMD.cpp @@ -22,6 +22,7 @@ #include #include +#include "logger/Logger.h" #include "models/MetricEvent.h" #include "models/PipelineEventGroup.h" #include "models/StringView.h" @@ -31,33 +32,364 @@ using namespace std; namespace logtail::prom { +TextParserSIMD::TextParserSIMD(bool honorTimestamps) : mHonorTimestamps(honorTimestamps) { +} -// start to parse metric sample:test_metric{k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx +void TextParserSIMD::SetDefaultTimestamp(uint64_t defaultTimestamp, uint32_t defaultNanoSec) { + mDefaultTimestamp = defaultTimestamp; + mDefaultNanoTimestamp = defaultNanoSec; +} +PipelineEventGroup +TextParserSIMD::Parse(const std::string& content, uint64_t defaultTimestamp, uint32_t defaultNanoSec) { + SetDefaultTimestamp(defaultTimestamp, defaultNanoSec); + auto eGroup = PipelineEventGroup(std::make_shared()); + std::vector lines; + // pre-reserve vector size by 1024 which is experience value per line + lines.reserve(content.size() / 1024); + SplitStringView(content, '\n', lines); + for (const auto& line : lines) { + if (!IsValidMetric(line)) { + continue; + } + auto metricEvent = eGroup.CreateMetricEvent(); + if (ParseLine(line, *metricEvent)) { + eGroup.MutableEvents().emplace_back(std::move(metricEvent), false, nullptr); + } + } -// parse:test_metric{k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx + return eGroup; +} +bool TextParserSIMD::ParseLine(StringView line, MetricEvent& metricEvent) { + mState = TextState::Start; + mLabelName.clear(); + mEscape = FindFirstLetter(line.data(), line.size(), '\\').has_value(); -// parse:k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx + HandleStart(metricEvent, line.data(), line.size()); + if (mState == TextState::Done) { + return true; + } -// parse:"v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx + return false; +} +std::optional TextParserSIMD::FindFirstLetter(const char* s, size_t len, char target) { + size_t res = 0; -// parse:v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx + __m128i targetVec = _mm_set1_epi8(target); -// parse:, k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx -// or parse:} 9.9410452992e+10 1715829785083 # exemplarsxxx + while (res + 16 < len) { + __m128i chunk = _mm_loadu_si128(reinterpret_cast(&s[res])); + __m128i cmp = _mm_cmpeq_epi8(chunk, targetVec); -// parse:9.9410452992e+10 1715829785083 # exemplarsxxx + int mask = _mm_movemask_epi8(cmp); + if (mask != 0) { + return res + __builtin_ffs(mask) - 1; + } -// parse:1715829785083 # exemplarsxxx -// timestamp will be 1715829785.083 in OpenMetrics + res += 16; + } + while (res < len) { + if (s[res] == target) { + return res; + } + res++; + } + return std::nullopt; +} +std::optional TextParserSIMD::FindFirstWhiteSpace(const char* s, size_t len) { + size_t res = 0; + static __m128i sTargetVec1 = _mm_set1_epi8(' '); + static __m128i sTargetVec2 = _mm_set1_epi8('\t'); + while (res + 16 < len) { + __m128i chunk = _mm_loadu_si128(reinterpret_cast(&s[res])); + + __m128i cmp1 = _mm_cmpeq_epi8(chunk, sTargetVec1); + __m128i cmp2 = _mm_cmpeq_epi8(chunk, sTargetVec2); + + int mask1 = _mm_movemask_epi8(cmp1); + int mask2 = _mm_movemask_epi8(cmp2); + + if (mask1 != 0) { + return res + __builtin_ffs(mask1) - 1; + } + if (mask2 != 0) { + return res + __builtin_ffs(mask2) - 1; + } + + res += 16; + } + + while (res < len) { + if (s[res] == ' ' || s[res] == '\t') { + return res; + } + res++; + } + return std::nullopt; +} + +std::optional TextParserSIMD::FindWhiteSpaceAndExemplar(const char* s, size_t len) { + size_t res = 0; + + static __m128i sTargetVec1 = _mm_set1_epi8(' '); + static __m128i sTargetVec2 = _mm_set1_epi8('\t'); + static __m128i sTargetVec3 = _mm_set1_epi8('#'); + + while (res + 16 < len) { + __m128i chunk = _mm_loadu_si128(reinterpret_cast(&s[res])); + + __m128i cmp1 = _mm_cmpeq_epi8(chunk, sTargetVec1); + __m128i cmp2 = _mm_cmpeq_epi8(chunk, sTargetVec2); + __m128i cmp3 = _mm_cmpeq_epi8(chunk, sTargetVec3); + + int mask1 = _mm_movemask_epi8(cmp1); + int mask2 = _mm_movemask_epi8(cmp2); + int mask3 = _mm_movemask_epi8(cmp3); + + if (mask1 != 0) { + return res + __builtin_ffs(mask1) - 1; + } + if (mask2 != 0) { + return res + __builtin_ffs(mask2) - 1; + } + if (mask3 != 0) { + return res + __builtin_ffs(mask3) - 1; + } + + res += 16; + } + + while (res < len) { + if (s[res] == ' ' || s[res] == '\t' || s[res] == '#') { + return res; + } + res++; + } + return std::nullopt; +} + +std::optional TextParserSIMD::SkipTrailingWhitespace(const char* s, size_t pos) { + for (; pos > 0 && (s[pos] == ' ' || s[pos] == '\t'); --pos) { + } + if (pos == 0 && (s[pos] == ' ' || s[pos] == '\t')) { + return std::nullopt; + } + return pos; +} + + +void TextParserSIMD::HandleError(const std::string& errMsg) { + LOG_WARNING(sLogger, ("text parser error parsing line", errMsg)); + mState = TextState::Error; +} + +void TextParserSIMD::HandleStart(MetricEvent& metricEvent, const char* s, const size_t len) { + auto pos = SkipLeadingWhitespace(s, len, 0); + HandleMetricName(metricEvent, s + pos, len - pos); +} +void TextParserSIMD::HandleMetricName(MetricEvent& metricEvent, const char* s, size_t len) { + auto pos = FindFirstLetter(s, len, '{'); + if (pos.has_value()) { + auto endPos = SkipTrailingWhitespace(s, pos.value() - 1); + if (endPos.has_value()) { + metricEvent.SetNameNoCopy(StringView(s, endPos.value() + 1)); + } else { + HandleError("error end of metric name"); + return; + } + auto nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); + HandleLabelName(metricEvent, s + nextPos, len - nextPos); + } else { + auto nextPos = FindFirstWhiteSpace(s, len); + if (nextPos.has_value()) { + metricEvent.SetNameNoCopy(StringView(s, nextPos.value())); + auto nextNextPos = SkipLeadingWhitespace(s, len, nextPos.value()); + HandleSampleValue(metricEvent, s + nextNextPos, len - nextNextPos); + } + } +} +void TextParserSIMD::HandleLabelName(MetricEvent& metricEvent, const char* s, size_t len) { + auto pos = FindFirstLetter(s, len, '='); + if (pos.has_value()) { + auto endPos = SkipTrailingWhitespace(s, pos.value() - 1); + if (endPos.has_value()) { + if (FindFirstLetter(s, endPos.value(), '"').has_value()) { + HandleError("invalid character in label name"); + return; + } + mLabelName = StringView(s, endPos.value() + 1); + } else { + HandleError("error end of metric name"); + return; + } + auto nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); + HandleLabelValue(metricEvent, s + nextPos, len - nextPos); + } else { + if (len > 0 && s[0] == '}') { + auto nextPos = SkipLeadingWhitespace(s, len, 1); + HandleSampleValue(metricEvent, s + nextPos, len - nextPos); + } else { + HandleError("invalid character in label name"); + } + } +} +void TextParserSIMD::HandleLabelValue(MetricEvent& metricEvent, const char* s, size_t len) { + // left quote has been consumed + // LableValue supports escape char + if (len == 0 || s[0] != '"') { + HandleError("invalid character in label value"); + return; + } + s = s + 1; + len--; + size_t nextPos = 0; + if (mEscape) { + // slow path + // escape char + std::string labelValue; + size_t pos = 0; + for (size_t i = 0; i < len; i++) { + if (s[i] == '\\') { + if (i + 1 < len) { + switch (s[i + 1]) { + case 'n': + labelValue.push_back('\n'); + break; + case '\\': + case '\"': + labelValue.push_back(s[i + 1]); + break; + default: + labelValue.push_back('\\'); + labelValue.push_back(s[i + 1]); + break; + } + i++; + } else { + HandleError("invalid escape char"); + return; + } + } else if (s[i] == '"') { + pos = i; + break; + } else { + labelValue.push_back(s[i]); + } + } + auto sb = metricEvent.GetSourceBuffer()->CopyString(labelValue); + metricEvent.SetTag(mLabelName, StringView(sb.data, sb.size)); + nextPos = SkipLeadingWhitespace(s, len, pos + 1); + } else { + const auto pos = FindFirstLetter(s, len, '"'); + if (pos.has_value()) { + metricEvent.SetTagNoCopy(mLabelName, StringView(s, pos.value())); + nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); + } else { + HandleError("invalid character in label value"); + return; + } + } + if (s[nextPos] == ',') { + nextPos++; + nextPos = SkipLeadingWhitespace(s, len, nextPos); + if (s[nextPos] == '}') { + nextPos++; + nextPos = SkipLeadingWhitespace(s, len, nextPos); + HandleSampleValue(metricEvent, s + nextPos, len - nextPos); + return; + } + HandleLabelName(metricEvent, s + nextPos, len - nextPos); + } else if (s[nextPos] == '}') { + nextPos++; + nextPos = SkipLeadingWhitespace(s, len, nextPos); + HandleSampleValue(metricEvent, s + nextPos, len - nextPos); + } else { + HandleError("invalid character in label value"); + } +} +void TextParserSIMD::HandleSampleValue(MetricEvent& metricEvent, const char* s, size_t len) { + auto pos = FindWhiteSpaceAndExemplar(s, len); + size_t valueLen = 0; + if (pos.has_value()) { + valueLen = pos.value(); + } else { + valueLen = len; + } + if (valueLen == 0) { + HandleError("invalid sample value"); + return; + } + auto tmpSampleValue = StringView(s, valueLen); + try { + auto sampleValue = std::stod(tmpSampleValue.to_string()); + metricEvent.SetValue(sampleValue); + } catch (...) { + HandleError("invalid sample value"); + return; + } + if ((pos.has_value() && s[pos.value()] == '#') || valueLen == len) { + metricEvent.SetTimestamp(mDefaultTimestamp, mDefaultNanoTimestamp); + mState = TextState::Done; + return; + } + s = s + pos.value() + 1; + len -= pos.value() + 1; + auto nextPos = SkipLeadingWhitespace(s, len, 0); + HandleTimestamp(metricEvent, s + nextPos, len - nextPos); +} +void TextParserSIMD::HandleTimestamp(MetricEvent& metricEvent, const char* s, size_t len) { + // '#' is for exemplars, and we don't need it + auto pos = FindWhiteSpaceAndExemplar(s, len); + size_t valueLen = 0; + if (pos.has_value()) { + valueLen = pos.value(); + } else { + valueLen = len; + } + if (valueLen == 0) { + mState = TextState::Done; + return; + } + auto tmpTimestamp = StringView(s, valueLen); + double milliTimestamp = 0; + try { + milliTimestamp = std::stod(tmpTimestamp.to_string()); + } catch (...) { + HandleError("invalid timestamp"); + return; + } + + if (milliTimestamp > 1ULL << 63) { + HandleError("timestamp overflow"); + return; + } + if (milliTimestamp < 1UL << 31) { + milliTimestamp *= 1000; + } + time_t timestamp = (int64_t)milliTimestamp / 1000; + auto ns = ((int64_t)milliTimestamp % 1000) * 1000000; + if (mHonorTimestamps) { + metricEvent.SetTimestamp(timestamp, ns); + } else { + metricEvent.SetTimestamp(mDefaultTimestamp, mDefaultNanoTimestamp); + } + mState = TextState::Done; +} + +inline size_t TextParserSIMD::SkipLeadingWhitespace(const char* s, size_t len, size_t pos) { + while (pos < len && (s[pos] == ' ' || s[pos] == '\t')) { + pos++; + } + return pos; +} } // namespace logtail::prom diff --git a/core/prometheus/labels/TextParserSIMD.h b/core/prometheus/labels/TextParserSIMD.h index 9b9edaf892..8ab9ba2aad 100644 --- a/core/prometheus/labels/TextParserSIMD.h +++ b/core/prometheus/labels/TextParserSIMD.h @@ -22,11 +22,9 @@ #include #include -#include "logger/Logger.h" #include "models/MetricEvent.h" #include "models/PipelineEventGroup.h" #include "models/StringView.h" -#include "prometheus/Utils.h" namespace logtail::prom { @@ -36,289 +34,31 @@ enum class TextState { Start, Done, Error }; class TextParserSIMD { public: TextParserSIMD() = default; - explicit TextParserSIMD(bool honorTimestamps) : mHonorTimestamps(honorTimestamps) {} + explicit TextParserSIMD(bool honorTimestamps); - void SetDefaultTimestamp(uint64_t defaultTimestamp, uint32_t defaultNanoSec) { - mDefaultTimestamp = defaultTimestamp; - mDefaultNanoTimestamp = defaultNanoSec; - } + void SetDefaultTimestamp(uint64_t defaultTimestamp, uint32_t defaultNanoSec); - PipelineEventGroup Parse(const std::string& content, uint64_t defaultTimestamp, uint32_t defaultNanoSec) { - SetDefaultTimestamp(defaultTimestamp, defaultNanoSec); - auto eGroup = PipelineEventGroup(std::make_shared()); - std::vector lines; - // pre-reserve vector size by 1024 which is experience value per line - lines.reserve(content.size() / 1024); - SplitStringView(content, '\n', lines); - for (const auto& line : lines) { - if (!IsValidMetric(line)) { - continue; - } - auto metricEvent = eGroup.CreateMetricEvent(); - if (ParseLine(line, *metricEvent)) { - eGroup.MutableEvents().emplace_back(std::move(metricEvent), false, nullptr); - } - } + PipelineEventGroup Parse(const std::string& content, uint64_t defaultTimestamp, uint32_t defaultNanoSec); - return eGroup; - } - - bool ParseLine(StringView line, MetricEvent& metricEvent) { - mState = TextState::Start; - mLabelName.clear(); - mEscape = FindFirstLetter(line.data(), line.size(), '\\').has_value(); - - HandleStart(metricEvent, line.data(), line.size()); - - if (mState == TextState::Done) { - return true; - } - - return false; - } + bool ParseLine(StringView line, MetricEvent& metricEvent); private: - std::optional FindFirstLetter(const char* s, size_t len, char target) { - size_t res = 0; - while (res < len) { - if (s[res] == target) { - return res; - } - res++; - } - return std::nullopt; - } - std::optional FindFirstLetters(const char* s, size_t len, char target1, char target2) { - size_t res = 0; - while (res < len) { - if (s[res] == target1 || s[res] == target2) { - return res; - } - res++; - } - return std::nullopt; - } - - std::optional SkipTrailingWhitespace(const char* s, size_t pos) { - for (; pos > 0 && (s[pos] == ' ' || s[pos] == '\t'); --pos) { - } - if (pos == 0 && (s[pos] == ' ' || s[pos] == '\t')) { - return std::nullopt; - } - return pos; - } - + std::optional FindFirstLetter(const char* s, size_t len, char target); + std::optional FindFirstWhiteSpace(const char* s, size_t len); - void HandleError(const std::string& errMsg) { - LOG_WARNING(sLogger, ("text parser error parsing line", errMsg)); - mState = TextState::Error; - } + std::optional FindWhiteSpaceAndExemplar(const char* s, size_t len); - void HandleStart(MetricEvent& metricEvent, const char* s, const size_t len) { - auto pos = SkipLeadingWhitespace(s, len, 0); - HandleMetricName(metricEvent, s + pos, len - pos); - } - void HandleMetricName(MetricEvent& metricEvent, const char* s, size_t len) { - auto pos = FindFirstLetter(s, len, '{'); - if (pos.has_value()) { - auto endPos = SkipTrailingWhitespace(s, pos.value() - 1); - if (endPos.has_value()) { - metricEvent.SetNameNoCopy(StringView(s, endPos.value() + 1)); - } else { - HandleError("error end of metric name"); - return; - } - auto nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); - HandleLabelName(metricEvent, s + nextPos, len - nextPos); - } else { - auto nextPos = FindFirstLetters(s, len, ' ', '\t'); - if (nextPos.has_value()) { - metricEvent.SetNameNoCopy(StringView(s, nextPos.value())); - auto nextNextPos = SkipLeadingWhitespace(s, len, nextPos.value()); - HandleSampleValue(metricEvent, s + nextNextPos, len - nextNextPos); - } - } - } - void HandleLabelName(MetricEvent& metricEvent, const char* s, size_t len) { - auto pos = FindFirstLetter(s, len, '='); - if (pos.has_value()) { - auto endPos = SkipTrailingWhitespace(s, pos.value() - 1); - if (endPos.has_value()) { - if (FindFirstLetter(s, endPos.value(), '"').has_value()) { - HandleError("invalid character in label name"); - return; - } - mLabelName = StringView(s, endPos.value() + 1); - } else { - HandleError("error end of metric name"); - return; - } - auto nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); - HandleLabelValue(metricEvent, s + nextPos, len - nextPos); - } else { - if (len > 0 && s[0] == '}') { - auto nextPos = SkipLeadingWhitespace(s, len, 1); - HandleSampleValue(metricEvent, s + nextPos, len - nextPos); - } else { - HandleError("invalid character in label name"); - } - } - } - void HandleLabelValue(MetricEvent& metricEvent, const char* s, size_t len) { - // left quote has been consumed - // LableValue supports escape char - if (len == 0 || s[0] != '"') { - HandleError("invalid character in label value"); - return; - } - s = s + 1; - len--; - size_t nextPos = 0; - if (mEscape) { - // slow path - // escape char - std::string labelValue; - size_t pos = 0; - for (size_t i = 0; i < len; i++) { - if (s[i] == '\\') { - if (i + 1 < len) { - switch (s[i + 1]) { - case 'n': - labelValue.push_back('\n'); - break; - case '\\': - case '\"': - labelValue.push_back(s[i + 1]); - break; - default: - labelValue.push_back('\\'); - labelValue.push_back(s[i + 1]); - break; - } - i++; - } else { - HandleError("invalid escape char"); - return; - } - } else if (s[i] == '"') { - pos = i; - break; - } else { - labelValue.push_back(s[i]); - } - } - auto sb = metricEvent.GetSourceBuffer()->CopyString(labelValue); - metricEvent.SetTag(mLabelName, StringView(sb.data, sb.size)); - nextPos = SkipLeadingWhitespace(s, len, pos + 1); - } else { - const auto pos = FindFirstLetter(s, len, '"'); - if (pos.has_value()) { - metricEvent.SetTagNoCopy(mLabelName, StringView(s, pos.value())); - nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); - } else { - HandleError("invalid character in label value"); - return; - } - } - if (s[nextPos] == ',') { - nextPos++; - nextPos = SkipLeadingWhitespace(s, len, nextPos); - if (s[nextPos] == '}') { - nextPos++; - nextPos = SkipLeadingWhitespace(s, len, nextPos); - HandleSampleValue(metricEvent, s + nextPos, len - nextPos); - return; - } - HandleLabelName(metricEvent, s + nextPos, len - nextPos); - } else if (s[nextPos] == '}') { - nextPos++; - nextPos = SkipLeadingWhitespace(s, len, nextPos); - HandleSampleValue(metricEvent, s + nextPos, len - nextPos); - } else { - HandleError("invalid character in label value"); - } - } - void HandleSampleValue(MetricEvent& metricEvent, const char* s, size_t len) { - auto pos = FindFirstLetters(s, len, ' ', '\t'); - if (!pos.has_value()) { - pos = FindFirstLetter(s, len, '#'); - } - size_t valueLen = 0; - if (pos.has_value()) { - valueLen = pos.value(); - } else { - valueLen = len; - } - if (valueLen == 0) { - HandleError("invalid sample value"); - return; - } - auto tmpSampleValue = StringView(s, valueLen); - try { - auto sampleValue = std::stod(tmpSampleValue.to_string()); - metricEvent.SetValue(sampleValue); - } catch (...) { - HandleError("invalid sample value"); - return; - } - if ((pos.has_value() && s[pos.value()] == '#') || valueLen == len) { - metricEvent.SetTimestamp(mDefaultTimestamp, mDefaultNanoTimestamp); - mState = TextState::Done; - return; - } - s = s + pos.value() + 1; - len -= pos.value() + 1; - auto nextPos = SkipLeadingWhitespace(s, len, 0); - HandleTimestamp(metricEvent, s + nextPos, len - nextPos); - } - void HandleTimestamp(MetricEvent& metricEvent, const char* s, size_t len) { - // '#' is for exemplars, and we don't need it - auto pos = FindFirstLetters(s, len, ' ', '\t'); - if (!pos.has_value()) { - pos = FindFirstLetter(s, len, '#'); - } - size_t valueLen = 0; - if (pos.has_value()) { - valueLen = pos.value(); - } else { - valueLen = len; - } - if (valueLen == 0) { - mState = TextState::Done; - return; - } - auto tmpTimestamp = StringView(s, valueLen); - double milliTimestamp = 0; - try { - milliTimestamp = std::stod(tmpTimestamp.to_string()); - } catch (...) { - HandleError("invalid timestamp"); - return; - } + std::optional SkipTrailingWhitespace(const char* s, size_t pos); - if (milliTimestamp > 1ULL << 63) { - HandleError("timestamp overflow"); - return; - } - if (milliTimestamp < 1UL << 31) { - milliTimestamp *= 1000; - } - time_t timestamp = (int64_t)milliTimestamp / 1000; - auto ns = ((int64_t)milliTimestamp % 1000) * 1000000; - if (mHonorTimestamps) { - metricEvent.SetTimestamp(timestamp, ns); - } else { - metricEvent.SetTimestamp(mDefaultTimestamp, mDefaultNanoTimestamp); - } - mState = TextState::Done; - } + void HandleError(const std::string& errMsg); - inline size_t SkipLeadingWhitespace(const char* s, size_t len, size_t pos) { - while (pos < len && (s[pos] == ' ' || s[pos] == '\t')) { - pos++; - } - return pos; - } + void HandleStart(MetricEvent& metricEvent, const char* s, size_t len); + void HandleMetricName(MetricEvent& metricEvent, const char* s, size_t len); + void HandleLabelName(MetricEvent& metricEvent, const char* s, size_t len); + void HandleLabelValue(MetricEvent& metricEvent, const char* s, size_t len); + void HandleSampleValue(MetricEvent& metricEvent, const char* s, size_t len); + void HandleTimestamp(MetricEvent& metricEvent, const char* s, size_t len); + inline size_t SkipLeadingWhitespace(const char* s, size_t len, size_t pos); TextState mState{TextState::Start}; bool mEscape{false}; diff --git a/core/unittest/prometheus/CMakeLists.txt b/core/unittest/prometheus/CMakeLists.txt index ef9aeddae5..cc34473f47 100644 --- a/core/unittest/prometheus/CMakeLists.txt +++ b/core/unittest/prometheus/CMakeLists.txt @@ -36,6 +36,9 @@ target_link_libraries(prometheus_input_runner_unittest ${UT_BASE_TARGET}) add_executable(textparser_unittest TextParserUnittest.cpp) target_link_libraries(textparser_unittest ${UT_BASE_TARGET}) +add_executable(textparser_simd_unittest TextParserSIMDUnittest.cpp) +target_link_libraries(textparser_simd_unittest ${UT_BASE_TARGET}) + add_executable(scrape_config_unittest ScrapeConfigUnittest.cpp) target_link_libraries(scrape_config_unittest ${UT_BASE_TARGET}) @@ -54,6 +57,7 @@ gtest_discover_tests(scrape_scheduler_unittest) gtest_discover_tests(target_subscriber_scheduler_unittest) gtest_discover_tests(prometheus_input_runner_unittest) gtest_discover_tests(textparser_unittest) +gtest_discover_tests(textparser_simd_unittest) gtest_discover_tests(scrape_config_unittest) gtest_discover_tests(prom_utils_unittest) gtest_discover_tests(prom_asyn_unittest) diff --git a/core/unittest/prometheus/TextParserBenchmark.cpp b/core/unittest/prometheus/TextParserBenchmark.cpp index 7af03beb72..a60a9ebf34 100644 --- a/core/unittest/prometheus/TextParserBenchmark.cpp +++ b/core/unittest/prometheus/TextParserBenchmark.cpp @@ -17,6 +17,7 @@ #include #include "prometheus/labels/TextParser.h" +#include "prometheus/labels/TextParserSIMD.h" #include "unittest/Unittest.h" using namespace std; @@ -48,14 +49,14 @@ class TextParserBenchmark : public testing::Test { private: std::string mRawData = R"""( -test_metric1{k1="v1", k2="v2"} 2.0 1234567890 -test_metric2{k1="v1",k2="v2"} 9.9410452992e+10 -test_metric3{k1="v1",k2="v2"} 9.9410452992e+10 1715829785083 -test_metric4{k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 -test_metric5{k1="v1",k2="v2",} 9.9410452992e+10 1715829785083 -test_metric6{k1="v1",k2="v2", } 9.9410452992e+10 1715829785083 -test_metric7{k1="v1", k2="v2", } 9.9410452992e+10 1715829785083 -test_metric8{k1="v1", k2="v2", } 9.9410452992e+10 1715829785083 +test_metric1{k111111111111="v11111111111", k222222="v2"} 2.0 1234567890 +test_metric2{k111111111111="v11111111111",k222222="v2"} 9.9410452992e+10 +test_metric3{k111111111111="v11111111111",k222222="v2"} 9.9410452992e+10 1715829785083 +test_metric4{k111111111111="v11111111111", k222222="v2" } 9.9410452992e+10 1715829785083 +test_metric5{k111111111111="v11111111111",k222222="v2",} 9.9410452992e+10 1715829785083 +test_metric6{k111111111111="v11111111111",k222222="v2", } 9.9410452992e+10 1715829785083 +test_metric7{k111111111111="v11111111111", k222222="v2", } 9.9410452992e+10 1715829785083 +test_metric8{k111111111111="v11111111111", k222222="v2", } 9.9410452992e+10 1715829785083 )"""; std::string m100MData; std::string m1000MData; @@ -72,6 +73,7 @@ void TextParserBenchmark::TestParse100M() const { cout << "elapsed: " << elapsed.count() << " seconds" << endl; // elapsed: 1.53s in release mode // elapsed: 551MB in release mode + // 2.51s -> 1.26s when rawLine is longer if we use SIMD } void TextParserBenchmark::TestParse1000M() const { From 189fd7d37e7d4d6389c6833a18117b2c6fb931cd Mon Sep 17 00:00:00 2001 From: liqiang Date: Sat, 7 Dec 2024 16:11:23 +0000 Subject: [PATCH 04/11] chore: change to SIMDTextParser --- .../plugin/processor/inner/ProcessorPromParseMetricNative.cpp | 4 ++-- core/plugin/processor/inner/ProcessorPromParseMetricNative.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp b/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp index 95b81a569a..b3200d71e1 100644 --- a/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp +++ b/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp @@ -32,7 +32,7 @@ void ProcessorPromParseMetricNative::Process(PipelineEventGroup& eGroup) { auto timestampMilliSec = StringTo(scrapeTimestampMilliSecStr.to_string()); auto timestamp = timestampMilliSec / 1000; auto nanoSec = timestampMilliSec % 1000 * 1000000; - TextParser parser(mScrapeConfigPtr->mHonorTimestamps); + prom::TextParserSIMD parser(mScrapeConfigPtr->mHonorTimestamps); parser.SetDefaultTimestamp(timestamp, nanoSec); for (auto& e : events) { @@ -49,7 +49,7 @@ bool ProcessorPromParseMetricNative::IsSupportedEvent(const PipelineEventPtr& e) bool ProcessorPromParseMetricNative::ProcessEvent(PipelineEventPtr& e, EventsContainer& newEvents, PipelineEventGroup& eGroup, - TextParser& parser) { + prom::TextParserSIMD& parser) { if (!IsSupportedEvent(e)) { return false; } diff --git a/core/plugin/processor/inner/ProcessorPromParseMetricNative.h b/core/plugin/processor/inner/ProcessorPromParseMetricNative.h index f9c036c58a..2d7e53712b 100644 --- a/core/plugin/processor/inner/ProcessorPromParseMetricNative.h +++ b/core/plugin/processor/inner/ProcessorPromParseMetricNative.h @@ -5,7 +5,7 @@ #include "models/PipelineEventGroup.h" #include "models/PipelineEventPtr.h" #include "pipeline/plugin/interface/Processor.h" -#include "prometheus/labels/TextParser.h" +#include "prometheus/labels/TextParserSIMD.h" #include "prometheus/schedulers/ScrapeConfig.h" namespace logtail { @@ -21,7 +21,7 @@ class ProcessorPromParseMetricNative : public Processor { bool IsSupportedEvent(const PipelineEventPtr&) const override; private: - bool ProcessEvent(PipelineEventPtr&, EventsContainer&, PipelineEventGroup&, TextParser& parser); + bool ProcessEvent(PipelineEventPtr&, EventsContainer&, PipelineEventGroup&, prom::TextParserSIMD& parser); std::unique_ptr mScrapeConfigPtr; #ifdef APSARA_UNIT_TEST_MAIN From 3dfc16a6ea84119b6a31d8b4ce8443b2cc1fc44e Mon Sep 17 00:00:00 2001 From: liqiang Date: Tue, 10 Dec 2024 02:26:01 +0000 Subject: [PATCH 05/11] chore: remove simd in processor --- .../plugin/processor/inner/ProcessorPromParseMetricNative.cpp | 4 ++-- core/plugin/processor/inner/ProcessorPromParseMetricNative.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp b/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp index b3200d71e1..95b81a569a 100644 --- a/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp +++ b/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp @@ -32,7 +32,7 @@ void ProcessorPromParseMetricNative::Process(PipelineEventGroup& eGroup) { auto timestampMilliSec = StringTo(scrapeTimestampMilliSecStr.to_string()); auto timestamp = timestampMilliSec / 1000; auto nanoSec = timestampMilliSec % 1000 * 1000000; - prom::TextParserSIMD parser(mScrapeConfigPtr->mHonorTimestamps); + TextParser parser(mScrapeConfigPtr->mHonorTimestamps); parser.SetDefaultTimestamp(timestamp, nanoSec); for (auto& e : events) { @@ -49,7 +49,7 @@ bool ProcessorPromParseMetricNative::IsSupportedEvent(const PipelineEventPtr& e) bool ProcessorPromParseMetricNative::ProcessEvent(PipelineEventPtr& e, EventsContainer& newEvents, PipelineEventGroup& eGroup, - prom::TextParserSIMD& parser) { + TextParser& parser) { if (!IsSupportedEvent(e)) { return false; } diff --git a/core/plugin/processor/inner/ProcessorPromParseMetricNative.h b/core/plugin/processor/inner/ProcessorPromParseMetricNative.h index 2d7e53712b..f9c036c58a 100644 --- a/core/plugin/processor/inner/ProcessorPromParseMetricNative.h +++ b/core/plugin/processor/inner/ProcessorPromParseMetricNative.h @@ -5,7 +5,7 @@ #include "models/PipelineEventGroup.h" #include "models/PipelineEventPtr.h" #include "pipeline/plugin/interface/Processor.h" -#include "prometheus/labels/TextParserSIMD.h" +#include "prometheus/labels/TextParser.h" #include "prometheus/schedulers/ScrapeConfig.h" namespace logtail { @@ -21,7 +21,7 @@ class ProcessorPromParseMetricNative : public Processor { bool IsSupportedEvent(const PipelineEventPtr&) const override; private: - bool ProcessEvent(PipelineEventPtr&, EventsContainer&, PipelineEventGroup&, prom::TextParserSIMD& parser); + bool ProcessEvent(PipelineEventPtr&, EventsContainer&, PipelineEventGroup&, TextParser& parser); std::unique_ptr mScrapeConfigPtr; #ifdef APSARA_UNIT_TEST_MAIN From 543b70f681b912b45663bcf1ac5d3d4c915ef7f5 Mon Sep 17 00:00:00 2001 From: liqiang Date: Tue, 10 Dec 2024 02:29:49 +0000 Subject: [PATCH 06/11] chore: update code style --- core/prometheus/labels/TextParserSIMD.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/prometheus/labels/TextParserSIMD.h b/core/prometheus/labels/TextParserSIMD.h index 8ab9ba2aad..f481de93ab 100644 --- a/core/prometheus/labels/TextParserSIMD.h +++ b/core/prometheus/labels/TextParserSIMD.h @@ -45,10 +45,10 @@ class TextParserSIMD { private: std::optional FindFirstLetter(const char* s, size_t len, char target); std::optional FindFirstWhiteSpace(const char* s, size_t len); - std::optional FindWhiteSpaceAndExemplar(const char* s, size_t len); std::optional SkipTrailingWhitespace(const char* s, size_t pos); + inline size_t SkipLeadingWhitespace(const char* s, size_t len, size_t pos); void HandleError(const std::string& errMsg); @@ -58,7 +58,6 @@ class TextParserSIMD { void HandleLabelValue(MetricEvent& metricEvent, const char* s, size_t len); void HandleSampleValue(MetricEvent& metricEvent, const char* s, size_t len); void HandleTimestamp(MetricEvent& metricEvent, const char* s, size_t len); - inline size_t SkipLeadingWhitespace(const char* s, size_t len, size_t pos); TextState mState{TextState::Start}; bool mEscape{false}; From 732cd9f01220719ea02b621273c9576d8d2f98ca Mon Sep 17 00:00:00 2001 From: liqiang Date: Tue, 17 Dec 2024 03:02:09 +0000 Subject: [PATCH 07/11] feat: textparser with simd --- core/CMakeLists.txt | 2 + .../inner/ProcessorPromParseMetricNative.cpp | 4 +- .../inner/ProcessorPromParseMetricNative.h | 2 +- core/prometheus/labels/TextParser.cpp | 452 ++++++++++-------- core/prometheus/labels/TextParser.h | 38 +- core/prometheus/labels/TextParserSIMD.cpp | 395 --------------- core/prometheus/labels/TextParserSIMD.h | 76 --- .../prometheus/TextParserBenchmark.cpp | 31 +- .../prometheus/TextParserSIMDUnittest.cpp | 394 --------------- .../prometheus/TextParserUnittest.cpp | 4 +- 10 files changed, 286 insertions(+), 1112 deletions(-) delete mode 100644 core/prometheus/labels/TextParserSIMD.cpp delete mode 100644 core/prometheus/labels/TextParserSIMD.h delete mode 100644 core/unittest/prometheus/TextParserSIMDUnittest.cpp diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 3dd6af2c93..51daca6b2c 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -52,6 +52,8 @@ endif () if (NOT WITHSPL) add_definitions(-D__EXCLUDE_SPL__) +else () + add_definitions(-D__ENABLE_SSE4_2__) endif() # Default C/CXX flags. diff --git a/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp b/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp index 95b81a569a..5c28375b82 100644 --- a/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp +++ b/core/plugin/processor/inner/ProcessorPromParseMetricNative.cpp @@ -32,7 +32,7 @@ void ProcessorPromParseMetricNative::Process(PipelineEventGroup& eGroup) { auto timestampMilliSec = StringTo(scrapeTimestampMilliSecStr.to_string()); auto timestamp = timestampMilliSec / 1000; auto nanoSec = timestampMilliSec % 1000 * 1000000; - TextParser parser(mScrapeConfigPtr->mHonorTimestamps); + prom::TextParser parser(mScrapeConfigPtr->mHonorTimestamps); parser.SetDefaultTimestamp(timestamp, nanoSec); for (auto& e : events) { @@ -49,7 +49,7 @@ bool ProcessorPromParseMetricNative::IsSupportedEvent(const PipelineEventPtr& e) bool ProcessorPromParseMetricNative::ProcessEvent(PipelineEventPtr& e, EventsContainer& newEvents, PipelineEventGroup& eGroup, - TextParser& parser) { + prom::TextParser& parser) { if (!IsSupportedEvent(e)) { return false; } diff --git a/core/plugin/processor/inner/ProcessorPromParseMetricNative.h b/core/plugin/processor/inner/ProcessorPromParseMetricNative.h index f9c036c58a..3300cf25f4 100644 --- a/core/plugin/processor/inner/ProcessorPromParseMetricNative.h +++ b/core/plugin/processor/inner/ProcessorPromParseMetricNative.h @@ -21,7 +21,7 @@ class ProcessorPromParseMetricNative : public Processor { bool IsSupportedEvent(const PipelineEventPtr&) const override; private: - bool ProcessEvent(PipelineEventPtr&, EventsContainer&, PipelineEventGroup&, TextParser& parser); + bool ProcessEvent(PipelineEventPtr&, EventsContainer&, PipelineEventGroup&, prom::TextParser& parser); std::unique_ptr mScrapeConfigPtr; #ifdef APSARA_UNIT_TEST_MAIN diff --git a/core/prometheus/labels/TextParser.cpp b/core/prometheus/labels/TextParser.cpp index a3ecb394ab..97d39d598f 100644 --- a/core/prometheus/labels/TextParser.cpp +++ b/core/prometheus/labels/TextParser.cpp @@ -16,29 +16,21 @@ #include "prometheus/labels/TextParser.h" +#include + #include -#include #include #include -#include "common/StringTools.h" #include "logger/Logger.h" #include "models/MetricEvent.h" #include "models/PipelineEventGroup.h" #include "models/StringView.h" -#include "prometheus/Constants.h" #include "prometheus/Utils.h" using namespace std; -namespace logtail { - -bool IsValidNumberChar(char c) { - static const unordered_set sValidChars - = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', '-', '+', 'e', 'E', 'I', - 'N', 'F', 'T', 'Y', 'i', 'n', 'f', 't', 'y', 'X', 'x', 'N', 'n', 'A', 'a'}; - return sValidChars.count(c); -}; +namespace logtail::prom { TextParser::TextParser(bool honorTimestamps) : mHonorTimestamps(honorTimestamps) { } @@ -69,13 +61,11 @@ PipelineEventGroup TextParser::Parse(const string& content, uint64_t defaultTime } bool TextParser::ParseLine(StringView line, MetricEvent& metricEvent) { - mLine = line; - mPos = 0; mState = TextState::Start; mLabelName.clear(); - mTokenLength = 0; + mEscape = FindFirstLetter(line.data(), line.size(), '\\').has_value(); - HandleStart(metricEvent); + HandleStart(metricEvent, line.data(), line.size()); if (mState == TextState::Done) { return true; @@ -84,228 +74,316 @@ bool TextParser::ParseLine(StringView line, MetricEvent& metricEvent) { return false; } -// start to parse metric sample:test_metric{k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx -void TextParser::HandleStart(MetricEvent& metricEvent) { - SkipLeadingWhitespace(); - auto c = (mPos < mLine.size()) ? mLine[mPos] : '\0'; - if (std::isalpha(c) || c == '_' || c == ':') { - HandleMetricName(metricEvent); - } else { - HandleError("expected metric name"); +std::optional TextParser::FindFirstLetter(const char* s, size_t len, char target) { + size_t res = 0; +#if defined(__ENABLE_SSE4_2__) + __m128i targetVec = _mm_set1_epi8(target); + + while (res + 16 < len) { + __m128i chunk = _mm_loadu_si128(reinterpret_cast(&s[res])); + + __m128i cmp = _mm_cmpeq_epi8(chunk, targetVec); + + int mask = _mm_movemask_epi8(cmp); + + if (mask != 0) { + return res + __builtin_ffs(mask) - 1; + } + + res += 16; + } +#endif + + while (res < len) { + if (s[res] == target) { + return res; + } + res++; } + return std::nullopt; } -// parse:test_metric{k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx -void TextParser::HandleMetricName(MetricEvent& metricEvent) { - char c = (mPos < mLine.size()) ? mLine[mPos] : '\0'; - while (std::isalpha(c) || c == '_' || c == ':' || std::isdigit(c)) { - ++mTokenLength; - ++mPos; - c = (mPos < mLine.size()) ? mLine[mPos] : '\0'; +std::optional TextParser::FindFirstWhiteSpace(const char* s, size_t len) { + size_t res = 0; + +#if defined(__ENABLE_SSE4_2__) + static __m128i sTargetVec1 = _mm_set1_epi8(' '); + static __m128i sTargetVec2 = _mm_set1_epi8('\t'); + + while (res + 16 < len) { + __m128i chunk = _mm_loadu_si128(reinterpret_cast(&s[res])); + + __m128i cmp1 = _mm_cmpeq_epi8(chunk, sTargetVec1); + __m128i cmp2 = _mm_cmpeq_epi8(chunk, sTargetVec2); + + int mask1 = _mm_movemask_epi8(cmp1); + int mask2 = _mm_movemask_epi8(cmp2); + + if (mask1 != 0) { + return res + __builtin_ffs(mask1) - 1; + } + if (mask2 != 0) { + return res + __builtin_ffs(mask2) - 1; + } + + res += 16; } - metricEvent.SetNameNoCopy(mLine.substr(mPos - mTokenLength, mTokenLength)); - mTokenLength = 0; - SkipLeadingWhitespace(); - if (mPos < mLine.size()) { - if (mLine[mPos] == '{') { - ++mPos; - SkipLeadingWhitespace(); - HandleLabelName(metricEvent); - } else { - HandleSampleValue(metricEvent); +#endif + + while (res < len) { + if (s[res] == ' ' || s[res] == '\t') { + return res; } - } else { - HandleError("error end of metric name"); + res++; } + return std::nullopt; } -// parse:k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx -void TextParser::HandleLabelName(MetricEvent& metricEvent) { - char c = (mPos < mLine.size()) ? mLine[mPos] : '\0'; - if (std::isalpha(c) || c == '_') { - while (std::isalpha(c) || c == '_' || std::isdigit(c)) { - ++mTokenLength; - ++mPos; - c = (mPos < mLine.size()) ? mLine[mPos] : '\0'; +std::optional TextParser::FindWhiteSpaceAndExemplar(const char* s, size_t len) { + size_t res = 0; + +#if defined(__ENABLE_SSE4_2__) + static __m128i sTargetVec1 = _mm_set1_epi8(' '); + static __m128i sTargetVec2 = _mm_set1_epi8('\t'); + static __m128i sTargetVec3 = _mm_set1_epi8('#'); + + while (res + 16 < len) { + __m128i chunk = _mm_loadu_si128(reinterpret_cast(&s[res])); + + __m128i cmp1 = _mm_cmpeq_epi8(chunk, sTargetVec1); + __m128i cmp2 = _mm_cmpeq_epi8(chunk, sTargetVec2); + __m128i cmp3 = _mm_cmpeq_epi8(chunk, sTargetVec3); + + int mask1 = _mm_movemask_epi8(cmp1); + int mask2 = _mm_movemask_epi8(cmp2); + int mask3 = _mm_movemask_epi8(cmp3); + + if (mask1 != 0) { + return res + __builtin_ffs(mask1) - 1; + } + if (mask2 != 0) { + return res + __builtin_ffs(mask2) - 1; + } + if (mask3 != 0) { + return res + __builtin_ffs(mask3) - 1; + } + + res += 16; + } +#endif + + while (res < len) { + if (s[res] == ' ' || s[res] == '\t' || s[res] == '#') { + return res; } - mLabelName = mLine.substr(mPos - mTokenLength, mTokenLength); - mTokenLength = 0; - SkipLeadingWhitespace(); - if (mPos == mLine.size() || mLine[mPos] != '=') { - HandleError("expected '=' after label name"); + res++; + } + return std::nullopt; +} + +std::optional TextParser::SkipTrailingWhitespace(const char* s, size_t pos) { + for (; pos > 0 && (s[pos] == ' ' || s[pos] == '\t'); --pos) { + } + if (pos == 0 && (s[pos] == ' ' || s[pos] == '\t')) { + return std::nullopt; + } + return pos; +} + +inline size_t TextParser::SkipLeadingWhitespace(const char* s, size_t len, size_t pos) { + while (pos < len && (s[pos] == ' ' || s[pos] == '\t')) { + pos++; + } + return pos; +} + +void TextParser::HandleError(const string& errMsg) { + LOG_WARNING(sLogger, ("text parser error parsing line", errMsg)); + mState = TextState::Error; +} + +void TextParser::HandleStart(MetricEvent& metricEvent, const char* s, const size_t len) { + auto pos = SkipLeadingWhitespace(s, len, 0); + HandleMetricName(metricEvent, s + pos, len - pos); +} + +void TextParser::HandleMetricName(MetricEvent& metricEvent, const char* s, size_t len) { + auto pos = FindFirstLetter(s, len, '{'); + if (pos.has_value()) { + auto endPos = SkipTrailingWhitespace(s, pos.value() - 1); + if (endPos.has_value()) { + metricEvent.SetNameNoCopy(StringView(s, endPos.value() + 1)); + } else { + HandleError("error end of metric name"); return; } - ++mPos; - SkipLeadingWhitespace(); - HandleEqualSign(metricEvent); - } else if (c == '}') { - ++mPos; - SkipLeadingWhitespace(); - HandleSampleValue(metricEvent); + auto nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); + HandleLabelName(metricEvent, s + nextPos, len - nextPos); } else { - HandleError("invalid character in label name"); + auto nextPos = FindFirstWhiteSpace(s, len); + if (nextPos.has_value()) { + metricEvent.SetNameNoCopy(StringView(s, nextPos.value())); + auto nextNextPos = SkipLeadingWhitespace(s, len, nextPos.value()); + HandleSampleValue(metricEvent, s + nextNextPos, len - nextNextPos); + } } } -// parse:"v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx -void TextParser::HandleEqualSign(MetricEvent& metricEvent) { - if (mPos < mLine.size() && mLine[mPos] == '"') { - ++mPos; - HandleLabelValue(metricEvent); +void TextParser::HandleLabelName(MetricEvent& metricEvent, const char* s, size_t len) { + auto pos = FindFirstLetter(s, len, '='); + if (pos.has_value()) { + auto endPos = SkipTrailingWhitespace(s, pos.value() - 1); + if (endPos.has_value()) { + if (FindFirstLetter(s, endPos.value(), '"').has_value()) { + HandleError("invalid character in label name"); + return; + } + mLabelName = StringView(s, endPos.value() + 1); + } else { + HandleError("error end of metric name"); + return; + } + auto nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); + HandleLabelValue(metricEvent, s + nextPos, len - nextPos); } else { - HandleError("expected '\"' after '='"); + if (len > 0 && s[0] == '}') { + auto nextPos = SkipLeadingWhitespace(s, len, 1); + HandleSampleValue(metricEvent, s + nextPos, len - nextPos); + } else { + HandleError("invalid character in label name"); + } } } -// parse:v1", k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx -void TextParser::HandleLabelValue(MetricEvent& metricEvent) { +void TextParser::HandleLabelValue(MetricEvent& metricEvent, const char* s, size_t len) { // left quote has been consumed // LableValue supports escape char - bool escaped = false; - auto lPos = mPos; - while (mPos < mLine.size() && mLine[mPos] != '"') { - if (mLine[mPos] != '\\') { - if (escaped) { - mEscapedLabelValue.push_back(mLine[mPos]); - } - ++mPos; - ++mTokenLength; - } else { - if (escaped == false) { - // first meet escape char - escaped = true; - mEscapedLabelValue = mLine.substr(lPos, mPos - lPos).to_string(); - } - if (mPos + 1 < mLine.size()) { - // check next char, if it is valid escape char, we can consume two chars and push one escaped char - // if not, we neet to push the two chars - // valid escape char: \", \\, \n - switch (mLine[lPos + 1]) { - case '\\': - case '\"': - mEscapedLabelValue.push_back(mLine[mPos + 1]); - break; - case 'n': - mEscapedLabelValue.push_back('\n'); - break; - default: - mEscapedLabelValue.push_back('\\'); - mEscapedLabelValue.push_back(mLine[mPos + 1]); - break; + if (len == 0 || s[0] != '"') { + HandleError("invalid character in label value"); + return; + } + s = s + 1; + len--; + size_t nextPos = 0; + if (mEscape) { + // slow path + // escape char + string labelValue; + size_t pos = 0; + for (size_t i = 0; i < len; i++) { + if (s[i] == '\\') { + if (i + 1 < len) { + switch (s[i + 1]) { + case 'n': + labelValue.push_back('\n'); + break; + case '\\': + case '\"': + labelValue.push_back(s[i + 1]); + break; + default: + labelValue.push_back('\\'); + labelValue.push_back(s[i + 1]); + break; + } + i++; + } else { + HandleError("invalid escape char"); + return; } - mPos += 2; + } else if (s[i] == '"') { + pos = i; + break; } else { - mEscapedLabelValue.push_back(mLine[mPos + 1]); - ++mPos; + labelValue.push_back(s[i]); } } - } - - if (mPos == mLine.size()) { - HandleError("unexpected end of input in label value"); - return; - } - - if (!escaped) { - metricEvent.SetTagNoCopy(mLabelName, mLine.substr(mPos - mTokenLength, mTokenLength)); + auto sb = metricEvent.GetSourceBuffer()->CopyString(labelValue); + metricEvent.SetTag(mLabelName, StringView(sb.data, sb.size)); + nextPos = SkipLeadingWhitespace(s, len, pos + 1); } else { - metricEvent.SetTag(mLabelName.to_string(), mEscapedLabelValue); - mEscapedLabelValue.clear(); + const auto pos = FindFirstLetter(s, len, '"'); + if (pos.has_value()) { + metricEvent.SetTagNoCopy(mLabelName, StringView(s, pos.value())); + nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); + } else { + HandleError("invalid character in label value"); + return; + } } - mTokenLength = 0; - ++mPos; - SkipLeadingWhitespace(); - if (mPos < mLine.size() && (mLine[mPos] == ',' || mLine[mPos] == '}')) { - HandleCommaOrCloseBrace(metricEvent); + if (s[nextPos] == ',') { + nextPos++; + nextPos = SkipLeadingWhitespace(s, len, nextPos); + if (s[nextPos] == '}') { + nextPos++; + nextPos = SkipLeadingWhitespace(s, len, nextPos); + HandleSampleValue(metricEvent, s + nextPos, len - nextPos); + return; + } + HandleLabelName(metricEvent, s + nextPos, len - nextPos); + } else if (s[nextPos] == '}') { + nextPos++; + nextPos = SkipLeadingWhitespace(s, len, nextPos); + HandleSampleValue(metricEvent, s + nextPos, len - nextPos); } else { - HandleError("unexpected end of input in label value"); + HandleError("invalid character in label value"); } } -// parse:, k2="v2" } 9.9410452992e+10 1715829785083 # exemplarsxxx -// or parse:} 9.9410452992e+10 1715829785083 # exemplarsxxx -void TextParser::HandleCommaOrCloseBrace(MetricEvent& metricEvent) { - char c = (mPos < mLine.size()) ? mLine[mPos] : '\0'; - if (c == ',') { - ++mPos; - SkipLeadingWhitespace(); - HandleLabelName(metricEvent); - } else if (c == '}') { - ++mPos; - SkipLeadingWhitespace(); - HandleSampleValue(metricEvent); +void TextParser::HandleSampleValue(MetricEvent& metricEvent, const char* s, size_t len) { + auto pos = FindWhiteSpaceAndExemplar(s, len); + size_t valueLen = 0; + if (pos.has_value()) { + valueLen = pos.value(); } else { - HandleError("expected ',' or '}' after label value"); + valueLen = len; } -} - -// parse:9.9410452992e+10 1715829785083 # exemplarsxxx -void TextParser::HandleSampleValue(MetricEvent& metricEvent) { - while (mPos < mLine.size() && IsValidNumberChar(mLine[mPos])) { - ++mPos; - ++mTokenLength; - } - - if (mPos < mLine.size() && mLine[mPos] != ' ' && mLine[mPos] != '\t' && mLine[mPos] != '#') { - HandleError("unexpected end of input in sample value"); + if (valueLen == 0) { + HandleError("invalid sample value"); return; } - - auto tmpSampleValue = mLine.substr(mPos - mTokenLength, mTokenLength); - mDoubleStr = tmpSampleValue.to_string(); - + auto tmpSampleValue = StringView(s, valueLen); try { - mSampleValue = std::stod(mDoubleStr); + auto sampleValue = stod(tmpSampleValue.to_string()); + metricEvent.SetValue(sampleValue); } catch (...) { HandleError("invalid sample value"); - mTokenLength = 0; return; } - mDoubleStr.clear(); - - metricEvent.SetValue(mSampleValue); - mTokenLength = 0; - SkipLeadingWhitespace(); - if (mPos == mLine.size() || mLine[mPos] == '#' || !mHonorTimestamps) { + if ((pos.has_value() && s[pos.value()] == '#') || valueLen == len) { metricEvent.SetTimestamp(mDefaultTimestamp, mDefaultNanoTimestamp); mState = TextState::Done; - } else { - HandleTimestamp(metricEvent); + return; } + s = s + pos.value() + 1; + len -= pos.value() + 1; + auto nextPos = SkipLeadingWhitespace(s, len, 0); + HandleTimestamp(metricEvent, s + nextPos, len - nextPos); } - -// parse:1715829785083 # exemplarsxxx -// timestamp will be 1715829785.083 in OpenMetrics -void TextParser::HandleTimestamp(MetricEvent& metricEvent) { +void TextParser::HandleTimestamp(MetricEvent& metricEvent, const char* s, size_t len) { // '#' is for exemplars, and we don't need it - while (mPos < mLine.size() && IsValidNumberChar(mLine[mPos])) { - ++mPos; - ++mTokenLength; - } - if (mPos < mLine.size() && mLine[mPos] != ' ' && mLine[mPos] != '\t' && mLine[mPos] != '#') { - HandleError("unexpected end of input in sample timestamp"); - return; + auto pos = FindWhiteSpaceAndExemplar(s, len); + size_t valueLen = 0; + if (pos.has_value()) { + valueLen = pos.value(); + } else { + valueLen = len; } - - auto tmpTimestamp = mLine.substr(mPos - mTokenLength, mTokenLength); - if (tmpTimestamp.size() == 0) { + if (valueLen == 0) { mState = TextState::Done; return; } - mDoubleStr = tmpTimestamp.to_string(); + auto tmpTimestamp = StringView(s, valueLen); double milliTimestamp = 0; try { - milliTimestamp = stod(mDoubleStr); + milliTimestamp = stod(tmpTimestamp.to_string()); } catch (...) { HandleError("invalid timestamp"); - mTokenLength = 0; return; } - mDoubleStr.clear(); if (milliTimestamp > 1ULL << 63) { HandleError("timestamp overflow"); - mTokenLength = 0; return; } if (milliTimestamp < 1UL << 31) { @@ -318,21 +396,7 @@ void TextParser::HandleTimestamp(MetricEvent& metricEvent) { } else { metricEvent.SetTimestamp(mDefaultTimestamp, mDefaultNanoTimestamp); } - - mTokenLength = 0; - mState = TextState::Done; } -void TextParser::HandleError(const string& errMsg) { - LOG_WARNING(sLogger, ("text parser error parsing line", mLine.to_string() + errMsg)); - mState = TextState::Error; -} - -inline void TextParser::SkipLeadingWhitespace() { - while (mPos < mLine.length() && (mLine[mPos] == ' ' || mLine[mPos] == '\t')) { - mPos++; - } -} - -} // namespace logtail +} // namespace logtail::prom diff --git a/core/prometheus/labels/TextParser.h b/core/prometheus/labels/TextParser.h index 0eb8899667..afa1d12933 100644 --- a/core/prometheus/labels/TextParser.h +++ b/core/prometheus/labels/TextParser.h @@ -20,11 +20,13 @@ #include "models/MetricEvent.h" #include "models/PipelineEventGroup.h" +#include "models/StringView.h" -namespace logtail { +namespace logtail::prom { enum class TextState { Start, Done, Error }; +// no strict grammar for prom class TextParser { public: TextParser() = default; @@ -37,30 +39,26 @@ class TextParser { bool ParseLine(StringView line, MetricEvent& metricEvent); private: - void HandleError(const std::string& errMsg); + std::optional FindFirstLetter(const char* s, size_t len, char target); + std::optional FindFirstWhiteSpace(const char* s, size_t len); + std::optional FindWhiteSpaceAndExemplar(const char* s, size_t len); + + std::optional SkipTrailingWhitespace(const char* s, size_t pos); + inline size_t SkipLeadingWhitespace(const char* s, size_t len, size_t pos); - void HandleStart(MetricEvent& metricEvent); - void HandleMetricName(MetricEvent& metricEvent); - void HandleOpenBrace(MetricEvent& metricEvent); - void HandleLabelName(MetricEvent& metricEvent); - void HandleEqualSign(MetricEvent& metricEvent); - void HandleLabelValue(MetricEvent& metricEvent); - void HandleCommaOrCloseBrace(MetricEvent& metricEvent); - void HandleSampleValue(MetricEvent& metricEvent); - void HandleTimestamp(MetricEvent& metricEvent); - void HandleSpace(MetricEvent& metricEvent); + void HandleError(const std::string& errMsg); - inline void SkipLeadingWhitespace(); + void HandleStart(MetricEvent& metricEvent, const char* s, size_t len); + void HandleMetricName(MetricEvent& metricEvent, const char* s, size_t len); + void HandleLabelName(MetricEvent& metricEvent, const char* s, size_t len); + void HandleLabelValue(MetricEvent& metricEvent, const char* s, size_t len); + void HandleSampleValue(MetricEvent& metricEvent, const char* s, size_t len); + void HandleTimestamp(MetricEvent& metricEvent, const char* s, size_t len); TextState mState{TextState::Start}; - StringView mLine; - std::size_t mPos{0}; + bool mEscape{false}; StringView mLabelName; - std::string mEscapedLabelValue; - double mSampleValue{0.0}; - std::size_t mTokenLength{0}; - std::string mDoubleStr; bool mHonorTimestamps{true}; time_t mDefaultTimestamp{0}; @@ -71,4 +69,4 @@ class TextParser { #endif }; -} // namespace logtail +} // namespace logtail::prom diff --git a/core/prometheus/labels/TextParserSIMD.cpp b/core/prometheus/labels/TextParserSIMD.cpp deleted file mode 100644 index 9f503a3279..0000000000 --- a/core/prometheus/labels/TextParserSIMD.cpp +++ /dev/null @@ -1,395 +0,0 @@ -/* - * Copyright 2024 iLogtail Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "prometheus/labels/TextParserSIMD.h" - -#include - -#include -#include -#include - -#include "logger/Logger.h" -#include "models/MetricEvent.h" -#include "models/PipelineEventGroup.h" -#include "models/StringView.h" -#include "prometheus/Utils.h" - -using namespace std; - -namespace logtail::prom { - -TextParserSIMD::TextParserSIMD(bool honorTimestamps) : mHonorTimestamps(honorTimestamps) { -} - -void TextParserSIMD::SetDefaultTimestamp(uint64_t defaultTimestamp, uint32_t defaultNanoSec) { - mDefaultTimestamp = defaultTimestamp; - mDefaultNanoTimestamp = defaultNanoSec; -} - -PipelineEventGroup -TextParserSIMD::Parse(const std::string& content, uint64_t defaultTimestamp, uint32_t defaultNanoSec) { - SetDefaultTimestamp(defaultTimestamp, defaultNanoSec); - auto eGroup = PipelineEventGroup(std::make_shared()); - std::vector lines; - // pre-reserve vector size by 1024 which is experience value per line - lines.reserve(content.size() / 1024); - SplitStringView(content, '\n', lines); - for (const auto& line : lines) { - if (!IsValidMetric(line)) { - continue; - } - auto metricEvent = eGroup.CreateMetricEvent(); - if (ParseLine(line, *metricEvent)) { - eGroup.MutableEvents().emplace_back(std::move(metricEvent), false, nullptr); - } - } - - return eGroup; -} - -bool TextParserSIMD::ParseLine(StringView line, MetricEvent& metricEvent) { - mState = TextState::Start; - mLabelName.clear(); - mEscape = FindFirstLetter(line.data(), line.size(), '\\').has_value(); - - HandleStart(metricEvent, line.data(), line.size()); - - if (mState == TextState::Done) { - return true; - } - - return false; -} - -std::optional TextParserSIMD::FindFirstLetter(const char* s, size_t len, char target) { - size_t res = 0; - - __m128i targetVec = _mm_set1_epi8(target); - - while (res + 16 < len) { - __m128i chunk = _mm_loadu_si128(reinterpret_cast(&s[res])); - - __m128i cmp = _mm_cmpeq_epi8(chunk, targetVec); - - int mask = _mm_movemask_epi8(cmp); - - if (mask != 0) { - return res + __builtin_ffs(mask) - 1; - } - - res += 16; - } - - while (res < len) { - if (s[res] == target) { - return res; - } - res++; - } - return std::nullopt; -} -std::optional TextParserSIMD::FindFirstWhiteSpace(const char* s, size_t len) { - size_t res = 0; - - static __m128i sTargetVec1 = _mm_set1_epi8(' '); - static __m128i sTargetVec2 = _mm_set1_epi8('\t'); - - while (res + 16 < len) { - __m128i chunk = _mm_loadu_si128(reinterpret_cast(&s[res])); - - __m128i cmp1 = _mm_cmpeq_epi8(chunk, sTargetVec1); - __m128i cmp2 = _mm_cmpeq_epi8(chunk, sTargetVec2); - - int mask1 = _mm_movemask_epi8(cmp1); - int mask2 = _mm_movemask_epi8(cmp2); - - if (mask1 != 0) { - return res + __builtin_ffs(mask1) - 1; - } - if (mask2 != 0) { - return res + __builtin_ffs(mask2) - 1; - } - - res += 16; - } - - while (res < len) { - if (s[res] == ' ' || s[res] == '\t') { - return res; - } - res++; - } - return std::nullopt; -} - -std::optional TextParserSIMD::FindWhiteSpaceAndExemplar(const char* s, size_t len) { - size_t res = 0; - - static __m128i sTargetVec1 = _mm_set1_epi8(' '); - static __m128i sTargetVec2 = _mm_set1_epi8('\t'); - static __m128i sTargetVec3 = _mm_set1_epi8('#'); - - while (res + 16 < len) { - __m128i chunk = _mm_loadu_si128(reinterpret_cast(&s[res])); - - __m128i cmp1 = _mm_cmpeq_epi8(chunk, sTargetVec1); - __m128i cmp2 = _mm_cmpeq_epi8(chunk, sTargetVec2); - __m128i cmp3 = _mm_cmpeq_epi8(chunk, sTargetVec3); - - int mask1 = _mm_movemask_epi8(cmp1); - int mask2 = _mm_movemask_epi8(cmp2); - int mask3 = _mm_movemask_epi8(cmp3); - - if (mask1 != 0) { - return res + __builtin_ffs(mask1) - 1; - } - if (mask2 != 0) { - return res + __builtin_ffs(mask2) - 1; - } - if (mask3 != 0) { - return res + __builtin_ffs(mask3) - 1; - } - - res += 16; - } - - while (res < len) { - if (s[res] == ' ' || s[res] == '\t' || s[res] == '#') { - return res; - } - res++; - } - return std::nullopt; -} - -std::optional TextParserSIMD::SkipTrailingWhitespace(const char* s, size_t pos) { - for (; pos > 0 && (s[pos] == ' ' || s[pos] == '\t'); --pos) { - } - if (pos == 0 && (s[pos] == ' ' || s[pos] == '\t')) { - return std::nullopt; - } - return pos; -} - - -void TextParserSIMD::HandleError(const std::string& errMsg) { - LOG_WARNING(sLogger, ("text parser error parsing line", errMsg)); - mState = TextState::Error; -} - -void TextParserSIMD::HandleStart(MetricEvent& metricEvent, const char* s, const size_t len) { - auto pos = SkipLeadingWhitespace(s, len, 0); - HandleMetricName(metricEvent, s + pos, len - pos); -} -void TextParserSIMD::HandleMetricName(MetricEvent& metricEvent, const char* s, size_t len) { - auto pos = FindFirstLetter(s, len, '{'); - if (pos.has_value()) { - auto endPos = SkipTrailingWhitespace(s, pos.value() - 1); - if (endPos.has_value()) { - metricEvent.SetNameNoCopy(StringView(s, endPos.value() + 1)); - } else { - HandleError("error end of metric name"); - return; - } - auto nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); - HandleLabelName(metricEvent, s + nextPos, len - nextPos); - } else { - auto nextPos = FindFirstWhiteSpace(s, len); - if (nextPos.has_value()) { - metricEvent.SetNameNoCopy(StringView(s, nextPos.value())); - auto nextNextPos = SkipLeadingWhitespace(s, len, nextPos.value()); - HandleSampleValue(metricEvent, s + nextNextPos, len - nextNextPos); - } - } -} -void TextParserSIMD::HandleLabelName(MetricEvent& metricEvent, const char* s, size_t len) { - auto pos = FindFirstLetter(s, len, '='); - if (pos.has_value()) { - auto endPos = SkipTrailingWhitespace(s, pos.value() - 1); - if (endPos.has_value()) { - if (FindFirstLetter(s, endPos.value(), '"').has_value()) { - HandleError("invalid character in label name"); - return; - } - mLabelName = StringView(s, endPos.value() + 1); - } else { - HandleError("error end of metric name"); - return; - } - auto nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); - HandleLabelValue(metricEvent, s + nextPos, len - nextPos); - } else { - if (len > 0 && s[0] == '}') { - auto nextPos = SkipLeadingWhitespace(s, len, 1); - HandleSampleValue(metricEvent, s + nextPos, len - nextPos); - } else { - HandleError("invalid character in label name"); - } - } -} -void TextParserSIMD::HandleLabelValue(MetricEvent& metricEvent, const char* s, size_t len) { - // left quote has been consumed - // LableValue supports escape char - if (len == 0 || s[0] != '"') { - HandleError("invalid character in label value"); - return; - } - s = s + 1; - len--; - size_t nextPos = 0; - if (mEscape) { - // slow path - // escape char - std::string labelValue; - size_t pos = 0; - for (size_t i = 0; i < len; i++) { - if (s[i] == '\\') { - if (i + 1 < len) { - switch (s[i + 1]) { - case 'n': - labelValue.push_back('\n'); - break; - case '\\': - case '\"': - labelValue.push_back(s[i + 1]); - break; - default: - labelValue.push_back('\\'); - labelValue.push_back(s[i + 1]); - break; - } - i++; - } else { - HandleError("invalid escape char"); - return; - } - } else if (s[i] == '"') { - pos = i; - break; - } else { - labelValue.push_back(s[i]); - } - } - auto sb = metricEvent.GetSourceBuffer()->CopyString(labelValue); - metricEvent.SetTag(mLabelName, StringView(sb.data, sb.size)); - nextPos = SkipLeadingWhitespace(s, len, pos + 1); - } else { - const auto pos = FindFirstLetter(s, len, '"'); - if (pos.has_value()) { - metricEvent.SetTagNoCopy(mLabelName, StringView(s, pos.value())); - nextPos = SkipLeadingWhitespace(s, len, pos.value() + 1); - } else { - HandleError("invalid character in label value"); - return; - } - } - if (s[nextPos] == ',') { - nextPos++; - nextPos = SkipLeadingWhitespace(s, len, nextPos); - if (s[nextPos] == '}') { - nextPos++; - nextPos = SkipLeadingWhitespace(s, len, nextPos); - HandleSampleValue(metricEvent, s + nextPos, len - nextPos); - return; - } - HandleLabelName(metricEvent, s + nextPos, len - nextPos); - } else if (s[nextPos] == '}') { - nextPos++; - nextPos = SkipLeadingWhitespace(s, len, nextPos); - HandleSampleValue(metricEvent, s + nextPos, len - nextPos); - } else { - HandleError("invalid character in label value"); - } -} -void TextParserSIMD::HandleSampleValue(MetricEvent& metricEvent, const char* s, size_t len) { - auto pos = FindWhiteSpaceAndExemplar(s, len); - size_t valueLen = 0; - if (pos.has_value()) { - valueLen = pos.value(); - } else { - valueLen = len; - } - if (valueLen == 0) { - HandleError("invalid sample value"); - return; - } - auto tmpSampleValue = StringView(s, valueLen); - try { - auto sampleValue = std::stod(tmpSampleValue.to_string()); - metricEvent.SetValue(sampleValue); - } catch (...) { - HandleError("invalid sample value"); - return; - } - if ((pos.has_value() && s[pos.value()] == '#') || valueLen == len) { - metricEvent.SetTimestamp(mDefaultTimestamp, mDefaultNanoTimestamp); - mState = TextState::Done; - return; - } - s = s + pos.value() + 1; - len -= pos.value() + 1; - auto nextPos = SkipLeadingWhitespace(s, len, 0); - HandleTimestamp(metricEvent, s + nextPos, len - nextPos); -} -void TextParserSIMD::HandleTimestamp(MetricEvent& metricEvent, const char* s, size_t len) { - // '#' is for exemplars, and we don't need it - auto pos = FindWhiteSpaceAndExemplar(s, len); - size_t valueLen = 0; - if (pos.has_value()) { - valueLen = pos.value(); - } else { - valueLen = len; - } - if (valueLen == 0) { - mState = TextState::Done; - return; - } - auto tmpTimestamp = StringView(s, valueLen); - double milliTimestamp = 0; - try { - milliTimestamp = std::stod(tmpTimestamp.to_string()); - } catch (...) { - HandleError("invalid timestamp"); - return; - } - - if (milliTimestamp > 1ULL << 63) { - HandleError("timestamp overflow"); - return; - } - if (milliTimestamp < 1UL << 31) { - milliTimestamp *= 1000; - } - time_t timestamp = (int64_t)milliTimestamp / 1000; - auto ns = ((int64_t)milliTimestamp % 1000) * 1000000; - if (mHonorTimestamps) { - metricEvent.SetTimestamp(timestamp, ns); - } else { - metricEvent.SetTimestamp(mDefaultTimestamp, mDefaultNanoTimestamp); - } - mState = TextState::Done; -} - -inline size_t TextParserSIMD::SkipLeadingWhitespace(const char* s, size_t len, size_t pos) { - while (pos < len && (s[pos] == ' ' || s[pos] == '\t')) { - pos++; - } - return pos; -} - - -} // namespace logtail::prom diff --git a/core/prometheus/labels/TextParserSIMD.h b/core/prometheus/labels/TextParserSIMD.h deleted file mode 100644 index f481de93ab..0000000000 --- a/core/prometheus/labels/TextParserSIMD.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright 2024 iLogtail Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include - -#include "models/MetricEvent.h" -#include "models/PipelineEventGroup.h" -#include "models/StringView.h" - -namespace logtail::prom { - -enum class TextState { Start, Done, Error }; - -// no strict grammar for prom -class TextParserSIMD { -public: - TextParserSIMD() = default; - explicit TextParserSIMD(bool honorTimestamps); - - void SetDefaultTimestamp(uint64_t defaultTimestamp, uint32_t defaultNanoSec); - - PipelineEventGroup Parse(const std::string& content, uint64_t defaultTimestamp, uint32_t defaultNanoSec); - - bool ParseLine(StringView line, MetricEvent& metricEvent); - -private: - std::optional FindFirstLetter(const char* s, size_t len, char target); - std::optional FindFirstWhiteSpace(const char* s, size_t len); - std::optional FindWhiteSpaceAndExemplar(const char* s, size_t len); - - std::optional SkipTrailingWhitespace(const char* s, size_t pos); - inline size_t SkipLeadingWhitespace(const char* s, size_t len, size_t pos); - - void HandleError(const std::string& errMsg); - - void HandleStart(MetricEvent& metricEvent, const char* s, size_t len); - void HandleMetricName(MetricEvent& metricEvent, const char* s, size_t len); - void HandleLabelName(MetricEvent& metricEvent, const char* s, size_t len); - void HandleLabelValue(MetricEvent& metricEvent, const char* s, size_t len); - void HandleSampleValue(MetricEvent& metricEvent, const char* s, size_t len); - void HandleTimestamp(MetricEvent& metricEvent, const char* s, size_t len); - - TextState mState{TextState::Start}; - bool mEscape{false}; - - StringView mLabelName; - - bool mHonorTimestamps{true}; - time_t mDefaultTimestamp{0}; - uint32_t mDefaultNanoTimestamp{0}; - -#ifdef APSARA_UNIT_TEST_MAIN - friend class TextParserUnittest; -#endif -}; - -} // namespace logtail::prom diff --git a/core/unittest/prometheus/TextParserBenchmark.cpp b/core/unittest/prometheus/TextParserBenchmark.cpp index a60a9ebf34..85028a46ab 100644 --- a/core/unittest/prometheus/TextParserBenchmark.cpp +++ b/core/unittest/prometheus/TextParserBenchmark.cpp @@ -17,12 +17,11 @@ #include #include "prometheus/labels/TextParser.h" -#include "prometheus/labels/TextParserSIMD.h" #include "unittest/Unittest.h" using namespace std; -namespace logtail { +namespace logtail::prom { class TextParserBenchmark : public testing::Test { public: @@ -38,13 +37,6 @@ class TextParserBenchmark : public testing::Test { m100MData += mRawData; repeatCnt -= 1; } - - m1000MData.reserve(1000 * 1024 * 1024); - repeatCnt = 1000 * 1024 * 1024 / mRawData.size(); - while (repeatCnt > 0) { - m1000MData += mRawData; - repeatCnt -= 1; - } } private: @@ -59,7 +51,6 @@ test_metric7{k111111111111="v11111111111", k222222="v2", } 9.9410452992e+10 1715 test_metric8{k111111111111="v11111111111", k222222="v2", } 9.9410452992e+10 1715829785083 )"""; std::string m100MData; - std::string m1000MData; }; void TextParserBenchmark::TestParse100M() const { @@ -71,27 +62,11 @@ void TextParserBenchmark::TestParse100M() const { auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed = end - start; cout << "elapsed: " << elapsed.count() << " seconds" << endl; - // elapsed: 1.53s in release mode - // elapsed: 551MB in release mode - // 2.51s -> 1.26s when rawLine is longer if we use SIMD -} - -void TextParserBenchmark::TestParse1000M() const { - auto start = std::chrono::high_resolution_clock::now(); - - TextParser parser; - auto res = parser.Parse(m1000MData, 0, 0); - - auto end = std::chrono::high_resolution_clock::now(); - std::chrono::duration elapsed = end - start; - cout << "elapsed: " << elapsed.count() << " seconds" << endl; - // elapsed: 15.4s in release mode - // elapsed: 4960MB in release mode + // 2.51s -> 1.26s if we use SIMD } UNIT_TEST_CASE(TextParserBenchmark, TestParse100M) -UNIT_TEST_CASE(TextParserBenchmark, TestParse1000M) -} // namespace logtail +} // namespace logtail::prom UNIT_TEST_MAIN diff --git a/core/unittest/prometheus/TextParserSIMDUnittest.cpp b/core/unittest/prometheus/TextParserSIMDUnittest.cpp deleted file mode 100644 index 34d23b3622..0000000000 --- a/core/unittest/prometheus/TextParserSIMDUnittest.cpp +++ /dev/null @@ -1,394 +0,0 @@ -/* - * Copyright 2024 iLogtail Authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "MetricEvent.h" -#include "models/PipelineEventGroup.h" -#include "prometheus/labels/TextParser.h" -#include "prometheus/labels/TextParserSIMD.h" -#include "unittest/Unittest.h" - -using namespace std; - -namespace logtail::prom { - -bool IsDoubleEqual(double a, double b) { - return fabs(a - b) < 0.000001; -} - -class TextParserSIMDUnittest : public testing::Test { -public: - void TestParseMultipleLines() const; - void TestParseMetricWithTagsAndTimestamp() const; - void TestParseMetricWithManyTags() const; - void TestParseUnicodeLabelValue(); - - void TestParseFaliure(); - void TestParseSuccess(); - - void TestHonorTimestamps(); -}; - -void TextParserSIMDUnittest::TestParseMultipleLines() const { - auto parser = TextParserSIMD(); - const auto eGroup = parser.Parse(R"""( -# begin - -test_metric1{k1="v1", k2="v 1.0 - test_metric2{k1="v1", k2="v2"} 2.0 1234567890 -test_metric3{k1="v1",k2="v2"} 9.9410452992e+10 - test_metric4{k1="v1",k2="v2"} 9.9410452992e+10 1715829785083 - test_metric5{k1="v1", k2="v2" } 9.9410452992e+10 1715829785083 -test_metric6{k1="v1",k2="v2",} 9.9410452992e+10 1715829785083 -test_metric7{k1="v1",k2="v2", } 9.9410452992e+10 1715829785083 -test_metric8{k1="v1", k2="v2", } 9.9410452992e+10 1715829785083 - -# end - )""", - 0, - 0); - const auto& events = &eGroup.GetEvents(); - APSARA_TEST_EQUAL(7UL, events->size()); -} -UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseMultipleLines) - -void TextParserSIMDUnittest::TestParseMetricWithTagsAndTimestamp() const { - auto parser = TextParserSIMD(); - string rawData = R"""( - test_metric{k1="v1", k2="v2"} 9.9410452992e+10 1715829785083 - test_metric2{k1="v1", k2="v2"} 2.0 1715829785083 - test_metric3{k1="v1",k2="v2"} 4.2 92233720368547758080000 - )"""; - const auto eGroup = parser.Parse(rawData, 0, 0); - - - // test_metric - const auto& events = &eGroup.GetEvents(); - const auto& event = events->front(); - const auto& metric = event.Get(); - APSARA_TEST_EQUAL("test_metric", metric->GetName().to_string()); - APSARA_TEST_EQUAL(1715829785, metric->GetTimestamp()); - APSARA_TEST_EQUAL(83000000, metric->GetTimestampNanosecond()); - APSARA_TEST_TRUE(IsDoubleEqual(9.9410452992e+10, metric->GetValue()->mValue)); - APSARA_TEST_EQUAL("v1", metric->GetTag("k1").to_string()); - APSARA_TEST_EQUAL("v2", metric->GetTag("k2").to_string()); - - // test_metric2 - const auto& event2 = events->at(1); - const auto& metric2 = event2.Get(); - APSARA_TEST_EQUAL("test_metric2", metric2->GetName().to_string()); - APSARA_TEST_EQUAL(1715829785, metric2->GetTimestamp()); - APSARA_TEST_TRUE(IsDoubleEqual(2.0, metric2->GetValue()->mValue)); - APSARA_TEST_EQUAL("v1", metric2->GetTag("k1").to_string()); - APSARA_TEST_EQUAL("v2", metric2->GetTag("k2").to_string()); - - // test_metric3 is not generated because of timestamp overflow - APSARA_TEST_EQUAL(2UL, events->size()); -} -UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseMetricWithTagsAndTimestamp) - -void TextParserSIMDUnittest::TestParseMetricWithManyTags() const { - auto parser = TextParserSIMD(); - string rawData - = R"""(container_blkio_device_usage_total{container="",device="/dev/nvme0n1",id="/",image="",major="259",minor="0",name="",namespace="",operation="Async",pod=""} 9.9410452992e+10 1715829785083)"""; - const auto eGroup = parser.Parse(rawData, 1715829785, 83000000); - const auto& events = &eGroup.GetEvents(); - APSARA_TEST_EQUAL(1UL, events->size()); - const auto& event = events->front(); - const auto& metric = event.Get(); - APSARA_TEST_EQUAL("container_blkio_device_usage_total", metric->GetName().to_string()); - APSARA_TEST_EQUAL(1715829785, metric->GetTimestamp()); - APSARA_TEST_TRUE(IsDoubleEqual(9.9410452992e+10, metric->GetValue()->mValue)); - - APSARA_TEST_EQUAL("", metric->GetTag("container").to_string()); - APSARA_TEST_EQUAL("/dev/nvme0n1", metric->GetTag("device").to_string()); - APSARA_TEST_EQUAL("/", metric->GetTag("id").to_string()); - APSARA_TEST_EQUAL("", metric->GetTag("image").to_string()); - APSARA_TEST_EQUAL("259", metric->GetTag("major").to_string()); - APSARA_TEST_EQUAL("0", metric->GetTag("minor").to_string()); - APSARA_TEST_EQUAL("", metric->GetTag("name").to_string()); - APSARA_TEST_EQUAL("", metric->GetTag("namespace").to_string()); - APSARA_TEST_EQUAL("Async", metric->GetTag("operation").to_string()); - APSARA_TEST_EQUAL("", metric->GetTag("pod").to_string()); -} -UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseMetricWithManyTags) - -void TextParserSIMDUnittest::TestParseFaliure() { - auto f = [](const std::string& content) { - TextParserSIMD parser; - PipelineEventGroup eGroup = parser.Parse(content, 0, 0); - APSARA_TEST_EQUAL(0UL, eGroup.GetEvents().size()); - }; - - // Empty lines and comments - f(""); - f(" "); - f("\t"); - f("\t \r"); - f("\t\t \n\n # foobar"); - f("#foobar"); - f("#foobar\n"); - - // invalid tags - f("a{"); - f("a { "); - f("a {foo"); - f("a {foo} 3"); - f("a {foo ="); - f("a {foo =\"bar"); - f("a {foo =\"b\\ar"); - f("a {foo = \"bar\""); - f("a {foo =\"bar\","); - f("a {foo =\"bar\" , "); - f("a {foo =\"bar\" , baz } 2"); - - // Invalid tags - see https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4284 - f(R"(a{"__name__":"upsd_time_left_ns","host":"myhost", "status_OB":"true"} 12)"); - f(R"(a{host:"myhost"} 12)"); - f(R"(a{host:"myhost",foo="bar"} 12)"); - - // Empty metric name - f(R"({foo="bar"})"); - - // Invalid quotes for label value - f(R"({foo='bar'} 23)"); - f(R"({foo=`bar`} 23"); - - // Missing value - f("aaa"); - f(" aaa"); - f(" aaa "); - f(" aaa \n"); - f(R"( aa{foo="bar"} )" - + std::string("\n")); - - // Invalid value - f("foo bar"); - f("foo bar 124"); - - // Invalid timestamp - f("foo 123 bar"); -} -UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseFaliure) - -void TextParserSIMDUnittest::TestParseSuccess() { - TextParserSIMD parser; - string rawData; - // single value - rawData = "foobar 123"; - auto res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foobar"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 123.0)); - - rawData = "foobar 123.456 789\n"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foobar"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 123.456)); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 789); - - rawData = R"( - # TYPE cassandra_token_ownership_ratio gauge -cassandra_token_ownership_ratio 78.9)"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), - "cassandra_token_ownership_ratio"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 78.9)); - - // `#` char in label value - rawData = R"(foo{bar="#1 az"} 24)"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foo"); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("bar").to_string(), "#1 az"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 24.0)); - - // Incorrectly escaped backlash. This is real-world case, which must be supported. - rawData = R"(mssql_sql_server_active_transactions_sec{loginname="domain\somelogin",env="develop"} 56)"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), - "mssql_sql_server_active_transactions_sec"); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("loginname").to_string(), "domain\\somelogin"); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("env").to_string(), "develop"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 56.0)); - - rawData = R"(foo_bucket{le="10",a="#b"} 17)"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foo_bucket"); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("le").to_string(), "10"); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("a").to_string(), "#b"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 17.0)); - - // "Infinity" word - this has been added in OpenMetrics. - // See https://github.com/OpenObservability/OpenMetrics/blob/master/OpenMetrics.md - // Checks for https://github.com/VictoriaMetrics/VictoriaMetrics/issues/924 - rawData = R"(foo Infinity - bar +Infinity - baz -infinity - aaa +inf - bbb -INF - ccc INF)"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().size(), 6UL); - APSARA_TEST_EQUAL(res.GetEvents()[0].Cast().GetName().to_string(), "foo"); - APSARA_TEST_EQUAL(res.GetEvents()[0].Cast().GetValue()->mValue, - std::numeric_limits::infinity()); - APSARA_TEST_EQUAL(res.GetEvents()[1].Cast().GetName().to_string(), "bar"); - APSARA_TEST_EQUAL(res.GetEvents()[1].Cast().GetValue()->mValue, - std::numeric_limits::infinity()); - APSARA_TEST_EQUAL(res.GetEvents()[2].Cast().GetName().to_string(), "baz"); - APSARA_TEST_EQUAL(res.GetEvents()[2].Cast().GetValue()->mValue, - -std::numeric_limits::infinity()); - APSARA_TEST_EQUAL(res.GetEvents()[3].Cast().GetName().to_string(), "aaa"); - APSARA_TEST_EQUAL(res.GetEvents()[3].Cast().GetValue()->mValue, - std::numeric_limits::infinity()); - APSARA_TEST_EQUAL(res.GetEvents()[4].Cast().GetName().to_string(), "bbb"); - APSARA_TEST_EQUAL(res.GetEvents()[4].Cast().GetValue()->mValue, - -std::numeric_limits::infinity()); - APSARA_TEST_EQUAL(res.GetEvents()[5].Cast().GetName().to_string(), "ccc"); - APSARA_TEST_EQUAL(res.GetEvents()[5].Cast().GetValue()->mValue, - std::numeric_limits::infinity()); - - // tags - rawData = R"(foo{bar="b\"a\\z"} -1.2)"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foo"); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("bar").to_string(), "b\"a\\z"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, -1.2)); - - // Empty tags - rawData = R"(foo {bar="baz",aa="",x="y"} 1 2)"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "foo"); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("bar").to_string(), "baz"); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("aa").to_string(), ""); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("x").to_string(), "y"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 1.0)); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 2); - - // Multi lines with invalid line - rawData = "\t foo\t { } 0.3\t 2\naaa\n barbaz 0.34 43\n"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().size(), 2UL); - APSARA_TEST_EQUAL(res.GetEvents()[0].Cast().GetName().to_string(), "foo"); - APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents()[0].Cast().GetValue()->mValue, 0.3)); - APSARA_TEST_EQUAL(res.GetEvents()[0].Cast().GetTimestamp(), 2); - APSARA_TEST_EQUAL(res.GetEvents()[1].Cast().GetName().to_string(), "barbaz"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents()[1].Cast().GetValue()->mValue, 0.34)); - APSARA_TEST_EQUAL(res.GetEvents()[1].Cast().GetTimestamp(), 43); - - // Spaces around tags - rawData = R"(vm_accounting { name="vminsertRows", accountID = "1" , projectID= "1" } 277779100)"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "vm_accounting"); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("name").to_string(), "vminsertRows"); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("accountID").to_string(), "1"); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTag("projectID").to_string(), "1"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 277779100.0)); - - // Exemplars - rawData = "abc 123 456 # foobar"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "abc"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 123.0)); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 456); - - // float timestamp - rawData = "abc 123 456.789"; - res = parser.Parse(rawData, 0, 0); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetName().to_string(), "abc"); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetValue()->mValue, 123.0)); - APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents().back().Cast().GetTimestamp(), 456)); - APSARA_TEST_TRUE( - IsDoubleEqual(res.GetEvents().back().Cast().GetTimestampNanosecond().value(), 789000000)); -} - -UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseSuccess) - -void TextParserSIMDUnittest::TestHonorTimestamps() { - // false - TextParserSIMD parser(false); - // has timestamp - std::string rawData = "abc 123 456"; - PipelineEventGroup res = parser.Parse(rawData, 789, 111); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 789); - APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents().back().Cast().GetTimestampNanosecond().value(), 111)); - - // no timestamp - rawData = "abc 123"; - res = parser.Parse(rawData, 789, 111); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 789); - APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents().back().Cast().GetTimestampNanosecond().value(), 111)); - - - // true - parser.mHonorTimestamps = true; - // has timestamp - rawData = "abc 123 456"; - res = parser.Parse(rawData, 789, 111); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 456); - APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents().back().Cast().GetTimestampNanosecond().value(), 0)); - - // no timestamp - rawData = "abc 123"; - res = parser.Parse(rawData, 789, 111); - APSARA_TEST_EQUAL(res.GetEvents().back().Cast().GetTimestamp(), 789); - APSARA_TEST_TRUE(IsDoubleEqual(res.GetEvents().back().Cast().GetTimestampNanosecond().value(), 111)); -} - -UNIT_TEST_CASE(TextParserSIMDUnittest, TestHonorTimestamps) - -void TextParserSIMDUnittest::TestParseUnicodeLabelValue() { - auto parser = TextParserSIMD(); - string rawData - = R"""(container_blkio_device_usage_total{container="",device="/dev/nvme0n1δΈ­ζ–‡",id="/πŸ˜€",image="",major="259",minor="0",name="",namespace="",operation="Async",pod=""} 9.9410452992e+10 1715829785083)"""; - const auto eGroup = parser.Parse(rawData, 1715829785, 83000000); - const auto& events = &eGroup.GetEvents(); - APSARA_TEST_EQUAL(1UL, events->size()); - const auto& event = events->front(); - const auto& metric = event.Get(); - APSARA_TEST_EQUAL("container_blkio_device_usage_total", metric->GetName().to_string()); - APSARA_TEST_EQUAL(1715829785, metric->GetTimestamp()); - APSARA_TEST_TRUE(IsDoubleEqual(9.9410452992e+10, metric->GetValue()->mValue)); - - APSARA_TEST_EQUAL("", metric->GetTag("container").to_string()); - APSARA_TEST_EQUAL("/dev/nvme0n1δΈ­ζ–‡", metric->GetTag("device").to_string()); - APSARA_TEST_EQUAL("/πŸ˜€", metric->GetTag("id").to_string()); - APSARA_TEST_EQUAL("", metric->GetTag("image").to_string()); - APSARA_TEST_EQUAL("259", metric->GetTag("major").to_string()); - APSARA_TEST_EQUAL("0", metric->GetTag("minor").to_string()); - APSARA_TEST_EQUAL("", metric->GetTag("name").to_string()); - APSARA_TEST_EQUAL("", metric->GetTag("namespace").to_string()); - APSARA_TEST_EQUAL("Async", metric->GetTag("operation").to_string()); - APSARA_TEST_EQUAL("", metric->GetTag("pod").to_string()); -} - -UNIT_TEST_CASE(TextParserSIMDUnittest, TestParseUnicodeLabelValue) - -} // namespace logtail - -UNIT_TEST_MAIN diff --git a/core/unittest/prometheus/TextParserUnittest.cpp b/core/unittest/prometheus/TextParserUnittest.cpp index 7af513ba2e..0474f7a981 100644 --- a/core/unittest/prometheus/TextParserUnittest.cpp +++ b/core/unittest/prometheus/TextParserUnittest.cpp @@ -23,7 +23,7 @@ using namespace std; -namespace logtail { +namespace logtail::prom { bool IsDoubleEqual(double a, double b) { return fabs(a - b) < 0.000001; @@ -388,6 +388,6 @@ void TextParserUnittest::TestParseUnicodeLabelValue() { UNIT_TEST_CASE(TextParserUnittest, TestParseUnicodeLabelValue) -} // namespace logtail +} // namespace logtail::prom UNIT_TEST_MAIN From 012acb78d8d5e5e1625f8678a78d0fa3b8bdcf55 Mon Sep 17 00:00:00 2001 From: liqiang Date: Tue, 17 Dec 2024 03:14:36 +0000 Subject: [PATCH 08/11] chore: remove textparser_simd ut --- core/unittest/prometheus/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/core/unittest/prometheus/CMakeLists.txt b/core/unittest/prometheus/CMakeLists.txt index cc34473f47..ef9aeddae5 100644 --- a/core/unittest/prometheus/CMakeLists.txt +++ b/core/unittest/prometheus/CMakeLists.txt @@ -36,9 +36,6 @@ target_link_libraries(prometheus_input_runner_unittest ${UT_BASE_TARGET}) add_executable(textparser_unittest TextParserUnittest.cpp) target_link_libraries(textparser_unittest ${UT_BASE_TARGET}) -add_executable(textparser_simd_unittest TextParserSIMDUnittest.cpp) -target_link_libraries(textparser_simd_unittest ${UT_BASE_TARGET}) - add_executable(scrape_config_unittest ScrapeConfigUnittest.cpp) target_link_libraries(scrape_config_unittest ${UT_BASE_TARGET}) @@ -57,7 +54,6 @@ gtest_discover_tests(scrape_scheduler_unittest) gtest_discover_tests(target_subscriber_scheduler_unittest) gtest_discover_tests(prometheus_input_runner_unittest) gtest_discover_tests(textparser_unittest) -gtest_discover_tests(textparser_simd_unittest) gtest_discover_tests(scrape_config_unittest) gtest_discover_tests(prom_utils_unittest) gtest_discover_tests(prom_asyn_unittest) From aa64981abc0e79318888da41aa221800bfdb9fa6 Mon Sep 17 00:00:00 2001 From: liqiang Date: Tue, 17 Dec 2024 03:30:49 +0000 Subject: [PATCH 09/11] chore: update ut --- .../processor/ProcessorPromParseMetricNativeUnittest.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/core/unittest/processor/ProcessorPromParseMetricNativeUnittest.cpp b/core/unittest/processor/ProcessorPromParseMetricNativeUnittest.cpp index c0481a1b94..4b8155e523 100644 --- a/core/unittest/processor/ProcessorPromParseMetricNativeUnittest.cpp +++ b/core/unittest/processor/ProcessorPromParseMetricNativeUnittest.cpp @@ -14,14 +14,11 @@ * limitations under the License. */ -#include "LogEvent.h" #include "MetricEvent.h" #include "StringTools.h" #include "common/JsonUtil.h" #include "models/PipelineEventGroup.h" #include "plugin/processor/inner/ProcessorPromParseMetricNative.h" -#include "prometheus/Constants.h" -#include "prometheus/labels/TextParser.h" #include "prometheus/schedulers/ScrapeScheduler.h" #include "unittest/Unittest.h" @@ -76,7 +73,6 @@ void ProcessorParsePrometheusMetricUnittest::TestProcess() { APSARA_TEST_TRUE(processor.Init(config)); // make events - auto parser = TextParser(); auto splitByLines = [](const std::string& content) { PipelineEventGroup eGroup(std::make_shared()); From 5f11f21e97ade2c79ac3b68a5b33cbeb51540707 Mon Sep 17 00:00:00 2001 From: liqiang Date: Tue, 17 Dec 2024 04:02:46 +0000 Subject: [PATCH 10/11] chore: update ut --- .../processor/ProcessorPromRelabelMetricNativeUnittest.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/unittest/processor/ProcessorPromRelabelMetricNativeUnittest.cpp b/core/unittest/processor/ProcessorPromRelabelMetricNativeUnittest.cpp index 39f4cde959..f5790cb92a 100644 --- a/core/unittest/processor/ProcessorPromRelabelMetricNativeUnittest.cpp +++ b/core/unittest/processor/ProcessorPromRelabelMetricNativeUnittest.cpp @@ -24,7 +24,7 @@ using namespace std; -namespace logtail { +namespace logtail::prom { class ProcessorPromRelabelMetricNativeUnittest : public testing::Test { public: void SetUp() override { mContext.SetConfigName("project##config_0"); } @@ -281,6 +281,6 @@ UNIT_TEST_CASE(ProcessorPromRelabelMetricNativeUnittest, TestAddAutoMetrics) UNIT_TEST_CASE(ProcessorPromRelabelMetricNativeUnittest, TestHonorLabels) -} // namespace logtail +} // namespace logtail::prom UNIT_TEST_MAIN \ No newline at end of file From 2c8459f18c430b80c4ceaf09df8f104d2625ea3c Mon Sep 17 00:00:00 2001 From: liqiang Date: Tue, 17 Dec 2024 08:25:40 +0000 Subject: [PATCH 11/11] chore: update macro define of SSE4.2 --- core/CMakeLists.txt | 4 +--- core/pipeline/plugin/PluginRegistry.cpp | 4 ++-- core/prometheus/labels/TextParser.cpp | 6 +++--- core/unittest/CMakeLists.txt | 2 +- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 51daca6b2c..6ab1f0a6a1 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -51,9 +51,7 @@ else () endif () if (NOT WITHSPL) - add_definitions(-D__EXCLUDE_SPL__) -else () - add_definitions(-D__ENABLE_SSE4_2__) + add_definitions(-D__EXCLUDE_SSE4_2__) endif() # Default C/CXX flags. diff --git a/core/pipeline/plugin/PluginRegistry.cpp b/core/pipeline/plugin/PluginRegistry.cpp index 6a0061c68a..d205aa7075 100644 --- a/core/pipeline/plugin/PluginRegistry.cpp +++ b/core/pipeline/plugin/PluginRegistry.cpp @@ -61,7 +61,7 @@ #include "plugin/processor/inner/ProcessorSplitLogStringNative.h" #include "plugin/processor/inner/ProcessorSplitMultilineLogStringNative.h" #include "plugin/processor/inner/ProcessorTagNative.h" -#if defined(__linux__) && !defined(__ANDROID__) && !defined(__EXCLUDE_SPL__) +#if defined(__linux__) && !defined(__ANDROID__) && !defined(__EXCLUDE_SSE4_2__) #include "plugin/processor/ProcessorSPL.h" #endif @@ -154,7 +154,7 @@ void PluginRegistry::LoadStaticPlugins() { RegisterProcessorCreator(new StaticProcessorCreator()); RegisterProcessorCreator(new StaticProcessorCreator()); RegisterProcessorCreator(new StaticProcessorCreator()); -#if defined(__linux__) && !defined(__ANDROID__) && !defined(__EXCLUDE_SPL__) +#if defined(__linux__) && !defined(__ANDROID__) && !defined(__EXCLUDE_SSE4_2__) if (BOOL_FLAG(enable_processor_spl)) { RegisterProcessorCreator(new StaticProcessorCreator()); } diff --git a/core/prometheus/labels/TextParser.cpp b/core/prometheus/labels/TextParser.cpp index 97d39d598f..07f8ace523 100644 --- a/core/prometheus/labels/TextParser.cpp +++ b/core/prometheus/labels/TextParser.cpp @@ -76,7 +76,7 @@ bool TextParser::ParseLine(StringView line, MetricEvent& metricEvent) { std::optional TextParser::FindFirstLetter(const char* s, size_t len, char target) { size_t res = 0; -#if defined(__ENABLE_SSE4_2__) +#if !defined(__EXCLUDE_SSE4_2__) __m128i targetVec = _mm_set1_epi8(target); while (res + 16 < len) { @@ -106,7 +106,7 @@ std::optional TextParser::FindFirstLetter(const char* s, size_t len, cha std::optional TextParser::FindFirstWhiteSpace(const char* s, size_t len) { size_t res = 0; -#if defined(__ENABLE_SSE4_2__) +#if !defined(__EXCLUDE_SSE4_2__) static __m128i sTargetVec1 = _mm_set1_epi8(' '); static __m128i sTargetVec2 = _mm_set1_epi8('\t'); @@ -142,7 +142,7 @@ std::optional TextParser::FindFirstWhiteSpace(const char* s, size_t len) std::optional TextParser::FindWhiteSpaceAndExemplar(const char* s, size_t len) { size_t res = 0; -#if defined(__ENABLE_SSE4_2__) +#if !defined(__EXCLUDE_SSE4_2__) static __m128i sTargetVec1 = _mm_set1_epi8(' '); static __m128i sTargetVec2 = _mm_set1_epi8('\t'); static __m128i sTargetVec3 = _mm_set1_epi8('#'); diff --git a/core/unittest/CMakeLists.txt b/core/unittest/CMakeLists.txt index 41f1601069..bed74003ba 100644 --- a/core/unittest/CMakeLists.txt +++ b/core/unittest/CMakeLists.txt @@ -91,7 +91,7 @@ if (UNIX) endif() # add core subdir set(UT_BASE_TARGET "unittest_base") - add_definitions(-D__EXCLUDE_SPL__) + add_definitions(-D__EXCLUDE_SSE4_2__) add_library(${UT_BASE_TARGET} SHARED ${SOURCE_FILES_CORE}) target_compile_options(${UT_BASE_TARGET} PRIVATE -Werror) add_core_subdir()