From f063820a89c6a4449a514443b1f0509fa4cbd1cd Mon Sep 17 00:00:00 2001 From: Ry Biesemeyer Date: Thu, 3 May 2018 21:51:04 +0000 Subject: [PATCH] make quoted capture less greedy when we have unambiguous separator In [logstash-plugins/logstash-filter-kv#60][], GitHub user @robcowart reports that in some scenarios we can fail to split on match of `field_split_pattern`. The example given is a partially-quoted value, with additional bytes between it and the unambiguous separator, followed by another key and quoted value. Since the quoted value isn't immediately followed by a field splitter, our semi-greedy quoted-value capture was too greedy, continuing to consume until it found a close-quote that would be followed by either a field-split or EOF. Here, we become much less greedy, capturing any sequence of characters that is _not_ the close-quote character. [logstash-plugins/logstash-filter-kv#60]: https://github.com/logstash-plugins/logstash-filter-kv/issues/60 Fixes #62 --- lib/logstash/filters/kv.rb | 4 ++-- spec/filters/kv_spec.rb | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/lib/logstash/filters/kv.rb b/lib/logstash/filters/kv.rb index a9b6920..7a3c429 100644 --- a/lib/logstash/filters/kv.rb +++ b/lib/logstash/filters/kv.rb @@ -422,8 +422,8 @@ def quoted_capture(quote_sequence, close_quote_sequence=quote_sequence) open_pattern = /#{Regexp.quote(quote_sequence)}/ close_pattern = /#{Regexp.quote(close_quote_sequence)}/ - # matches a sequence of zero or more characters that is followed by the `close_quote_sequence` - quoted_value_pattern = /(?:.)*?(?=#{Regexp.quote(close_quote_sequence)})/ + # matches a sequence of zero or more characters are _not_ the `close_quote_sequence` + quoted_value_pattern = /[^#{Regexp.quote(close_quote_sequence)}]*/ /#{open_pattern}(#{quoted_value_pattern})#{close_pattern}/ end diff --git a/spec/filters/kv_spec.rb b/spec/filters/kv_spec.rb index a1f65d4..6f5dbe3 100644 --- a/spec/filters/kv_spec.rb +++ b/spec/filters/kv_spec.rb @@ -833,6 +833,40 @@ it_behaves_like "parsing all fields and values" end + context 'multi-char field split pattern with value that begins quoted and contains more unquoted' do + let(:message) { 'foo=bar!!!!!baz="quoted stuff" and more unquoted!!!!!msg="fully-quoted with a part! of the separator"!!!!!blip="this!!!!!is it"!!!!!empty=""!!!!!non-empty="foo"' } + let(:options) { + { + "field_split_pattern" => "!!!!!" + } + } + it 'gets the right bits' do + subject.filter(event) + expect(event.get("foo")).to eq('bar') + expect(event.get("baz")).to eq('"quoted stuff" and more unquoted') + expect(event.get("msg")).to eq('fully-quoted with a part! of the separator') + expect(event.get("blip")).to eq('this!!!!!is it') + expect(event.get("empty")).to be_nil + expect(event.get("non-empty")).to eq('foo') + end + end + + context 'standard field split pattern with value that begins quoted and contains more unquoted' do + let(:message) { 'foo=bar baz="quoted stuff" and more unquoted msg="some fully-quoted message " empty="" non-empty="foo"' } + let(:options) { + { + } + } + it 'gets the right bits' do + subject.filter(event) + expect(event.get("foo")).to eq('bar') + expect(event.get("baz")).to eq('quoted stuff') # NOTE: outside the quotes is truncated because field split pattern wins. + expect(event.get("msg")).to eq('some fully-quoted message ') + expect(event.get("empty")).to be_nil + expect(event.get("non-empty")).to eq('foo') + end + end + context "field and value split multi" do let(:message) { "hello::world__foo::bar__baz::fizz__doublequoted::\"hello world\"__singlequoted::'hello world'__bracketsone::(hello world)__bracketstwo::[hello world]__bracketsthree::" } let(:options) {