From d53e041a3e8cb6a7374ddb6b7704466f3bcdd3cf Mon Sep 17 00:00:00 2001 From: Noah Treuhaft Date: Mon, 9 Dec 2024 16:06:08 -0500 Subject: [PATCH] Implement vector non-predicate search terms (#5521) Specifically, this includes regular expression, glob, keyword, and literal search terms (i.e., dag.Search and dag.RegexpSearch). --- compiler/kernel/vexpr.go | 41 ++++- compiler/parser/ztests/bytes.yaml | 2 + compiler/parser/ztests/esc-quote.yaml | 2 + compiler/parser/ztests/glob-numeric.yaml | 2 + compiler/parser/ztests/in-requires-space.yaml | 2 + compiler/parser/ztests/ipv6.yaml | 2 + .../parser/ztests/leading-quotation-mark.yaml | 2 + compiler/parser/ztests/match-double.yaml | 2 + compiler/parser/ztests/match-parentheses.yaml | 2 + compiler/parser/ztests/match.yaml | 2 + compiler/parser/ztests/search.yaml | 2 + compiler/parser/ztests/unicode-keyword.yaml | 2 + compiler/ztests/regexp-search.yaml | 2 + compiler/ztests/search-regexp-not-glob.yaml | 2 + compiler/ztests/search-type-value.yaml | 2 + runtime/sam/expr/filter.go | 10 +- runtime/vam/expr/search.go | 142 ++++++++++++++++++ .../expr/search-escaped-asterisk.yaml} | 2 + .../expr/search-escaped-equal-sign.yaml} | 2 + .../expr/search-glob.yaml} | 2 + .../expr}/search-nested-field-regexp.yaml | 2 + .../expr}/search-nested-field.yaml | 2 + .../expr}/search-primitives.yaml | 2 + .../expr/search-unescaped-asterisk.yaml} | 2 + .../search-regexp-escapes.yaml} | 2 + 25 files changed, 228 insertions(+), 9 deletions(-) create mode 100644 runtime/vam/expr/search.go rename runtime/{sam/expr/ztests/filter-escaped-asterisk.yaml => ztests/expr/search-escaped-asterisk.yaml} (87%) rename runtime/{sam/expr/ztests/filter-escaped-equal-sign.yaml => ztests/expr/search-escaped-equal-sign.yaml} (87%) rename runtime/{sam/expr/ztests/glob.yaml => ztests/expr/search-glob.yaml} (92%) rename runtime/{sam/expr/ztests => ztests/expr}/search-nested-field-regexp.yaml (95%) rename runtime/{sam/expr/ztests => ztests/expr}/search-nested-field.yaml (94%) rename runtime/{sam/expr/ztests => ztests/expr}/search-primitives.yaml (83%) rename runtime/{sam/expr/ztests/filter-unescaped-asterisk.yaml => ztests/expr/search-unescaped-asterisk.yaml} (88%) rename runtime/{sam/expr/ztests/regexp-escapes.yaml => ztests/search-regexp-escapes.yaml} (93%) diff --git a/compiler/kernel/vexpr.go b/compiler/kernel/vexpr.go index eeb9137aee..cd42566cfa 100644 --- a/compiler/kernel/vexpr.go +++ b/compiler/kernel/vexpr.go @@ -4,12 +4,15 @@ import ( "errors" "fmt" + "github.com/brimdata/super" "github.com/brimdata/super/compiler/dag" "github.com/brimdata/super/pkg/field" + "github.com/brimdata/super/runtime/sam/expr" "github.com/brimdata/super/runtime/sam/expr/function" vamexpr "github.com/brimdata/super/runtime/vam/expr" vamfunction "github.com/brimdata/super/runtime/vam/expr/function" "github.com/brimdata/super/zson" + "golang.org/x/text/unicode/norm" ) func (b *Builder) compileVamExpr(e dag.Expr) (vamexpr.Evaluator, error) { @@ -27,8 +30,8 @@ func (b *Builder) compileVamExpr(e dag.Expr) (vamexpr.Evaluator, error) { return vamexpr.NewLiteral(val), nil //case *dag.Var: // return vamexpr.NewVar(e.Slot), nil - //case *dag.Search: - // return b.compileSearch(e) + case *dag.Search: + return b.compileVamSearch(e) case *dag.This: return vamexpr.NewDottedExpr(b.zctx(), field.Path(e.Path)), nil case *dag.Dot: @@ -47,8 +50,8 @@ func (b *Builder) compileVamExpr(e dag.Expr) (vamexpr.Evaluator, error) { return b.compileVamCall(e) //case *dag.RegexpMatch: // return b.compileVamRegexpMatch(e) - //case *dag.RegexpSearch: - // return b.compileVamRegexpSearch(e) + case *dag.RegexpSearch: + return b.compileVamRegexpSearch(e) case *dag.RecordExpr: return b.compileVamRecordExpr(e) //case *dag.SetExpr: @@ -244,6 +247,36 @@ func (b *Builder) compileVamRecordExpr(e *dag.RecordExpr) (vamexpr.Evaluator, er return vamexpr.NewRecordExpr(b.zctx(), elems), nil } +func (b *Builder) compileVamRegexpSearch(search *dag.RegexpSearch) (vamexpr.Evaluator, error) { + e, err := b.compileVamExpr(search.Expr) + if err != nil { + return nil, err + } + re, err := expr.CompileRegexp(search.Pattern) + if err != nil { + return nil, err + } + return vamexpr.NewSearchRegexp(re, e), nil +} + +func (b *Builder) compileVamSearch(search *dag.Search) (vamexpr.Evaluator, error) { + val, err := zson.ParseValue(b.zctx(), search.Value) + if err != nil { + return nil, err + } + e, err := b.compileVamExpr(search.Expr) + if err != nil { + return nil, err + } + if super.TypeUnder(val.Type()) == super.TypeString { + // Do a grep-style substring search instead of an + // exact match on each value. + term := norm.NFC.Bytes(val.Bytes()) + return vamexpr.NewSearchString(string(term), e), nil + } + return vamexpr.NewSearch(search.Text, val, e), nil +} + func (b *Builder) compileVamArrayExpr(e *dag.ArrayExpr) (vamexpr.Evaluator, error) { elems, err := b.compileVamListElems(e.Elems) if err != nil { diff --git a/compiler/parser/ztests/bytes.yaml b/compiler/parser/ztests/bytes.yaml index 5383f9ee09..7a6576aba1 100644 --- a/compiler/parser/ztests/bytes.yaml +++ b/compiler/parser/ztests/bytes.yaml @@ -1,5 +1,7 @@ zed: '? 0x or 0x0e00924c694c98b6c11ef56f025f3255904f4e45' +vector: true + input: &input | {a:0x} {a:0x0e00924c694c98b6c11ef56f025f3255904f4e45} diff --git a/compiler/parser/ztests/esc-quote.yaml b/compiler/parser/ztests/esc-quote.yaml index 925e27c558..1ab7d3e48a 100644 --- a/compiler/parser/ztests/esc-quote.yaml +++ b/compiler/parser/ztests/esc-quote.yaml @@ -1,5 +1,7 @@ zed: '? "foo\"bar"' +vector: true + input: | {s:"foo\"bar"} {s:"foobar"} diff --git a/compiler/parser/ztests/glob-numeric.yaml b/compiler/parser/ztests/glob-numeric.yaml index 4188c3094c..189dd3ed0b 100644 --- a/compiler/parser/ztests/glob-numeric.yaml +++ b/compiler/parser/ztests/glob-numeric.yaml @@ -1,5 +1,7 @@ zed: '? *1' +vector: true + input: | "1" "a1" diff --git a/compiler/parser/ztests/in-requires-space.yaml b/compiler/parser/ztests/in-requires-space.yaml index 705e09e0b0..0d82b17a9a 100644 --- a/compiler/parser/ztests/in-requires-space.yaml +++ b/compiler/parser/ztests/in-requires-space.yaml @@ -2,6 +2,8 @@ # longer doing this zed: '? x==1 inaction' +vector: true + input: | {x:1,text:"inaction"} diff --git a/compiler/parser/ztests/ipv6.yaml b/compiler/parser/ztests/ipv6.yaml index 4d42f11a88..9b1324f957 100644 --- a/compiler/parser/ztests/ipv6.yaml +++ b/compiler/parser/ztests/ipv6.yaml @@ -1,5 +1,7 @@ zed: '? fe80::58d2:2d09:e8cb:a8ad OR ::58d2:2d09:e8cb:a8ad OR 2d09:e8cb:a8ad:: OR ::' +vector: true + input: &input | {a:fe80::58d2:2d09:e8cb:a8ad} {a:::58d2:2d09:e8cb:a8ad} diff --git a/compiler/parser/ztests/leading-quotation-mark.yaml b/compiler/parser/ztests/leading-quotation-mark.yaml index a3bed3a13b..f64930f0a4 100644 --- a/compiler/parser/ztests/leading-quotation-mark.yaml +++ b/compiler/parser/ztests/leading-quotation-mark.yaml @@ -1,5 +1,7 @@ zed: '? \"foo' +vector: true + input: | {s:"foo"} {s:"\"foo"} diff --git a/compiler/parser/ztests/match-double.yaml b/compiler/parser/ztests/match-double.yaml index bcb948932e..ec5866f424 100644 --- a/compiler/parser/ztests/match-double.yaml +++ b/compiler/parser/ztests/match-double.yaml @@ -1,5 +1,7 @@ zed: '? grep("a") grep("b")' +vector: true + input: | {s1:"a",s2:"b"} {s1:"b",s2:"a"} diff --git a/compiler/parser/ztests/match-parentheses.yaml b/compiler/parser/ztests/match-parentheses.yaml index 16234e5765..77a382233f 100644 --- a/compiler/parser/ztests/match-parentheses.yaml +++ b/compiler/parser/ztests/match-parentheses.yaml @@ -1,5 +1,7 @@ zed: '? (10.0.0.0/8 or 172.16.0.0/12 or 192.168.0.0/16)' +vector: true + input: | {a:1.1.1.1,b:172.16.0.1} {a:192.168.0.1,b:2.2.2.2} diff --git a/compiler/parser/ztests/match.yaml b/compiler/parser/ztests/match.yaml index 5122ef48f0..83eaaeb885 100644 --- a/compiler/parser/ztests/match.yaml +++ b/compiler/parser/ztests/match.yaml @@ -1,5 +1,7 @@ zed: grep("a") +vector: true + input: | {s1:"a",s2:"b"} {s1:"b",s2:"a"} diff --git a/compiler/parser/ztests/search.yaml b/compiler/parser/ztests/search.yaml index 8e11e525b1..9e43dcb5e1 100644 --- a/compiler/parser/ztests/search.yaml +++ b/compiler/parser/ztests/search.yaml @@ -1,5 +1,7 @@ zed: '?bar' +vector: true + input: | {s1:"foo",s2:"bar"} {s1:"foo",s2:null(string)} diff --git a/compiler/parser/ztests/unicode-keyword.yaml b/compiler/parser/ztests/unicode-keyword.yaml index 89db1868d5..00c4fed3ca 100644 --- a/compiler/parser/ztests/unicode-keyword.yaml +++ b/compiler/parser/ztests/unicode-keyword.yaml @@ -1,5 +1,7 @@ zed: '? bjørndal' +vector: true + input: '"bjørndal"' output: | diff --git a/compiler/ztests/regexp-search.yaml b/compiler/ztests/regexp-search.yaml index 32362c6c06..8ba71d3d31 100644 --- a/compiler/ztests/regexp-search.yaml +++ b/compiler/ztests/regexp-search.yaml @@ -1,5 +1,7 @@ zed: yield grep(*foo*) +vector: true + input: | "foo" 1 diff --git a/compiler/ztests/search-regexp-not-glob.yaml b/compiler/ztests/search-regexp-not-glob.yaml index d2e51d5e67..5dcce751b6 100644 --- a/compiler/ztests/search-regexp-not-glob.yaml +++ b/compiler/ztests/search-regexp-not-glob.yaml @@ -1,6 +1,8 @@ zed: | ? /.*/ +vector: true + input: &input | "a" diff --git a/compiler/ztests/search-type-value.yaml b/compiler/ztests/search-type-value.yaml index 6c5160e698..f18edffed7 100644 --- a/compiler/ztests/search-type-value.yaml +++ b/compiler/ztests/search-type-value.yaml @@ -1,6 +1,8 @@ zed: | ? <{x:int64}> +vector: true + input: <{x:int64}> output: | diff --git a/runtime/sam/expr/filter.go b/runtime/sam/expr/filter.go index d1630e64ec..bb183bead3 100644 --- a/runtime/sam/expr/filter.go +++ b/runtime/sam/expr/filter.go @@ -49,9 +49,9 @@ func (s *searchByPred) Eval(ectx Context, val super.Value) super.Value { return super.False } -// stringSearch is like strings.Contains() but with case-insensitive +// StringContainsFold is like strings.Contains but with case-insensitive // comparison. -func stringSearch(a, b string) bool { +func StringContainsFold(a, b string) bool { alen := len(a) blen := len(b) @@ -107,7 +107,7 @@ func (s *search) Eval(ectx Context, val super.Value) super.Value { } if errMatch == val.Walk(func(typ super.Type, body zcode.Bytes) error { if typ.ID() == super.IDString { - if stringSearch(byteconv.UnsafeString(body), s.text) { + if StringContainsFold(byteconv.UnsafeString(body), s.text) { return errMatch } return nil @@ -159,7 +159,7 @@ func NewSearchString(term string, expr Evaluator) Evaluator { term: term, expr: expr, fnm: NewFieldNameMatcher(func(b []byte) bool { - return stringSearch(byteconv.UnsafeString(b), term) + return StringContainsFold(byteconv.UnsafeString(b), term) }), } } @@ -176,7 +176,7 @@ func (s *searchString) Eval(ectx Context, val super.Value) super.Value { } if errMatch == val.Walk(func(typ super.Type, body zcode.Bytes) error { if typ.ID() == super.IDString && - stringSearch(byteconv.UnsafeString(body), s.term) { + StringContainsFold(byteconv.UnsafeString(body), s.term) { return errMatch } return nil diff --git a/runtime/vam/expr/search.go b/runtime/vam/expr/search.go new file mode 100644 index 0000000000..ca162f72dd --- /dev/null +++ b/runtime/vam/expr/search.go @@ -0,0 +1,142 @@ +package expr + +import ( + "net/netip" + "regexp" + "slices" + "unsafe" + + "github.com/brimdata/super" + "github.com/brimdata/super/runtime/sam/expr" + "github.com/brimdata/super/vector" +) + +type search struct { + e Evaluator + vectorPred func(vector.Any) vector.Any + stringPred func([]byte) bool + fnm *expr.FieldNameMatcher +} + +func NewSearch(s string, val super.Value, e Evaluator) Evaluator { + stringPred := func(b []byte) bool { + return expr.StringContainsFold(string(b), s) + } + var net netip.Prefix + if val.Type().ID() == super.IDNet { + net = super.DecodeNet(val.Bytes()) + } + eq := NewCompare(super.NewContext() /* XXX */, nil, nil, "==") + vectorPred := func(vec vector.Any) vector.Any { + if net.IsValid() && vector.KindOf(vec) == vector.KindIP { + out := vector.NewBoolEmpty(vec.Len(), nil) + for i := range vec.Len() { + if ip, null := vector.IPValue(vec, i); !null && net.Contains(ip) { + out.Set(i) + } + } + return out + } + return eq.eval(vec, vector.NewConst(val, vec.Len(), nil)) + } + return &search{e, vectorPred, stringPred, nil} +} + +func NewSearchRegexp(re *regexp.Regexp, e Evaluator) Evaluator { + return &search{e, nil, re.Match, expr.NewFieldNameMatcher(re.Match)} +} + +func NewSearchString(s string, e Evaluator) Evaluator { + pred := func(b []byte) bool { + return expr.StringContainsFold(string(b), s) + } + return &search{e, nil, pred, expr.NewFieldNameMatcher(pred)} +} + +func (s *search) Eval(this vector.Any) vector.Any { + return vector.Apply(true, s.eval, s.e.Eval(this)) +} + +func (s *search) eval(vecs ...vector.Any) vector.Any { + vec := vector.Under(vecs[0]) + typ := vec.Type() + if s.fnm != nil && s.fnm.Match(typ) { + return vector.NewConst(super.True, vec.Len(), nil) + } + if typ.Kind() == super.PrimitiveKind { + return s.match(vec) + } + n := vec.Len() + var index []uint32 + if view, ok := vec.(*vector.View); ok { + vec = view.Any + index = view.Index + } + switch vec := vec.(type) { + case *vector.Record: + out := vector.NewBoolEmpty(n, nil) + for _, f := range vec.Fields { + if index != nil { + f = vector.NewView(f, index) + } + out = vector.Or(out, toBool(s.eval(f))) + } + return out + case *vector.Array: + return s.evalForList(vec.Values, vec.Offsets, index, n) + case *vector.Set: + return s.evalForList(vec.Values, vec.Offsets, index, n) + case *vector.Map: + return vector.Or(s.evalForList(vec.Keys, vec.Offsets, index, n), + s.evalForList(vec.Values, vec.Offsets, index, n)) + case *vector.Union: + return vector.Apply(true, s.eval, vec) + case *vector.Error: + return s.eval(vec.Vals) + } + panic(vec) +} + +func (s *search) evalForList(vec vector.Any, offsets, index []uint32, length uint32) *vector.Bool { + out := vector.NewBoolEmpty(length, nil) + var index2 []uint32 + for j := range length { + if index != nil { + j = index[j] + } + start, end := offsets[j], offsets[j+1] + if start == end { + continue + } + n := end - start + index2 = slices.Grow(index2[:0], int(n))[:n] + for k := range n { + index2[k] = k + start + } + view := vector.NewView(vec, index2) + if toBool(s.eval(view)).TrueCount() > 0 { + out.Set(j) + } + } + return out +} + +func (s *search) match(vec vector.Any) vector.Any { + if vec.Type().ID() == super.IDString { + out := vector.NewBoolEmpty(vec.Len(), nil) + for i := range vec.Len() { + str, null := vector.StringValue(vec, i) + // Prevent compiler from copying str, which it thinks + // escapes to the heap because stringPred is a pointer. + bytes := unsafe.Slice(unsafe.StringData(str), len(str)) + if !null && s.stringPred(bytes) { + out.Set(i) + } + } + return out + } + if s.vectorPred != nil { + return s.vectorPred(vec) + } + return vector.NewConst(super.False, vec.Len(), nil) +} diff --git a/runtime/sam/expr/ztests/filter-escaped-asterisk.yaml b/runtime/ztests/expr/search-escaped-asterisk.yaml similarity index 87% rename from runtime/sam/expr/ztests/filter-escaped-asterisk.yaml rename to runtime/ztests/expr/search-escaped-asterisk.yaml index 3f1d9067fb..a4305623fa 100644 --- a/runtime/sam/expr/ztests/filter-escaped-asterisk.yaml +++ b/runtime/ztests/expr/search-escaped-asterisk.yaml @@ -1,5 +1,7 @@ zed: '? A\=\*' +vector: true + input: | {s:"A=B"} {s:"A=*"} diff --git a/runtime/sam/expr/ztests/filter-escaped-equal-sign.yaml b/runtime/ztests/expr/search-escaped-equal-sign.yaml similarity index 87% rename from runtime/sam/expr/ztests/filter-escaped-equal-sign.yaml rename to runtime/ztests/expr/search-escaped-equal-sign.yaml index 91e8915c19..0c0f6d8b0d 100644 --- a/runtime/sam/expr/ztests/filter-escaped-equal-sign.yaml +++ b/runtime/ztests/expr/search-escaped-equal-sign.yaml @@ -1,5 +1,7 @@ zed: '? A\=B' +vector: true + input: | {s:"A=B"} {s:"A=*"} diff --git a/runtime/sam/expr/ztests/glob.yaml b/runtime/ztests/expr/search-glob.yaml similarity index 92% rename from runtime/sam/expr/ztests/glob.yaml rename to runtime/ztests/expr/search-glob.yaml index e5413fc2d1..8ec29474e9 100644 --- a/runtime/sam/expr/ztests/glob.yaml +++ b/runtime/ztests/expr/search-glob.yaml @@ -1,5 +1,7 @@ zed: '? foo*' +vector: true + input: | {a:"hello",b:"there"} {a:"foox",b:"there"} diff --git a/runtime/sam/expr/ztests/search-nested-field-regexp.yaml b/runtime/ztests/expr/search-nested-field-regexp.yaml similarity index 95% rename from runtime/sam/expr/ztests/search-nested-field-regexp.yaml rename to runtime/ztests/expr/search-nested-field-regexp.yaml index 2072dbc72d..d2433d7f8f 100644 --- a/runtime/sam/expr/ztests/search-nested-field-regexp.yaml +++ b/runtime/ztests/expr/search-nested-field-regexp.yaml @@ -1,5 +1,7 @@ zed: '? *ar' +vector: true + input: | {a:[{bar:"foo"}]} {a:[{car:"foo"}]} diff --git a/runtime/sam/expr/ztests/search-nested-field.yaml b/runtime/ztests/expr/search-nested-field.yaml similarity index 94% rename from runtime/sam/expr/ztests/search-nested-field.yaml rename to runtime/ztests/expr/search-nested-field.yaml index 37765b9989..1df16e5387 100644 --- a/runtime/sam/expr/ztests/search-nested-field.yaml +++ b/runtime/ztests/expr/search-nested-field.yaml @@ -1,5 +1,7 @@ zed: '?b' +vector: true + input: | {a:[{b:"foo"}]} {a:[{c:"foo"}]} diff --git a/runtime/sam/expr/ztests/search-primitives.yaml b/runtime/ztests/expr/search-primitives.yaml similarity index 83% rename from runtime/sam/expr/ztests/search-primitives.yaml rename to runtime/ztests/expr/search-primitives.yaml index ead6d54e17..9ee849edec 100644 --- a/runtime/sam/expr/ztests/search-primitives.yaml +++ b/runtime/ztests/expr/search-primitives.yaml @@ -1,5 +1,7 @@ zed: '?foo' +vector: true + input: | "foo" "bar" diff --git a/runtime/sam/expr/ztests/filter-unescaped-asterisk.yaml b/runtime/ztests/expr/search-unescaped-asterisk.yaml similarity index 88% rename from runtime/sam/expr/ztests/filter-unescaped-asterisk.yaml rename to runtime/ztests/expr/search-unescaped-asterisk.yaml index 41f7c2fea4..09367d9931 100644 --- a/runtime/sam/expr/ztests/filter-unescaped-asterisk.yaml +++ b/runtime/ztests/expr/search-unescaped-asterisk.yaml @@ -1,5 +1,7 @@ zed: '? A\=*' +vector: true + input: | {s:"A=B"} {s:"A=*"} diff --git a/runtime/sam/expr/ztests/regexp-escapes.yaml b/runtime/ztests/search-regexp-escapes.yaml similarity index 93% rename from runtime/sam/expr/ztests/regexp-escapes.yaml rename to runtime/ztests/search-regexp-escapes.yaml index 352b9d2ad7..49004c8c42 100644 --- a/runtime/sam/expr/ztests/regexp-escapes.yaml +++ b/runtime/ztests/search-regexp-escapes.yaml @@ -2,6 +2,8 @@ # (https://github.com/brimdata/super/issues/3041). zed: '? /\f\t\n\r\(\)\*\+\.\/\?\[\]\{\}/' +vector: true + input: &input | {a:"\f\t\n\r()*+./?[]{}"}