Skip to content

Commit

Permalink
Implement vector non-predicate search terms
Browse files Browse the repository at this point in the history
Specifically, this includes regular expression, glob, keyword, and
literal search terms (i.e., dag.Search and dag.RegexpSearch).
  • Loading branch information
nwt committed Dec 9, 2024
1 parent 5058677 commit 988cc7f
Show file tree
Hide file tree
Showing 23 changed files with 224 additions and 9 deletions.
41 changes: 37 additions & 4 deletions compiler/kernel/vexpr.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@ import (
"errors"
"fmt"

"github.com/brimdata/super"
"github.com/brimdata/super/compiler/dag"
"github.com/brimdata/super/pkg/field"
"github.com/brimdata/super/runtime/sam/expr"
"github.com/brimdata/super/runtime/sam/expr/function"
vamexpr "github.com/brimdata/super/runtime/vam/expr"
vamfunction "github.com/brimdata/super/runtime/vam/expr/function"
"github.com/brimdata/super/zson"
"golang.org/x/text/unicode/norm"
)

func (b *Builder) compileVamExpr(e dag.Expr) (vamexpr.Evaluator, error) {
Expand All @@ -27,8 +30,8 @@ func (b *Builder) compileVamExpr(e dag.Expr) (vamexpr.Evaluator, error) {
return vamexpr.NewLiteral(val), nil
//case *dag.Var:
// return vamexpr.NewVar(e.Slot), nil
//case *dag.Search:
// return b.compileSearch(e)
case *dag.Search:
return b.compileVamSearch(e)
case *dag.This:
return vamexpr.NewDottedExpr(b.zctx(), field.Path(e.Path)), nil
case *dag.Dot:
Expand All @@ -47,8 +50,8 @@ func (b *Builder) compileVamExpr(e dag.Expr) (vamexpr.Evaluator, error) {
return b.compileVamCall(e)
//case *dag.RegexpMatch:
// return b.compileVamRegexpMatch(e)
//case *dag.RegexpSearch:
// return b.compileVamRegexpSearch(e)
case *dag.RegexpSearch:
return b.compileVamRegexpSearch(e)
case *dag.RecordExpr:
return b.compileVamRecordExpr(e)
//case *dag.SetExpr:
Expand Down Expand Up @@ -244,6 +247,36 @@ func (b *Builder) compileVamRecordExpr(e *dag.RecordExpr) (vamexpr.Evaluator, er
return vamexpr.NewRecordExpr(b.zctx(), elems), nil
}

func (b *Builder) compileVamRegexpSearch(search *dag.RegexpSearch) (vamexpr.Evaluator, error) {
e, err := b.compileVamExpr(search.Expr)
if err != nil {
return nil, err
}
re, err := expr.CompileRegexp(search.Pattern)
if err != nil {
return nil, err
}
return vamexpr.NewSearchRegexp(re, e), nil
}

func (b *Builder) compileVamSearch(search *dag.Search) (vamexpr.Evaluator, error) {
val, err := zson.ParseValue(b.zctx(), search.Value)
if err != nil {
return nil, err
}
e, err := b.compileVamExpr(search.Expr)
if err != nil {
return nil, err
}
if super.TypeUnder(val.Type()) == super.TypeString {
// Do a grep-style substring search instead of an
// exact match on each value.
term := norm.NFC.Bytes(val.Bytes())
return vamexpr.NewSearchString(string(term), e), nil
}
return vamexpr.NewSearch(search.Text, val, e), nil
}

func (b *Builder) compileVamArrayExpr(e *dag.ArrayExpr) (vamexpr.Evaluator, error) {
elems, err := b.compileVamListElems(e.Elems)
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions compiler/parser/ztests/bytes.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '? 0x or 0x0e00924c694c98b6c11ef56f025f3255904f4e45'

vector: true

input: &input |
{a:0x}
{a:0x0e00924c694c98b6c11ef56f025f3255904f4e45}
Expand Down
2 changes: 2 additions & 0 deletions compiler/parser/ztests/esc-quote.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '? "foo\"bar"'

vector: true

input: |
{s:"foo\"bar"}
{s:"foobar"}
Expand Down
2 changes: 2 additions & 0 deletions compiler/parser/ztests/glob-numeric.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '? *1'

vector: true

input: |
"1"
"a1"
Expand Down
2 changes: 2 additions & 0 deletions compiler/parser/ztests/in-requires-space.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# longer doing this
zed: '? x==1 inaction'

vector: true

input: |
{x:1,text:"inaction"}
Expand Down
2 changes: 2 additions & 0 deletions compiler/parser/ztests/ipv6.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '? fe80::58d2:2d09:e8cb:a8ad OR ::58d2:2d09:e8cb:a8ad OR 2d09:e8cb:a8ad:: OR ::'

vector: true

input: &input |
{a:fe80::58d2:2d09:e8cb:a8ad}
{a:::58d2:2d09:e8cb:a8ad}
Expand Down
2 changes: 2 additions & 0 deletions compiler/parser/ztests/leading-quotation-mark.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '? \"foo'

vector: true

input: |
{s:"foo"}
{s:"\"foo"}
Expand Down
2 changes: 2 additions & 0 deletions compiler/parser/ztests/match-parentheses.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '? (10.0.0.0/8 or 172.16.0.0/12 or 192.168.0.0/16)'

vector: true

input: |
{a:1.1.1.1,b:172.16.0.1}
{a:192.168.0.1,b:2.2.2.2}
Expand Down
2 changes: 2 additions & 0 deletions compiler/parser/ztests/search.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '?bar'

vector: true

input: |
{s1:"foo",s2:"bar"}
{s1:"foo",s2:null(string)}
Expand Down
2 changes: 2 additions & 0 deletions compiler/parser/ztests/unicode-keyword.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '? bjørndal'

vector: true

input: '"bjørndal"'

output: |
Expand Down
2 changes: 2 additions & 0 deletions compiler/ztests/regexp-search.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: yield grep(*foo*)

vector: true

input: |
"foo"
1
Expand Down
2 changes: 2 additions & 0 deletions compiler/ztests/search-regexp-not-glob.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
zed: |
? /.*/
vector: true

input: &input |
"a"

Expand Down
2 changes: 2 additions & 0 deletions compiler/ztests/search-type-value.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
zed: |
? <{x:int64}>
vector: true

input: <int64> <string> <{x:int64}>

output: |
Expand Down
10 changes: 5 additions & 5 deletions runtime/sam/expr/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ func (s *searchByPred) Eval(ectx Context, val super.Value) super.Value {
return super.False
}

// stringSearch is like strings.Contains() but with case-insensitive
// StringContainsFold is like strings.Contains but with case-insensitive
// comparison.
func stringSearch(a, b string) bool {
func StringContainsFold(a, b string) bool {
alen := len(a)
blen := len(b)

Expand Down Expand Up @@ -107,7 +107,7 @@ func (s *search) Eval(ectx Context, val super.Value) super.Value {
}
if errMatch == val.Walk(func(typ super.Type, body zcode.Bytes) error {
if typ.ID() == super.IDString {
if stringSearch(byteconv.UnsafeString(body), s.text) {
if StringContainsFold(byteconv.UnsafeString(body), s.text) {
return errMatch
}
return nil
Expand Down Expand Up @@ -159,7 +159,7 @@ func NewSearchString(term string, expr Evaluator) Evaluator {
term: term,
expr: expr,
fnm: NewFieldNameMatcher(func(b []byte) bool {
return stringSearch(byteconv.UnsafeString(b), term)
return StringContainsFold(byteconv.UnsafeString(b), term)
}),
}
}
Expand All @@ -176,7 +176,7 @@ func (s *searchString) Eval(ectx Context, val super.Value) super.Value {
}
if errMatch == val.Walk(func(typ super.Type, body zcode.Bytes) error {
if typ.ID() == super.IDString &&
stringSearch(byteconv.UnsafeString(body), s.term) {
StringContainsFold(byteconv.UnsafeString(body), s.term) {
return errMatch
}
return nil
Expand Down
142 changes: 142 additions & 0 deletions runtime/vam/expr/search.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
package expr

import (
"net/netip"
"regexp"
"slices"
"unsafe"

"github.com/brimdata/super"
"github.com/brimdata/super/runtime/sam/expr"
"github.com/brimdata/super/vector"
)

type search struct {
e Evaluator
vectorPred func(vector.Any) vector.Any
stringPred func([]byte) bool
fnm *expr.FieldNameMatcher
}

func NewSearch(s string, val super.Value, e Evaluator) Evaluator {
stringPred := func(b []byte) bool {
return expr.StringContainsFold(string(b), s)
}
var net netip.Prefix
if val.Type().ID() == super.IDNet {
net = super.DecodeNet(val.Bytes())
}
eq := NewCompare(super.NewContext() /* XXX */, nil, nil, "==")
vectorPred := func(vec vector.Any) vector.Any {
if net.IsValid() && vector.KindOf(vec) == vector.KindIP {
out := vector.NewBoolEmpty(vec.Len(), nil)
for i := range vec.Len() {
if ip, null := vector.IPValue(vec, i); !null && net.Contains(ip) {
out.Set(i)
}
}
return out
}
return eq.eval(vec, vector.NewConst(val, vec.Len(), nil))
}
return &search{e, vectorPred, stringPred, nil}
}

func NewSearchRegexp(re *regexp.Regexp, e Evaluator) Evaluator {
return &search{e, nil, re.Match, expr.NewFieldNameMatcher(re.Match)}
}

func NewSearchString(s string, e Evaluator) Evaluator {
pred := func(b []byte) bool {
return expr.StringContainsFold(string(b), s)
}
return &search{e, nil, pred, expr.NewFieldNameMatcher(pred)}
}

func (s *search) Eval(this vector.Any) vector.Any {
return vector.Apply(true, s.eval, s.e.Eval(this))
}

func (s *search) eval(vecs ...vector.Any) vector.Any {
vec := vector.Under(vecs[0])
typ := vec.Type()
if s.fnm != nil && s.fnm.Match(typ) {
return vector.NewConst(super.True, vec.Len(), nil)
}
if typ.Kind() == super.PrimitiveKind {
return s.match(vec)
}
n := vec.Len()
var index []uint32
if view, ok := vec.(*vector.View); ok {
vec = view.Any
index = view.Index
}
switch vec := vec.(type) {
case *vector.Record:
out := vector.NewBoolEmpty(n, nil)
for _, f := range vec.Fields {
if index != nil {
f = vector.NewView(f, index)
}
out = vector.Or(out, toBool(s.eval(f)))
}
return out
case *vector.Array:
return s.evalForList(vec.Values, vec.Offsets, index, n)
case *vector.Set:
return s.evalForList(vec.Values, vec.Offsets, index, n)
case *vector.Map:
return vector.Or(s.evalForList(vec.Keys, vec.Offsets, index, n),
s.evalForList(vec.Values, vec.Offsets, index, n))
case *vector.Union:
return vector.Apply(true, s.eval, vec)
case *vector.Error:
return s.eval(vec.Vals)
}
panic(vec)
}

func (s *search) evalForList(vec vector.Any, offsets, index []uint32, length uint32) *vector.Bool {
out := vector.NewBoolEmpty(length, nil)
var index2 []uint32
for j := range length {
if index != nil {
j = index[j]
}
start, end := offsets[j], offsets[j+1]
if start == end {
continue
}
n := end - start
index2 = slices.Grow(index2[:0], int(n))[:n]
for k := range n {
index2[k] = k + start
}
view := vector.NewView(vec, index2)
if toBool(s.eval(view)).TrueCount() > 0 {
out.Set(j)
}
}
return out
}

func (s *search) match(vec vector.Any) vector.Any {
if vec.Type().ID() == super.IDString {
out := vector.NewBoolEmpty(vec.Len(), nil)
for i := range vec.Len() {
str, null := vector.StringValue(vec, i)
// Prevent compiler from copying str, which it thinks
// escapes to the heap because stringPred is a pointer.
bytes := unsafe.Slice(unsafe.StringData(str), len(str))
if !null && s.stringPred(bytes) {
out.Set(i)
}
}
return out
}
if s.vectorPred != nil {
return s.vectorPred(vec)
}
return vector.NewConst(super.False, vec.Len(), nil)
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '? A\=\*'

vector: true

input: |
{s:"A=B"}
{s:"A=*"}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '? A\=B'

vector: true

input: |
{s:"A=B"}
{s:"A=*"}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '? foo*'

vector: true

input: |
{a:"hello",b:"there"}
{a:"foox",b:"there"}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '? *ar'

vector: true

input: |
{a:[{bar:"foo"}]}
{a:[{car:"foo"}]}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: '?b'

vector: true

input: |
{a:[{b:"foo"}]}
{a:[{c:"foo"}]}
Expand Down
Loading

0 comments on commit 988cc7f

Please sign in to comment.