From caa0006c1ca46569bde70be0c36c1f850283f16f Mon Sep 17 00:00:00 2001 From: Matthew Nibecker Date: Thu, 19 Dec 2024 11:55:54 -0800 Subject: [PATCH] vam: Support slices on bytes and strings --- runtime/sam/expr/slice.go | 8 +- runtime/vam/expr/slice.go | 198 +++++++++++++++++- .../expr/ztests => ztests/expr}/slice.yaml | 2 + 3 files changed, 197 insertions(+), 11 deletions(-) rename runtime/{sam/expr/ztests => ztests/expr}/slice.yaml (99%) diff --git a/runtime/sam/expr/slice.go b/runtime/sam/expr/slice.go index c8e028011b..bb7699b613 100644 --- a/runtime/sam/expr/slice.go +++ b/runtime/sam/expr/slice.go @@ -70,8 +70,8 @@ func (s *Slice) Eval(ectx Context, this super.Value) super.Value { case *super.TypeOfBytes: bytes = bytes[from:to] case *super.TypeOfString: - bytes = bytes[utf8PrefixLen(bytes, from):] - bytes = bytes[:utf8PrefixLen(bytes, to-from)] + bytes = bytes[UTF8PrefixLen(bytes, from):] + bytes = bytes[:UTF8PrefixLen(bytes, to-from)] case *super.TypeArray, *super.TypeSet: it := bytes.Iter() for k := 0; k < to && !it.Done(); k++ { @@ -104,9 +104,9 @@ func sliceIndex(ectx Context, this super.Value, slot Evaluator, length int) (int return index, nil } -// utf8PrefixLen returns the length in bytes of the first runeCount runes in b. +// UTF8PrefixLen returns the length in bytes of the first runeCount runes in b. // It returns 0 if runeCount<0 and len(b) if runeCount>utf8.RuneCount(b). -func utf8PrefixLen(b []byte, runeCount int) int { +func UTF8PrefixLen(b []byte, runeCount int) int { var i, runeCurrent int for { if runeCurrent >= runeCount { diff --git a/runtime/vam/expr/slice.go b/runtime/vam/expr/slice.go index af0ea1991f..026a2d23d8 100644 --- a/runtime/vam/expr/slice.go +++ b/runtime/vam/expr/slice.go @@ -1,7 +1,10 @@ package expr import ( + "unicode/utf8" + "github.com/brimdata/super" + "github.com/brimdata/super/runtime/sam/expr" "github.com/brimdata/super/vector" ) @@ -37,21 +40,21 @@ func (s *sliceExpr) eval(vecs ...vector.Any) vector.Any { if s.fromEval != nil { from = vecs[0] if !super.IsSigned(from.Type().ID()) { - return vector.NewWrappedError(s.zctx, "slice: from value is not an integer", from) + return vector.NewStringError(s.zctx, "slice index is not a number", from.Len()) } vecs = vecs[1:] } if s.toEval != nil { to = vecs[0] if !super.IsSigned(to.Type().ID()) { - return vector.NewWrappedError(s.zctx, "slice: to value is not an integer", from) + return vector.NewStringError(s.zctx, "slice index is not a number", to.Len()) } } switch vector.KindOf(container) { case vector.KindArray, vector.KindSet: return s.evalArrayOrSlice(container, from, to) case vector.KindBytes, vector.KindString: - panic("slices on bytes and strings unsupported") + return s.evalStringOrBytes(container, from, to) case vector.KindError: return container default: @@ -64,6 +67,7 @@ func (s *sliceExpr) evalArrayOrSlice(vec, fromVec, toVec vector.Any) vector.Any to, constTo := sliceIsConstIndex(toVec) slowPath := !constFrom || !constTo var index []uint32 + n := vec.Len() if view, ok := vec.(*vector.View); ok { vec, index = view.Any, view.Index } @@ -72,7 +76,7 @@ func (s *sliceExpr) evalArrayOrSlice(vec, fromVec, toVec vector.Any) vector.Any var errs []uint32 var innerIndex []uint32 var nullsOut *vector.Bool - for i := range vec.Len() { + for i := range n { idx := i if index != nil { idx = index[i] @@ -90,17 +94,17 @@ func (s *sliceExpr) evalArrayOrSlice(vec, fromVec, toVec vector.Any) vector.Any start, end := int64(0), size if fromVec != nil { if slowPath { - from, _ = vector.IntValue(fromVec, idx) + from, _ = vector.IntValue(fromVec, i) } start = sliceIndex(from, size) } if toVec != nil { if slowPath { - to, _ = vector.IntValue(toVec, idx) + to, _ = vector.IntValue(toVec, i) } end = sliceIndex(to, size) } - if start > end || end > size || start < 0 { + if invalidSlice(start, end, size) { errs = append(errs, i) continue } @@ -127,6 +131,166 @@ func (s *sliceExpr) evalArrayOrSlice(vec, fromVec, toVec vector.Any) vector.Any return out } +func (s *sliceExpr) evalStringOrBytes(vec, fromVec, toVec vector.Any) vector.Any { + constFrom, isConstFrom := sliceIsConstIndex(fromVec) + constTo, isConstTo := sliceIsConstIndex(toVec) + if isConstFrom && isConstTo { + if out, ok := s.evalStringOrBytesFast(vec, constFrom, constTo); ok { + return out + } + } + var errs []uint32 + newOffsets := []uint32{0} + var newBytes []byte + var nullsOut *vector.Bool + id := vec.Type().ID() + for i := range vec.Len() { + slice, isnull := s.bytesAt(vec, i) + if isnull { + newOffsets = append(newOffsets, newOffsets[len(newOffsets)-1]) + if nullsOut == nil { + nullsOut = vector.NewBoolEmpty(vec.Len(), nil) + } + nullsOut.Set(i) + continue + } + size := lengthOfBytesOrString(id, slice) + start, end := int64(0), size + if fromVec != nil { + from, _ := vector.IntValue(fromVec, i) + start = sliceIndex(from, size) + } + if toVec != nil { + to, _ := vector.IntValue(toVec, i) + end = sliceIndex(to, size) + } + if invalidSlice(start, end, size) { + errs = append(errs, i) + continue + } + slice = sliceBytesOrString(slice, id, start, end) + newBytes = append(newBytes, slice...) + newOffsets = append(newOffsets, newOffsets[len(newOffsets)-1]+uint32(len(slice))) + + } + out := s.bytesOrStringVec(vec.Type(), newOffsets, newBytes, nullsOut) + if nullsOut != nil { + nullsOut.SetLen(out.Len()) + } + if len(errs) > 0 { + errOut := vector.NewStringError(s.zctx, "slice out of bounds", uint32(len(errs))) + return vector.Combine(out, errs, errOut) + } + return out +} + +func (s *sliceExpr) evalStringOrBytesFast(vec vector.Any, from, to int64) (vector.Any, bool) { + switch vec := vec.(type) { + case *vector.Const: + slice := vec.Value().Bytes() + id := vec.Type().ID() + size := lengthOfBytesOrString(id, slice) + start, end := int64(0), size + if s.fromEval != nil { + start = sliceIndex(from, size) + } + if s.toEval != nil { + end = sliceIndex(to, size) + } + if invalidSlice(start, end, size) { + return nil, false + } + slice = sliceBytesOrString(slice, id, start, end) + return vector.NewConst(super.NewValue(vec.Type(), slice), vec.Len(), vec.Nulls), true + case *vector.View: + out, ok := s.evalStringOrBytesFast(vec.Any, from, to) + if !ok { + return nil, false + } + return vector.NewView(out, vec.Index), true + case *vector.Dict: + out, ok := s.evalStringOrBytesFast(vec.Any, from, to) + if !ok { + return nil, false + } + return vector.NewDict(out, vec.Index, vec.Counts, vec.Nulls), true + default: + offsets, bytes, nullsIn := stringOrBytesContents(vec) + newOffsets := []uint32{0} + var newBytes []byte + id := vec.Type().ID() + for i := range vec.Len() { + slice := bytes[offsets[i]:offsets[i+1]] + size := lengthOfBytesOrString(id, slice) + start, end := int64(0), size + if s.fromEval != nil { + start = sliceIndex(from, size) + } + if s.toEval != nil { + end = sliceIndex(to, size) + } + if invalidSlice(start, end, size) { + return nil, false + } + slice = sliceBytesOrString(slice, id, start, end) + newBytes = append(newBytes, slice...) + newOffsets = append(newOffsets, newOffsets[len(newOffsets)-1]+uint32(len(slice))) + } + return s.bytesOrStringVec(vec.Type(), newOffsets, newBytes, nullsIn), true + } +} + +func (s *sliceExpr) bytesOrStringVec(typ super.Type, offsets []uint32, bytes []byte, nulls *vector.Bool) vector.Any { + switch typ.ID() { + case super.IDBytes: + return vector.NewBytes(offsets, bytes, nulls) + case super.IDString: + return vector.NewString(offsets, bytes, nulls) + default: + panic(typ) + } +} + +func (s *sliceExpr) bytesAt(val vector.Any, slot uint32) ([]byte, bool) { + switch val := val.(type) { + case *vector.String: + if val.Nulls.Value(slot) { + return nil, true + } + return val.Bytes[val.Offsets[slot]:val.Offsets[slot+1]], false + case *vector.Bytes: + if val.Nulls.Value(slot) { + return nil, true + } + return val.Value(slot), false + case *vector.Const: + if val.Nulls.Value(slot) { + return nil, true + } + s, _ := val.AsBytes() + return s, false + case *vector.Dict: + if val.Nulls.Value(slot) { + return nil, true + } + return s.bytesAt(val.Any, uint32(val.Index[slot])) + case *vector.View: + return s.bytesAt(val.Any, val.Index[slot]) + } + panic(val) +} + +func lengthOfBytesOrString(id int, slice []byte) int64 { + if id == super.IDString { + return int64(utf8.RuneCount(slice)) + } + return int64(len(slice)) +} + +func invalidSlice(start, end, size int64) bool { + return start > end || end > size || start < 0 +} + func sliceIsConstIndex(vec vector.Any) (int64, bool) { if vec == nil { return 0, true @@ -144,6 +308,26 @@ func sliceIndex(idx, size int64) int64 { return idx } +func sliceBytesOrString(slice []byte, id int, start, end int64) []byte { + if id == super.IDString { + slice = slice[expr.UTF8PrefixLen(slice, int(start)):] + return slice[:expr.UTF8PrefixLen(slice, int(end-start))] + } else { + return slice[start:end] + } +} + +func stringOrBytesContents(vec vector.Any) ([]uint32, []byte, *vector.Bool) { + switch vec := vec.(type) { + case *vector.String: + return vec.Offsets, vec.Bytes, vec.Nulls + case *vector.Bytes: + return vec.Offs, vec.Bytes, vec.Nulls + default: + panic(vec) + } +} + func arrayOrSetContents(vec vector.Any) ([]uint32, vector.Any, *vector.Bool) { switch vec := vec.(type) { case *vector.Array: diff --git a/runtime/sam/expr/ztests/slice.yaml b/runtime/ztests/expr/slice.yaml similarity index 99% rename from runtime/sam/expr/ztests/slice.yaml rename to runtime/ztests/expr/slice.yaml index 007b09ead1..9401948c45 100644 --- a/runtime/sam/expr/ztests/slice.yaml +++ b/runtime/ztests/expr/slice.yaml @@ -1,5 +1,7 @@ zed: "cut a1:=a[1:-1],a2:=a[1:],a3:=a[:1],a4:=a[:-1],a5:=a[:-100],a6:=a[-1:],a7:=a[-2:-1],a8:=(a IS NOT NULL and len(a)>0) ? a[:a[0]-8] : null" +vector: true + input: | {a:null} {a:null(bytes)}