Skip to content

Commit

Permalink
vam: Support slices on bytes and strings (#5542)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattnibs authored Dec 19, 2024
1 parent a38d6b8 commit 966d3a9
Show file tree
Hide file tree
Showing 3 changed files with 197 additions and 11 deletions.
8 changes: 4 additions & 4 deletions runtime/sam/expr/slice.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ func (s *Slice) Eval(ectx Context, this super.Value) super.Value {
case *super.TypeOfBytes:
bytes = bytes[from:to]
case *super.TypeOfString:
bytes = bytes[utf8PrefixLen(bytes, from):]
bytes = bytes[:utf8PrefixLen(bytes, to-from)]
bytes = bytes[UTF8PrefixLen(bytes, from):]
bytes = bytes[:UTF8PrefixLen(bytes, to-from)]
case *super.TypeArray, *super.TypeSet:
it := bytes.Iter()
for k := 0; k < to && !it.Done(); k++ {
Expand Down Expand Up @@ -104,9 +104,9 @@ func sliceIndex(ectx Context, this super.Value, slot Evaluator, length int) (int
return index, nil
}

// utf8PrefixLen returns the length in bytes of the first runeCount runes in b.
// UTF8PrefixLen returns the length in bytes of the first runeCount runes in b.
// It returns 0 if runeCount<0 and len(b) if runeCount>utf8.RuneCount(b).
func utf8PrefixLen(b []byte, runeCount int) int {
func UTF8PrefixLen(b []byte, runeCount int) int {
var i, runeCurrent int
for {
if runeCurrent >= runeCount {
Expand Down
198 changes: 191 additions & 7 deletions runtime/vam/expr/slice.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package expr

import (
"unicode/utf8"

"github.com/brimdata/super"
"github.com/brimdata/super/runtime/sam/expr"
"github.com/brimdata/super/vector"
)

Expand Down Expand Up @@ -37,21 +40,21 @@ func (s *sliceExpr) eval(vecs ...vector.Any) vector.Any {
if s.fromEval != nil {
from = vecs[0]
if !super.IsSigned(from.Type().ID()) {
return vector.NewWrappedError(s.zctx, "slice: from value is not an integer", from)
return vector.NewStringError(s.zctx, "slice index is not a number", from.Len())
}
vecs = vecs[1:]
}
if s.toEval != nil {
to = vecs[0]
if !super.IsSigned(to.Type().ID()) {
return vector.NewWrappedError(s.zctx, "slice: to value is not an integer", from)
return vector.NewStringError(s.zctx, "slice index is not a number", to.Len())
}
}
switch vector.KindOf(container) {
case vector.KindArray, vector.KindSet:
return s.evalArrayOrSlice(container, from, to)
case vector.KindBytes, vector.KindString:
panic("slices on bytes and strings unsupported")
return s.evalStringOrBytes(container, from, to)
case vector.KindError:
return container
default:
Expand All @@ -64,6 +67,7 @@ func (s *sliceExpr) evalArrayOrSlice(vec, fromVec, toVec vector.Any) vector.Any
to, constTo := sliceIsConstIndex(toVec)
slowPath := !constFrom || !constTo
var index []uint32
n := vec.Len()
if view, ok := vec.(*vector.View); ok {
vec, index = view.Any, view.Index
}
Expand All @@ -72,7 +76,7 @@ func (s *sliceExpr) evalArrayOrSlice(vec, fromVec, toVec vector.Any) vector.Any
var errs []uint32
var innerIndex []uint32
var nullsOut *vector.Bool
for i := range vec.Len() {
for i := range n {
idx := i
if index != nil {
idx = index[i]
Expand All @@ -90,17 +94,17 @@ func (s *sliceExpr) evalArrayOrSlice(vec, fromVec, toVec vector.Any) vector.Any
start, end := int64(0), size
if fromVec != nil {
if slowPath {
from, _ = vector.IntValue(fromVec, idx)
from, _ = vector.IntValue(fromVec, i)
}
start = sliceIndex(from, size)
}
if toVec != nil {
if slowPath {
to, _ = vector.IntValue(toVec, idx)
to, _ = vector.IntValue(toVec, i)
}
end = sliceIndex(to, size)
}
if start > end || end > size || start < 0 {
if invalidSlice(start, end, size) {
errs = append(errs, i)
continue
}
Expand All @@ -127,6 +131,166 @@ func (s *sliceExpr) evalArrayOrSlice(vec, fromVec, toVec vector.Any) vector.Any
return out
}

func (s *sliceExpr) evalStringOrBytes(vec, fromVec, toVec vector.Any) vector.Any {
constFrom, isConstFrom := sliceIsConstIndex(fromVec)
constTo, isConstTo := sliceIsConstIndex(toVec)
if isConstFrom && isConstTo {
if out, ok := s.evalStringOrBytesFast(vec, constFrom, constTo); ok {
return out
}
}
var errs []uint32
newOffsets := []uint32{0}
var newBytes []byte
var nullsOut *vector.Bool
id := vec.Type().ID()
for i := range vec.Len() {
slice, isnull := s.bytesAt(vec, i)
if isnull {
newOffsets = append(newOffsets, newOffsets[len(newOffsets)-1])
if nullsOut == nil {
nullsOut = vector.NewBoolEmpty(vec.Len(), nil)
}
nullsOut.Set(i)
continue
}
size := lengthOfBytesOrString(id, slice)
start, end := int64(0), size
if fromVec != nil {
from, _ := vector.IntValue(fromVec, i)
start = sliceIndex(from, size)
}
if toVec != nil {
to, _ := vector.IntValue(toVec, i)
end = sliceIndex(to, size)
}
if invalidSlice(start, end, size) {
errs = append(errs, i)
continue
}
slice = sliceBytesOrString(slice, id, start, end)
newBytes = append(newBytes, slice...)
newOffsets = append(newOffsets, newOffsets[len(newOffsets)-1]+uint32(len(slice)))

}
out := s.bytesOrStringVec(vec.Type(), newOffsets, newBytes, nullsOut)
if nullsOut != nil {
nullsOut.SetLen(out.Len())
}
if len(errs) > 0 {
errOut := vector.NewStringError(s.zctx, "slice out of bounds", uint32(len(errs)))
return vector.Combine(out, errs, errOut)
}
return out
}

func (s *sliceExpr) evalStringOrBytesFast(vec vector.Any, from, to int64) (vector.Any, bool) {
switch vec := vec.(type) {
case *vector.Const:
slice := vec.Value().Bytes()
id := vec.Type().ID()
size := lengthOfBytesOrString(id, slice)
start, end := int64(0), size
if s.fromEval != nil {
start = sliceIndex(from, size)
}
if s.toEval != nil {
end = sliceIndex(to, size)
}
if invalidSlice(start, end, size) {
return nil, false
}
slice = sliceBytesOrString(slice, id, start, end)
return vector.NewConst(super.NewValue(vec.Type(), slice), vec.Len(), vec.Nulls), true
case *vector.View:
out, ok := s.evalStringOrBytesFast(vec.Any, from, to)
if !ok {
return nil, false
}
return vector.NewView(out, vec.Index), true
case *vector.Dict:
out, ok := s.evalStringOrBytesFast(vec.Any, from, to)
if !ok {
return nil, false
}
return vector.NewDict(out, vec.Index, vec.Counts, vec.Nulls), true
default:
offsets, bytes, nullsIn := stringOrBytesContents(vec)
newOffsets := []uint32{0}
var newBytes []byte
id := vec.Type().ID()
for i := range vec.Len() {
slice := bytes[offsets[i]:offsets[i+1]]
size := lengthOfBytesOrString(id, slice)
start, end := int64(0), size
if s.fromEval != nil {
start = sliceIndex(from, size)
}
if s.toEval != nil {
end = sliceIndex(to, size)
}
if invalidSlice(start, end, size) {
return nil, false
}
slice = sliceBytesOrString(slice, id, start, end)
newBytes = append(newBytes, slice...)
newOffsets = append(newOffsets, newOffsets[len(newOffsets)-1]+uint32(len(slice)))
}
return s.bytesOrStringVec(vec.Type(), newOffsets, newBytes, nullsIn), true
}
}

func (s *sliceExpr) bytesOrStringVec(typ super.Type, offsets []uint32, bytes []byte, nulls *vector.Bool) vector.Any {
switch typ.ID() {
case super.IDBytes:
return vector.NewBytes(offsets, bytes, nulls)
case super.IDString:
return vector.NewString(offsets, bytes, nulls)
default:
panic(typ)
}
}

func (s *sliceExpr) bytesAt(val vector.Any, slot uint32) ([]byte, bool) {
switch val := val.(type) {
case *vector.String:
if val.Nulls.Value(slot) {
return nil, true
}
return val.Bytes[val.Offsets[slot]:val.Offsets[slot+1]], false
case *vector.Bytes:
if val.Nulls.Value(slot) {
return nil, true
}
return val.Value(slot), false
case *vector.Const:
if val.Nulls.Value(slot) {
return nil, true
}
s, _ := val.AsBytes()
return s, false
case *vector.Dict:
if val.Nulls.Value(slot) {
return nil, true
}
return s.bytesAt(val.Any, uint32(val.Index[slot]))
case *vector.View:
return s.bytesAt(val.Any, val.Index[slot])
}
panic(val)
}

func lengthOfBytesOrString(id int, slice []byte) int64 {
if id == super.IDString {
return int64(utf8.RuneCount(slice))
}
return int64(len(slice))
}

func invalidSlice(start, end, size int64) bool {
return start > end || end > size || start < 0
}

func sliceIsConstIndex(vec vector.Any) (int64, bool) {
if vec == nil {
return 0, true
Expand All @@ -144,6 +308,26 @@ func sliceIndex(idx, size int64) int64 {
return idx
}

func sliceBytesOrString(slice []byte, id int, start, end int64) []byte {
if id == super.IDString {
slice = slice[expr.UTF8PrefixLen(slice, int(start)):]
return slice[:expr.UTF8PrefixLen(slice, int(end-start))]
} else {
return slice[start:end]
}
}

func stringOrBytesContents(vec vector.Any) ([]uint32, []byte, *vector.Bool) {
switch vec := vec.(type) {
case *vector.String:
return vec.Offsets, vec.Bytes, vec.Nulls
case *vector.Bytes:
return vec.Offs, vec.Bytes, vec.Nulls
default:
panic(vec)
}
}

func arrayOrSetContents(vec vector.Any) ([]uint32, vector.Any, *vector.Bool) {
switch vec := vec.(type) {
case *vector.Array:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
zed: "cut a1:=a[1:-1],a2:=a[1:],a3:=a[:1],a4:=a[:-1],a5:=a[:-100],a6:=a[-1:],a7:=a[-2:-1],a8:=(a IS NOT NULL and len(a)>0) ? a[:a[0]-8] : null"

vector: true

input: |
{a:null}
{a:null(bytes)}
Expand Down

0 comments on commit 966d3a9

Please sign in to comment.