Skip to content

Commit

Permalink
Use CSUP files in vector runtime
Browse files Browse the repository at this point in the history
The commit adds the ability to use CSUP files in vector runtime. It also
implements the NewConcurrentPuller protocol to enable parallel read of a
CSUP file in vector runtime.
  • Loading branch information
mattnibs committed Nov 26, 2024
1 parent b1f99bd commit efb9335
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 23 deletions.
9 changes: 7 additions & 2 deletions compiler/semantic/op.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,13 @@ func (a *analyzer) formatArg(args ast.FromArgs) string {

func (a *analyzer) semFile(name string, args ast.FromArgs) dag.Op {
format := a.formatArg(args)
if format == "" && strings.HasSuffix(name, ".parquet") {
format = "parquet"
if format == "" {
switch filepath.Ext(name) {
case ".parquet":
format = "parquet"
case ".csup":
format = "csup"
}
}
return &dag.FileScan{
Kind: "FileScan",
Expand Down
12 changes: 9 additions & 3 deletions runtime/exec/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/brimdata/super/zbuf"
"github.com/brimdata/super/zio/anyio"
"github.com/brimdata/super/zio/parquetio"
"github.com/brimdata/super/zio/vngio"
"github.com/segmentio/ksuid"
)

Expand Down Expand Up @@ -124,8 +125,8 @@ func (c *closePuller) Pull(done bool) (zbuf.Batch, error) {
}

func (e *Environment) VectorOpen(ctx context.Context, zctx *super.Context, path, format string, fields []field.Path) (vector.Puller, error) {
if format != "parquet" {
return nil, fmt.Errorf("vector runtime supports only Parquet files")
if format != "parquet" && format != "csup" {
return nil, fmt.Errorf("vector runtime supports only Parquet and CSUP files")
}
if path == "-" {
path = "stdio:stdin"
Expand All @@ -138,7 +139,12 @@ func (e *Environment) VectorOpen(ctx context.Context, zctx *super.Context, path,
if err != nil {
return nil, err
}
puller, err := parquetio.NewVectorReader(ctx, zctx, r, fields)
var puller vector.Puller
if format == "parquet" {
puller, err = parquetio.NewVectorReader(ctx, zctx, r, fields)
} else {
puller, err = vngio.NewVectorReader(ctx, zctx, r, fields)
}
if err != nil {
r.Close()
return nil, err
Expand Down
4 changes: 4 additions & 0 deletions vng/object.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ func (o *Object) NewReader(zctx *super.Context) (zio.Reader, error) {
return NewZedReader(zctx, o.meta, o.readerAt)
}

func (o *Object) Size() uint64 {
return HeaderSize + o.header.MetaSize + o.header.DataSize
}

func readMetadata(r io.Reader) (Metadata, error) {
zctx := super.NewContext()
zr := zngio.NewReader(zctx, r)
Expand Down
30 changes: 12 additions & 18 deletions zio/vngio/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package vngio
import (
"errors"
"io"
"math"

"github.com/brimdata/super"
"github.com/brimdata/super/pkg/field"
Expand All @@ -12,7 +13,7 @@ import (

type reader struct {
zctx *super.Context
objects []object
objects []*vng.Object
n int
readerAt io.ReaderAt
reader zio.Reader
Expand Down Expand Up @@ -40,14 +41,10 @@ again:
if r.n >= len(r.objects) {
return nil, nil
}
meta := r.objects[r.n]
o := r.objects[r.n]
r.n++
o, err := vng.NewObject(io.NewSectionReader(r.readerAt, meta.start, meta.offset))
if err != nil {
return nil, err
}
r.reader, err = o.NewReader(r.zctx)
if err != nil {
var err error
if r.reader, err = o.NewReader(r.zctx); err != nil {
return nil, err
}
}
Expand All @@ -59,23 +56,20 @@ again:
return v, err
}

type object struct {
start, offset int64
}

func readObjects(r io.ReaderAt) ([]object, error) {
var objects []object
func readObjects(r io.ReaderAt) ([]*vng.Object, error) {
var objects []*vng.Object
var start int64
for {
header, err := vng.ReadHeader(io.NewSectionReader(r, start, vng.HeaderSize))
// NewObject puts the right end to the passed in SectionReader and since
// the end is unkown just pass MaxInt64.
o, err := vng.NewObject(io.NewSectionReader(r, start, math.MaxInt64))
if err != nil {
if err == io.EOF && len(objects) > 0 {
return objects, nil
}
return nil, err
}
offset := int64(vng.HeaderSize) + int64(header.MetaSize+header.DataSize)
objects = append(objects, object{start, offset})
start += offset
objects = append(objects, o)
start += int64(o.Size())
}
}
73 changes: 73 additions & 0 deletions zio/vngio/vectorreader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package vngio

import (
"context"
"errors"
"io"
"sync/atomic"

"github.com/brimdata/super"
"github.com/brimdata/super/pkg/field"
"github.com/brimdata/super/runtime/vcache"
"github.com/brimdata/super/vector"
"github.com/brimdata/super/vng"
)

type VectorReader struct {
ctx context.Context
zctx *super.Context

objects []object

Check failure on line 20 in zio/vngio/vectorreader.go

View workflow job for this annotation

GitHub Actions / output-check

undefined: object

Check failure on line 20 in zio/vngio/vectorreader.go

View workflow job for this annotation

GitHub Actions / test-windows

undefined: object

Check failure on line 20 in zio/vngio/vectorreader.go

View workflow job for this annotation

GitHub Actions / test-windows

undefined: object

Check failure on line 20 in zio/vngio/vectorreader.go

View workflow job for this annotation

GitHub Actions / test

undefined: object

Check failure on line 20 in zio/vngio/vectorreader.go

View workflow job for this annotation

GitHub Actions / test

undefined: object
nextObject *atomic.Int64
projection vcache.Path
readerAt io.ReaderAt
}

func NewVectorReader(ctx context.Context, zctx *super.Context, r io.Reader, fields []field.Path) (*VectorReader, error) {
ra, ok := r.(io.ReaderAt)
if !ok {
return nil, errors.New("Super Columnar requires a seekable input")
}
objects, err := readObjects(ra)
if err != nil {
return nil, err
}
return &VectorReader{
ctx: ctx,
zctx: zctx,
objects: objects,
nextObject: &atomic.Int64{},
projection: vcache.NewProjection(fields),
readerAt: ra,
}, nil
}

func (v *VectorReader) NewConcurrentPuller() vector.Puller {
return &VectorReader{
ctx: v.ctx,
zctx: v.zctx,
objects: v.objects,
nextObject: v.nextObject,
projection: v.projection,
readerAt: v.readerAt,
}
}

func (v *VectorReader) Pull(done bool) (vector.Any, error) {
if done {
return nil, nil
}
if err := v.ctx.Err(); err != nil {
return nil, err
}
n := int(v.nextObject.Add(1) - 1)
if n >= len(v.objects) {
return nil, nil
}
meta := v.objects[n]
o, err := vng.NewObject(io.NewSectionReader(v.readerAt, meta.start, meta.offset))
if err != nil {
return nil, err
}
return vcache.NewObjectFromVNG(o).Fetch(v.zctx, v.projection)
}

0 comments on commit efb9335

Please sign in to comment.