diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index ca5e7e4..1080000 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -44,6 +44,12 @@ jobs: - name: Integration test run: make it + - name: Benchmark + run: | + make bench + go tool pprof -top cpu.out + go tool pprof -top mem.out + - uses: codecov/codecov-action@v1 with: file: ./cover.out diff --git a/Makefile b/Makefile index 32b5627..b6878ed 100644 --- a/Makefile +++ b/Makefile @@ -66,6 +66,11 @@ it: build ./columnify -schemaType bigquery -schemaFile columnifier/testdata/schema/array.bq.json -recordType jsonl columnifier/testdata/record/array.jsonl > /dev/null ./columnify -schemaType bigquery -schemaFile columnifier/testdata/schema/array.bq.json -recordType msgpack columnifier/testdata/record/array.msgpack > /dev/null +# NOTE A large number of -benchtime or default value occurs too long testing time +.PHONY: bench +bench: + go test -bench . -v ./columnifier -benchtime 100000x -benchmem -cpuprofile cpu.out -memprofile mem.out + # Set GITHUB_TOKEN and create release git tag .PHONY: release release: diff --git a/README.md b/README.md index 48a0502..5fceb3a 100644 --- a/README.md +++ b/README.md @@ -91,15 +91,16 @@ $ parquet-tools cat -json out.parquet Currently it has some limitations from schema/record types. -- Some logical types like Decimal are unsupported. +- Some logical types like `Decimal` are unsupported. - If using `-recordType = avro`, it doesn't support a nested record has only 1 sub field. - If using `-recordType = avro`, it converts bytes fields to base64 encoded value implicitly. +- The supported values have limitations with considering to record types, e.g. if you use `jsonl`, it might not be able to handle a large value. ## Development `Columnifier` reads input file(s), converts format based on given parameter, finally writes output files. -Format conversion is separated by schema / record. The `schema` conversion accepts input schema, then converts it to targer's via Arrow's schema. The `record` conversion is similar to schema's but intermediate is simply `map[string]interface{}`, because Arrow record isn't available as an intermediate. -`columnify` basically depends on existing modules but it contains additional modules like `avro`, `parquet` to fill insufficient features. +Format conversion is separated by schema / record. The `schema` conversion accepts input schema, then converts it to targets via Arrow's schema. And also the `record` conversion uses Arrow's Record as the intermediate data representation. +`columnify` basically depends on existing modules but it contains additional modules like `arrow`, `avro`, `parquet` to fill insufficient features. ## Release diff --git a/arrow/doc.go b/arrow/doc.go new file mode 100644 index 0000000..c4aee8b --- /dev/null +++ b/arrow/doc.go @@ -0,0 +1,11 @@ +/* + Package arrow is an extension for Go Arrow implementation. + https://github.com/apache/arrow/tree/master/go/arrow + + Go Arrow package still has some missing parts which we required, so + we fill it in this package our own. The package structure considers to + Arrow official's. + see also https://github.com/apache/arrow/blob/master/docs/source/status.rst + +*/ +package arrow diff --git a/arrow/json/writer.go b/arrow/json/writer.go new file mode 100644 index 0000000..df0971f --- /dev/null +++ b/arrow/json/writer.go @@ -0,0 +1,317 @@ +package json + +import ( + "encoding/json" + "errors" + "fmt" + "io" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/array" +) + +var ( + ErrMismatchFields = errors.New("arrow/json: number of records mismatch") + ErrUnsupportedType = errors.New("arrow/json: unsupported type") +) + +// JsonEncoder wraps encoding/json.Encoder and writes array.Record based on a schema. +type Encoder struct { + e *json.Encoder + schema *arrow.Schema +} + +// NewWriter returns a writer that writes array.Records to the CSV file +// with the given schema. +// +// NewWriter panics if the given schema contains fields that have types that are not +// primitive types. +func NewWriter(w io.Writer, schema *arrow.Schema) *Encoder { + ww := &Encoder{ + e: json.NewEncoder(w), + schema: schema, + } + + return ww +} + +func (e *Encoder) Schema() *arrow.Schema { return e.schema } + +// Write writes a single Record as one row to the JSON file +func (e *Encoder) Write(record array.Record) error { + if !record.Schema().Equal(e.schema) { + return ErrMismatchFields + } + + recs := make([]map[string]interface{}, record.NumRows()) + for i := range recs { + recs[i] = make(map[string]interface{}, record.NumCols()) + } + + for i, col := range record.Columns() { + values, err := convertToGo(col.Data()) + if err != nil { + return err + } + for j, v := range values { + recs[j][e.schema.Field(i).Name] = v + } + } + + return e.e.Encode(recs) +} + +// convertToGo converts Arrow values to Go typed values. +func convertToGo(data *array.Data) ([]interface{}, error) { + recs := make([]interface{}, 0, data.Len()) + + switch data.DataType().ID() { + case arrow.BOOL: + arr := array.NewBooleanData(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.INT8: + arr := array.NewInt8Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.INT16: + arr := array.NewInt16Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.INT32: + arr := array.NewInt32Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.INT64: + arr := array.NewInt64Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.UINT8: + arr := array.NewUint8Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.UINT16: + arr := array.NewUint16Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.UINT32: + arr := array.NewUint32Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.UINT64: + arr := array.NewUint64Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.FLOAT32: + arr := array.NewFloat32Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.FLOAT64: + arr := array.NewFloat64Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.STRING: + arr := array.NewStringData(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.BINARY: + arr := array.NewBinaryData(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.DATE32: + arr := array.NewDate32Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.DATE64: + arr := array.NewDate64Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.TIME32: + arr := array.NewTime32Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.TIME64: + arr := array.NewTime64Data(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.TIMESTAMP: + arr := array.NewTimestampData(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, arr.Value(i)) + } else { + recs = append(recs, nil) + } + } + + case arrow.STRUCT: + arr := array.NewStructData(data) + defer arr.Release() + st, stOk := arr.DataType().(*arrow.StructType) + if !stOk { + return nil, fmt.Errorf("unsupported data type %v: %w", arr.DataType(), ErrUnsupportedType) + } + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + recs = append(recs, make(map[string]interface{}, arr.NumField())) + } else { + recs = append(recs, nil) + } + } + for i := 0; i < arr.NumField(); i++ { + values, err := convertToGo(arr.Field(i).Data()) + if err != nil { + return nil, err + } + for j, v := range values { + if arr.IsValid(j) { + if r, ok := recs[j].(map[string]interface{}); ok { + r[st.Field(i).Name] = v + } + } + } + } + + case arrow.LIST: + arr := array.NewListData(data) + defer arr.Release() + for i := 0; i < arr.Len(); i++ { + if arr.IsValid(i) { + o := i + arr.Offset() + bgn := int64(arr.Offsets()[o]) + end := int64(arr.Offsets()[o+1]) + slice := array.NewSlice(arr.ListValues(), bgn, end) + defer slice.Release() + values, err := convertToGo(slice.Data()) + if err != nil { + return nil, err + } + recs = append(recs, values) + } else { + recs = append(recs, nil) + } + } + } + + return recs, nil +} diff --git a/arrow/json/writer_test.go b/arrow/json/writer_test.go new file mode 100644 index 0000000..f3ea0ff --- /dev/null +++ b/arrow/json/writer_test.go @@ -0,0 +1,475 @@ +package json + +import ( + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "reflect" + "strings" + "testing" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" +) + +func equalAsJson(left, right interface{}) bool { + l, err := json.Marshal(left) + if err != nil { + return false + } + + r, err := json.Marshal(right) + if err != nil { + return false + } + + return reflect.DeepEqual(l, r) +} + +func TestJsonWriter(t *testing.T) { + tests := []struct { + name string + }{{ + name: "Primitives", + }} + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + testJsonWriter(t) + }) + } +} + +func testJsonWriter(t *testing.T) { + f := new(bytes.Buffer) + + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(t, 0) + schema := arrow.NewSchema( + []arrow.Field{ + {Name: "bool", Type: arrow.FixedWidthTypes.Boolean}, + {Name: "i8", Type: arrow.PrimitiveTypes.Int8}, + {Name: "i16", Type: arrow.PrimitiveTypes.Int16}, + {Name: "i32", Type: arrow.PrimitiveTypes.Int32}, + {Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + {Name: "u8", Type: arrow.PrimitiveTypes.Uint8}, + {Name: "u16", Type: arrow.PrimitiveTypes.Uint16}, + {Name: "u32", Type: arrow.PrimitiveTypes.Uint32}, + {Name: "u64", Type: arrow.PrimitiveTypes.Uint64}, + {Name: "f32", Type: arrow.PrimitiveTypes.Float32}, + {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + {Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "bin", Type: arrow.BinaryTypes.Binary}, + {Name: "struct", Type: arrow.StructOf([]arrow.Field{ + {Name: "bool", Type: arrow.FixedWidthTypes.Boolean}, + {Name: "i8", Type: arrow.PrimitiveTypes.Int8}, + {Name: "i16", Type: arrow.PrimitiveTypes.Int16}, + {Name: "i32", Type: arrow.PrimitiveTypes.Int32}, + {Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + {Name: "u8", Type: arrow.PrimitiveTypes.Uint8}, + {Name: "u16", Type: arrow.PrimitiveTypes.Uint16}, + {Name: "u32", Type: arrow.PrimitiveTypes.Uint32}, + {Name: "u64", Type: arrow.PrimitiveTypes.Uint64}, + {Name: "f32", Type: arrow.PrimitiveTypes.Float32}, + {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + {Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "bin", Type: arrow.BinaryTypes.Binary}, + }...)}, + {Name: "list", Type: arrow.ListOf(arrow.PrimitiveTypes.Uint64)}, + }, + nil, + ) + + b := array.NewRecordBuilder(pool, schema) + defer b.Release() + + b.Field(0).(*array.BooleanBuilder).AppendValues([]bool{true, false, true}, nil) + b.Field(1).(*array.Int8Builder).AppendValues([]int8{-1, 0, 1}, nil) + b.Field(2).(*array.Int16Builder).AppendValues([]int16{-1, 0, 1}, nil) + b.Field(3).(*array.Int32Builder).AppendValues([]int32{-1, 0, 1}, nil) + b.Field(4).(*array.Int64Builder).AppendValues([]int64{-1, 0, 1}, nil) + b.Field(5).(*array.Uint8Builder).AppendValues([]uint8{0, 1, 2}, nil) + b.Field(6).(*array.Uint16Builder).AppendValues([]uint16{0, 1, 2}, nil) + b.Field(7).(*array.Uint32Builder).AppendValues([]uint32{0, 1, 2}, nil) + b.Field(8).(*array.Uint64Builder).AppendValues([]uint64{0, 1, 2}, nil) + b.Field(9).(*array.Float32Builder).AppendValues([]float32{0.0, 0.1, 0.2}, nil) + b.Field(10).(*array.Float64Builder).AppendValues([]float64{0.0, 0.1, 0.2}, nil) + b.Field(11).(*array.StringBuilder).AppendValues([]string{"str-0", "str-1", "str-2"}, nil) + b.Field(12).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("bin-0"), []byte("bin-1"), []byte("bin-2")}, nil) + sb := b.Field(13).(*array.StructBuilder) + sb.AppendValues([]bool{true, true, true}) + sb.FieldBuilder(0).(*array.BooleanBuilder).AppendValues([]bool{true, false, true}, nil) + sb.FieldBuilder(1).(*array.Int8Builder).AppendValues([]int8{-1, 0, 1}, nil) + sb.FieldBuilder(2).(*array.Int16Builder).AppendValues([]int16{-1, 0, 1}, nil) + sb.FieldBuilder(3).(*array.Int32Builder).AppendValues([]int32{-1, 0, 1}, nil) + sb.FieldBuilder(4).(*array.Int64Builder).AppendValues([]int64{-1, 0, 1}, nil) + sb.FieldBuilder(5).(*array.Uint8Builder).AppendValues([]uint8{0, 1, 2}, nil) + sb.FieldBuilder(6).(*array.Uint16Builder).AppendValues([]uint16{0, 1, 2}, nil) + sb.FieldBuilder(7).(*array.Uint32Builder).AppendValues([]uint32{0, 1, 2}, nil) + sb.FieldBuilder(8).(*array.Uint64Builder).AppendValues([]uint64{0, 1, 2}, nil) + sb.FieldBuilder(9).(*array.Float32Builder).AppendValues([]float32{0.0, 0.1, 0.2}, nil) + sb.FieldBuilder(10).(*array.Float64Builder).AppendValues([]float64{0.0, 0.1, 0.2}, nil) + sb.FieldBuilder(11).(*array.StringBuilder).AppendValues([]string{"str-0", "str-1", "str-2"}, nil) + sb.FieldBuilder(12).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("bin-0"), []byte("bin-1"), []byte("bin-2")}, nil) + lb := b.Field(14).(*array.ListBuilder) + lb.Append(true) + lb.ValueBuilder().(*array.Uint64Builder).AppendValues([]uint64{0, 0, 0}, nil) + lb.Append(true) + lb.ValueBuilder().(*array.Uint64Builder).AppendValues([]uint64{1, 11, 111}, nil) + lb.Append(true) + lb.ValueBuilder().(*array.Uint64Builder).AppendValues([]uint64{2, 22, 222}, nil) + + for _, field := range b.Fields() { + field.AppendNull() + } + + rec := b.NewRecord() + defer rec.Release() + + w := NewWriter(f, schema) + err := w.Write(rec) + if err != nil { + t.Fatal(err) + } + + want := strings.ReplaceAll(`[ +{"bin":"YmluLTA=","bool":true,"f32":0,"f64":0,"i16":-1,"i32":-1,"i64":-1,"i8":-1,"list":[0,0,0],"str":"str-0","struct":{"bin":"YmluLTA=","bool":true,"f32":0,"f64":0,"i16":-1,"i32":-1,"i64":-1,"i8":-1,"str":"str-0","u16":0,"u32":0,"u64":0,"u8":0},"u16":0,"u32":0,"u64":0,"u8":0}, +{"bin":"YmluLTE=","bool":false,"f32":0.1,"f64":0.1,"i16":0,"i32":0,"i64":0,"i8":0,"list":[1,11,111],"str":"str-1","struct":{"bin":"YmluLTE=","bool":false,"f32":0.1,"f64":0.1,"i16":0,"i32":0,"i64":0,"i8":0,"str":"str-1","u16":1,"u32":1,"u64":1,"u8":1},"u16":1,"u32":1,"u64":1,"u8":1}, +{"bin":"YmluLTI=","bool":true,"f32":0.2,"f64":0.2,"i16":1,"i32":1,"i64":1,"i8":1,"list":[2,22,222],"str":"str-2","struct":{"bin":"YmluLTI=","bool":true,"f32":0.2,"f64":0.2,"i16":1,"i32":1,"i64":1,"i8":1,"str":"str-2","u16":2,"u32":2,"u64":2,"u8":2},"u16":2,"u32":2,"u64":2,"u8":2}, +{"bin":null,"bool":null,"f32":null,"f64":null,"i16":null,"i32":null,"i64":null,"i8":null,"list":null,"str":null,"struct":null,"u16":null,"u32":null,"u64":null,"u8":null}] +`, "\n", "") + "\n" + + if got, want := f.String(), want; strings.Compare(got, want) != 0 { + t.Fatalf("invalid output:\ngot=%s\nwant=%s\n", got, want) + } +} + +func TestToGo(t *testing.T) { + pool := memory.NewGoAllocator() + + cases := []struct { + data *array.Data + expected interface{} + err error + }{ + // boolean + { + data: func() *array.Data { + b := array.NewBooleanBuilder(pool) + b.AppendValues([]bool{true, false, true}, nil) + return b.NewBooleanArray().Data() + }(), + expected: []bool{true, false, true}, + err: nil, + }, + + // int8 + { + data: func() *array.Data { + b := array.NewInt8Builder(pool) + b.AppendValues([]int8{-1, 0, 1}, nil) + return b.NewInt8Array().Data() + }(), + expected: []int8{-1, 0, 1}, + err: nil, + }, + + // int16 + { + data: func() *array.Data { + b := array.NewInt16Builder(pool) + b.AppendValues([]int16{-1, 0, 1}, nil) + return b.NewInt16Array().Data() + }(), + expected: []int16{-1, 0, 1}, + err: nil, + }, + + // int32 + { + data: func() *array.Data { + b := array.NewInt32Builder(pool) + b.AppendValues([]int32{-1, 0, 1}, nil) + return b.NewInt32Array().Data() + }(), + expected: []int32{-1, 0, 1}, + err: nil, + }, + + // int64 + { + data: func() *array.Data { + b := array.NewInt64Builder(pool) + b.AppendValues([]int64{-1, 0, 1}, nil) + return b.NewInt64Array().Data() + }(), + expected: []int64{-1, 0, 1}, + err: nil, + }, + + // uint8 TODO support this case + // []uint8 will be converted base64-ed string + /* + { + data: func() *array.Data { + b := array.NewUint8Builder(pool) + b.AppendValues([]uint8{0, 1, 2}, nil) + return b.NewUint8Array().Data() + }(), + expected: []uint8{0, 1, 2}, + err: nil, + }, + */ + + // uint16 + { + data: func() *array.Data { + b := array.NewUint16Builder(pool) + b.AppendValues([]uint16{0, 1, 2}, nil) + return b.NewUint16Array().Data() + }(), + expected: []uint16{0, 1, 2}, + err: nil, + }, + + // uint32 + { + data: func() *array.Data { + b := array.NewUint32Builder(pool) + b.AppendValues([]uint32{0, 1, 2}, nil) + return b.NewUint32Array().Data() + }(), + expected: []uint32{0, 1, 2}, + err: nil, + }, + + // uint64 + { + data: func() *array.Data { + b := array.NewUint64Builder(pool) + b.AppendValues([]uint64{0, 1, 2}, nil) + return b.NewUint64Array().Data() + }(), + expected: []uint64{0, 1, 2}, + err: nil, + }, + + // float32 + { + data: func() *array.Data { + b := array.NewFloat32Builder(pool) + b.AppendValues([]float32{0.0, 0.1, 0.2}, nil) + return b.NewFloat32Array().Data() + }(), + expected: []float32{0.0, 0.1, 0.2}, + err: nil, + }, + + // float64 + { + data: func() *array.Data { + b := array.NewFloat64Builder(pool) + b.AppendValues([]float64{0.0, 0.1, 0.2}, nil) + return b.NewFloat64Array().Data() + }(), + expected: []float64{0.0, 0.1, 0.2}, + err: nil, + }, + + // string + { + data: func() *array.Data { + b := array.NewStringBuilder(pool) + b.AppendValues([]string{"str-0", "str-1", "str-2"}, nil) + return b.NewStringArray().Data() + }(), + expected: []string{"str-0", "str-1", "str-2"}, + err: nil, + }, + + // binary + { + data: func() *array.Data { + b := array.NewBinaryBuilder(pool, arrow.BinaryTypes.Binary) + b.AppendValues([][]byte{[]byte("bin-0"), []byte("bin-1"), []byte("bin-2")}, nil) + return b.NewBinaryArray().Data() + }(), + expected: [][]byte{[]byte("bin-0"), []byte("bin-1"), []byte("bin-2")}, + err: nil, + }, + + // struct + { + data: func() *array.Data { + b := array.NewStructBuilder(pool, arrow.StructOf([]arrow.Field{ + {Name: "bool", Type: arrow.FixedWidthTypes.Boolean}, + {Name: "i8", Type: arrow.PrimitiveTypes.Int8}, + {Name: "i16", Type: arrow.PrimitiveTypes.Int16}, + {Name: "i32", Type: arrow.PrimitiveTypes.Int32}, + {Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + {Name: "u8", Type: arrow.PrimitiveTypes.Uint8}, + {Name: "u16", Type: arrow.PrimitiveTypes.Uint16}, + {Name: "u32", Type: arrow.PrimitiveTypes.Uint32}, + {Name: "u64", Type: arrow.PrimitiveTypes.Uint64}, + {Name: "f32", Type: arrow.PrimitiveTypes.Float32}, + {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + {Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "bin", Type: arrow.BinaryTypes.Binary}, + }...)) + b.AppendValues([]bool{true, true, true}) + b.FieldBuilder(0).(*array.BooleanBuilder).AppendValues([]bool{true, false, true}, nil) + b.FieldBuilder(1).(*array.Int8Builder).AppendValues([]int8{-1, 0, 1}, nil) + b.FieldBuilder(2).(*array.Int16Builder).AppendValues([]int16{-1, 0, 1}, nil) + b.FieldBuilder(3).(*array.Int32Builder).AppendValues([]int32{-1, 0, 1}, nil) + b.FieldBuilder(4).(*array.Int64Builder).AppendValues([]int64{-1, 0, 1}, nil) + b.FieldBuilder(5).(*array.Uint8Builder).AppendValues([]uint8{0, 1, 2}, nil) + b.FieldBuilder(6).(*array.Uint16Builder).AppendValues([]uint16{0, 1, 2}, nil) + b.FieldBuilder(7).(*array.Uint32Builder).AppendValues([]uint32{0, 1, 2}, nil) + b.FieldBuilder(8).(*array.Uint64Builder).AppendValues([]uint64{0, 1, 2}, nil) + b.FieldBuilder(9).(*array.Float32Builder).AppendValues([]float32{0.0, 0.1, 0.2}, nil) + b.FieldBuilder(10).(*array.Float64Builder).AppendValues([]float64{0.0, 0.1, 0.2}, nil) + b.FieldBuilder(11).(*array.StringBuilder).AppendValues([]string{"str-0", "str-1", "str-2"}, nil) + b.FieldBuilder(12).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("bin-0"), []byte("bin-1"), []byte("bin-2")}, nil) + b.FieldBuilder(0).(*array.BooleanBuilder).AppendValues([]bool{true, false, true}, nil) + return b.NewStructArray().Data() + }(), + expected: []map[string]interface{}{ + { + "bool": true, + "i8": -1, + "i16": -1, + "i32": -1, + "i64": -1, + "u8": 0, + "u16": 0, + "u32": 0, + "u64": 0, + "f32": 0.0, + "f64": 0.0, + "str": "str-0", + "bin": []byte("bin-0"), + }, + { + "bool": false, + "i8": 0, + "i16": 0, + "i32": 0, + "i64": 0, + "u8": 1, + "u16": 1, + "u32": 1, + "u64": 1, + "f32": 0.1, + "f64": 0.1, + "str": "str-1", + "bin": []byte("bin-1"), + }, + { + "bool": true, + "i8": 1, + "i16": 1, + "i32": 1, + "i64": 1, + "u8": 2, + "u16": 2, + "u32": 2, + "u64": 2, + "f32": 0.2, + "f64": 0.2, + "str": "str-2", + "bin": []byte("bin-2"), + }, + }, + err: nil, + }, + + // list + { + data: func() *array.Data { + b := array.NewListBuilder(pool, arrow.FixedWidthTypes.Boolean) + b.Append(true) + b.ValueBuilder().(*array.BooleanBuilder).AppendValues([]bool{true, false, false}, nil) + b.Append(true) + b.ValueBuilder().(*array.BooleanBuilder).AppendValues([]bool{true, true, false}, nil) + b.Append(true) + b.ValueBuilder().(*array.BooleanBuilder).AppendValues([]bool{true, true, true}, nil) + return b.NewListArray().Data() + }(), + expected: [][]bool{ + {true, false, false}, + {true, true, false}, + {true, true, true}, + }, + err: nil, + }, + } + + for _, c := range cases { + actual, err := convertToGo(c.data) + if err != c.err { + t.Errorf("expected %v, but actual %v", c.err, err) + } + if !equalAsJson(actual, c.expected) { + t.Errorf("expected %v, but actual %v", c.expected, actual) + } + } +} + +func BenchmarkWrite(b *testing.B) { + pool := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer pool.AssertSize(b, 0) + + schema := arrow.NewSchema( + []arrow.Field{ + {Name: "bool", Type: arrow.FixedWidthTypes.Boolean}, + {Name: "i8", Type: arrow.PrimitiveTypes.Int8}, + {Name: "i16", Type: arrow.PrimitiveTypes.Int16}, + {Name: "i32", Type: arrow.PrimitiveTypes.Int32}, + {Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + {Name: "u8", Type: arrow.PrimitiveTypes.Uint8}, + {Name: "u16", Type: arrow.PrimitiveTypes.Uint16}, + {Name: "u32", Type: arrow.PrimitiveTypes.Uint32}, + {Name: "u64", Type: arrow.PrimitiveTypes.Uint64}, + {Name: "f32", Type: arrow.PrimitiveTypes.Float32}, + {Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + {Name: "str", Type: arrow.BinaryTypes.String}, + {Name: "bin", Type: arrow.BinaryTypes.Binary}, + }, + nil, + ) + + bldr := array.NewRecordBuilder(pool, schema) + defer bldr.Release() + + const N = 1000 + for i := 0; i < N; i++ { + bldr.Field(0).(*array.BooleanBuilder).Append(i%10 == 0) + bldr.Field(1).(*array.Int8Builder).Append(int8(i)) + bldr.Field(2).(*array.Int16Builder).Append(int16(i)) + bldr.Field(3).(*array.Int32Builder).Append(int32(i)) + bldr.Field(4).(*array.Int64Builder).Append(int64(i)) + bldr.Field(5).(*array.Uint8Builder).Append(uint8(i)) + bldr.Field(6).(*array.Uint16Builder).Append(uint16(i)) + bldr.Field(7).(*array.Uint32Builder).Append(uint32(i)) + bldr.Field(8).(*array.Uint64Builder).Append(uint64(i)) + bldr.Field(9).(*array.Float32Builder).Append(float32(i)) + bldr.Field(10).(*array.Float64Builder).Append(float64(i)) + bldr.Field(11).(*array.StringBuilder).Append(fmt.Sprintf("str-%d", i)) + bldr.Field(12).(*array.BinaryBuilder).Append([]byte(fmt.Sprintf("bin-%d", i))) + } + + rec := bldr.NewRecord() + defer rec.Release() + + w := NewWriter(ioutil.Discard, schema) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := w.Write(rec) + if err != nil { + b.Fatal(err) + + } + } +} diff --git a/columnifier/parquet.go b/columnifier/parquet.go index 97f56a3..a9aea47 100644 --- a/columnifier/parquet.go +++ b/columnifier/parquet.go @@ -1,9 +1,12 @@ package columnifier import ( + "bytes" "io/ioutil" + "github.com/reproio/columnify/arrow/json" "github.com/reproio/columnify/record" + "github.com/xitongsys/parquet-go/marshal" "github.com/reproio/columnify/parquet" "github.com/reproio/columnify/schema" @@ -66,34 +69,31 @@ func NewParquetColumnifier(st string, sf string, rt string, output string, confi // Write reads, converts input binary data and write it to buffer. func (c *parquetColumnifier) Write(data []byte) (int, error) { - // Intermediate record type is map[string]interface{} - c.w.MarshalFunc = parquet.MarshalMap - records, err := record.FormatToMap(data, c.schema, c.rt) + // Intermediate record type is json string + c.w.MarshalFunc = marshal.MarshalJSON + records, err := record.FormatToArrow(data, c.schema, c.rt) if err != nil { return -1, err } beforeSize := c.w.Size - for _, r := range records { - if err := c.w.Write(r); err != nil { + for i := int64(0); i < records.Record.NumRows(); i++ { + s := records.Record.NewSlice(i, i+1) + defer s.Release() + + buf := &bytes.Buffer{} + w := json.NewWriter(buf, records.Record.Schema()) + if err := w.Write(s); err != nil { return -1, err } - } - afterSize := c.w.Size - // Intermediate record type is wrapped Apache Arrow record - // It requires Arrow Golang implementation more logical type supports - // ref. https://github.com/apache/arrow/blob/9c9dc2012266442d0848e4af0cf52874bc4db151/go/arrow/array/builder.go#L211 - /* - c.w.MarshalFunc = parquet.MarshalArrow - records, err := record.FormatToArrow(data, c.schema, c.rt) - if err != nil { - return err + if buf.Len() > 2 { + if err := c.w.Write(buf.String()[1 : buf.Len()-1]); err != nil { + return -1, err + } } - if err := c.w.Write(&records); err != nil { - return err - } - */ + } + afterSize := c.w.Size return int(afterSize - beforeSize), nil } diff --git a/columnifier/parquet_test.go b/columnifier/parquet_test.go index daccab8..2bc2b3c 100644 --- a/columnifier/parquet_test.go +++ b/columnifier/parquet_test.go @@ -7,9 +7,12 @@ import ( "os" "testing" + "github.com/xitongsys/parquet-go/writer" + "github.com/xitongsys/parquet-go-source/local" "github.com/xitongsys/parquet-go/reader" + columnifyParquet "github.com/reproio/columnify/parquet" "github.com/reproio/columnify/record" "github.com/reproio/columnify/schema" "github.com/xitongsys/parquet-go/parquet" @@ -137,7 +140,7 @@ func TestWriteClose(t *testing.T) { sf: "testdata/schema/primitives.avsc", rt: record.RecordTypeAvro, input: "testdata/record/primitives.avro", - expected: "testdata/parquet/primitives_with_bytes.parquet", + expected: "testdata/parquet/primitives.parquet", }, // primitives; Avro schema, CSV record { @@ -185,7 +188,7 @@ func TestWriteClose(t *testing.T) { sf: "testdata/schema/nullables.avsc", rt: record.RecordTypeAvro, input: "testdata/record/nullables.avro", - expected: "testdata/parquet/nullables_with_bytes.parquet", + expected: "testdata/parquet/nullables.parquet", }, // nullables; Avro schema, JSONL record { @@ -203,7 +206,14 @@ func TestWriteClose(t *testing.T) { input: "testdata/record/nullables.msgpack", expected: "testdata/parquet/nullables.parquet", }, - // TODO logicals; Avro schema, Avro record + // logicals; Avro schema, Avro record + { + st: schema.SchemaTypeAvro, + sf: "testdata/schema/logicals.avsc", + rt: record.RecordTypeAvro, + input: "testdata/record/logicals.avro", + expected: "testdata/parquet/logicals.parquet", + }, // logicals; Avro schema, CSV record { st: schema.SchemaTypeAvro, @@ -250,7 +260,7 @@ func TestWriteClose(t *testing.T) { sf: "testdata/schema/nested.avsc", rt: record.RecordTypeAvro, input: "testdata/record/nested.avro", - expected: "testdata/parquet/nested_with_bytes.parquet", + expected: "testdata/parquet/nested.parquet", }, // nested; Avro schema, JSONL record { @@ -274,7 +284,7 @@ func TestWriteClose(t *testing.T) { sf: "testdata/schema/array.avsc", rt: record.RecordTypeAvro, input: "testdata/record/array.avro", - expected: "testdata/parquet/array_with_bytes.parquet", + expected: "testdata/parquet/array.parquet", }, // array; Avro schema, JSONL record { @@ -298,7 +308,7 @@ func TestWriteClose(t *testing.T) { sf: "testdata/schema/nullable_complex.avsc", rt: record.RecordTypeAvro, input: "testdata/record/nullable_complex.avro", - expected: "testdata/parquet/nullable_complex_with_bytes.parquet", + expected: "testdata/parquet/nullable_complex.parquet", }, // nullable/complex; Avro schema, JSONL record { @@ -323,7 +333,7 @@ func TestWriteClose(t *testing.T) { sf: "testdata/schema/primitives.bq.json", rt: record.RecordTypeAvro, input: "testdata/record/primitives.avro", - expected: "testdata/parquet/primitives_with_bytes.parquet", + expected: "testdata/parquet/primitives.parquet", }, // primitives; BigQuery schema, CSV record { @@ -371,7 +381,7 @@ func TestWriteClose(t *testing.T) { sf: "testdata/schema/nullables.bq.json", rt: record.RecordTypeAvro, input: "testdata/record/nullables.avro", - expected: "testdata/parquet/nullables_with_bytes.parquet", + expected: "testdata/parquet/nullables.parquet", }, // nullables; BigQuery schema, JSONL record { @@ -395,7 +405,7 @@ func TestWriteClose(t *testing.T) { sf: "testdata/schema/nested.bq.json", rt: record.RecordTypeAvro, input: "testdata/record/nested.avro", - expected: "testdata/parquet/nested_with_bytes.parquet", + expected: "testdata/parquet/nested.parquet", }, // nested; BigQuery schema, JSONL record { @@ -419,7 +429,7 @@ func TestWriteClose(t *testing.T) { sf: "testdata/schema/array.bq.json", rt: record.RecordTypeAvro, input: "testdata/record/array.avro", - expected: "testdata/parquet/array_with_bytes.parquet", + expected: "testdata/parquet/array.parquet", }, // array; BigQuery schema, JSONL record { @@ -460,6 +470,7 @@ func TestWriteClose(t *testing.T) { } if err != nil { t.Errorf("expected success, but actual %v", err) + continue } // Check written file @@ -515,3 +526,56 @@ func TestWriteClose_Errors(t *testing.T) { } } } + +func BenchmarkWriteClose(b *testing.B) { + // primitives; Avro schema, JSONL record + st := schema.SchemaTypeAvro + sf := "testdata/schema/primitives.avsc" + rt := record.RecordTypeJsonl + input := "testdata/record/primitives.jsonl" + + schemaContent, err := ioutil.ReadFile(sf) + if err != nil { + b.Fatal(err) + } + + intermediateSchema, err := schema.GetSchema(schemaContent, st) + if err != nil { + b.Fatal(err) + } + + sh, err := schema.NewSchemaHandlerFromArrow(*intermediateSchema) + if err != nil { + b.Fatal(err) + } + + fw := columnifyParquet.NewDiscard() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + w, err := writer.NewParquetWriter(fw, nil, 1) + if err != nil { + b.Fatal(err) + } + w.SchemaHandler = sh + w.Footer.Schema = append(w.Footer.Schema, sh.SchemaElements...) + + columnifier := &parquetColumnifier{ + w: w, + schema: intermediateSchema, + rt: rt, + } + b.Cleanup(func() { + columnifier.Close() + }) + + _, err = columnifier.WriteFromFiles([]string{input}) + if err == nil { + err = columnifier.Close() + } + if err != nil { + b.Errorf("expected error occurs, but actual it's nil") + } + } +} diff --git a/columnifier/testdata/parquet/array.parquet b/columnifier/testdata/parquet/array.parquet index 69f1196..2a71876 100644 Binary files a/columnifier/testdata/parquet/array.parquet and b/columnifier/testdata/parquet/array.parquet differ diff --git a/columnifier/testdata/parquet/array_with_bytes.parquet b/columnifier/testdata/parquet/array_with_bytes.parquet deleted file mode 100644 index 2fbb19d..0000000 Binary files a/columnifier/testdata/parquet/array_with_bytes.parquet and /dev/null differ diff --git a/columnifier/testdata/parquet/nested.parquet b/columnifier/testdata/parquet/nested.parquet index 8ed2286..6f0eb57 100644 Binary files a/columnifier/testdata/parquet/nested.parquet and b/columnifier/testdata/parquet/nested.parquet differ diff --git a/columnifier/testdata/parquet/nested_with_bytes.parquet b/columnifier/testdata/parquet/nested_with_bytes.parquet deleted file mode 100644 index 10e2491..0000000 Binary files a/columnifier/testdata/parquet/nested_with_bytes.parquet and /dev/null differ diff --git a/columnifier/testdata/parquet/nullable_complex.parquet b/columnifier/testdata/parquet/nullable_complex.parquet index 601f842..f32abc6 100644 Binary files a/columnifier/testdata/parquet/nullable_complex.parquet and b/columnifier/testdata/parquet/nullable_complex.parquet differ diff --git a/columnifier/testdata/parquet/nullable_complex_with_bytes.parquet b/columnifier/testdata/parquet/nullable_complex_with_bytes.parquet deleted file mode 100644 index e78285a..0000000 Binary files a/columnifier/testdata/parquet/nullable_complex_with_bytes.parquet and /dev/null differ diff --git a/columnifier/testdata/parquet/nullables.parquet b/columnifier/testdata/parquet/nullables.parquet index a9cf42c..f4623bc 100644 Binary files a/columnifier/testdata/parquet/nullables.parquet and b/columnifier/testdata/parquet/nullables.parquet differ diff --git a/columnifier/testdata/parquet/nullables_with_bytes.parquet b/columnifier/testdata/parquet/nullables_with_bytes.parquet deleted file mode 100644 index 6cc8509..0000000 Binary files a/columnifier/testdata/parquet/nullables_with_bytes.parquet and /dev/null differ diff --git a/columnifier/testdata/parquet/primitives.parquet b/columnifier/testdata/parquet/primitives.parquet index f17b6e3..2b59914 100644 Binary files a/columnifier/testdata/parquet/primitives.parquet and b/columnifier/testdata/parquet/primitives.parquet differ diff --git a/columnifier/testdata/parquet/primitives_with_bytes.parquet b/columnifier/testdata/parquet/primitives_with_bytes.parquet deleted file mode 100644 index e1ac2f7..0000000 Binary files a/columnifier/testdata/parquet/primitives_with_bytes.parquet and /dev/null differ diff --git a/columnifier/testdata/record/array.avro b/columnifier/testdata/record/array.avro index e658e79..ffd3719 100644 Binary files a/columnifier/testdata/record/array.avro and b/columnifier/testdata/record/array.avro differ diff --git a/columnifier/testdata/record/nested.avro b/columnifier/testdata/record/nested.avro index 5f6e3f1..53515a5 100644 Binary files a/columnifier/testdata/record/nested.avro and b/columnifier/testdata/record/nested.avro differ diff --git a/columnifier/testdata/record/nullable_complex.avro b/columnifier/testdata/record/nullable_complex.avro index c1ffa54..268ef2a 100644 Binary files a/columnifier/testdata/record/nullable_complex.avro and b/columnifier/testdata/record/nullable_complex.avro differ diff --git a/columnifier/testdata/record/nullable_complex.jsonl b/columnifier/testdata/record/nullable_complex.jsonl index 35da9e3..f8b4ee8 100644 --- a/columnifier/testdata/record/nullable_complex.jsonl +++ b/columnifier/testdata/record/nullable_complex.jsonl @@ -1,10 +1,10 @@ -{"record":null,"array":[{"boolean":true,"int":531872382,"long":7110611789091831201,"float":0.6601785,"double":0.8597064844366096,"bytes":"+wï„û׌\r7#‡\u0015‚~","string":"ivhxpcqrkytq"},{"boolean":true,"int":1568203348,"long":8411675868321286014,"float":0.57808614,"double":0.6472848747039682,"bytes":"ŒÇL\u0018","string":"xbtfamwqcaqiqv"},{"boolean":false,"int":676698507,"long":-6032067769757778270,"float":0.1851039,"double":0.7933557800199204,"bytes":"\u001Cÿœ¦","string":"ruyadxra"}]} -{"record":{"boolean":true,"int":704611415,"long":-4154316846281341480,"float":0.72661895,"double":0.17519039388445223,"bytes":"\u0005â\u0002g—U®&ü","string":"bdepmjrccto","record":{"boolean":false,"int":412585646,"long":-7466931932318538095,"float":0.9039127,"double":0.05007118033693603,"bytes":"ˆñOv","string":"wdwkyntcvjg"}},"array":null} -{"record":null,"array":[{"boolean":false,"int":15734176,"long":7235048369847869063,"float":0.04944855,"double":0.39543834390425747,"bytes":"‚\nÍM§Ìo\tӅ\nø–","string":"hhuhhfpppaaoawv"},{"boolean":false,"int":-759267670,"long":6209213668179349721,"float":0.08292121,"double":0.709668360008986,"bytes":"Mš‹\u0015Ï\u000E?\u0014ì´>àÉe","string":"xaksrfidw"},{"boolean":true,"int":78473942,"long":-7604464740561955340,"float":0.36307728,"double":0.9575836282746225,"bytes":"læ”\u0017›%\"šÄí!çy","string":"dfpudiibbpaklbq"},{"boolean":true,"int":1652609307,"long":-2323452492270821900,"float":0.4778906,"double":0.7760289718282248,"bytes":"$ý","string":"alrgtaejlkohj"}]} -{"record":{"boolean":false,"int":-2073860483,"long":2954556304067229886,"float":0.8766103,"double":0.6845090166889758,"bytes":"‰SóÝe ýx§","string":"igxkqmagrvckuv","record":{"boolean":false,"int":449450328,"long":-1036106565808094601,"float":0.14457196,"double":0.5111580963259849,"bytes":"\u001CÑ£¡","string":"epteapbvp"}},"array":[{"boolean":false,"int":-1863603426,"long":-2579645623053350094,"float":0.050024986,"double":0.5330182127973555,"bytes":"1r\u0012\f","string":"wgaiabrpdl"},{"boolean":true,"int":670490580,"long":-5565241018984351747,"float":0.7184204,"double":0.3597669741266478,"bytes":"\u0013ÉÜ","string":"mhbotsagiqqym"},{"boolean":true,"int":-965553496,"long":3721776566876355974,"float":0.23817366,"double":0.19794262070036395,"bytes":"odÀ9\\‘<","string":"gj"},{"boolean":false,"int":942696897,"long":410939410506230949,"float":0.47205192,"double":0.994724092677451,"bytes":"جæv›\u0007QåKᙚ‚J/ÅñC‡ùâ’k‰‹nBŒ–ô‡®ÇõÙí²H\u0005°tü+NÒ\"‹\u0004hÖrÞTG̲±ÌB\u0012˜¥k\t\u0001X/g|¸c—`?>xŠ[†’v&“\u0018ûX¥Aó“DÅ\u0006ß'_àÇ\u001B\u0001ä[ûõwWé´XD;{_¬a=\u001AããòîÜÑ\u001F\"Ò®Žfò@.‰Í\u0019ˆF®r—ÙØ\u0014I\u0007ÃcD¿g“ÊÐÝd4V,õePÍw~\fÖŒõæ?Y«ùnÿÄ\u0011N(?±™l\u001E|£z}\u001Ej[QRÏC߬ˆ¦Á¢‰6\"\u0004FIf¡«ÿ\\\u000BKëÏj˜üÒÑ\"VÊ\u0013lë>\u001A\u0006Ú·”WÄ","string":"gfwfkwogenyjas"}]} +{"record":null,"array":[{"boolean":true,"int":531872382,"long":711061178909183,"float":0.6601785,"double":0.8597064844366096,"bytes":"bytes1","string":"ivhxpcqrkytq"},{"boolean":true,"int":1568203348,"long":841167586832128,"float":0.57808614,"double":0.6472848747039682,"bytes":"bytes11","string":"xbtfamwqcaqiqv"},{"boolean":false,"int":676698507,"long":-603206776975777,"float":0.1851039,"double":0.7933557800199204,"bytes":"bytes12","string":"ruyadxra"}]} +{"record":{"boolean":true,"int":704611415,"long":-415431684628134,"float":0.72661895,"double":0.17519039388445223,"bytes":"bytes2","string":"bdepmjrccto","record":{"boolean":false,"int":412585646,"long":-746693193231853,"float":0.9039127,"double":0.05007118033693603,"bytes":"bytes21","string":"wdwkyntcvjg"}},"array":null} +{"record":null,"array":[{"boolean":false,"int":15734176,"long":723504836984786,"float":0.04944855,"double":0.39543834390425747,"bytes":"bytes3","string":"hhuhhfpppaaoawv"},{"boolean":false,"int":-759267670,"long":620921366817934,"float":0.08292121,"double":0.709668360008986,"bytes":"bytes31","string":"xaksrfidw"},{"boolean":true,"int":78473942,"long":-760446474056195,"float":0.36307728,"double":0.9575836282746225,"bytes":"bytes32","string":"dfpudiibbpaklbq"},{"boolean":true,"int":1652609307,"long":-232345249227082,"float":0.4778906,"double":0.7760289718282248,"bytes":"bytes33","string":"alrgtaejlkohj"}]} +{"record":{"boolean":false,"int":-2073860483,"long":295455630406722,"float":0.8766103,"double":0.6845090166889758,"bytes":"bytes4","string":"igxkqmagrvckuv","record":{"boolean":false,"int":449450328,"long":-103610656580809,"float":0.14457196,"double":0.5111580963259849,"bytes":"bytes41","string":"epteapbvp"}},"array":[{"boolean":false,"int":-1863603426,"long":-257964562305335,"float":0.050024986,"double":0.5330182127973555,"bytes":"bytes42","string":"wgaiabrpdl"},{"boolean":true,"int":670490580,"long":-556524101898435,"float":0.7184204,"double":0.3597669741266478,"bytes":"bytes43","string":"mhbotsagiqqym"},{"boolean":true,"int":-965553496,"long":372177656687635,"float":0.23817366,"double":0.19794262070036395,"bytes":"bytes44","string":"gj"},{"boolean":false,"int":942696897,"long":41093941050623,"float":0.47205192,"double":0.994724092677451,"bytes":"bytes45","string":"gfwfkwogenyjas"}]} {"record":null,"array":null} -{"record":null,"array":[{"boolean":false,"int":-205305957,"long":-6310159692356029115,"float":0.45849025,"double":0.4151344192952111,"bytes":"!º","string":"l"},{"boolean":false,"int":-1906477311,"long":6793894243550945905,"float":0.24969548,"double":0.8262268836347516,"bytes":"\u001AKks°÷ª¥=","string":"lsgcxkahjyvj"},{"boolean":false,"int":800510100,"long":3948065499059021988,"float":0.39302593,"double":0.4653160604460925,"bytes":"ˆªýÜ","string":"uaj"},{"boolean":false,"int":-501910644,"long":-625099817872328699,"float":0.50161093,"double":0.09707274119946419,"bytes":"Sù\u0000FÜ`ÛCÑ","string":"pffbjv"}]} -{"record":null,"array":[{"boolean":false,"int":-429281721,"long":-2463452959582765306,"float":0.21187967,"double":0.2947158537922535,"bytes":"—\u0001’¥À>w&X\u00152Ãs@","string":"vkveqmsa"},{"boolean":false,"int":-720755339,"long":-6839395060757807646,"float":0.020896614,"double":0.5549829191175112,"bytes":"~LLm","string":"ghuur"},{"boolean":false,"int":1401455895,"long":-6856426453752275413,"float":0.8850343,"double":0.2949759564640848,"bytes":"4xŽ¯- ¥²U†u‰","string":"fnqcpxjj"},{"boolean":false,"int":-851242423,"long":-8786680963009681589,"float":0.36196667,"double":0.5072582082152138,"bytes":"\u0005","string":"mqckrhofqrbnus"},{"boolean":false,"int":81523005,"long":2200754585769015537,"float":0.6559036,"double":0.7559358978229996,"bytes":"Mw2yZIkîl®","string":"qwwlknotkakvf"}]} -{"record":{"boolean":true,"int":-96518477,"long":-8852974925921340863,"float":0.9184945,"double":0.11812527931788697,"bytes":"\u0006eƒ]\u0002S¡ŒÎm„}¢XH\u0000u4÷d÷Tzád'û).leƒ=¢u¹CmV\u0007$íÿ›ÌýÕ×û\u0019Y`¨\u0014Z9°m³\u000FÎ\r+.pZPʂßÿdÌ\b¶áº\u0013¥J\u001B㮡2ë¤_\u0002x‰\u0007Â_ZŒjáâV«\fW¬Oh\rÿj¹\tߕ\u0013øútõ18š 6\nI\u001BêǯC\u0001~\\\u0002ýè€\u001D„ ð·¦puÚäÓ\u001EÓLh,š–Qò\u001A\\Y£\u0010J‘á+#d°Âò/ª¦„µð±#~4o݁WX\u0015<7ßD\u0019#m\u0019ìôv","string":"pxv","record":{"boolean":true,"int":56734848,"long":6267839549902900690,"float":0.71818876,"double":0.8955613920671284,"bytes":"âØ\u0010º*H:z^|7","string":"gefjhu"}},"array":null} +{"record":null,"array":[{"boolean":false,"int":-205305957,"long":-631015969235602,"float":0.45849025,"double":0.4151344192952111,"bytes":"bytes6","string":"l"},{"boolean":false,"int":-1906477311,"long":679389424355094,"float":0.24969548,"double":0.8262268836347516,"bytes":"bytes61","string":"lsgcxkahjyvj"},{"boolean":false,"int":800510100,"long":394806549905902,"float":0.39302593,"double":0.4653160604460925,"bytes":"bytes62","string":"uaj"},{"boolean":false,"int":-501910644,"long":-62509981787232,"float":0.50161093,"double":0.09707274119946419,"bytes":"bytes63","string":"pffbjv"}]} +{"record":null,"array":[{"boolean":false,"int":-429281721,"long":-246345295958276,"float":0.21187967,"double":0.2947158537922535,"bytes":"bytes7","string":"vkveqmsa"},{"boolean":false,"int":-720755339,"long":-683939506075780,"float":0.020896614,"double":0.5549829191175112,"bytes":"byte71","string":"ghuur"},{"boolean":false,"int":1401455895,"long":-685642645375227,"float":0.8850343,"double":0.2949759564640848,"bytes":"byte72","string":"fnqcpxjj"},{"boolean":false,"int":-851242423,"long":-878668096300968,"float":0.36196667,"double":0.5072582082152138,"bytes":"bytes73","string":"mqckrhofqrbnus"},{"boolean":false,"int":81523005,"long":220075458576901,"float":0.6559036,"double":0.7559358978229996,"bytes":"bytes74","string":"qwwlknotkakvf"}]} +{"record":{"boolean":true,"int":-96518477,"long":-885297492592134,"float":0.9184945,"double":0.11812527931788697,"bytes":"bytes8","string":"pxv","record":{"boolean":true,"int":56734848,"long":626783954990290,"float":0.71818876,"double":0.8955613920671284,"bytes":"bytes81","string":"gefjhu"}},"array":null} {"record":null,"array":null} -{"record":null,"array":[{"boolean":true,"int":-120284706,"long":3934822438577813854,"float":0.9792121,"double":0.8598279548509492,"bytes":"ƒ*å$","string":"jfqytlsrixugae"},{"boolean":true,"int":-167433176,"long":4996907008784510459,"float":0.5693579,"double":0.7272509771521923,"bytes":"özä¾","string":"laruxgvtikw"},{"boolean":false,"int":1939227671,"long":-4522720500642051954,"float":0.6620586,"double":0.2977905118815489,"bytes":"ÅÌûÄ\u00149r","string":"dhu"}]} \ No newline at end of file +{"record":null,"array":[{"boolean":true,"int":-120284706,"long":393482243857781,"float":0.9792121,"double":0.8598279548509492,"bytes":"bytes10","string":"jfqytlsrixugae"},{"boolean":true,"int":-167433176,"long":499690700878451,"float":0.5693579,"double":0.7272509771521923,"bytes":"bytes101","string":"laruxgvtikw"},{"boolean":false,"int":1939227671,"long":-452272050064205,"float":0.6620586,"double":0.2977905118815489,"bytes":"bytes102","string":"dhu"}]} \ No newline at end of file diff --git a/columnifier/testdata/record/nullable_complex.msgpack b/columnifier/testdata/record/nullable_complex.msgpack index 0c99677..b9d779f 100644 Binary files a/columnifier/testdata/record/nullable_complex.msgpack and b/columnifier/testdata/record/nullable_complex.msgpack differ diff --git a/columnifier/testdata/record/nullables.avro b/columnifier/testdata/record/nullables.avro index fe19141..28b451a 100644 Binary files a/columnifier/testdata/record/nullables.avro and b/columnifier/testdata/record/nullables.avro differ diff --git a/columnifier/testdata/record/nullables.jsonl b/columnifier/testdata/record/nullables.jsonl index da2f5d8..90493ae 100644 --- a/columnifier/testdata/record/nullables.jsonl +++ b/columnifier/testdata/record/nullables.jsonl @@ -1,10 +1,10 @@ -{"boolean":null,"int":2049911329,"long":935174337359573034,"float":null,"double":null,"bytes":null,"string":null} -{"boolean":null,"int":-1494730473,"long":-2022895809491630103,"float":null,"double":0.08069785324756118,"bytes":"","string":"tpwmyxc"} -{"boolean":true,"int":-1949023704,"long":5167344269218891758,"float":null,"double":0.6583549661805351,"bytes":null,"string":null} -{"boolean":false,"int":null,"long":-8670003858467235223,"float":0.13172472,"double":0.007504294905384068,"bytes":null,"string":null} -{"boolean":true,"int":null,"long":-1630961264885603867,"float":0.08742553,"double":0.5728205289212072,"bytes":"d4ÑB¿”","string":null} -{"boolean":true,"int":null,"long":null,"float":null,"double":null,"bytes":"\u0005d½œ™ÆÓØhý","string":"s"} -{"boolean":false,"int":170755098,"long":7147626639653793287,"float":0.7437153,"double":null,"bytes":null,"string":null} +{"boolean":null,"int":2049911329,"long":93517433735957,"float":null,"double":null,"bytes":null,"string":null} +{"boolean":null,"int":-1494730473,"long":-202289580949163,"float":null,"double":0.08069785324756118,"bytes":"","string":"tpwmyxc"} +{"boolean":true,"int":-1949023704,"long":516734426921889,"float":null,"double":0.6583549661805351,"bytes":null,"string":null} +{"boolean":false,"int":null,"long":-867000385846723,"float":0.13172472,"double":0.007504294905384068,"bytes":null,"string":null} +{"boolean":true,"int":null,"long":-163096126488560,"float":0.08742553,"double":0.5728205289212072,"bytes":"bytes5","string":null} +{"boolean":true,"int":null,"long":null,"float":null,"double":null,"bytes":"bytes6","string":"s"} +{"boolean":false,"int":170755098,"long":714762663965379,"float":0.7437153,"double":null,"bytes":null,"string":null} {"boolean":null,"int":null,"long":null,"float":null,"double":0.22171424755307045,"bytes":null,"string":"uusutbymi"} -{"boolean":true,"int":-433672812,"long":4602315000893829332,"float":0.43936086,"double":0.4923838260209136,"bytes":"~œ† )ï\u0006Õ'","string":null} +{"boolean":true,"int":-433672812,"long":460231500089382,"float":0.43936086,"double":0.4923838260209136,"bytes":"bytes9","string":null} {"boolean":null,"int":null,"long":null,"float":null,"double":0.24505978464315714,"bytes":null,"string":null} \ No newline at end of file diff --git a/columnifier/testdata/record/nullables.msgpack b/columnifier/testdata/record/nullables.msgpack index 56ee1fd..16db9a8 100644 Binary files a/columnifier/testdata/record/nullables.msgpack and b/columnifier/testdata/record/nullables.msgpack differ diff --git a/columnifier/testdata/record/primitives.avro b/columnifier/testdata/record/primitives.avro index e4618d1..3ed0624 100644 Binary files a/columnifier/testdata/record/primitives.avro and b/columnifier/testdata/record/primitives.avro differ diff --git a/go.mod b/go.mod index 864a00a..c81ca66 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.14 require ( cloud.google.com/go/bigquery v1.4.0 github.com/Songmu/go-ltsv v0.0.0-20181014062614-c30af2b7b171 - github.com/apache/arrow/go/arrow v0.0.0-20200504153628-d13e8f3ed647 + github.com/apache/arrow/go/arrow v0.0.0-20200721111830-aa51b5a60623 github.com/linkedin/goavro/v2 v2.9.7 github.com/vmihailenco/msgpack/v4 v4.3.11 github.com/xitongsys/parquet-go v1.5.2 diff --git a/go.sum b/go.sum index 82dae0b..7716a7d 100644 --- a/go.sum +++ b/go.sum @@ -28,6 +28,8 @@ github.com/Songmu/go-ltsv v0.0.0-20181014062614-c30af2b7b171 h1:nwdeQV2pNjaTv3os github.com/Songmu/go-ltsv v0.0.0-20181014062614-c30af2b7b171/go.mod h1:LBP+tS9C2iiUoR7AGPaZYY+kjXgB5eZxZKbSEBL9UFw= github.com/apache/arrow/go/arrow v0.0.0-20200504153628-d13e8f3ed647 h1:wGcHSHIBp0+NEMyXG2N0878wAl5J3yOFDU5RZECDSj8= github.com/apache/arrow/go/arrow v0.0.0-20200504153628-d13e8f3ed647/go.mod h1:QNYViu/X0HXDHw7m3KXzWSVXIbfUvJqBFe6Gj8/pYA0= +github.com/apache/arrow/go/arrow v0.0.0-20200721111830-aa51b5a60623 h1:S+5uMnRlLTFeZNP/HEezoamVyI0bcnvtIN/2ONf6VyU= +github.com/apache/arrow/go/arrow v0.0.0-20200721111830-aa51b5a60623/go.mod h1:QNYViu/X0HXDHw7m3KXzWSVXIbfUvJqBFe6Gj8/pYA0= github.com/apache/thrift v0.0.0-20181112125854-24918abba929 h1:ubPe2yRkS6A/X37s0TVGfuN42NV2h0BlzWj0X76RoUw= github.com/apache/thrift v0.0.0-20181112125854-24918abba929/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= diff --git a/parquet/discard.go b/parquet/discard.go new file mode 100644 index 0000000..90ac55d --- /dev/null +++ b/parquet/discard.go @@ -0,0 +1,38 @@ +package parquet + +import ( + "fmt" + + "github.com/xitongsys/parquet-go/source" +) + +// discard is an implementation of ParquetFile, just discard written data. +type discard struct{} + +func NewDiscard() *discard { + return &discard{} +} + +func (f *discard) Read(p []byte) (n int, err error) { + return -1, fmt.Errorf("never implemented: %w", ErrUnsupportedMethod) +} + +func (f *discard) Write(p []byte) (n int, err error) { + return len(p), nil +} + +func (f *discard) Seek(offset int64, whence int) (int64, error) { + return -1, fmt.Errorf("never implemented: %w", ErrUnsupportedMethod) +} + +func (f *discard) Close() error { + return nil +} + +func (f *discard) Open(name string) (source.ParquetFile, error) { + return nil, fmt.Errorf("never implemented: %w", ErrUnsupportedMethod) +} + +func (f *discard) Create(name string) (source.ParquetFile, error) { + return nil, fmt.Errorf("never implemented: %w", ErrUnsupportedMethod) +} diff --git a/parquet/doc.go b/parquet/doc.go deleted file mode 100644 index d1559df..0000000 --- a/parquet/doc.go +++ /dev/null @@ -1,12 +0,0 @@ -/* - Package parquetgo is an utility and marshaler with go-friendly error handling for parquet-go. - https://github.com/xitongsys/parquet-go - - xitongsys/parquet-go provides simple, high-level API to convert to Parquet. - But provided features are limited (mainly it looks main users select Go struct or JSON ), - and the error handling is sometimes too simple (panic/recovery based). - - parquetgo package enriches these points for handling Arrow based data. - -*/ -package parquet diff --git a/parquet/marshal_arrow.go b/parquet/marshal_arrow.go deleted file mode 100644 index 42a45fb..0000000 --- a/parquet/marshal_arrow.go +++ /dev/null @@ -1,266 +0,0 @@ -package parquet - -import ( - "bytes" - "encoding/base64" - "fmt" - "reflect" - - "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/array" - "github.com/reproio/columnify/record" - "github.com/xitongsys/parquet-go/common" - "github.com/xitongsys/parquet-go/layout" - "github.com/xitongsys/parquet-go/parquet" - "github.com/xitongsys/parquet-go/schema" - "github.com/xitongsys/parquet-go/types" -) - -// MarshalMap converts 1 arrow record to parquet tables. -func MarshalArrow(maybeRecord []interface{}, bgn int, end int, schemaHandler *schema.SchemaHandler) (*map[string]*layout.Table, error) { - // NOTE This marshaler expects record values aggregation has done before call - if len(maybeRecord) != 1 { - return nil, fmt.Errorf("size of records is invalid: %w", ErrInvalidParquetRecord) - } - - wrapped, recordOk := maybeRecord[0].(*record.WrappedRecord) - if !recordOk { - return nil, fmt.Errorf("unexpected input type %v: %w", reflect.TypeOf(maybeRecord[0]), ErrInvalidParquetRecord) - } - - return marshalArrowRecord(wrapped.Record, schemaHandler) -} - -func marshalArrowRecord(record array.Record, sh *schema.SchemaHandler) (*map[string]*layout.Table, error) { - tables, err := prepareTables(sh) - if err != nil { - return nil, err - } - - keys := make([]string, 0, len(record.Schema().Fields())) - for _, f := range record.Schema().Fields() { - keys = append(keys, common.HeadToUpper(f.Name)) - } - - for i, c := range record.Columns() { - childPathMap := sh.PathMap.Children[keys[i]] - data := c.Data() - tables, err = marshalArrowData(data, tables, sh, childPathMap, 0, 0) - if err != nil { - return nil, err - } - } - - return &tables, nil -} - -func marshalArrowData(data *array.Data, tables map[string]*layout.Table, sh *schema.SchemaHandler, pathMap *schema.PathMapType, rl int32, dl int32) (map[string]*layout.Table, error) { - pathStr := pathMap.Path - - var info *common.Tag - if i, ok := sh.MapIndex[pathStr]; ok { - info = sh.Infos[i] - } else { - return nil, fmt.Errorf("schema not found to path %v: %w", pathStr, ErrInvalidParquetSchema) - } - - switch data.DataType().ID() { - case arrow.BOOL: - values := array.NewBooleanData(data) - for i := 0; i < values.Len(); i++ { - v, deltaDl, err := arrowPrimitiveToDataPageSource(values.Value(i), values.IsValid(i), info) - if err != nil { - return nil, err - } - tables[pathStr].Values = append(tables[pathStr].Values, v) - tables[pathStr].DefinitionLevels = append(tables[pathStr].DefinitionLevels, dl+deltaDl) - tables[pathStr].RepetitionLevels = append(tables[pathStr].RepetitionLevels, rl) - } - - case arrow.UINT32: - values := array.NewUint32Data(data) - for i := 0; i < values.Len(); i++ { - v, deltaDl, err := arrowPrimitiveToDataPageSource(values.Value(i), values.IsValid(i), info) - if err != nil { - return nil, err - } - tables[pathStr].Values = append(tables[pathStr].Values, v) - tables[pathStr].DefinitionLevels = append(tables[pathStr].DefinitionLevels, dl+deltaDl) - tables[pathStr].RepetitionLevels = append(tables[pathStr].RepetitionLevels, rl) - } - - case arrow.UINT64: - values := array.NewUint64Data(data) - for i := 0; i < values.Len(); i++ { - v, deltaDl, err := arrowPrimitiveToDataPageSource(values.Value(i), values.IsValid(i), info) - if err != nil { - return nil, err - } - tables[pathStr].Values = append(tables[pathStr].Values, v) - tables[pathStr].DefinitionLevels = append(tables[pathStr].DefinitionLevels, dl+deltaDl) - tables[pathStr].RepetitionLevels = append(tables[pathStr].RepetitionLevels, rl) - } - - case arrow.FLOAT32: - values := array.NewFloat32Data(data) - for i := 0; i < values.Len(); i++ { - v, deltaDl, err := arrowPrimitiveToDataPageSource(values.Value(i), values.IsValid(i), info) - if err != nil { - return nil, err - } - tables[pathStr].Values = append(tables[pathStr].Values, v) - tables[pathStr].DefinitionLevels = append(tables[pathStr].DefinitionLevels, dl+deltaDl) - tables[pathStr].RepetitionLevels = append(tables[pathStr].RepetitionLevels, rl) - } - - case arrow.FLOAT64: - values := array.NewFloat64Data(data) - for i := 0; i < values.Len(); i++ { - v, deltaDl, err := arrowPrimitiveToDataPageSource(values.Value(i), values.IsValid(i), info) - if err != nil { - return nil, err - } - tables[pathStr].Values = append(tables[pathStr].Values, v) - tables[pathStr].DefinitionLevels = append(tables[pathStr].DefinitionLevels, dl+deltaDl) - tables[pathStr].RepetitionLevels = append(tables[pathStr].RepetitionLevels, rl) - } - - case arrow.STRING: - values := array.NewStringData(data) - for i := 0; i < values.Len(); i++ { - v, deltaDl, err := arrowPrimitiveToDataPageSource(values.Value(i), values.IsValid(i), info) - if err != nil { - return nil, err - } - tables[pathStr].Values = append(tables[pathStr].Values, v) - tables[pathStr].DefinitionLevels = append(tables[pathStr].DefinitionLevels, dl+deltaDl) - tables[pathStr].RepetitionLevels = append(tables[pathStr].RepetitionLevels, rl) - } - - case arrow.BINARY: - values := array.NewBinaryData(data) - for i := 0; i < values.Len(); i++ { - v, deltaDl, err := arrowPrimitiveToDataPageSource(values.Value(i), values.IsValid(i), info) - if err != nil { - return nil, err - } - tables[pathStr].Values = append(tables[pathStr].Values, v) - tables[pathStr].DefinitionLevels = append(tables[pathStr].DefinitionLevels, dl+deltaDl) - tables[pathStr].RepetitionLevels = append(tables[pathStr].RepetitionLevels, rl) - } - - case arrow.STRUCT: - values := array.NewStructData(data) - st, stOk := values.DataType().(*arrow.StructType) - if !stOk { - return nil, fmt.Errorf("unsupported data type %v: %w", values.DataType(), ErrInvalidParquetRecord) - } - keys := make([]string, 0, len(st.Fields())) - for _, f := range st.Fields() { - keys = append(keys, common.HeadToUpper(f.Name)) - } - deltaDl := int32(0) - if info.RepetitionType == parquet.FieldRepetitionType_OPTIONAL { - deltaDl = 1 - } - for i := 0; i < values.NumField(); i++ { - childPathMap := pathMap.Children[keys[i]] - data := values.Field(i).Data() - var err error - tables, err = marshalArrowData(data, tables, sh, childPathMap, rl, dl+deltaDl) - if err != nil { - return nil, err - } - } - - case arrow.LIST: - values := array.NewListData(data) - for i := 0; i < values.Len(); i++ { - o := i + values.Offset() - bgn := int64(values.Offsets()[o]) - end := int64(values.Offsets()[o+1]) - slice := array.NewSlice(values.ListValues(), bgn, end) - - // first - if slice.Len() > 0 { - first := array.NewSlice(slice, 0, 1) - var err error - tables, err = marshalArrowData(first.Data(), tables, sh, pathMap, rl, dl+1) - if err != nil { - return nil, err - } - } - - // repeated; repetition level += max repetition level - if slice.Len() > 1 { - repeated := array.NewSlice(slice, 1, int64(slice.Len())) - maxRl, err := sh.MaxRepetitionLevel(common.StrToPath(pathStr)) - if err != nil { - return nil, err - } - tables, err = marshalArrowData(repeated.Data(), tables, sh, pathMap, rl+maxRl, dl+1) - if err != nil { - return nil, err - } - - } - } - - default: - return nil, fmt.Errorf("unsupported type %v: %w", data.DataType(), ErrInvalidParquetRecord) - } - - return tables, nil -} - -func arrowPrimitiveToDataPageSource(value interface{}, isValid bool, info *common.Tag) (interface{}, int32, error) { - switch info.RepetitionType { - case parquet.FieldRepetitionType_REQUIRED: - if isValid { - if v, err := formatArrowPrimitive(value, info); err != nil { - return nil, -1, err - } else { - return v, 0, nil - } - } else { - return nil, -1, fmt.Errorf("null for required field %v: %w", info, ErrInvalidParquetRecord) - } - case parquet.FieldRepetitionType_OPTIONAL: - if isValid { - if v, err := formatArrowPrimitive(value, info); err != nil { - return nil, -1, err - } else { - return v, 1, nil - } - } else { - return nil, 0, nil - } - default: - return nil, -1, fmt.Errorf("invalid field repetition type for %v: %w", info, ErrInvalidParquetRecord) - } -} - -func formatArrowPrimitive(value interface{}, info *common.Tag) (interface{}, error) { - pT, cT := types.TypeNameToParquetType(info.Type, info.BaseType) - - var s string - if (*pT == parquet.Type_BYTE_ARRAY || *pT == parquet.Type_FIXED_LEN_BYTE_ARRAY) && cT == nil { - bin, binOk := value.([]byte) - if !binOk { - return nil, fmt.Errorf("%v is not []byte: %w", value, ErrInvalidParquetRecord) - } - - var buf bytes.Buffer - encoder := base64.NewEncoder(base64.StdEncoding, &buf) - defer func() { _ = encoder.Close() }() - - if _, err := encoder.Write(bin); err != nil { - return nil, err - } - s = buf.String() - } else { - s = fmt.Sprintf("%v", value) - } - - return types.StrToParquetType(s, pT, cT, int(info.Length), int(info.Scale)), nil -} diff --git a/parquet/marshal_arrow_test.go b/parquet/marshal_arrow_test.go deleted file mode 100644 index d82da5d..0000000 --- a/parquet/marshal_arrow_test.go +++ /dev/null @@ -1,527 +0,0 @@ -package parquet - -import ( - "reflect" - "testing" - - "github.com/reproio/columnify/record" - "github.com/reproio/columnify/schema" - "github.com/xitongsys/parquet-go/layout" - - "github.com/apache/arrow/go/arrow" - "github.com/apache/arrow/go/arrow/array" - "github.com/apache/arrow/go/arrow/memory" -) - -func TestNewArrowSchemaFromAvroSchema(t *testing.T) { - cases := []struct { - input func(s *schema.IntermediateSchema) []interface{} - schema *schema.IntermediateSchema - expect *map[string]*layout.Table - err error - }{ - // Only primitives - { - input: func(s *schema.IntermediateSchema) []interface{} { - pool := memory.NewGoAllocator() - b := array.NewRecordBuilder(pool, s.ArrowSchema) - - b.Field(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) - b.Field(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) - b.Field(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) - b.Field(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) - b.Field(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) - b.Field(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) - b.Field(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) - - return []interface{}{record.NewWrappedRecord(b)} - }, - schema: schema.NewIntermediateSchema( - arrow.NewSchema( - []arrow.Field{ - { - Name: "boolean", - Type: arrow.FixedWidthTypes.Boolean, - Nullable: false, - }, - { - Name: "int", - Type: arrow.PrimitiveTypes.Uint32, - Nullable: false, - }, - { - Name: "long", - Type: arrow.PrimitiveTypes.Uint64, - Nullable: false, - }, - { - Name: "float", - Type: arrow.PrimitiveTypes.Float32, - Nullable: false, - }, - { - Name: "double", - Type: arrow.PrimitiveTypes.Float64, - Nullable: false, - }, - { - Name: "bytes", - Type: arrow.BinaryTypes.Binary, - Nullable: false, - }, - { - Name: "string", - Type: arrow.BinaryTypes.String, - Nullable: false, - }, - }, nil), - "primitives"), - expect: &map[string]*layout.Table{ - "Primitives.Boolean": { - Values: []interface{}{false, true}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.Int": { - Values: []interface{}{int32(1), int32(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.Long": { - Values: []interface{}{int64(1), int64(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.Float": { - Values: []interface{}{float32(1.1), float32(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.Double": { - Values: []interface{}{float64(1.1), float64(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.Bytes": { - Values: []interface{}{base64Str([]byte("foo"), t), base64Str([]byte("bar"), t)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.String": { - Values: []interface{}{"foo", "bar"}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - }, - err: nil, - }, - - // Nested - { - input: func(s *schema.IntermediateSchema) []interface{} { - pool := memory.NewGoAllocator() - b := array.NewRecordBuilder(pool, s.ArrowSchema) - - b.Field(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) - b.Field(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) - b.Field(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) - b.Field(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) - b.Field(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) - b.Field(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) - b.Field(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) - sb := b.Field(7).(*array.StructBuilder) - sb.AppendValues([]bool{true, true}) - sb.FieldBuilder(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) - sb.FieldBuilder(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) - sb.FieldBuilder(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) - sb.FieldBuilder(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) - sb.FieldBuilder(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) - sb.FieldBuilder(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) - sb.FieldBuilder(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) - - return []interface{}{record.NewWrappedRecord(b)} - }, - schema: schema.NewIntermediateSchema( - arrow.NewSchema( - []arrow.Field{ - { - Name: "boolean", - Type: arrow.FixedWidthTypes.Boolean, - Nullable: false, - }, - { - Name: "int", - Type: arrow.PrimitiveTypes.Uint32, - Nullable: false, - }, - { - Name: "long", - Type: arrow.PrimitiveTypes.Uint64, - Nullable: false, - }, - { - Name: "float", - Type: arrow.PrimitiveTypes.Float32, - Nullable: false, - }, - { - Name: "double", - Type: arrow.PrimitiveTypes.Float64, - Nullable: false, - }, - { - Name: "bytes", - Type: arrow.BinaryTypes.Binary, - Nullable: false, - }, - { - Name: "string", - Type: arrow.BinaryTypes.String, - Nullable: false, - }, - { - Name: "record", - Type: arrow.StructOf( - []arrow.Field{ - { - Name: "boolean", - Type: arrow.FixedWidthTypes.Boolean, - Nullable: false, - }, - { - Name: "int", - Type: arrow.PrimitiveTypes.Uint32, - Nullable: false, - }, - { - Name: "long", - Type: arrow.PrimitiveTypes.Uint64, - Nullable: false, - }, - { - Name: "float", - Type: arrow.PrimitiveTypes.Float32, - Nullable: false, - }, - { - Name: "double", - Type: arrow.PrimitiveTypes.Float64, - Nullable: false, - }, - { - Name: "bytes", - Type: arrow.BinaryTypes.Binary, - Nullable: false, - }, - { - Name: "string", - Type: arrow.BinaryTypes.String, - Nullable: false, - }, - }..., - ), - Nullable: false, - }, - }, - nil), - "nested"), - expect: &map[string]*layout.Table{ - "Nested.Boolean": { - Values: []interface{}{false, true}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Int": { - Values: []interface{}{int32(1), int32(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Long": { - Values: []interface{}{int64(1), int64(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Float": { - Values: []interface{}{float32(1.1), float32(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Double": { - Values: []interface{}{float64(1.1), float64(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Bytes": { - Values: []interface{}{base64Str([]byte("foo"), t), base64Str([]byte("bar"), t)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.String": { - Values: []interface{}{"foo", "bar"}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Boolean": { - Values: []interface{}{false, true}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Int": { - Values: []interface{}{int32(1), int32(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Long": { - Values: []interface{}{int64(1), int64(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Float": { - Values: []interface{}{float32(1.1), float32(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Double": { - Values: []interface{}{float64(1.1), float64(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Bytes": { - Values: []interface{}{base64Str([]byte("foo"), t), base64Str([]byte("bar"), t)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.String": { - Values: []interface{}{"foo", "bar"}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - }, - err: nil, - }, - - // Array - { - input: func(s *schema.IntermediateSchema) []interface{} { - pool := memory.NewGoAllocator() - b := array.NewRecordBuilder(pool, s.ArrowSchema) - - b.Field(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) - b.Field(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) - b.Field(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) - b.Field(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) - b.Field(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) - b.Field(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) - b.Field(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) - lb := b.Field(7).(*array.ListBuilder) - sb := lb.ValueBuilder().(*array.StructBuilder) - lb.Append(true) - sb.AppendValues([]bool{true, true}) - sb.FieldBuilder(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) - sb.FieldBuilder(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) - sb.FieldBuilder(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) - sb.FieldBuilder(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) - sb.FieldBuilder(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) - sb.FieldBuilder(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) - sb.FieldBuilder(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) - lb.Append(true) - sb.AppendValues([]bool{true, true}) - sb.FieldBuilder(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) - sb.FieldBuilder(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) - sb.FieldBuilder(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) - sb.FieldBuilder(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) - sb.FieldBuilder(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) - sb.FieldBuilder(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) - sb.FieldBuilder(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) - - return []interface{}{record.NewWrappedRecord(b)} - }, - schema: schema.NewIntermediateSchema( - arrow.NewSchema( - []arrow.Field{ - { - Name: "boolean", - Type: arrow.FixedWidthTypes.Boolean, - Nullable: false, - }, - { - Name: "int", - Type: arrow.PrimitiveTypes.Uint32, - Nullable: false, - }, - { - Name: "long", - Type: arrow.PrimitiveTypes.Uint64, - Nullable: false, - }, - { - Name: "float", - Type: arrow.PrimitiveTypes.Float32, - Nullable: false, - }, - { - Name: "double", - Type: arrow.PrimitiveTypes.Float64, - Nullable: false, - }, - { - Name: "bytes", - Type: arrow.BinaryTypes.Binary, - Nullable: false, - }, - { - Name: "string", - Type: arrow.BinaryTypes.String, - Nullable: false, - }, - { - Name: "array", - Type: arrow.ListOf( - arrow.StructOf( - []arrow.Field{ - { - Name: "boolean", - Type: arrow.FixedWidthTypes.Boolean, - Nullable: false, - }, - { - Name: "int", - Type: arrow.PrimitiveTypes.Uint32, - Nullable: false, - }, - { - Name: "long", - Type: arrow.PrimitiveTypes.Uint64, - Nullable: false, - }, - { - Name: "float", - Type: arrow.PrimitiveTypes.Float32, - Nullable: false, - }, - { - Name: "double", - Type: arrow.PrimitiveTypes.Float64, - Nullable: false, - }, - { - Name: "bytes", - Type: arrow.BinaryTypes.Binary, - Nullable: false, - }, - { - Name: "string", - Type: arrow.BinaryTypes.String, - Nullable: false, - }, - }..., - )), - Nullable: false, - }, - }, nil), - "arrays"), - expect: &map[string]*layout.Table{ - "Arrays.Boolean": { - Values: []interface{}{false, true}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Int": { - Values: []interface{}{int32(1), int32(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Long": { - Values: []interface{}{int64(1), int64(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Float": { - Values: []interface{}{float32(1.1), float32(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Double": { - Values: []interface{}{float64(1.1), float64(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Bytes": { - Values: []interface{}{base64Str([]byte("foo"), t), base64Str([]byte("bar"), t)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.String": { - Values: []interface{}{"foo", "bar"}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Array.Boolean": { - Values: []interface{}{false, true, false, true}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.Int": { - Values: []interface{}{int32(1), int32(2), int32(1), int32(2)}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.Long": { - Values: []interface{}{int64(1), int64(2), int64(1), int64(2)}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.Float": { - Values: []interface{}{float32(1.1), float32(2.2), float32(1.1), float32(2.2)}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.Double": { - Values: []interface{}{float64(1.1), float64(2.2), float64(1.1), float64(2.2)}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.Bytes": { - Values: []interface{}{base64Str([]byte("foo"), t), base64Str([]byte("bar"), t), base64Str([]byte("foo"), t), base64Str([]byte("bar"), t)}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.String": { - Values: []interface{}{"foo", "bar", "foo", "bar"}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - }, - err: nil, - }, - } - - for _, c := range cases { - sh, err := schema.NewSchemaHandlerFromArrow(*c.schema) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - tables, err := MarshalArrow(c.input(c.schema), 0, 1, sh) - - if err != c.err { - t.Errorf("expected: %v, but actual: %v\n", c.err, err) - } - - for k, v := range *c.expect { - actual := (*tables)[k] - - if !reflect.DeepEqual(actual.Values, v.Values) { - t.Errorf("expected: %v, but actual: %v\n", v.Values, actual.Values) - } - - if !reflect.DeepEqual(actual.DefinitionLevels, v.DefinitionLevels) { - t.Errorf("expected: %v, but actual: %v\n", v.DefinitionLevels, actual.DefinitionLevels) - } - - if !reflect.DeepEqual(actual.RepetitionLevels, v.RepetitionLevels) { - t.Errorf("expected: %v, but actual: %v\n", v.RepetitionLevels, actual.RepetitionLevels) - } - } - } -} diff --git a/parquet/marshal_map.go b/parquet/marshal_map.go deleted file mode 100644 index 7f1a55f..0000000 --- a/parquet/marshal_map.go +++ /dev/null @@ -1,26 +0,0 @@ -package parquet - -import ( - "encoding/json" - - "github.com/xitongsys/parquet-go/layout" - "github.com/xitongsys/parquet-go/marshal" - "github.com/xitongsys/parquet-go/schema" -) - -// MarshalMap converts []map[string]interface{} to parquet tables. -func MarshalMap(sources []interface{}, bgn int, end int, schemaHandler *schema.SchemaHandler) (*map[string]*layout.Table, error) { - jsons := make([]interface{}, 0, end-bgn) - - for _, d := range sources[bgn:end] { - e, err := json.Marshal(d) - if err != nil { - return nil, err - } - jsons = append(jsons, string(e)) - } - - // NOTE: reuse existing JSON marshaler. Implementing it ourselves is high cost - // NOTE: it requires redundant map -> json -> map conversions - return marshal.MarshalJSON(jsons, bgn, end, schemaHandler) -} diff --git a/parquet/marshal_map_test.go b/parquet/marshal_map_test.go deleted file mode 100644 index f1005b8..0000000 --- a/parquet/marshal_map_test.go +++ /dev/null @@ -1,588 +0,0 @@ -package parquet - -import ( - "bytes" - "encoding/base64" - "reflect" - "testing" - - "github.com/apache/arrow/go/arrow" - "github.com/reproio/columnify/schema" - "github.com/xitongsys/parquet-go/layout" -) - -func base64Str(d []byte, t *testing.T) string { - var buf bytes.Buffer - encoder := base64.NewEncoder(base64.StdEncoding, &buf) - - _, err := encoder.Write(d) - if err != nil { - t.Fatalf("invalid test case: %v", err) - } - - err = encoder.Close() - if err != nil { - t.Fatalf("invalid test case: %v", err) - } - - return buf.String() -} - -func TestMarshalMap(t *testing.T) { - cases := []struct { - input []interface{} - bgn int - end int - schema *schema.IntermediateSchema - expect *map[string]*layout.Table - err error - }{ - // Only primitives - { - input: []interface{}{ - map[string]interface{}{ - "boolean": false, - "bytes": []byte("foo"), - "double": 1.1, - "float": 1.1, - "int": 1, - "long": 1, - "string": "foo", - }, - map[string]interface{}{ - "boolean": true, - "bytes": []byte("bar"), - "double": 2.2, - "float": 2.2, - "int": 2, - "long": 2, - "string": "bar", - }, - }, - bgn: 0, - end: 2, - schema: schema.NewIntermediateSchema( - arrow.NewSchema( - []arrow.Field{ - { - Name: "boolean", - Type: arrow.FixedWidthTypes.Boolean, - Nullable: false, - }, - { - Name: "int", - Type: arrow.PrimitiveTypes.Uint32, - Nullable: false, - }, - { - Name: "long", - Type: arrow.PrimitiveTypes.Uint64, - Nullable: false, - }, - { - Name: "float", - Type: arrow.PrimitiveTypes.Float32, - Nullable: false, - }, - { - Name: "double", - Type: arrow.PrimitiveTypes.Float64, - Nullable: false, - }, - { - Name: "bytes", - Type: arrow.BinaryTypes.Binary, - Nullable: false, - }, - { - Name: "string", - Type: arrow.BinaryTypes.String, - Nullable: false, - }, - }, nil), - "primitives"), - expect: &map[string]*layout.Table{ - "Primitives.Boolean": { - Values: []interface{}{false, true}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.Int": { - Values: []interface{}{int32(1), int32(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.Long": { - Values: []interface{}{int64(1), int64(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.Float": { - Values: []interface{}{float32(1.1), float32(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.Double": { - Values: []interface{}{float64(1.1), float64(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.Bytes": { - Values: []interface{}{base64Str([]byte("foo"), t), base64Str([]byte("bar"), t)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Primitives.String": { - Values: []interface{}{"foo", "bar"}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - }, - err: nil, - }, - - // Nested - { - input: []interface{}{ - map[string]interface{}{ - "boolean": false, - "bytes": []byte("foo"), - "double": 1.1, - "float": 1.1, - "int": 1, - "long": 1, - "string": "foo", - "record": map[string]interface{}{ - "boolean": false, - "bytes": []byte("foo"), - "double": 1.1, - "float": 1.1, - "int": 1, - "long": 1, - "string": "foo", - }, - }, - map[string]interface{}{ - "boolean": true, - "bytes": []byte("bar"), - "double": 2.2, - "float": 2.2, - "int": 2, - "long": 2, - "string": "bar", - "record": map[string]interface{}{ - "boolean": true, - "bytes": []byte("bar"), - "double": 2.2, - "float": 2.2, - "int": 2, - "long": 2, - "string": "bar", - }, - }, - }, - bgn: 0, - end: 2, - schema: schema.NewIntermediateSchema( - arrow.NewSchema( - []arrow.Field{ - { - Name: "boolean", - Type: arrow.FixedWidthTypes.Boolean, - }, - { - Name: "int", - Type: arrow.PrimitiveTypes.Uint32, - }, - { - Name: "long", - Type: arrow.PrimitiveTypes.Uint64, - }, - { - Name: "float", - Type: arrow.PrimitiveTypes.Float32, - }, - { - Name: "double", - Type: arrow.PrimitiveTypes.Float64, - }, - { - Name: "bytes", - Type: arrow.BinaryTypes.Binary, - }, - { - Name: "string", - Type: arrow.BinaryTypes.String, - }, - { - Name: "record", - Type: arrow.StructOf( - []arrow.Field{ - { - Name: "boolean", - Type: arrow.FixedWidthTypes.Boolean, - Nullable: false, - }, - { - Name: "int", - Type: arrow.PrimitiveTypes.Uint32, - Nullable: false, - }, - { - Name: "long", - Type: arrow.PrimitiveTypes.Uint64, - Nullable: false, - }, - { - Name: "float", - Type: arrow.PrimitiveTypes.Float32, - Nullable: false, - }, - { - Name: "double", - Type: arrow.PrimitiveTypes.Float64, - Nullable: false, - }, - { - Name: "bytes", - Type: arrow.BinaryTypes.Binary, - Nullable: false, - }, - { - Name: "string", - Type: arrow.BinaryTypes.String, - Nullable: false, - }, - }..., - ), - Nullable: false, - }, - }, - nil), - "nested"), - expect: &map[string]*layout.Table{ - "Nested.Boolean": { - Values: []interface{}{false, true}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Int": { - Values: []interface{}{int32(1), int32(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Long": { - Values: []interface{}{int64(1), int64(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Float": { - Values: []interface{}{float32(1.1), float32(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Double": { - Values: []interface{}{float64(1.1), float64(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Bytes": { - Values: []interface{}{base64Str([]byte("foo"), t), base64Str([]byte("bar"), t)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.String": { - Values: []interface{}{"foo", "bar"}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Boolean": { - Values: []interface{}{false, true}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Int": { - Values: []interface{}{int32(1), int32(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Long": { - Values: []interface{}{int64(1), int64(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Float": { - Values: []interface{}{float32(1.1), float32(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Double": { - Values: []interface{}{float64(1.1), float64(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.Bytes": { - Values: []interface{}{base64Str([]byte("foo"), t), base64Str([]byte("bar"), t)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Nested.Record.String": { - Values: []interface{}{"foo", "bar"}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - }, - err: nil, - }, - - // Array - { - input: []interface{}{ - map[string]interface{}{ - "boolean": false, - "bytes": []byte("foo"), - "double": 1.1, - "float": 1.1, - "int": 1, - "long": 1, - "string": "foo", - "array": []interface{}{ - map[string]interface{}{ - "boolean": false, - "bytes": []byte("foo"), - "double": 1.1, - "float": 1.1, - "int": 1, - "long": 1, - "string": "foo", - }, - map[string]interface{}{ - "boolean": true, - "bytes": []byte("bar"), - "double": 2.2, - "float": 2.2, - "int": 2, - "long": 2, - "string": "bar", - }, - }, - }, - map[string]interface{}{ - "boolean": true, - "bytes": []byte("bar"), - "double": 2.2, - "float": 2.2, - "int": 2, - "long": 2, - "string": "bar", - "array": []interface{}{ - map[string]interface{}{ - "boolean": false, - "bytes": []byte("foo"), - "double": 1.1, - "float": 1.1, - "int": 1, - "long": 1, - "string": "foo", - }, - map[string]interface{}{ - "boolean": true, - "bytes": []byte("bar"), - "double": 2.2, - "float": 2.2, - "int": 2, - "long": 2, - "string": "bar", - }, - }, - }, - }, - bgn: 0, - end: 2, - schema: schema.NewIntermediateSchema( - arrow.NewSchema( - []arrow.Field{ - { - Name: "boolean", - Type: arrow.FixedWidthTypes.Boolean, - Nullable: false, - }, - { - Name: "int", - Type: arrow.PrimitiveTypes.Uint32, - Nullable: false, - }, - { - Name: "long", - Type: arrow.PrimitiveTypes.Uint64, - Nullable: false, - }, - { - Name: "float", - Type: arrow.PrimitiveTypes.Float32, - Nullable: false, - }, - { - Name: "double", - Type: arrow.PrimitiveTypes.Float64, - Nullable: false, - }, - { - Name: "bytes", - Type: arrow.BinaryTypes.Binary, - Nullable: false, - }, - { - Name: "string", - Type: arrow.BinaryTypes.String, - Nullable: false, - }, - { - Name: "array", - Type: arrow.ListOf( - arrow.StructOf( - []arrow.Field{ - { - Name: "boolean", - Type: arrow.FixedWidthTypes.Boolean, - Nullable: false, - }, - { - Name: "int", - Type: arrow.PrimitiveTypes.Uint32, - Nullable: false, - }, - { - Name: "long", - Type: arrow.PrimitiveTypes.Uint64, - Nullable: false, - }, - { - Name: "float", - Type: arrow.PrimitiveTypes.Float32, - Nullable: false, - }, - { - Name: "double", - Type: arrow.PrimitiveTypes.Float64, - Nullable: false, - }, - { - Name: "bytes", - Type: arrow.BinaryTypes.Binary, - Nullable: false, - }, - { - Name: "string", - Type: arrow.BinaryTypes.String, - Nullable: false, - }, - }..., - )), - Nullable: false, - }, - }, nil), - "arrays"), - expect: &map[string]*layout.Table{ - "Arrays.Boolean": { - Values: []interface{}{false, true}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Int": { - Values: []interface{}{int32(1), int32(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Long": { - Values: []interface{}{int64(1), int64(2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Float": { - Values: []interface{}{float32(1.1), float32(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Double": { - Values: []interface{}{float64(1.1), float64(2.2)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Bytes": { - Values: []interface{}{base64Str([]byte("foo"), t), base64Str([]byte("bar"), t)}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.String": { - Values: []interface{}{"foo", "bar"}, - DefinitionLevels: []int32{0, 0}, - RepetitionLevels: []int32{0, 0}, - }, - "Arrays.Array.Boolean": { - Values: []interface{}{false, true, false, true}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.Int": { - Values: []interface{}{int32(1), int32(2), int32(1), int32(2)}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.Long": { - Values: []interface{}{int64(1), int64(2), int64(1), int64(2)}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.Float": { - Values: []interface{}{float32(1.1), float32(2.2), float32(1.1), float32(2.2)}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.Double": { - Values: []interface{}{float64(1.1), float64(2.2), float64(1.1), float64(2.2)}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.Bytes": { - Values: []interface{}{base64Str([]byte("foo"), t), base64Str([]byte("bar"), t), base64Str([]byte("foo"), t), base64Str([]byte("bar"), t)}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - "Arrays.Array.String": { - Values: []interface{}{"foo", "bar", "foo", "bar"}, - DefinitionLevels: []int32{1, 1, 1, 1}, - RepetitionLevels: []int32{0, 1, 0, 1}, - }, - }, - err: nil, - }, - } - - for _, c := range cases { - sh, err := schema.NewSchemaHandlerFromArrow(*c.schema) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - - tables, err := MarshalMap(c.input, c.bgn, c.end, sh) - if err != c.err { - t.Errorf("expected: %v, but actual: %v\n", c.err, err) - } - - for k, v := range *c.expect { - actual := (*tables)[k] - - if !reflect.DeepEqual(actual.Values, v.Values) { - t.Errorf("values: expected: %v, but actual: %v\n", v.Values, actual.Values) - } - - if !reflect.DeepEqual(actual.DefinitionLevels, v.DefinitionLevels) { - t.Errorf("definition levels: expected: %v, but actual: %v\n", v.DefinitionLevels, actual.DefinitionLevels) - } - if !reflect.DeepEqual(actual.RepetitionLevels, v.RepetitionLevels) { - t.Errorf("repetition levels: expected: %v, but actual: %v\n", v.RepetitionLevels, actual.RepetitionLevels) - } - } - } -} diff --git a/parquet/parquet.go b/parquet/parquet.go deleted file mode 100644 index cba88ca..0000000 --- a/parquet/parquet.go +++ /dev/null @@ -1,56 +0,0 @@ -package parquet - -import ( - "errors" - "fmt" - - "github.com/xitongsys/parquet-go/common" - "github.com/xitongsys/parquet-go/layout" - "github.com/xitongsys/parquet-go/schema" -) - -var ( - ErrInvalidParquetSchema = errors.New("invalid parquet schema") - ErrInvalidParquetRecord = errors.New("invalid parquet record") - ErrUnsupportedMethod = errors.New("unsupported method") -) - -// prepareTables returns tables from fields(non record) in schema elements. -func prepareTables(schemaHandler *schema.SchemaHandler) (map[string]*layout.Table, error) { - numSchemaElements := len(schemaHandler.SchemaElements) - if len(schemaHandler.Infos) != numSchemaElements { - return nil, fmt.Errorf("sizes of SchemaElement and Infos don't match: %w", ErrInvalidParquetSchema) - } - if len(schemaHandler.MapIndex) != numSchemaElements { - return nil, fmt.Errorf("sizes of SchemaElement and MapIndex don't match: %w", ErrInvalidParquetSchema) - } - - tables := make(map[string]*layout.Table) - for i, e := range schemaHandler.SchemaElements { - if e.GetNumChildren() == 0 { // fields(non record) - pathStr := schemaHandler.IndexMap[int32(i)] - path := common.StrToPath(pathStr) - - maxDefinitionLevel, err := schemaHandler.MaxDefinitionLevel(path) - if err != nil { - return nil, err - } - - maxRepetitionLevel, err := schemaHandler.MaxRepetitionLevel(path) - if err != nil { - return nil, err - } - - tables[pathStr] = &layout.Table{ - Path: path, - MaxDefinitionLevel: maxDefinitionLevel, - MaxRepetitionLevel: maxRepetitionLevel, - RepetitionType: e.GetRepetitionType(), - Schema: schemaHandler.SchemaElements[schemaHandler.MapIndex[pathStr]], - Info: schemaHandler.Infos[i], - } - } - } - - return tables, nil -} diff --git a/parquet/stdio.go b/parquet/stdio.go index 90895e5..9ca5b9f 100644 --- a/parquet/stdio.go +++ b/parquet/stdio.go @@ -1,6 +1,7 @@ package parquet import ( + "errors" "fmt" "io" "os" @@ -8,6 +9,10 @@ import ( "github.com/xitongsys/parquet-go/source" ) +var ( + ErrUnsupportedMethod = errors.New("unsupported method") +) + // stdioFile is an implementation of ParquetFile, just writing data to stdout. type stdioFile struct { in io.ReadCloser diff --git a/record/arrow.go b/record/arrow.go index a5647ea..ab8699f 100644 --- a/record/arrow.go +++ b/record/arrow.go @@ -2,10 +2,12 @@ package record import ( "fmt" + "reflect" + "strconv" + "time" "github.com/apache/arrow/go/arrow" "github.com/apache/arrow/go/arrow/array" - "github.com/apache/arrow/go/arrow/memory" ) type WrappedRecord struct { @@ -18,34 +20,32 @@ func NewWrappedRecord(b *array.RecordBuilder) *WrappedRecord { } } -func formatMapToArrowRecord(s *arrow.Schema, maps []map[string]interface{}) (*WrappedRecord, error) { - pool := memory.NewGoAllocator() - b := array.NewRecordBuilder(pool, s) - defer b.Release() - - for _, m := range maps { - for i, f := range s.Fields() { - if v, ok := m[f.Name]; ok { - if _, err := formatMapToArrowField(b.Field(i), f.Type, v); err != nil { - return nil, err - } - } else { - b.Field(i).AppendNull() +func formatMapToArrowRecord(b *array.RecordBuilder, m map[string]interface{}) (*array.RecordBuilder, error) { + for i, f := range b.Schema().Fields() { + if v, ok := m[f.Name]; ok { + if _, err := formatMapToArrowField(b.Field(i), f.Type, f.Nullable, v); err != nil { + return nil, err } + } else if f.Nullable { + b.Field(i).AppendNull() + } else { + return nil, fmt.Errorf("unconvertable record field with type %v, name %v: %w", f.Type, f.Name, ErrUnconvertibleRecord) } } - return NewWrappedRecord(b), nil + return b, nil } func formatMapToArrowStruct(b *array.StructBuilder, s *arrow.StructType, m map[string]interface{}) (*array.StructBuilder, error) { for i, f := range s.Fields() { if v, ok := m[f.Name]; ok { - if _, err := formatMapToArrowField(b.FieldBuilder(i), f.Type, v); err != nil { + if _, err := formatMapToArrowField(b.FieldBuilder(i), f.Type, f.Nullable, v); err != nil { return nil, err } - } else { + } else if f.Nullable { b.FieldBuilder(i).AppendNull() + } else { + return nil, fmt.Errorf("unconvertable struct field with type %v, name %v: %w", f.Type, f.Name, ErrUnconvertibleRecord) } } @@ -55,7 +55,8 @@ func formatMapToArrowStruct(b *array.StructBuilder, s *arrow.StructType, m map[s func formatMapToArrowList(b *array.ListBuilder, l *arrow.ListType, list []interface{}) (*array.ListBuilder, error) { for _, e := range list { - if _, err := formatMapToArrowField(b.ValueBuilder(), l.Elem(), e); err != nil { + // NOTE list type always accepts null values? + if _, err := formatMapToArrowField(b.ValueBuilder(), l.Elem(), true, e); err != nil { return nil, err } } @@ -63,15 +64,98 @@ func formatMapToArrowList(b *array.ListBuilder, l *arrow.ListType, list []interf return b, nil } -func formatMapToArrowField(b array.Builder, t arrow.DataType, v interface{}) (array.Builder, error) { +func formatMapToArrowField(b array.Builder, t arrow.DataType, nullable bool, v interface{}) (array.Builder, error) { + if v == nil && nullable { + b.AppendNull() + return b, nil + } + switch t.ID() { case arrow.BOOL: vb, builderOk := b.(*array.BooleanBuilder) - vv, valueOk := v.(bool) - if builderOk && valueOk { + if !builderOk { + return nil, fmt.Errorf("builder %v is wrong: %w", v, ErrUnconvertibleRecord) + } + if vv, valueOk := v.(bool); valueOk { vb.Append(vv) } else { - return nil, fmt.Errorf("unexpected input %v as bool: %w", v, ErrUnconvertibleRecord) + return nil, fmt.Errorf("unexpected input %v typed %v as bool: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) + } + + case arrow.INT32: + vb, builderOk := b.(*array.Int32Builder) + if !builderOk { + return nil, fmt.Errorf("builder %v is wrong: %w", v, ErrUnconvertibleRecord) + } + switch vv := v.(type) { + case int: + vb.Append(int32(vv)) + case int8: + vb.Append(int32(vv)) + case int16: + vb.Append(int32(vv)) + case int32: + vb.Append(int32(vv)) + case int64: + vb.Append(int32(vv)) + case uint: + vb.Append(int32(vv)) + case uint8: + vb.Append(int32(vv)) + case uint16: + vb.Append(int32(vv)) + case uint32: + vb.Append(int32(vv)) + case uint64: + vb.Append(int32(vv)) + case float64: + vb.Append(int32(vv)) + case string: + vvv, err := strconv.ParseInt(vv, 10, 32) + if err != nil { + return nil, err + } + vb.Append(int32(vvv)) + default: + return nil, fmt.Errorf("unexpected input %v typed %v as int32: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) + } + + case arrow.INT64: + vb, builderOk := b.(*array.Int64Builder) + if !builderOk { + return nil, fmt.Errorf("builder %v is wrong: %w", v, ErrUnconvertibleRecord) + } + switch vv := v.(type) { + case int: + vb.Append(int64(vv)) + case int8: + vb.Append(int64(vv)) + case int16: + vb.Append(int64(vv)) + case int32: + vb.Append(int64(vv)) + case int64: + vb.Append(int64(vv)) + case uint: + vb.Append(int64(vv)) + case uint8: + vb.Append(int64(vv)) + case uint16: + vb.Append(int64(vv)) + case uint32: + vb.Append(int64(vv)) + case uint64: + vb.Append(int64(vv)) + case float64: + vb.Append(int64(vv)) + case string: + vvv, err := strconv.ParseInt(vv, 10, 64) + if err != nil { + return nil, err + } + vb.Append(vvv) + default: + return nil, fmt.Errorf("unexpected input %v typed %v as int64: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) } case arrow.UINT32: @@ -102,8 +186,14 @@ func formatMapToArrowField(b array.Builder, t arrow.DataType, v interface{}) (ar vb.Append(uint32(vv)) case float64: vb.Append(uint32(vv)) + case string: + vvv, err := strconv.ParseUint(vv, 10, 64) + if err != nil { + return nil, err + } + vb.Append(uint32(vvv)) default: - return nil, fmt.Errorf("unexpected input %v as uint32: %w", v, ErrUnconvertibleRecord) + return nil, fmt.Errorf("unexpected input %v typed %v as uint32: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) } case arrow.UINT64: @@ -130,8 +220,14 @@ func formatMapToArrowField(b array.Builder, t arrow.DataType, v interface{}) (ar vb.Append(uint64(vv)) case float64: vb.Append(uint64(vv)) + case string: + vvv, err := strconv.ParseUint(vv, 10, 64) + if err != nil { + return nil, err + } + vb.Append(vvv) default: - return nil, fmt.Errorf("unexpected input %v as uint64: %w", v, ErrUnconvertibleRecord) + return nil, fmt.Errorf("unexpected input %v typed %v as uint64: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) } case arrow.FLOAT32: @@ -141,29 +237,53 @@ func formatMapToArrowField(b array.Builder, t arrow.DataType, v interface{}) (ar } switch vv := v.(type) { case float32: - vb.Append(float32(vv)) + vb.Append(vv) case float64: vb.Append(float32(vv)) + case string: + vvv, err := strconv.ParseFloat(vv, 32) + if err != nil { + return nil, err + } + vb.Append(float32(vvv)) default: - return nil, fmt.Errorf("unexpected input %v as float32: %w", v, ErrUnconvertibleRecord) + return nil, fmt.Errorf("unexpected input %v typed %v as float32: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) } case arrow.FLOAT64: vb, builderOk := b.(*array.Float64Builder) - vv, valueOk := v.(float64) - if builderOk && valueOk { + if !builderOk { + return nil, fmt.Errorf("builder %v is wrong: %w", b, ErrUnconvertibleRecord) + } + switch vv := v.(type) { + case float32: + f64, err := strconv.ParseFloat(fmt.Sprint(vv), 64) + if err != nil { + return nil, err + } + vb.Append(f64) + case float64: vb.Append(vv) - } else { - return nil, fmt.Errorf("unexpected input %v as float64: %w", v, ErrUnconvertibleRecord) + case string: + vvv, err := strconv.ParseFloat(vv, 64) + if err != nil { + return nil, err + } + vb.Append(vvv) + default: + return nil, fmt.Errorf("unexpected input %v typed %v as float64: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) } case arrow.STRING: vb, builderOk := b.(*array.StringBuilder) + if !builderOk { + return nil, fmt.Errorf("builder %v is wrong: %w", v, ErrUnconvertibleRecord) + } vv, valueOk := v.(string) - if builderOk && valueOk { + if valueOk { vb.Append(vv) } else { - return nil, fmt.Errorf("unexpected input %v as string: %w", v, ErrUnconvertibleRecord) + return nil, fmt.Errorf("unexpected input %v typed %v as string: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) } case arrow.BINARY: @@ -177,50 +297,225 @@ func formatMapToArrowField(b array.Builder, t arrow.DataType, v interface{}) (ar case []byte: vb.Append(vv) default: - return nil, fmt.Errorf("unexpected input %v as binary: %w", v, ErrUnconvertibleRecord) + return nil, fmt.Errorf("unexpected input %v typed %v as binary: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) + } + + case arrow.DATE32: + vb, builderOk := b.(*array.Date32Builder) + if !builderOk { + return nil, fmt.Errorf("builder %v is wrong: %w", b, ErrUnconvertibleRecord) + } + switch vv := v.(type) { + case int: + vb.Append(arrow.Date32(vv)) + case int8: + vb.Append(arrow.Date32(vv)) + case int16: + vb.Append(arrow.Date32(vv)) + case int32: + vb.Append(arrow.Date32(vv)) + case int64: + vb.Append(arrow.Date32(vv)) + case uint: + vb.Append(arrow.Date32(vv)) + case uint8: + vb.Append(arrow.Date32(vv)) + case uint16: + vb.Append(arrow.Date32(vv)) + case uint32: + vb.Append(arrow.Date32(vv)) + case uint64: + vb.Append(arrow.Date32(vv)) + case float64: + vb.Append(arrow.Date32(vv)) + case time.Time: + _, _, d := vv.Date() + vb.Append(arrow.Date32(d - 1)) + default: + return nil, fmt.Errorf("unexpected input %v typed %v as Date32: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) + } + + case arrow.DATE64: + vb, builderOk := b.(*array.Date64Builder) + if !builderOk { + return nil, fmt.Errorf("builder %v is wrong: %w", b, ErrUnconvertibleRecord) + } + switch vv := v.(type) { + case int: + vb.Append(arrow.Date64(vv)) + case int8: + vb.Append(arrow.Date64(vv)) + case int16: + vb.Append(arrow.Date64(vv)) + case int32: + vb.Append(arrow.Date64(vv)) + case int64: + vb.Append(arrow.Date64(vv)) + case uint: + vb.Append(arrow.Date64(vv)) + case uint8: + vb.Append(arrow.Date64(vv)) + case uint16: + vb.Append(arrow.Date64(vv)) + case uint32: + vb.Append(arrow.Date64(vv)) + case uint64: + vb.Append(arrow.Date64(vv)) + case float64: + vb.Append(arrow.Date64(vv)) + case time.Time: + _, _, d := vv.Date() + vb.Append(arrow.Date64(d - 1)) + default: + return nil, fmt.Errorf("unexpected input %v typed %v as Date64: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) + } + + case arrow.TIME32: + vb, builderOk := b.(*array.Time32Builder) + if !builderOk { + return nil, fmt.Errorf("builder %v is wrong: %w", b, ErrUnconvertibleRecord) + } + switch vv := v.(type) { + case int: + vb.Append(arrow.Time32(vv)) + case int8: + vb.Append(arrow.Time32(vv)) + case int16: + vb.Append(arrow.Time32(vv)) + case int32: + vb.Append(arrow.Time32(vv)) + case int64: + vb.Append(arrow.Time32(vv)) + case uint: + vb.Append(arrow.Time32(vv)) + case uint8: + vb.Append(arrow.Time32(vv)) + case uint16: + vb.Append(arrow.Time32(vv)) + case uint32: + vb.Append(arrow.Time32(vv)) + case uint64: + vb.Append(arrow.Time32(vv)) + case float64: + vb.Append(arrow.Time32(vv)) + case time.Duration: + vb.Append(arrow.Time32(vv.Milliseconds())) + default: + return nil, fmt.Errorf("unexpected input %v typed %v as Time32: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) + } + + case arrow.TIME64: + vb, builderOk := b.(*array.Time64Builder) + if !builderOk { + return nil, fmt.Errorf("builder %v is wrong: %w", b, ErrUnconvertibleRecord) + } + switch vv := v.(type) { + case int: + vb.Append(arrow.Time64(vv)) + case int8: + vb.Append(arrow.Time64(vv)) + case int16: + vb.Append(arrow.Time64(vv)) + case int32: + vb.Append(arrow.Time64(vv)) + case int64: + vb.Append(arrow.Time64(vv)) + case uint: + vb.Append(arrow.Time64(vv)) + case uint8: + vb.Append(arrow.Time64(vv)) + case uint16: + vb.Append(arrow.Time64(vv)) + case uint32: + vb.Append(arrow.Time64(vv)) + case uint64: + vb.Append(arrow.Time64(vv)) + case float64: + vb.Append(arrow.Time64(vv)) + case time.Duration: + vb.Append(arrow.Time64(vv.Microseconds())) + default: + return nil, fmt.Errorf("unexpected input %v typed %v as Time64: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) + } + + case arrow.TIMESTAMP: + vb, builderOk := b.(*array.TimestampBuilder) + if !builderOk { + return nil, fmt.Errorf("builder %v is wrong: %w", b, ErrUnconvertibleRecord) + } + switch vv := v.(type) { + case int: + vb.Append(arrow.Timestamp(vv)) + case int8: + vb.Append(arrow.Timestamp(vv)) + case int16: + vb.Append(arrow.Timestamp(vv)) + case int32: + vb.Append(arrow.Timestamp(vv)) + case int64: + vb.Append(arrow.Timestamp(vv)) + case uint: + vb.Append(arrow.Timestamp(vv)) + case uint8: + vb.Append(arrow.Timestamp(vv)) + case uint16: + vb.Append(arrow.Timestamp(vv)) + case uint32: + vb.Append(arrow.Timestamp(vv)) + case uint64: + vb.Append(arrow.Timestamp(vv)) + case float64: + vb.Append(arrow.Timestamp(vv)) + case time.Time: + tt, ok := t.(*arrow.TimestampType) + if !ok { + return nil, fmt.Errorf("unexpected type %v as Timestamp: %w", t, ErrUnconvertibleRecord) + } + switch tt.Unit { + case arrow.Millisecond: + vb.Append(arrow.Timestamp(vv.UnixNano() / 1000000)) + case arrow.Microsecond: + vb.Append(arrow.Timestamp(vv.UnixNano() / 1000)) + default: + return nil, fmt.Errorf("unexpected input %v typed %v as Timestamp: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) + } + default: + return nil, fmt.Errorf("unexpected input %v typed %v as Timestamp: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) } case arrow.STRUCT: vb, builderOk := b.(*array.StructBuilder) st, structOk := t.(*arrow.StructType) if builderOk && structOk { - if v != nil { - vb.Append(true) - vv, valueOk := v.(map[string]interface{}) - if !valueOk { - return nil, fmt.Errorf("unexpected input %v as struct: %w", v, ErrUnconvertibleRecord) - } else if _, err := formatMapToArrowStruct(vb, st, vv); err != nil { - return nil, err - } - } else { - vb.Append(false) + vb.Append(true) + vv, valueOk := v.(map[string]interface{}) + if !valueOk { + return nil, fmt.Errorf("unexpected input %v as struct: %w", v, ErrUnconvertibleRecord) + } else if _, err := formatMapToArrowStruct(vb, st, vv); err != nil { + return nil, err } } else { - return nil, fmt.Errorf("unexpected input %v as struct: %w", v, ErrUnconvertibleRecord) + return nil, fmt.Errorf("unexpected input %v typed %v as struct: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) } case arrow.LIST: vb, builderOk := b.(*array.ListBuilder) lt, listOk := t.(*arrow.ListType) if builderOk && listOk { - if v != nil { - vb.Append(true) - vv, valueOk := v.([]interface{}) - if !valueOk { - return nil, fmt.Errorf("unexpected input %v as list: %w", v, ErrUnconvertibleRecord) - } - if _, err := formatMapToArrowList(vb, lt, vv); err != nil { - return nil, err - } - } else { - vb.Append(false) + vb.Append(true) + vv, valueOk := v.([]interface{}) + if !valueOk { + return nil, fmt.Errorf("unexpected input %v as list: %w", v, ErrUnconvertibleRecord) + } + if _, err := formatMapToArrowList(vb, lt, vv); err != nil { + return nil, err } } else { - return nil, fmt.Errorf("unexpected input %v as list: %w", v, ErrUnconvertibleRecord) + return nil, fmt.Errorf("unexpected input %v typed %v as list: %w", v, reflect.TypeOf(v), ErrUnconvertibleRecord) } default: - return nil, fmt.Errorf("unconvertable type %v: %w", t.ID(), ErrUnconvertibleRecord) + return nil, fmt.Errorf("unconvertable type %v: %w", t, ErrUnconvertibleRecord) } return b, nil diff --git a/record/arrow_test.go b/record/arrow_test.go index 02eda1d..7381c9f 100644 --- a/record/arrow_test.go +++ b/record/arrow_test.go @@ -430,17 +430,23 @@ func TestNewArrowSchemaFromAvroSchema(t *testing.T) { }, } + pool := memory.NewGoAllocator() for _, c := range cases { expectedRecord := c.expected(c.schema) - actual, err := formatMapToArrowRecord(c.schema.ArrowSchema, c.input) + b := array.NewRecordBuilder(pool, c.schema.ArrowSchema) + defer b.Release() - if err != c.err { - t.Errorf("expected: %v, but actual: %v\n", c.err, err) + for _, v := range c.input { + _, err := formatMapToArrowRecord(b, v) + if err != c.err { + t.Errorf("expected: %v, but actual: %v\n", c.err, err) + } } - if !reflect.DeepEqual(actual, expectedRecord) { - t.Errorf("values: expected: %v, but actual: %v\n", expectedRecord, actual) + r := NewWrappedRecord(b) + if !reflect.DeepEqual(r, expectedRecord) { + t.Errorf("values: expected: %v, but actual: %v\n", expectedRecord, r) } } } diff --git a/record/avro.go b/record/avro.go index 156bee7..168907c 100644 --- a/record/avro.go +++ b/record/avro.go @@ -4,6 +4,8 @@ import ( "bytes" "fmt" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" "github.com/reproio/columnify/schema" "github.com/linkedin/goavro/v2" @@ -57,10 +59,30 @@ func FormatAvroToMap(data []byte) ([]map[string]interface{}, error) { } func FormatAvroToArrow(s *schema.IntermediateSchema, data []byte) (*WrappedRecord, error) { - maps, err := FormatAvroToMap(data) + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, s.ArrowSchema) + defer b.Release() + + r, err := goavro.NewOCFReader(bytes.NewReader(data)) if err != nil { return nil, err } - return formatMapToArrowRecord(s.ArrowSchema, maps) + for r.Scan() { + v, err := r.Read() + if err != nil { + return nil, err + } + + m, mapOk := v.(map[string]interface{}) + if !mapOk { + return nil, fmt.Errorf("invalid value %v: %w", v, ErrUnconvertibleRecord) + } + + if _, err = formatMapToArrowRecord(b, flattenAvroUnion(m)); err != nil { + return nil, err + } + } + + return NewWrappedRecord(b), nil } diff --git a/record/avro_test.go b/record/avro_test.go index b68f9e2..35e2991 100644 --- a/record/avro_test.go +++ b/record/avro_test.go @@ -5,6 +5,11 @@ import ( "reflect" "testing" + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/reproio/columnify/schema" + "github.com/linkedin/goavro/v2" ) @@ -136,3 +141,147 @@ func TestFormatAvroToMap(t *testing.T) { } } } + +func TestFormatAvroToArrow(t *testing.T) { + cases := []struct { + input []byte + schema *schema.IntermediateSchema + expected func(s *schema.IntermediateSchema) *WrappedRecord + isErr bool + }{ + { + input: func() []byte { + w := &bytes.Buffer{} + + r, err := goavro.NewOCFWriter(goavro.OCFConfig{ + W: w, + Schema: ` +{ + "type": "record", + "name": "Primitives", + "fields" : [ + {"name": "boolean", "type": "boolean"}, + {"name": "int", "type": "int"}, + {"name": "long", "type": "long"}, + {"name": "float", "type": "float"}, + {"name": "double", "type": "double"}, + {"name": "bytes", "type": "bytes"}, + {"name": "string", "type": "string"} + ] +} +`, + }) + if err != nil { + t.Fatal(err) + } + + err = r.Append([]map[string]interface{}{ + { + "boolean": false, + "bytes": string([]byte("foo")), + "double": 1.1, + "float": 1.1, + "int": 1, + "long": 1, + "string": "foo", + }, + { + "boolean": true, + "bytes": string([]byte("bar")), + "double": 2.2, + "float": 2.2, + "int": 2, + "long": 2, + "string": "bar", + }, + }) + if err != nil { + t.Fatal(err) + } + + return w.Bytes() + }(), + schema: schema.NewIntermediateSchema( + arrow.NewSchema( + []arrow.Field{ + { + Name: "boolean", + Type: arrow.FixedWidthTypes.Boolean, + Nullable: false, + }, + { + Name: "int", + Type: arrow.PrimitiveTypes.Uint32, + Nullable: false, + }, + { + Name: "long", + Type: arrow.PrimitiveTypes.Uint64, + Nullable: false, + }, + { + Name: "float", + Type: arrow.PrimitiveTypes.Float32, + Nullable: false, + }, + { + Name: "double", + Type: arrow.PrimitiveTypes.Float64, + Nullable: false, + }, + { + Name: "bytes", + Type: arrow.BinaryTypes.Binary, + Nullable: false, + }, + { + Name: "string", + Type: arrow.BinaryTypes.String, + Nullable: false, + }, + }, nil), + "primitives"), + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, s.ArrowSchema) + defer b.Release() + + b.Field(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) + b.Field(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) + b.Field(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) + b.Field(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) + b.Field(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) + b.Field(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) + b.Field(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) + + return NewWrappedRecord(b) + }, + isErr: false, + }, + + // Not avro + { + input: []byte("not-valid-avro"), + schema: schema.NewIntermediateSchema( + arrow.NewSchema([]arrow.Field{}, nil), + ""), + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + return nil + }, + isErr: true, + }, + } + + for _, c := range cases { + actual, err := FormatAvroToArrow(c.schema, c.input) + + if err != nil != c.isErr { + t.Errorf("expected: %v, but actual: %v\n", c.isErr, err) + } + + expectedRecord := c.expected(c.schema) + if !reflect.DeepEqual(actual, expectedRecord) { + t.Errorf("expected: %v, but actual: %v\n", expectedRecord, actual) + } + } +} diff --git a/record/csv.go b/record/csv.go index a12ed71..c5f3f7e 100644 --- a/record/csv.go +++ b/record/csv.go @@ -7,6 +7,9 @@ import ( "strconv" "strings" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/reproio/columnify/schema" ) @@ -89,10 +92,62 @@ func FormatCsvToMap(s *schema.IntermediateSchema, data []byte, delimiter delimit } func FormatCsvToArrow(s *schema.IntermediateSchema, data []byte, delimiter delimiter) (*WrappedRecord, error) { - maps, err := FormatCsvToMap(s, data, delimiter) + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, s.ArrowSchema) + defer b.Release() + + names, err := getFieldNamesFromSchema(s) if err != nil { return nil, err } - return formatMapToArrowRecord(s.ArrowSchema, maps) + reader := csv.NewReader(strings.NewReader(string(data))) + reader.Comma = rune(delimiter) + + numFields := len(names) + for { + values, err := reader.Read() + if err == io.EOF { + break + } + if err != nil { + return nil, err + } + + if numFields != len(values) { + return nil, fmt.Errorf("incompleted value %v: %w", values, ErrUnconvertibleRecord) + } + + e := make(map[string]interface{}) + for i, v := range values { + // bool + if v != "0" && v != "1" { + if vv, err := strconv.ParseBool(v); err == nil { + e[names[i]] = vv + continue + } + } + + // int + if vv, err := strconv.ParseInt(v, 10, 64); err == nil { + e[names[i]] = vv + continue + } + + // float + if vv, err := strconv.ParseFloat(v, 64); err == nil { + e[names[i]] = vv + continue + } + + // others; to string + e[names[i]] = v + } + + if _, err := formatMapToArrowRecord(b, e); err != nil { + return nil, err + } + } + + return NewWrappedRecord(b), nil } diff --git a/record/csv_test.go b/record/csv_test.go index a75406f..13c0e25 100644 --- a/record/csv_test.go +++ b/record/csv_test.go @@ -4,6 +4,9 @@ import ( "reflect" "testing" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/apache/arrow/go/arrow" "github.com/reproio/columnify/schema" ) @@ -188,3 +191,179 @@ true 2 2 2.2 2.2 bar bar`), } } } + +func TestFormatCsvToArrow(t *testing.T) { + cases := []struct { + schema *schema.IntermediateSchema + input []byte + delimiter delimiter + expected func(s *schema.IntermediateSchema) *WrappedRecord + isErr bool + }{ + // csv; Primitives + { + schema: schema.NewIntermediateSchema( + arrow.NewSchema( + []arrow.Field{ + { + Name: "boolean", + Type: arrow.FixedWidthTypes.Boolean, + Nullable: false, + }, + { + Name: "int", + Type: arrow.PrimitiveTypes.Uint32, + Nullable: false, + }, + { + Name: "long", + Type: arrow.PrimitiveTypes.Uint64, + Nullable: false, + }, + { + Name: "float", + Type: arrow.PrimitiveTypes.Float32, + Nullable: false, + }, + { + Name: "double", + Type: arrow.PrimitiveTypes.Float64, + Nullable: false, + }, + { + Name: "bytes", + Type: arrow.BinaryTypes.Binary, + Nullable: false, + }, + { + Name: "string", + Type: arrow.BinaryTypes.String, + Nullable: false, + }, + }, nil), + "primitives"), + input: []byte(`false,1,1,1.1,1.1,"foo","foo" +true,2,2,2.2,2.2,"bar","bar"`), + delimiter: CsvDelimiter, + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, s.ArrowSchema) + defer b.Release() + + b.Field(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) + b.Field(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) + b.Field(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) + b.Field(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) + b.Field(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) + b.Field(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) + b.Field(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) + + return NewWrappedRecord(b) + }, + isErr: false, + }, + + // tsv; Primitives + { + schema: schema.NewIntermediateSchema( + arrow.NewSchema( + []arrow.Field{ + { + Name: "boolean", + Type: arrow.FixedWidthTypes.Boolean, + Nullable: false, + }, + { + Name: "int", + Type: arrow.PrimitiveTypes.Uint32, + Nullable: false, + }, + { + Name: "long", + Type: arrow.PrimitiveTypes.Uint64, + Nullable: false, + }, + { + Name: "float", + Type: arrow.PrimitiveTypes.Float32, + Nullable: false, + }, + { + Name: "double", + Type: arrow.PrimitiveTypes.Float64, + Nullable: false, + }, + { + Name: "bytes", + Type: arrow.BinaryTypes.Binary, + Nullable: false, + }, + { + Name: "string", + Type: arrow.BinaryTypes.String, + Nullable: false, + }, + }, nil), + "primitives"), + input: []byte(`false 1 1 1.1 1.1 foo foo +true 2 2 2.2 2.2 bar bar`), + delimiter: TsvDelimiter, + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, s.ArrowSchema) + defer b.Release() + + b.Field(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) + b.Field(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) + b.Field(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) + b.Field(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) + b.Field(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) + b.Field(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) + b.Field(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) + + return NewWrappedRecord(b) + }, + isErr: false, + }, + + // Not csv + { + schema: schema.NewIntermediateSchema( + arrow.NewSchema([]arrow.Field{}, nil), + "primitives", + ), + input: []byte("not-valid-csv"), + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + return nil + }, + isErr: true, + }, + + // Not tsv + { + schema: schema.NewIntermediateSchema( + arrow.NewSchema([]arrow.Field{}, nil), + "primitives", + ), + input: []byte("not-valid-tsv"), + delimiter: TsvDelimiter, + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + return nil + }, + isErr: true, + }, + } + + for _, c := range cases { + actual, err := FormatCsvToArrow(c.schema, c.input, c.delimiter) + + if err != nil != c.isErr { + t.Errorf("expected: %v, but actual: %v\n", c.isErr, err) + } + + expectedRecord := c.expected(c.schema) + if !reflect.DeepEqual(actual, expectedRecord) { + t.Errorf("expected: %v, but actual: %v\n", expectedRecord, actual) + } + } +} diff --git a/record/jsonl.go b/record/jsonl.go index 48144cd..74a1fe2 100644 --- a/record/jsonl.go +++ b/record/jsonl.go @@ -4,6 +4,9 @@ import ( "encoding/json" "strings" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/reproio/columnify/schema" ) @@ -29,10 +32,25 @@ func FormatJsonlToMap(data []byte) ([]map[string]interface{}, error) { } func FormatJsonlToArrow(s *schema.IntermediateSchema, data []byte) (*WrappedRecord, error) { - maps, err := FormatJsonlToMap(data) - if err != nil { - return nil, err + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, s.ArrowSchema) + defer b.Release() + + for _, l := range strings.Split(string(data), "\n") { + if l == "" { + // skip blank line + continue + } + + var e map[string]interface{} + if err := json.Unmarshal([]byte(l), &e); err != nil { + return nil, err + } + + if _, err := formatMapToArrowRecord(b, e); err != nil { + return nil, err + } } - return formatMapToArrowRecord(s.ArrowSchema, maps) + return NewWrappedRecord(b), nil } diff --git a/record/jsonl_test.go b/record/jsonl_test.go index db07a9e..a5d744e 100644 --- a/record/jsonl_test.go +++ b/record/jsonl_test.go @@ -3,6 +3,11 @@ package record import ( "reflect" "testing" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/reproio/columnify/schema" ) func TestFormatJsonlToMap(t *testing.T) { @@ -60,3 +65,101 @@ func TestFormatJsonlToMap(t *testing.T) { } } } + +func TestFormatJsonlToArrow(t *testing.T) { + cases := []struct { + input []byte + schema *schema.IntermediateSchema + expected func(s *schema.IntermediateSchema) *WrappedRecord + isErr bool + }{ + // Primitives + { + input: []byte( + `{"boolean": false, "int": 1, "long": 1, "float": 1.1, "double": 1.1, "bytes": "foo", "string": "foo"} +{"boolean": true, "int": 2, "long": 2, "float": 2.2, "double": 2.2, "bytes": "bar", "string": "bar"}`, + ), + schema: schema.NewIntermediateSchema( + arrow.NewSchema( + []arrow.Field{ + { + Name: "boolean", + Type: arrow.FixedWidthTypes.Boolean, + Nullable: false, + }, + { + Name: "int", + Type: arrow.PrimitiveTypes.Uint32, + Nullable: false, + }, + { + Name: "long", + Type: arrow.PrimitiveTypes.Uint64, + Nullable: false, + }, + { + Name: "float", + Type: arrow.PrimitiveTypes.Float32, + Nullable: false, + }, + { + Name: "double", + Type: arrow.PrimitiveTypes.Float64, + Nullable: false, + }, + { + Name: "bytes", + Type: arrow.BinaryTypes.Binary, + Nullable: false, + }, + { + Name: "string", + Type: arrow.BinaryTypes.String, + Nullable: false, + }, + }, nil), + "primitives"), + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, s.ArrowSchema) + defer b.Release() + + b.Field(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) + b.Field(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) + b.Field(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) + b.Field(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) + b.Field(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) + b.Field(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) + b.Field(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) + + return NewWrappedRecord(b) + }, + isErr: false, + }, + + // Not JSONL + { + input: []byte("not-valid-json"), + schema: schema.NewIntermediateSchema( + arrow.NewSchema([]arrow.Field{}, nil), + ""), + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + return nil + }, + isErr: true, + }, + } + + for _, c := range cases { + actual, err := FormatJsonlToArrow(c.schema, c.input) + + if err != nil != c.isErr { + t.Errorf("expected: %v, but actual: %v\n", c.isErr, err) + } + + expectedRecord := c.expected(c.schema) + if !reflect.DeepEqual(actual, expectedRecord) { + t.Errorf("expected: %v, but actual: %v\n", expectedRecord, actual) + } + } +} diff --git a/record/ltsv.go b/record/ltsv.go index c76c1d9..eabc590 100644 --- a/record/ltsv.go +++ b/record/ltsv.go @@ -4,6 +4,9 @@ import ( "strconv" "strings" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/reproio/columnify/schema" "github.com/Songmu/go-ltsv" @@ -54,10 +57,48 @@ func FormatLtsvToMap(data []byte) ([]map[string]interface{}, error) { } func FormatLtsvToArrow(s *schema.IntermediateSchema, data []byte) (*WrappedRecord, error) { - maps, err := FormatLtsvToMap(data) - if err != nil { - return nil, err + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, s.ArrowSchema) + defer b.Release() + + for _, l := range strings.Split(string(data), "\n") { + v := map[string]string{} + + err := ltsv.Unmarshal([]byte(l), &v) + if err != nil { + return nil, err + } + + m := make(map[string]interface{}) + for k, v := range v { + // bool + if v != "0" && v != "1" { + if vv, err := strconv.ParseBool(v); err == nil { + m[k] = vv + continue + } + } + + // int + if vv, err := strconv.ParseInt(v, 10, 64); err == nil { + m[k] = vv + continue + } + + // float + if vv, err := strconv.ParseFloat(v, 64); err == nil { + m[k] = vv + continue + } + + // others; to string + m[k] = v + } + + if _, err := formatMapToArrowRecord(b, m); err != nil { + return nil, err + } } - return formatMapToArrowRecord(s.ArrowSchema, maps) + return NewWrappedRecord(b), nil } diff --git a/record/ltsv_test.go b/record/ltsv_test.go index 3b0ff18..b784d44 100644 --- a/record/ltsv_test.go +++ b/record/ltsv_test.go @@ -3,6 +3,11 @@ package record import ( "reflect" "testing" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/reproio/columnify/schema" ) func TestFormatLtsvToMap(t *testing.T) { @@ -58,3 +63,99 @@ boolean:true int:2 long:2 float:2.2 double:2.2 bytes:bar string:bar`), } } } + +func TestFormatLtsvToArrow(t *testing.T) { + cases := []struct { + input []byte + schema *schema.IntermediateSchema + expected func(s *schema.IntermediateSchema) *WrappedRecord + isErr bool + }{ + // Primitives + { + input: []byte(`boolean:false int:1 long:1 float:1.1 double:1.1 bytes:foo string:foo +boolean:true int:2 long:2 float:2.2 double:2.2 bytes:bar string:bar`), + schema: schema.NewIntermediateSchema( + arrow.NewSchema( + []arrow.Field{ + { + Name: "boolean", + Type: arrow.FixedWidthTypes.Boolean, + Nullable: false, + }, + { + Name: "int", + Type: arrow.PrimitiveTypes.Uint32, + Nullable: false, + }, + { + Name: "long", + Type: arrow.PrimitiveTypes.Uint64, + Nullable: false, + }, + { + Name: "float", + Type: arrow.PrimitiveTypes.Float32, + Nullable: false, + }, + { + Name: "double", + Type: arrow.PrimitiveTypes.Float64, + Nullable: false, + }, + { + Name: "bytes", + Type: arrow.BinaryTypes.Binary, + Nullable: false, + }, + { + Name: "string", + Type: arrow.BinaryTypes.String, + Nullable: false, + }, + }, nil), + "primitives"), + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, s.ArrowSchema) + defer b.Release() + + b.Field(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) + b.Field(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) + b.Field(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) + b.Field(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) + b.Field(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) + b.Field(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) + b.Field(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) + + return NewWrappedRecord(b) + }, + isErr: false, + }, + + // Not LTSV + { + input: []byte("not-valid-ltsv"), + schema: schema.NewIntermediateSchema( + arrow.NewSchema([]arrow.Field{}, nil), + ""), + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + return nil + }, + isErr: true, + }, + } + + for _, c := range cases { + actual, err := FormatLtsvToArrow(c.schema, c.input) + + if err != nil != c.isErr { + t.Errorf("expected: %v, but actual: %v\n", c.isErr, err) + } + + expectedRecord := c.expected(c.schema) + if !reflect.DeepEqual(actual, expectedRecord) { + t.Errorf("expected: %v, but actual: %v\n", expectedRecord, actual) + } + } +} diff --git a/record/msgpack.go b/record/msgpack.go index 7200ed2..fe677c8 100644 --- a/record/msgpack.go +++ b/record/msgpack.go @@ -5,6 +5,9 @@ import ( "fmt" "io" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/reproio/columnify/schema" "github.com/vmihailenco/msgpack/v4" @@ -34,10 +37,28 @@ func FormatMsgpackToMap(data []byte) ([]map[string]interface{}, error) { } func FormatMsgpackToArrow(s *schema.IntermediateSchema, data []byte) (*WrappedRecord, error) { - maps, err := FormatMsgpackToMap(data) - if err != nil { - return nil, err + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, s.ArrowSchema) + defer b.Release() + + d := msgpack.NewDecoder(bytes.NewReader(data)) + for { + arr, err := d.DecodeInterface() + if err == io.EOF { + break + } else if err != nil { + return nil, err + } + + m, mapOk := arr.(map[string]interface{}) + if !mapOk { + return nil, fmt.Errorf("invalid input %v: %w", arr, ErrUnconvertibleRecord) + } + + if _, err = formatMapToArrowRecord(b, m); err != nil { + return nil, err + } } - return formatMapToArrowRecord(s.ArrowSchema, maps) + return NewWrappedRecord(b), nil } diff --git a/record/msgpack_test.go b/record/msgpack_test.go index 7206c35..84779db 100644 --- a/record/msgpack_test.go +++ b/record/msgpack_test.go @@ -5,6 +5,11 @@ import ( "errors" "reflect" "testing" + + "github.com/apache/arrow/go/arrow" + "github.com/apache/arrow/go/arrow/array" + "github.com/apache/arrow/go/arrow/memory" + "github.com/reproio/columnify/schema" ) func TestFormatMsgpackToMap(t *testing.T) { @@ -71,3 +76,110 @@ func TestFormatMsgpackToMap(t *testing.T) { } } } + +func TestFormatMsgpackToArrow(t *testing.T) { + cases := []struct { + input []byte + schema *schema.IntermediateSchema + expected func(s *schema.IntermediateSchema) *WrappedRecord + err error + }{ + // Primitives + { + // examples/record/primitives.msgpack + input: bytes.Join([][]byte{ + []byte("\x87\xa7\x62\x6f\x6f\x6c\x65\x61\x6e\xc2\xa3\x69\x6e\x74\x01\xa4"), + []byte("\x6c\x6f\x6e\x67\x01\xa5\x66\x6c\x6f\x61\x74\xcb\x3f\xf1\x99\x99"), + []byte("\x99\x99\x99\x9a\xa6\x64\x6f\x75\x62\x6c\x65\xcb\x3f\xf1\x99\x99"), + []byte("\x99\x99\x99\x9a\xa5\x62\x79\x74\x65\x73\xa3\x66\x6f\x6f\xa6\x73"), + []byte("\x74\x72\x69\x6e\x67\xa3\x66\x6f\x6f\x87\xa7\x62\x6f\x6f\x6c\x65"), + []byte("\x61\x6e\xc3\xa3\x69\x6e\x74\x02\xa4\x6c\x6f\x6e\x67\x02\xa5\x66"), + []byte("\x6c\x6f\x61\x74\xcb\x40\x01\x99\x99\x99\x99\x99\x9a\xa6\x64\x6f"), + []byte("\x75\x62\x6c\x65\xcb\x40\x01\x99\x99\x99\x99\x99\x9a\xa5\x62\x79"), + []byte("\x74\x65\x73\xa3\x62\x61\x72\xa6\x73\x74\x72\x69\x6e\x67\xa3\x62"), + []byte("\x61\x72"), + }, []byte("")), + schema: schema.NewIntermediateSchema( + arrow.NewSchema( + []arrow.Field{ + { + Name: "boolean", + Type: arrow.FixedWidthTypes.Boolean, + Nullable: false, + }, + { + Name: "int", + Type: arrow.PrimitiveTypes.Uint32, + Nullable: false, + }, + { + Name: "long", + Type: arrow.PrimitiveTypes.Uint64, + Nullable: false, + }, + { + Name: "float", + Type: arrow.PrimitiveTypes.Float32, + Nullable: false, + }, + { + Name: "double", + Type: arrow.PrimitiveTypes.Float64, + Nullable: false, + }, + { + Name: "bytes", + Type: arrow.BinaryTypes.Binary, + Nullable: false, + }, + { + Name: "string", + Type: arrow.BinaryTypes.String, + Nullable: false, + }, + }, nil), + "primitives"), + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + pool := memory.NewGoAllocator() + b := array.NewRecordBuilder(pool, s.ArrowSchema) + defer b.Release() + + b.Field(0).(*array.BooleanBuilder).AppendValues([]bool{false, true}, []bool{true, true}) + b.Field(1).(*array.Uint32Builder).AppendValues([]uint32{1, 2}, []bool{true, true}) + b.Field(2).(*array.Uint64Builder).AppendValues([]uint64{1, 2}, []bool{true, true}) + b.Field(3).(*array.Float32Builder).AppendValues([]float32{1.1, 2.2}, []bool{true, true}) + b.Field(4).(*array.Float64Builder).AppendValues([]float64{1.1, 2.2}, []bool{true, true}) + b.Field(5).(*array.BinaryBuilder).AppendValues([][]byte{[]byte("foo"), []byte("bar")}, []bool{true, true}) + b.Field(6).(*array.StringBuilder).AppendValues([]string{"foo", "bar"}, []bool{true, true}) + + return NewWrappedRecord(b) + }, + err: nil, + }, + + // Not map type + { + input: []byte("\xa7compact"), + schema: schema.NewIntermediateSchema( + arrow.NewSchema([]arrow.Field{}, nil), + ""), + expected: func(s *schema.IntermediateSchema) *WrappedRecord { + return nil + }, + err: ErrUnconvertibleRecord, + }, + } + + for _, c := range cases { + actual, err := FormatMsgpackToArrow(c.schema, c.input) + + if !errors.Is(err, c.err) { + t.Errorf("expected: %v, but actual: %v\n", c.err, err) + } + + expectedRecord := c.expected(c.schema) + if !reflect.DeepEqual(actual, expectedRecord) { + t.Errorf("expected: %v, but actual: %v\n", expectedRecord, actual) + } + } +} diff --git a/schema/avro.go b/schema/avro.go index 74edb1a..81ada46 100644 --- a/schema/avro.go +++ b/schema/avro.go @@ -11,8 +11,8 @@ import ( var ( avroPrimitivesToArrow = map[avro.PrimitiveType]arrow.DataType{ avro.AvroPrimitiveType_Boolean: arrow.FixedWidthTypes.Boolean, - avro.AvroPrimitiveType_Int: arrow.PrimitiveTypes.Uint32, - avro.AvroPrimitiveType_Long: arrow.PrimitiveTypes.Uint64, + avro.AvroPrimitiveType_Int: arrow.PrimitiveTypes.Int32, + avro.AvroPrimitiveType_Long: arrow.PrimitiveTypes.Int64, avro.AvroPrimitiveType_Float: arrow.PrimitiveTypes.Float32, avro.AvroPrimitiveType_Double: arrow.PrimitiveTypes.Float64, avro.AvroPrimitiveType_String: arrow.BinaryTypes.String, diff --git a/schema/avro_test.go b/schema/avro_test.go index 933108f..b02ea93 100644 --- a/schema/avro_test.go +++ b/schema/avro_test.go @@ -42,12 +42,12 @@ func TestNewArrowSchemaFromAvroSchema(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint32, + Type: arrow.PrimitiveTypes.Int32, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -114,12 +114,12 @@ func TestNewArrowSchemaFromAvroSchema(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint32, + Type: arrow.PrimitiveTypes.Int32, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -153,12 +153,12 @@ func TestNewArrowSchemaFromAvroSchema(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint32, + Type: arrow.PrimitiveTypes.Int32, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -264,12 +264,12 @@ func TestNewArrowSchemaFromAvroSchema(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint32, + Type: arrow.PrimitiveTypes.Int32, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -304,12 +304,12 @@ func TestNewArrowSchemaFromAvroSchema(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint32, + Type: arrow.PrimitiveTypes.Int32, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { diff --git a/schema/bigquery.go b/schema/bigquery.go index 3e23a91..7cdb0b3 100644 --- a/schema/bigquery.go +++ b/schema/bigquery.go @@ -10,14 +10,14 @@ import ( var ( bqPrimitivesToArrow = map[bigquery.FieldType]arrow.DataType{ bigquery.BooleanFieldType: arrow.FixedWidthTypes.Boolean, - bigquery.IntegerFieldType: arrow.PrimitiveTypes.Uint64, + bigquery.IntegerFieldType: arrow.PrimitiveTypes.Int64, bigquery.FloatFieldType: arrow.PrimitiveTypes.Float64, - bigquery.NumericFieldType: arrow.PrimitiveTypes.Uint64, bigquery.StringFieldType: arrow.BinaryTypes.String, bigquery.BytesFieldType: arrow.BinaryTypes.Binary, bigquery.DateFieldType: arrow.FixedWidthTypes.Date32, bigquery.TimeFieldType: arrow.FixedWidthTypes.Time64us, bigquery.TimestampFieldType: arrow.FixedWidthTypes.Timestamp_us, + // bigquery.NumericFieldType: Unsupported // bigquery.DateTimeFieldType: Unsupported } ) diff --git a/schema/bigquery_test.go b/schema/bigquery_test.go index bd028c4..c5adfc8 100644 --- a/schema/bigquery_test.go +++ b/schema/bigquery_test.go @@ -64,12 +64,12 @@ func TestNewArrowSchemaFromBigquerySchema(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -189,12 +189,12 @@ func TestNewArrowSchemaFromBigquerySchema(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -228,12 +228,12 @@ func TestNewArrowSchemaFromBigquerySchema(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -357,12 +357,12 @@ func TestNewArrowSchemaFromBigquerySchema(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -397,12 +397,12 @@ func TestNewArrowSchemaFromBigquerySchema(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { diff --git a/schema/parquet.go b/schema/parquet.go index d6d8eb9..641a6da 100644 --- a/schema/parquet.go +++ b/schema/parquet.go @@ -13,8 +13,8 @@ import ( var ( arrowToParquetPrimitiveType = map[arrow.DataType]string{ arrow.FixedWidthTypes.Boolean: "BOOLEAN", - arrow.PrimitiveTypes.Uint32: "INT32", - arrow.PrimitiveTypes.Uint64: "INT64", + arrow.PrimitiveTypes.Int32: "INT32", + arrow.PrimitiveTypes.Int64: "INT64", arrow.PrimitiveTypes.Float32: "FLOAT", arrow.PrimitiveTypes.Float64: "DOUBLE", arrow.BinaryTypes.Binary: "BYTE_ARRAY", diff --git a/schema/parquet_test.go b/schema/parquet_test.go index a88d325..89016cc 100644 --- a/schema/parquet_test.go +++ b/schema/parquet_test.go @@ -29,12 +29,12 @@ func TestNewSchemaHandlerFromArrow(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint32, + Type: arrow.PrimitiveTypes.Int32, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -119,12 +119,12 @@ func TestNewSchemaHandlerFromArrow(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint32, + Type: arrow.PrimitiveTypes.Int32, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -158,12 +158,12 @@ func TestNewSchemaHandlerFromArrow(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint32, + Type: arrow.PrimitiveTypes.Int32, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -293,12 +293,12 @@ func TestNewSchemaHandlerFromArrow(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint32, + Type: arrow.PrimitiveTypes.Int32, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, { @@ -333,12 +333,12 @@ func TestNewSchemaHandlerFromArrow(t *testing.T) { }, { Name: "int", - Type: arrow.PrimitiveTypes.Uint32, + Type: arrow.PrimitiveTypes.Int32, Nullable: false, }, { Name: "long", - Type: arrow.PrimitiveTypes.Uint64, + Type: arrow.PrimitiveTypes.Int64, Nullable: false, }, {