-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added ingestion by CSV and URL (#14)
* Added ingestion by CSV and URL * path fix
- Loading branch information
Showing
11 changed files
with
4,208 additions
and
1,717 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
// Copyright (c) Roman Atachiants and contributors. All rights reserved. | ||
// Licensed under the MIT license. See LICENSE file in the project root for details. | ||
|
||
package block | ||
|
||
import ( | ||
"bytes" | ||
"encoding/csv" | ||
"io" | ||
|
||
"github.com/kelindar/talaria/internal/column" | ||
"github.com/kelindar/talaria/internal/encoding/typeof" | ||
) | ||
|
||
// FromCSVBy creates a block from a comma-separated file. It repartitions the batch by a given partition key at the same time. | ||
func FromCSVBy(input []byte, partitionBy string, filter *typeof.Schema, computed ...*column.Computed) ([]Block, error) { | ||
const max = 10000000 // 10MB | ||
|
||
rdr := csv.NewReader(bytes.NewReader(input)) | ||
|
||
// Read the header first | ||
r, err := rdr.Read() | ||
header := r | ||
|
||
// Find the partition index | ||
partitionIdx, ok := findString(header, partitionBy) | ||
if !ok { | ||
return nil, errPartitionNotFound | ||
} | ||
|
||
// The resulting set of blocks, repartitioned and chunked | ||
blocks := make([]Block, 0, 128) | ||
|
||
// Create presto columns and iterate | ||
result, size := make(map[string]column.Columns, 16), 0 | ||
for { | ||
r, err = rdr.Read() | ||
if err == io.EOF { | ||
break | ||
} else if err != nil { | ||
return nil, err | ||
} | ||
|
||
if size >= max { | ||
pending, err := makeBlocks(result) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
size = 0 // Reset the size | ||
blocks = append(blocks, pending...) | ||
result = make(map[string]column.Columns, 16) | ||
} | ||
|
||
// Get the partition value, must be a string | ||
partition, ok := convertToString(r[partitionIdx]) | ||
if !ok { | ||
return nil, errPartitionNotFound | ||
} | ||
|
||
// Get the block for that partition | ||
columns, exists := result[partition] | ||
if !exists { | ||
columns = column.MakeColumns(filter) | ||
result[partition] = columns | ||
} | ||
|
||
// Prepare a row for transformation | ||
row := newRow(filter.Clone(), len(r)) | ||
for i, v := range r { | ||
row.Set(header[i], v) | ||
} | ||
|
||
// Append computed columns and fill nulls for the row | ||
size += row.Transform(computed, filter).AppendTo(columns) | ||
size += columns.FillNulls() | ||
} | ||
|
||
// Write the last chunk | ||
last, err := makeBlocks(result) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
blocks = append(blocks, last...) | ||
return blocks, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
// Copyright (c) Roman Atachiants and contributors. All rights reserved. | ||
// Licensed under the MIT license. See LICENSE file in the project root for details. | ||
|
||
package block | ||
|
||
import ( | ||
"io/ioutil" | ||
"testing" | ||
|
||
"github.com/kelindar/talaria/internal/encoding/typeof" | ||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestFromCSV(t *testing.T) { | ||
const testFile = "../../../test/test4.csv" | ||
|
||
o, err := ioutil.ReadFile(testFile) | ||
assert.NotEmpty(t, o) | ||
assert.NoError(t, err) | ||
|
||
b, err := FromCSVBy(o, "raisedCurrency", &typeof.Schema{ | ||
"raisedCurrency": typeof.String, | ||
"raisedAmt": typeof.Float64, | ||
}) | ||
assert.NoError(t, err) | ||
assert.Equal(t, 3, len(b)) | ||
|
||
for _, v := range b { | ||
assert.Contains(t, []string{"EUR", "CAD", "USD"}, string(v.Key)) | ||
} | ||
|
||
v, err := b[0].Select(typeof.Schema{"raisedAmt": typeof.String}) | ||
assert.NoError(t, err) | ||
assert.True(t, v["raisedAmt"].Size() > 0) | ||
assert.Equal(t, typeof.Float64, v["raisedAmt"].Kind()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
// Copyright (c) Roman Atachiants and contributors. All rights reserved. | ||
// Licensed under the MIT license. See LICENSE file in the project root for details. | ||
|
||
package block | ||
|
||
import ( | ||
"context" | ||
"path/filepath" | ||
"strings" | ||
|
||
"github.com/kelindar/loader" | ||
"github.com/kelindar/talaria/internal/column" | ||
"github.com/kelindar/talaria/internal/encoding/typeof" | ||
"github.com/kelindar/talaria/internal/monitor/errors" | ||
) | ||
|
||
// FromURLBy creates a block from a remote url which should be loaded. It repartitions the batch by a given partition key at the same time. | ||
func FromURLBy(uri string, partitionBy string, filter *typeof.Schema, computed ...*column.Computed) ([]Block, error) { | ||
var handler func([]byte, string, *typeof.Schema, ...*column.Computed) ([]Block, error) | ||
switch strings.ToLower(filepath.Ext(uri)) { | ||
case ".orc": | ||
handler = FromOrcBy | ||
case ".csv": | ||
handler = FromCSVBy | ||
default: | ||
return nil, errors.Newf("block: unsupported file extension %s", filepath.Ext(uri)) | ||
} | ||
|
||
l := loader.New() | ||
b, err := l.Load(context.Background(), uri) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return handler(b, partitionBy, filter, computed...) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
// Copyright (c) Roman Atachiants and contributors. All rights reserved. | ||
// Licensed under the MIT license. See LICENSE file in the project root for details. | ||
|
||
package block | ||
|
||
import ( | ||
"path/filepath" | ||
"testing" | ||
|
||
"github.com/kelindar/talaria/internal/encoding/typeof" | ||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestFromURL(t *testing.T) { | ||
p, err := filepath.Abs("../../../test/test4.csv") | ||
assert.NoError(t, err) | ||
|
||
b, err := FromURLBy("file:///"+p, "raisedCurrency", &typeof.Schema{ | ||
"raisedCurrency": typeof.String, | ||
"raisedAmt": typeof.Float64, | ||
}) | ||
assert.NoError(t, err) | ||
assert.Equal(t, 3, len(b)) | ||
|
||
for _, v := range b { | ||
assert.Contains(t, []string{"EUR", "CAD", "USD"}, string(v.Key)) | ||
} | ||
|
||
v, err := b[0].Select(typeof.Schema{"raisedAmt": typeof.String}) | ||
assert.NoError(t, err) | ||
assert.True(t, v["raisedAmt"].Size() > 0) | ||
assert.Equal(t, typeof.Float64, v["raisedAmt"].Kind()) | ||
} |
Oops, something went wrong.