From 903aa41f9fd2c2807b5cb8800ec777370a8ca68b Mon Sep 17 00:00:00 2001 From: stanbrub Date: Thu, 12 Oct 2023 20:32:10 -0600 Subject: [PATCH] Added benchmarks for defaults, nulls, and string arrays --- .../standard/parquet/ParquetCodecTest.java | 44 ++++++++++++------- .../parquet/ParquetSingleColTest.java | 44 ++++++++++++++----- .../standard/parquet/ParquetTestRunner.java | 25 ++++++++--- 3 files changed, 82 insertions(+), 31 deletions(-) diff --git a/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetCodecTest.java b/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetCodecTest.java index 2f39970b..9f3024e2 100644 --- a/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetCodecTest.java +++ b/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetCodecTest.java @@ -8,85 +8,99 @@ * generated by the "write" tests is used by the "read" tests */ @TestMethodOrder(OrderAnnotation.class) -public class ParquetCodecTest { +class ParquetCodecTest { final ParquetTestRunner runner = new ParquetTestRunner(this); - final String[] usedColumns = {"str10K", "long10K", "int10K", "short10K", "bigDec10K", "array5", "vector5"}; + final String[] usedColumns = {"str10K", "long10K", "int10K", "short10K", "bigDec10K", "intArr5", "intVec5"}; @BeforeEach - public void setup() { + void setup() { runner.setScaleFactors(5, 1); } @Test @Order(1) - public void writeMultiColSnappy() { + void writeMultiColSnappy() { runner.runWriteTest("ParquetWrite- Snappy Multi Col -Static", "SNAPPY", usedColumns); } @Test @Order(2) - public void readMultiColSnappy() { + void readMultiColSnappy() { runner.runReadTest("ParquetRead- Snappy Multi Col -Static"); } @Test @Order(3) - public void writeMultiColZstd() { + void writeMultiColZstd() { runner.runWriteTest("ParquetWrite- Zstd Multi Col -Static", "ZSTD", usedColumns); } @Test @Order(4) - public void readMultiColZstd() { + void readMultiColZstd() { runner.runReadTest("ParquetRead- Zstd Multi Col -Static"); } @Test @Order(5) - public void writeMultiColLzo() { + void writeMultiColLzo() { runner.runWriteTest("ParquetWrite- Lzo Multi Col -Static", "LZO", usedColumns); } @Test @Order(6) - public void readMultiColLzo() { + void readMultiColLzo() { runner.runReadTest("ParquetRead- Lzo Multi Col -Static"); } @Test @Order(7) - public void writeMultiColLz4Raw() { + void writeMultiColLz4Raw() { runner.runWriteTest("ParquetWrite- Lz4Raw Multi Col -Static", "LZ4_RAW", usedColumns); } @Test @Order(8) - public void readMultiColLz4Raw() { + void readMultiColLz4Raw() { runner.runReadTest("ParquetRead- Lz4Raw Multi Col -Static"); } @Test @Order(9) - public void writeMultiColGzip() { + void writeMultiColGzip() { runner.runWriteTest("ParquetWrite- Gzip Multi Col -Static", "GZIP", usedColumns); } @Test @Order(10) - public void readMultiColGzip() { + void readMultiColGzip() { runner.runReadTest("ParquetRead- Gzip Multi Col -Static"); } @Test @Order(11) - public void writeMultiColNone() { + void writeMultiColNone() { runner.runWriteTest("ParquetWrite- No Codec Multi Col -Static", "NONE", usedColumns); } @Test @Order(12) - public void readMultiColNone() { + void readMultiColNone() { runner.runReadTest("ParquetRead- No Codec Multi Col -Static"); } + + @Test + @Order(13) + void writeMultiColDefaultSnappy() { + runner.useParquetDefaultSettings(); + runner.runWriteTest("ParquetWrite- Snappy Multi Col Defaults -Static", "SNAPPY", usedColumns); + } + + @Test + @Order(14) + void readMultiColDefaultSnappy() { + runner.useParquetDefaultSettings(); + runner.runReadTest("ParquetRead- Snappy Multi Col Defaults -Static"); + } } diff --git a/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetSingleColTest.java b/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetSingleColTest.java index f3e380e5..540b6b5a 100644 --- a/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetSingleColTest.java +++ b/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetSingleColTest.java @@ -5,49 +5,73 @@ /** * Standard tests for writing single column parquet for different column types. */ -public class ParquetSingleColTest { +class ParquetSingleColTest { final ParquetTestRunner runner = new ParquetTestRunner(this); @Test - public void writeOneStringCol() { + void writeOneStringCol() { runner.setScaleFactors(5, 15); runner.runWriteTest("ParquetWrite- 1 String Col -Static", "SNAPPY", "str10K"); } @Test - public void writeOneBigDecimalCol() { + void writeOneBigDecimalCol() { runner.setScaleFactors(5, 6); runner.runWriteTest("ParquetWrite- 1 Big Decimal Col -Static", "SNAPPY", "bigDec10K"); } @Test - public void writeOneLongCol() { + void writeOneLongCol() { runner.setScaleFactors(5, 15); runner.runWriteTest("ParquetWrite- 1 Long Col -Static", "SNAPPY", "long10K"); } @Test - public void writeOneIntCol() { + void writeOneIntCol() { runner.setScaleFactors(5, 30); runner.runWriteTest("ParquetWrite- 1 Int Col -Static", "SNAPPY", "int10K"); } @Test - public void writeOneShortCol() { + void writeOneShortCol() { runner.setScaleFactors(5, 35); runner.runWriteTest("ParquetWrite- 1 Short Col -Static", "SNAPPY", "short10K"); } @Test - public void writeOneArrayCol() { + void writeOneInt1KArrayCol() { runner.setScaleFactors(0.10, 1); - runner.runWriteTest("ParquetWrite- 1 Int Array Col -Static", "SNAPPY", "array1K"); + runner.runWriteTest("ParquetWrite- 1 Array Col of 1K Ints -Static", "SNAPPY", "intArr1K"); } @Test - public void writeOneVectorCol() { + void writeOneInt1KVectorCol() { runner.setScaleFactors(0.10, 1); - runner.runWriteTest("ParquetWrite- 1 Int Vector Col -Static", "SNAPPY", "vector1K"); + runner.runWriteTest("ParquetWrite- 1 Vector Col of 1K Ints -Static", "SNAPPY", "intVec1K"); + } + + @Test + void writeOneInt5ArrayCol() { + runner.setScaleFactors(2, 4); + runner.runWriteTest("ParquetWrite- 1 Array Col of 5 Ints -Static", "SNAPPY", "intArr5"); + } + + @Test + void writeOneInt5VectorCol() { + runner.setScaleFactors(2, 4); + runner.runWriteTest("ParquetWrite- 1 Vector Col of 5 Ints -Static", "SNAPPY", "intVec5"); + } + + @Test + void writeOneObjectArrayCol() { + runner.setScaleFactors(2, 2); + runner.runWriteTest("ParquetWrite- 1 Array Col of 3 Strings and 2 Nulls -Static", "SNAPPY", "objArr5"); + } + + @Test + void writeOneObjectVectorCol() { + runner.setScaleFactors(2, 1); + runner.runWriteTest("ParquetWrite- 1 Vector Col of 3 String and 2 Nulls -Static", "SNAPPY", "objVec5"); } } diff --git a/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetTestRunner.java b/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetTestRunner.java index 5a034d6d..644bba57 100644 --- a/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetTestRunner.java +++ b/src/it/java/io/deephaven/benchmark/tests/standard/parquet/ParquetTestRunner.java @@ -13,11 +13,13 @@ * Test reading and writing parquet files with various data types and compression codecs. */ class ParquetTestRunner { + final String parquetCfg = "max_dictionary_keys=2000000, max_dictionary_size=20000000, target_page_size=2000000"; final Object testInst; final Bench api; private double rowCountFactor = 1; private int scaleFactor = 1; private long scaleRowCount; + private boolean useParquetDefaultSettings = false; ParquetTestRunner(Object testInst) { this.testInst = testInst; @@ -36,6 +38,14 @@ void setScaleFactors(double rowCountFactor, int scaleFactor) { this.scaleRowCount = (long) (api.propertyAsIntegral("scale.row.count", "100000") * rowCountFactor); this.scaleFactor = scaleFactor; } + + /** + * Use the default settings in deephaven-core for parquet dictionary and page size instead of the + * defaults used for benchmarks + */ + void useParquetDefaultSettings() { + this.useParquetDefaultSettings = true; + } /** * Read a benchmark that measures parquet read performance. This tests always runs after a corresponding write test. @@ -76,8 +86,7 @@ void runWriteTest(String testName, String codec, String... columnNames) { bench_api_metrics_snapshot() begin_time = time.perf_counter_ns() write( - source, '/data/source.ptr.parquet', compression_codec_name='${codec}', - max_dictionary_keys=2000000, max_dictionary_size=20000000, target_page_size=2000000 + source, '/data/source.ptr.parquet', compression_codec_name='${codec}'${parquetSettings} ) end_time = time.perf_counter_ns() bench_api_metrics_snapshot() @@ -93,6 +102,7 @@ void runWriteTest(String testName, String codec, String... columnNames) { q = q.replace("${scaleFactor}", "" + scaleFactor); q = q.replace("${codec}", codec.equalsIgnoreCase("none") ? "UNCOMPRESSED" : codec); q = q.replace("${generators}", getGenerators(columnNames)); + q = q.replace("${parquetSettings}", useParquetDefaultSettings ? "" : (",\n " + parquetCfg)); runTest(testName, q); } @@ -143,16 +153,19 @@ String getGenerators(String... columnNames) { String getGenerator(final String columnName) { var array5 = "java.util.stream.IntStream.range((int)(ii % 5),(int)((ii % 5) + 5)).toArray()"; var array1K = "java.util.stream.IntStream.range((int)(ii % 1000),(int)((ii % 1000) + 1000)).toArray()"; + var objArr5 = "java.util.stream.Stream.of(`1`,null,`3`,null,`5`).toArray()"; var gen = switch (columnName) { case "str10K" -> "(`` + (ii % 10000))"; case "long10K" -> "(ii % 10000)"; case "int10K" -> "((int)(ii % 10000))"; case "short10K" -> "((short)(ii % 10000))"; case "bigDec10K" -> "java.math.BigDecimal.valueOf(ii % 10000)"; - case "array5" -> array5; - case "vector5" -> "new io.deephaven.vector.IntVectorDirect(" + array5 + ")"; - case "array1K" -> array1K; - case "vector1K" -> "new io.deephaven.vector.IntVectorDirect(" + array1K + ")"; + case "intArr5" -> array5; + case "intVec5" -> "new io.deephaven.vector.IntVectorDirect(" + array5 + ")"; + case "intArr1K" -> array1K; + case "intVec1K" -> "new io.deephaven.vector.IntVectorDirect(" + array1K + ")"; + case "objArr5" -> objArr5; + case "objVec5" -> "new io.deephaven.vector.ObjectVectorDirect(" + objArr5 + ")"; default -> throw new RuntimeException("Undefined column: " + columnName); }; return "(ii % 10 == 0) ? null : " + gen;