Skip to content

Commit

Permalink
Added benchmarks for defaults, nulls, and string arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
stanbrub committed Oct 13, 2023
1 parent 4586c35 commit 903aa41
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,85 +8,99 @@
* generated by the "write" tests is used by the "read" tests
*/
@TestMethodOrder(OrderAnnotation.class)
public class ParquetCodecTest {
class ParquetCodecTest {
final ParquetTestRunner runner = new ParquetTestRunner(this);
final String[] usedColumns = {"str10K", "long10K", "int10K", "short10K", "bigDec10K", "array5", "vector5"};
final String[] usedColumns = {"str10K", "long10K", "int10K", "short10K", "bigDec10K", "intArr5", "intVec5"};

@BeforeEach
public void setup() {
void setup() {
runner.setScaleFactors(5, 1);
}

@Test
@Order(1)
public void writeMultiColSnappy() {
void writeMultiColSnappy() {
runner.runWriteTest("ParquetWrite- Snappy Multi Col -Static", "SNAPPY", usedColumns);
}

@Test
@Order(2)
public void readMultiColSnappy() {
void readMultiColSnappy() {
runner.runReadTest("ParquetRead- Snappy Multi Col -Static");
}

@Test
@Order(3)
public void writeMultiColZstd() {
void writeMultiColZstd() {
runner.runWriteTest("ParquetWrite- Zstd Multi Col -Static", "ZSTD", usedColumns);
}

@Test
@Order(4)
public void readMultiColZstd() {
void readMultiColZstd() {
runner.runReadTest("ParquetRead- Zstd Multi Col -Static");
}

@Test
@Order(5)
public void writeMultiColLzo() {
void writeMultiColLzo() {
runner.runWriteTest("ParquetWrite- Lzo Multi Col -Static", "LZO", usedColumns);
}

@Test
@Order(6)
public void readMultiColLzo() {
void readMultiColLzo() {
runner.runReadTest("ParquetRead- Lzo Multi Col -Static");
}

@Test
@Order(7)
public void writeMultiColLz4Raw() {
void writeMultiColLz4Raw() {
runner.runWriteTest("ParquetWrite- Lz4Raw Multi Col -Static", "LZ4_RAW", usedColumns);
}

@Test
@Order(8)
public void readMultiColLz4Raw() {
void readMultiColLz4Raw() {
runner.runReadTest("ParquetRead- Lz4Raw Multi Col -Static");
}

@Test
@Order(9)
public void writeMultiColGzip() {
void writeMultiColGzip() {
runner.runWriteTest("ParquetWrite- Gzip Multi Col -Static", "GZIP", usedColumns);
}

@Test
@Order(10)
public void readMultiColGzip() {
void readMultiColGzip() {
runner.runReadTest("ParquetRead- Gzip Multi Col -Static");
}

@Test
@Order(11)
public void writeMultiColNone() {
void writeMultiColNone() {
runner.runWriteTest("ParquetWrite- No Codec Multi Col -Static", "NONE", usedColumns);
}

@Test
@Order(12)
public void readMultiColNone() {
void readMultiColNone() {
runner.runReadTest("ParquetRead- No Codec Multi Col -Static");
}

@Test
@Order(13)
void writeMultiColDefaultSnappy() {
runner.useParquetDefaultSettings();
runner.runWriteTest("ParquetWrite- Snappy Multi Col Defaults -Static", "SNAPPY", usedColumns);
}

@Test
@Order(14)
void readMultiColDefaultSnappy() {
runner.useParquetDefaultSettings();
runner.runReadTest("ParquetRead- Snappy Multi Col Defaults -Static");
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -5,49 +5,73 @@
/**
* Standard tests for writing single column parquet for different column types.
*/
public class ParquetSingleColTest {
class ParquetSingleColTest {
final ParquetTestRunner runner = new ParquetTestRunner(this);

@Test
public void writeOneStringCol() {
void writeOneStringCol() {
runner.setScaleFactors(5, 15);
runner.runWriteTest("ParquetWrite- 1 String Col -Static", "SNAPPY", "str10K");
}

@Test
public void writeOneBigDecimalCol() {
void writeOneBigDecimalCol() {
runner.setScaleFactors(5, 6);
runner.runWriteTest("ParquetWrite- 1 Big Decimal Col -Static", "SNAPPY", "bigDec10K");
}

@Test
public void writeOneLongCol() {
void writeOneLongCol() {
runner.setScaleFactors(5, 15);
runner.runWriteTest("ParquetWrite- 1 Long Col -Static", "SNAPPY", "long10K");
}

@Test
public void writeOneIntCol() {
void writeOneIntCol() {
runner.setScaleFactors(5, 30);
runner.runWriteTest("ParquetWrite- 1 Int Col -Static", "SNAPPY", "int10K");
}

@Test
public void writeOneShortCol() {
void writeOneShortCol() {
runner.setScaleFactors(5, 35);
runner.runWriteTest("ParquetWrite- 1 Short Col -Static", "SNAPPY", "short10K");
}

@Test
public void writeOneArrayCol() {
void writeOneInt1KArrayCol() {
runner.setScaleFactors(0.10, 1);
runner.runWriteTest("ParquetWrite- 1 Int Array Col -Static", "SNAPPY", "array1K");
runner.runWriteTest("ParquetWrite- 1 Array Col of 1K Ints -Static", "SNAPPY", "intArr1K");
}

@Test
public void writeOneVectorCol() {
void writeOneInt1KVectorCol() {
runner.setScaleFactors(0.10, 1);
runner.runWriteTest("ParquetWrite- 1 Int Vector Col -Static", "SNAPPY", "vector1K");
runner.runWriteTest("ParquetWrite- 1 Vector Col of 1K Ints -Static", "SNAPPY", "intVec1K");
}

@Test
void writeOneInt5ArrayCol() {
runner.setScaleFactors(2, 4);
runner.runWriteTest("ParquetWrite- 1 Array Col of 5 Ints -Static", "SNAPPY", "intArr5");
}

@Test
void writeOneInt5VectorCol() {
runner.setScaleFactors(2, 4);
runner.runWriteTest("ParquetWrite- 1 Vector Col of 5 Ints -Static", "SNAPPY", "intVec5");
}

@Test
void writeOneObjectArrayCol() {
runner.setScaleFactors(2, 2);
runner.runWriteTest("ParquetWrite- 1 Array Col of 3 Strings and 2 Nulls -Static", "SNAPPY", "objArr5");
}

@Test
void writeOneObjectVectorCol() {
runner.setScaleFactors(2, 1);
runner.runWriteTest("ParquetWrite- 1 Vector Col of 3 String and 2 Nulls -Static", "SNAPPY", "objVec5");
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@
* Test reading and writing parquet files with various data types and compression codecs.
*/
class ParquetTestRunner {
final String parquetCfg = "max_dictionary_keys=2000000, max_dictionary_size=20000000, target_page_size=2000000";
final Object testInst;
final Bench api;
private double rowCountFactor = 1;
private int scaleFactor = 1;
private long scaleRowCount;
private boolean useParquetDefaultSettings = false;

ParquetTestRunner(Object testInst) {
this.testInst = testInst;
Expand All @@ -36,6 +38,14 @@ void setScaleFactors(double rowCountFactor, int scaleFactor) {
this.scaleRowCount = (long) (api.propertyAsIntegral("scale.row.count", "100000") * rowCountFactor);
this.scaleFactor = scaleFactor;
}

/**
* Use the default settings in deephaven-core for parquet dictionary and page size instead of the
* defaults used for benchmarks
*/
void useParquetDefaultSettings() {
this.useParquetDefaultSettings = true;
}

/**
* Read a benchmark that measures parquet read performance. This tests always runs after a corresponding write test.
Expand Down Expand Up @@ -76,8 +86,7 @@ void runWriteTest(String testName, String codec, String... columnNames) {
bench_api_metrics_snapshot()
begin_time = time.perf_counter_ns()
write(
source, '/data/source.ptr.parquet', compression_codec_name='${codec}',
max_dictionary_keys=2000000, max_dictionary_size=20000000, target_page_size=2000000
source, '/data/source.ptr.parquet', compression_codec_name='${codec}'${parquetSettings}
)
end_time = time.perf_counter_ns()
bench_api_metrics_snapshot()
Expand All @@ -93,6 +102,7 @@ void runWriteTest(String testName, String codec, String... columnNames) {
q = q.replace("${scaleFactor}", "" + scaleFactor);
q = q.replace("${codec}", codec.equalsIgnoreCase("none") ? "UNCOMPRESSED" : codec);
q = q.replace("${generators}", getGenerators(columnNames));
q = q.replace("${parquetSettings}", useParquetDefaultSettings ? "" : (",\n " + parquetCfg));
runTest(testName, q);
}

Expand Down Expand Up @@ -143,16 +153,19 @@ String getGenerators(String... columnNames) {
String getGenerator(final String columnName) {
var array5 = "java.util.stream.IntStream.range((int)(ii % 5),(int)((ii % 5) + 5)).toArray()";
var array1K = "java.util.stream.IntStream.range((int)(ii % 1000),(int)((ii % 1000) + 1000)).toArray()";
var objArr5 = "java.util.stream.Stream.of(`1`,null,`3`,null,`5`).toArray()";
var gen = switch (columnName) {
case "str10K" -> "(`` + (ii % 10000))";
case "long10K" -> "(ii % 10000)";
case "int10K" -> "((int)(ii % 10000))";
case "short10K" -> "((short)(ii % 10000))";
case "bigDec10K" -> "java.math.BigDecimal.valueOf(ii % 10000)";
case "array5" -> array5;
case "vector5" -> "new io.deephaven.vector.IntVectorDirect(" + array5 + ")";
case "array1K" -> array1K;
case "vector1K" -> "new io.deephaven.vector.IntVectorDirect(" + array1K + ")";
case "intArr5" -> array5;
case "intVec5" -> "new io.deephaven.vector.IntVectorDirect(" + array5 + ")";
case "intArr1K" -> array1K;
case "intVec1K" -> "new io.deephaven.vector.IntVectorDirect(" + array1K + ")";
case "objArr5" -> objArr5;
case "objVec5" -> "new io.deephaven.vector.ObjectVectorDirect(" + objArr5 + ")";
default -> throw new RuntimeException("Undefined column: " + columnName);
};
return "(ii % 10 == 0) ? null : " + gen;
Expand Down

0 comments on commit 903aa41

Please sign in to comment.