From 88a00c1991ac2cd5296bf9364e0799203e78b984 Mon Sep 17 00:00:00 2001 From: Karthik Ramgopal Date: Tue, 16 Jan 2024 12:18:48 -0800 Subject: [PATCH] Improve performance of avro codegen (#535) Co-authored-by: Karthik Ramgopal --- .../codegen/own/AvroUtilCodeGenPlugin.java | 122 +++++++----------- .../codegen/SpecificRecordGeneratorUtil.java | 3 +- 2 files changed, 52 insertions(+), 73 deletions(-) diff --git a/avro-builder/builder/src/main/java/com/linkedin/avroutil1/builder/operations/codegen/own/AvroUtilCodeGenPlugin.java b/avro-builder/builder/src/main/java/com/linkedin/avroutil1/builder/operations/codegen/own/AvroUtilCodeGenPlugin.java index e741e9a8..2a9d3dd4 100644 --- a/avro-builder/builder/src/main/java/com/linkedin/avroutil1/builder/operations/codegen/own/AvroUtilCodeGenPlugin.java +++ b/avro-builder/builder/src/main/java/com/linkedin/avroutil1/builder/operations/codegen/own/AvroUtilCodeGenPlugin.java @@ -21,13 +21,11 @@ import com.squareup.javapoet.JavaFile; import java.io.File; import java.nio.file.Path; -import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.HashSet; import java.util.List; import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; import org.slf4j.Logger; @@ -57,85 +55,65 @@ public void createOperations(BuilderPluginContext context) { } private void generateCode(OperationContext opContext) { - //mkdir any output folders that don't exist - if (!config.getOutputSpecificRecordClassesRoot().exists() && !config.getOutputSpecificRecordClassesRoot() - .mkdirs()) { - throw new IllegalStateException( - "unable to create destination folder " + config.getOutputSpecificRecordClassesRoot()); + // Make sure the output folder exists + File outputFolder = config.getOutputSpecificRecordClassesRoot(); + if (!outputFolder.exists() && !outputFolder.mkdirs()) { + throw new IllegalStateException("unable to create output folder " + outputFolder); } + final Path outputDirectoryPath = outputFolder.toPath(); - final AtomicInteger schemaCounter = new AtomicInteger(0); - final int schemaChunkSize = 500; - Collection> allNamedSchemaList = opContext.getAvroSchemas().stream().flatMap(schema -> { - if (schema instanceof AvroNamedSchema) { - return Stream.of((AvroNamedSchema) schema); - } else if (AvroType.UNION.equals(schema.type())) { - return ((AvroUnionSchema) schema).getTypes() - .stream() - .map(schemaOrRef -> (AvroNamedSchema) schemaOrRef.getSchema()); - } else { - return Stream.empty(); - } - }).collect(Collectors.groupingBy(it -> schemaCounter.getAndIncrement() / schemaChunkSize)).values(); - + // Collect all named schemas. + long collectStart = System.currentTimeMillis(); + Collection allNamedSchemas = opContext.getAvroSchemas() + .stream() + .flatMap(schema -> { + if (schema instanceof AvroNamedSchema) { + return Stream.of((AvroNamedSchema) schema); + } else if (AvroType.UNION.equals(schema.type())) { + return ((AvroUnionSchema) schema).getTypes() + .stream() + .filter(schemaOrRef -> schemaOrRef.getSchema() instanceof AvroNamedSchema) + .map(schemaOrRef -> (AvroNamedSchema) schemaOrRef.getSchema()); + } else { + return Stream.empty(); + } + }) + .flatMap(namedSchema -> { + // Collect inner schemas. + return Stream.concat(Stream.of(namedSchema), + SpecificRecordGeneratorUtil.getNestedInternalSchemaList(namedSchema).stream()); + }) + .filter(namedSchema -> { + // Skip codegen if schema is on classpath and config says to skip + return !config.shouldSkipCodegenIfSchemaOnClasspath() || !doesSchemaExistOnClasspath(namedSchema, + opContext.getLookupSchemaSet()); + }) + .collect(Collectors.toMap(AvroNamedSchema::getFullName, Function.identity(), (s1, s2) -> s1)) + .values(); + long collectEnd = System.currentTimeMillis(); + LOGGER.info("Collected {} named avro schemas in {} millis", allNamedSchemas.size(), collectEnd - collectStart); + + // Generate avro binding java classes. long genStart = System.currentTimeMillis(); - final SpecificRecordGenerationConfig generationConfig = SpecificRecordGenerationConfig.getBroadCompatibilitySpecificRecordGenerationConfig( AvroJavaStringRepresentation.fromJson(config.getStringRepresentation().toString()), AvroJavaStringRepresentation.fromJson(config.getMethodStringRepresentation().toString()), config.getMinAvroVersion(), config.isUtf8EncodingPutByIndexEnabled()); - - // Make sure the output folder exists - File outputFolder = config.getOutputSpecificRecordClassesRoot(); - if (!outputFolder.exists() && !outputFolder.mkdirs()) { - throw new IllegalStateException("unable to create output folder " + outputFolder); - } - final Path outputDirectoryPath = outputFolder.toPath(); final SpecificRecordClassGenerator generator = new SpecificRecordClassGenerator(); - - int totalGeneratedClasses = allNamedSchemaList.parallelStream().map(allNamedSchemas -> { - HashSet alreadyGeneratedSchemaNames = new HashSet<>(); - List generatedSpecificClasses = new ArrayList<>(allNamedSchemas.size()); - for (AvroNamedSchema namedSchema : allNamedSchemas) { - try { - if (!alreadyGeneratedSchemaNames.contains(namedSchema.getFullName())) { - // skip codegen if schema is on classpath and config says to skip - if (config.shouldSkipCodegenIfSchemaOnClasspath() && - doesSchemaExistOnClasspath(namedSchema, opContext.getLookupSchemaSet())) { - continue; - } - - //top level schema - alreadyGeneratedSchemaNames.add(namedSchema.getFullName()); - generatedSpecificClasses.add(generator.generateSpecificClass(namedSchema, generationConfig)); - - // generate internal schemas if not already present - List internalSchemaList = - SpecificRecordGeneratorUtil.getNestedInternalSchemaList(namedSchema); - for (AvroNamedSchema namedInternalSchema : internalSchemaList) { - if (!alreadyGeneratedSchemaNames.contains(namedInternalSchema.getFullName())) { - // skip codegen for nested schemas if schema is on classpath and config says to skip - if (config.shouldSkipCodegenIfSchemaOnClasspath() && - doesSchemaExistOnClasspath(namedInternalSchema, opContext.getLookupSchemaSet())) { - continue; - } - - generatedSpecificClasses.add(generator.generateSpecificClass(namedInternalSchema, generationConfig)); - alreadyGeneratedSchemaNames.add(namedInternalSchema.getFullName()); - } - } - } - } catch (Exception e) { - throw new RuntimeException("failed to generate class for " + namedSchema.getFullName(), e); - } + List generatedClasses = allNamedSchemas.parallelStream().map(namedSchema -> { + try { + // Top level schema + return generator.generateSpecificClass(namedSchema, generationConfig); + } catch (Exception e) { + throw new RuntimeException("failed to generate class for " + namedSchema.getFullName(), e); } - writeJavaFilesToDisk(generatedSpecificClasses, outputDirectoryPath); - return generatedSpecificClasses.size(); - }).reduce(0, Integer::sum); - + }).collect(Collectors.toList()); long genEnd = System.currentTimeMillis(); - LOGGER.info("Generated {} java source files in {} millis", totalGeneratedClasses, genEnd - genStart); + LOGGER.info("Generated {} java source files in {} millis", generatedClasses.size(), genEnd - genStart); + + // Write to disk. + writeJavaFilesToDisk(generatedClasses, outputDirectoryPath); } private boolean doesSchemaExistOnClasspath(AvroNamedSchema schema, SchemaSet schemaSet) { @@ -162,7 +140,7 @@ private void writeJavaFilesToDisk(Collection javaFiles, Path outputFol }).reduce(0, Integer::sum); long writeEnd = System.currentTimeMillis(); - LOGGER.info("wrote out {} generated java source files under {} in {} millis", filesWritten, outputFolderPath, + LOGGER.info("Wrote out {} generated java source files under {} in {} millis", filesWritten, outputFolderPath, writeEnd - writeStart); } } diff --git a/avro-codegen/src/main/java/com/linkedin/avroutil1/codegen/SpecificRecordGeneratorUtil.java b/avro-codegen/src/main/java/com/linkedin/avroutil1/codegen/SpecificRecordGeneratorUtil.java index ad878294..9e9b219d 100644 --- a/avro-codegen/src/main/java/com/linkedin/avroutil1/codegen/SpecificRecordGeneratorUtil.java +++ b/avro-codegen/src/main/java/com/linkedin/avroutil1/codegen/SpecificRecordGeneratorUtil.java @@ -24,6 +24,7 @@ import com.squareup.javapoet.TypeName; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.LinkedList; import java.util.List; @@ -206,7 +207,7 @@ public static List getNestedInternalSchemaList(AvroNamedSchema switch (type) { case ENUM: case FIXED: - return new ArrayList<>(); + return Collections.emptyList(); case RECORD: return getNestedInternalSchemaListForRecord((AvroRecordSchema) topLevelSchema); default: