Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

work on new code-gen - generate SCHEMA$ for enum classes #310

Merged
merged 1 commit into from
Apr 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion avro-codegen/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ dependencies {
exclude group: "com.thoughtworks.paranamer", module: "paranamer-ant"
exclude group: "org.slf4j"
}
testImplementation 'net.openhft:compiler:2.3.6'
testImplementation 'net.openhft:compiler:2.4.1'
}

jar {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,22 @@

package com.linkedin.avroutil1.codegen;

import com.linkedin.avroutil1.compatibility.HelperConsts;
import com.linkedin.avroutil1.compatibility.SourceCodeUtils;
import com.linkedin.avroutil1.model.AvroEnumSchema;
import com.linkedin.avroutil1.model.AvroNamedSchema;
import com.linkedin.avroutil1.model.AvroType;
import com.linkedin.avroutil1.writer.avsc.AvscSchemaWriter;
import com.squareup.javapoet.ClassName;
import com.squareup.javapoet.CodeBlock;
import com.squareup.javapoet.FieldSpec;
import com.squareup.javapoet.JavaFile;
import com.squareup.javapoet.TypeSpec;
import javax.lang.model.element.Modifier;
import javax.tools.JavaFileObject;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.StringJoiner;


/**
Expand Down Expand Up @@ -56,13 +63,7 @@ protected JavaFileObject generateSpecificEnum(AvroEnumSchema enumSchema, Specifi
}

//add public final static SCHEMA$
ClassName avroSchemaType = ClassName.get("org.apache.avro", "Schema");
classBuilder.alwaysQualify(avroSchemaType.simpleName()); //no import statements
classBuilder.addField(FieldSpec
.builder(avroSchemaType, "SCHEMA$", Modifier.PUBLIC, Modifier.FINAL, Modifier.STATIC)
.initializer("null") //TODO - provide avsc string here
.build()
);
addSchema$ToGeneratedClass(classBuilder, enumSchema);

//create file object
TypeSpec classSpec = classBuilder.build();
Expand All @@ -73,4 +74,52 @@ protected JavaFileObject generateSpecificEnum(AvroEnumSchema enumSchema, Specifi

return javaFile.toJavaFileObject();
}

/**
* adds "public final static Schema SCHEMA$" field to generated classes for named avro types.
* the field is defined as:
* public final static Schema SCHEMA$ =
* com.linkedin.avroutil1.compatibility.AvroCompatibilityHelper.parse(avsc1, avsc2, avsc3 ...)
* where the arguments are pieces of the input schema's self-contained (fully-inlined) avsc
* representation. java does not allow string literals to be > 64K in size, so large avsc literals
* are chunked and the var-args Helper.parse() is used.
* @param classBuilder builder for a class being generated
* @param classSchema schema of the class being generated
*/
protected void addSchema$ToGeneratedClass(TypeSpec.Builder classBuilder, AvroNamedSchema classSchema) {
ClassName avroSchemaType = ClassName.get("org.apache.avro", "Schema");
classBuilder.alwaysQualify(avroSchemaType.simpleName()); //no import statements

//get fully-inlined single-line avsc from schema
AvscSchemaWriter avscWriter = new AvscSchemaWriter();
String avsc = avscWriter.writeSingle(classSchema).getContents();

//JVM spec spec says string literals cant be over 65535 bytes in size (this isnt simply the
//character count as horrible wide unicode characters could be involved).
//for details see https://docs.oracle.com/javase/specs/jvms/se8/html/jvms-4.html#jvms-4.4.7
//we add some extra safety margin
String parseFormat;
Object[] parseFormatArgs;
if (avsc.getBytes(StandardCharsets.UTF_8).length > 64000) {
//not 100% safe as argument is in characters and should be bytes ...
List<String> chunks = SourceCodeUtils.safeSplit(avsc, 20000);
StringJoiner csv = new StringJoiner(", ");
for (int i = 1; i <= chunks.size(); i++) {
//"$1S, $2S, ... $NS"
csv.add("$" + i + "S");
}
parseFormat = HelperConsts.HELPER_FQCN + ".parse(" + csv + ")";
parseFormatArgs = chunks.toArray(new Object[] {});
} else {
//no need to split anything
parseFormat = HelperConsts.HELPER_FQCN + ".parse($1S)";
parseFormatArgs = new Object[] {avsc};
}
classBuilder.addField(FieldSpec
.builder(avroSchemaType, "SCHEMA$", Modifier.PUBLIC, Modifier.FINAL, Modifier.STATIC)
//TODO - use strict parsing
.initializer(CodeBlock.of(parseFormat, parseFormatArgs))
.build()
radai-rosenblatt marked this conversation as resolved.
Show resolved Hide resolved
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,18 @@ public void testSimpleEnum() throws Exception {

CompilerHelper.assertCompiles(javaSourceFile);
}

@Test
public void testHugeEnum() throws Exception {
radai-rosenblatt marked this conversation as resolved.
Show resolved Hide resolved
String avsc = TestUtil.load("schemas/SimpleEnumWithHugeDoc.avsc");
SpecificRecordClassGenerator generator = new SpecificRecordClassGenerator();
AvscParser parser = new AvscParser();
AvscParseResult result = parser.parse(avsc);
Assert.assertNull(result.getParseError());
AvroEnumSchema enumSchema = (AvroEnumSchema) result.getTopLevelSchema();
Assert.assertNotNull(enumSchema);
JavaFileObject javaSourceFile = generator.generateSpecificRecordClass(enumSchema, SpecificRecordGenerationConfig.BROAD_COMPATIBILITY);

CompilerHelper.assertCompiles(javaSourceFile);
}
}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ public static String transformParseCalls(

String argToParseCall;
if (largeString && !alreadyVararg) {
List<String> pieces = safeSplit(stringLiteral, MAX_STRING_LITERAL_SIZE);
List<String> pieces = SourceCodeUtils.safeSplit(stringLiteral, MAX_STRING_LITERAL_SIZE);
StringBuilder argBuilder = new StringBuilder(stringLiteral.length()); //at least
argBuilder.append("new StringBuilder()");
for (String piece : pieces) {
Expand Down Expand Up @@ -984,53 +984,4 @@ private static String addImports(String code, Collection<String> importStatement
String newImports = joiner.toString();
return code.substring(0, endOfImports) + "\n" + newImports + "\n" + code.substring(endOfImports);
}

/**
* splits a large java string literal into smaller pieces in a safe way.
* by safe we mean avoids splitting anywhere near an escape sequence
* @param javaStringLiteral large string literal
* @return smaller string literals that can be joined to reform the argument
*/
static List<String> safeSplit(String javaStringLiteral, int maxChunkSize) {
String remainder = javaStringLiteral;
List<String> results = new ArrayList<>(remainder.length() / maxChunkSize);
while (remainder.length() > maxChunkSize) {
int cutIndex = maxChunkSize;
while (cutIndex > 0 && escapesNear(remainder, cutIndex)) {
cutIndex--;
}
if (cutIndex <= 0) {
//should never happen ...
throw new IllegalStateException("unable to split " + javaStringLiteral);
}
String piece = remainder.substring(0, cutIndex);
results.add(piece);
remainder = remainder.substring(cutIndex);
}
if (!remainder.isEmpty()) {
results.add(remainder);
}
return results;
}

/**
* returns true is there's a string escape sequence starting anywhere
* near a given index in a given string literal. since the longest escape
* sequences in java are ~5-6 characters (unicode escapes) a safety margin
* of 10 characters is used.
* @param literal string literal to look for escape sequences in
* @param index index around (before) which to look for escapes
* @return true if any escape sequence found
*/
static boolean escapesNear(String literal, int index) {
//we start at index because we dont want the char at the start of the next fragment
//to be an "interesting" character either
for (int i = index; i > Math.max(0, index - 6); i--) {
char c = literal.charAt(i);
if (c == '\\' || c == '"' || c == '\'') {
return true;
}
}
return false;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Copyright 2022 LinkedIn Corp.
* Licensed under the BSD 2-Clause License (the "License").
* See License in the project root for license information.
*/

package com.linkedin.avroutil1.compatibility;

import java.util.ArrayList;
import java.util.List;

public class SourceCodeUtils {

private SourceCodeUtils() {
//util class
}

/**
* splits a large java string literal into smaller pieces in a safe way.
* by safe we mean avoids splitting anywhere near an escape sequence
* @param javaStringLiteral large string literal
* @param maxChunkSize max chunk size in characters
* @return smaller string literals that can be joined to reform the argument
* TODO - change this method to calculate chunk sizes in utf-8 bytes
*/
public static List<String> safeSplit(String javaStringLiteral, int maxChunkSize) {
String remainder = javaStringLiteral;
List<String> results = new ArrayList<>(remainder.length() / maxChunkSize);
while (remainder.length() > maxChunkSize) {
int cutIndex = maxChunkSize;
while (cutIndex > 0 && escapesNear(remainder, cutIndex)) {
cutIndex--;
}
if (cutIndex <= 0) {
//should never happen ...
throw new IllegalStateException("unable to split " + javaStringLiteral);
}
String piece = remainder.substring(0, cutIndex);
results.add(piece);
remainder = remainder.substring(cutIndex);
}
if (!remainder.isEmpty()) {
results.add(remainder);
}
return results;
}

/**
* returns true is there's a string escape sequence starting anywhere
* near a given index in a given string literal. since the longest escape
* sequences in java are ~5-6 characters (unicode escapes) a safety margin
* of 10 characters is used.
* @param literal string literal to look for escape sequences in
* @param index index around (before) which to look for escapes
* @return true if any escape sequence found
*/
static boolean escapesNear(String literal, int index) {
//we start at index because we dont want the char at the start of the next fragment
//to be an "interesting" character either
for (int i = index; i > Math.max(0, index - 6); i--) {
char c = literal.charAt(i);
if (c == '\\' || c == '"' || c == '\'') {
return true;
}
}
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,62 +6,12 @@

package com.linkedin.avroutil1.compatibility;

import java.util.Arrays;
import java.util.Collections;
import org.testng.Assert;
import org.testng.annotations.Test;


public class CodeTransformationsTest {

@Test
public void testSafeSplit() {
Assert.assertEquals(
Arrays.asList("1234567890", "abcdefghij"),
CodeTransformations.safeSplit("1234567890abcdefghij", 10));
Assert.assertEquals(
Arrays.asList("1234567890", "abcdefghij", "AB"),
CodeTransformations.safeSplit("1234567890abcdefghijAB", 10));
Assert.assertEquals(Collections.singletonList("1234567890"),
CodeTransformations.safeSplit("1234567890", 10));
//dont chop at '
Assert.assertEquals(
Arrays.asList("12345678", "9'abcdefgh", "ij"),
CodeTransformations.safeSplit("123456789'abcdefghij", 10));
//unicode escapes not on the boundary
Assert.assertEquals(
Arrays.asList("xx\\u1234xx", "xxxxxxxxxx"),
CodeTransformations.safeSplit("xx\\u1234xxxxxxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxx\\u1234", "xxxxxxxxxx"),
CodeTransformations.safeSplit("xxxx\\u1234xxxxxxxxxx", 10));
//unicode escapes cross the boundary
Assert.assertEquals(
Arrays.asList("xxxx","x\\u1234xxx", "xxxxxx"),
CodeTransformations.safeSplit("xxxxx\\u1234xxxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxx","x\\u1234xxx", "xxxxx"),
CodeTransformations.safeSplit("xxxxxx\\u1234xxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxx","x\\u1234xxx", "xxxx"),
CodeTransformations.safeSplit("xxxxxxx\\u1234xxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxx","x\\u1234xxx", "xxx"),
CodeTransformations.safeSplit("xxxxxxxx\\u1234xxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxx","x\\u1234xxx", "xx"),
CodeTransformations.safeSplit("xxxxxxxxx\\u1234xxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
CodeTransformations.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
CodeTransformations.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxxx","x\\u1234xxx"),
CodeTransformations.safeSplit("xxxxxxxxxxx\\u1234xxx", 10));
}

@Test
public void testFindEndOfSchemaDeclaration() {
String normal = "public static final org.apache.avro.Schema SCHEMA$ = whatever(\"{json}\"); fluff";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright 2022 LinkedIn Corp.
* Licensed under the BSD 2-Clause License (the "License").
* See License in the project root for license information.
*/

package com.linkedin.avroutil1.compatibility;

import org.testng.Assert;
import org.testng.annotations.Test;

import java.util.Arrays;
import java.util.Collections;

public class SourceCodeUtilsTest {

@Test
public void testSafeSplit() {
Assert.assertEquals(
Arrays.asList("1234567890", "abcdefghij"),
SourceCodeUtils.safeSplit("1234567890abcdefghij", 10));
Assert.assertEquals(
Arrays.asList("1234567890", "abcdefghij", "AB"),
SourceCodeUtils.safeSplit("1234567890abcdefghijAB", 10));
Assert.assertEquals(Collections.singletonList("1234567890"),
SourceCodeUtils.safeSplit("1234567890", 10));
//dont chop at '
Assert.assertEquals(
Arrays.asList("12345678", "9'abcdefgh", "ij"),
SourceCodeUtils.safeSplit("123456789'abcdefghij", 10));
//unicode escapes not on the boundary
Assert.assertEquals(
Arrays.asList("xx\\u1234xx", "xxxxxxxxxx"),
SourceCodeUtils.safeSplit("xx\\u1234xxxxxxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxx\\u1234", "xxxxxxxxxx"),
SourceCodeUtils.safeSplit("xxxx\\u1234xxxxxxxxxx", 10));
//unicode escapes cross the boundary
Assert.assertEquals(
Arrays.asList("xxxx","x\\u1234xxx", "xxxxxx"),
SourceCodeUtils.safeSplit("xxxxx\\u1234xxxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxx","x\\u1234xxx", "xxxxx"),
SourceCodeUtils.safeSplit("xxxxxx\\u1234xxxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxx","x\\u1234xxx", "xxxx"),
SourceCodeUtils.safeSplit("xxxxxxx\\u1234xxxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxx","x\\u1234xxx", "xxx"),
SourceCodeUtils.safeSplit("xxxxxxxx\\u1234xxxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxx","x\\u1234xxx", "xx"),
SourceCodeUtils.safeSplit("xxxxxxxxx\\u1234xxxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
SourceCodeUtils.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxx","x\\u1234xxx", "x"),
SourceCodeUtils.safeSplit("xxxxxxxxxx\\u1234xxxx", 10));
Assert.assertEquals(
Arrays.asList("xxxxxxxxxx","x\\u1234xxx"),
SourceCodeUtils.safeSplit("xxxxxxxxxxx\\u1234xxx", 10));
}
}
2 changes: 1 addition & 1 deletion helper/tests/helper-tests-common/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies {
api project(":helper:helper")
//we use this module as an easy way to "export" libraries for use by other test modules
api "commons-io:commons-io:2.6"
api "net.openhft:compiler:2.3.6"
api "net.openhft:compiler:2.4.1"

implementation "args4j:args4j:2.33"

Expand Down
2 changes: 1 addition & 1 deletion test-common/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ plugins {
dependencies {
//we use this module as an easy way to "export" libraries for use by other test modules
api "commons-io:commons-io:2.6"
api "net.openhft:compiler:2.3.6"
api "net.openhft:compiler:2.4.1"
api "org.testng:testng:6.14.3"
}