From c9581ce05167d9d40c15b0b0712628b77fc8ccc5 Mon Sep 17 00:00:00 2001 From: andsel Date: Tue, 19 Nov 2024 11:15:02 +0100 Subject: [PATCH 1/2] [Test] add test to verify that BufferedTokenizerExt doesn't apply encoding twice --- .../common/BufferedTokenizerExtTest.java | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtTest.java b/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtTest.java index 5638cffd83b..d43f6f07b88 100644 --- a/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtTest.java +++ b/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtTest.java @@ -28,6 +28,8 @@ import org.logstash.RubyTestBase; import org.logstash.RubyUtil; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.List; import static org.junit.Assert.assertEquals; @@ -88,4 +90,25 @@ public void shouldTokenizeEmptyPayloadWithNewline() { tokens = (RubyArray) sut.extract(context, RubyUtil.RUBY.newString("\n\n\n")); assertEquals(List.of("", "", ""), tokens); } + + @Test + public void shouldRespectTheCharset() { + final String input = new String(new byte[] {(byte) 0xA3}, StandardCharsets.ISO_8859_1); + sut.extract(context, RubyUtil.RUBY.newString(input)); + + // send a delimiter to trigger the output of the only token + RubyArray tokens = (RubyArray) sut.extract(context, RubyUtil.RUBY.newString("\n")); + + String token = (String) tokens.iterator().next(); + assertEquals("£", token); + assertEqualsBytes(new byte[] {(byte) 0xC2, (byte) 0xA3}, token.getBytes(StandardCharsets.UTF_8)); + assertEqualsBytes(new byte[] {(byte) 0xA3}, token.getBytes(StandardCharsets.ISO_8859_1)); + } + + private void assertEqualsBytes(byte[] expected, byte[] actual) { + assertEquals(expected.length, actual.length); + for (int i = 0; i < expected.length; i++) { + assertEquals(expected[i], actual[i]); + } + } } \ No newline at end of file From f718ef4c5e80eb8f7f4d4bfab7dad1c7041a6863 Mon Sep 17 00:00:00 2001 From: andsel Date: Tue, 19 Nov 2024 15:34:43 +0100 Subject: [PATCH 2/2] [Test] added test to check encoding preservation, which fails with new implementation of BufferedTokenizerExt, due to JRuby conversion of types --- .../org/logstash/common/BufferedTokenizerExtTest.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtTest.java b/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtTest.java index d43f6f07b88..5b68a930317 100644 --- a/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtTest.java +++ b/logstash-core/src/test/java/org/logstash/common/BufferedTokenizerExtTest.java @@ -111,4 +111,14 @@ private void assertEqualsBytes(byte[] expected, byte[] actual) { assertEquals(expected[i], actual[i]); } } + + @Test + public void testEncodingIsPreserved() { + RubyString rubyString = RubyString.newString(RUBY, new byte[]{(byte) 0xA3}); + + IRubyObject rubyInput = rubyString.force_encoding(context, RUBY.newString("ISO8859-1")); + sut.extract(context, rubyInput); + + assertEqualsBytes(new byte[]{(byte) 0xA3}, ((RubyString) sut.flush(context)).getBytes()); + } } \ No newline at end of file