From 3e51932ff3795efe4e03b8a12d451d468a0b1142 Mon Sep 17 00:00:00 2001 From: Andy Boothe Date: Sat, 28 Sep 2024 19:38:57 -0500 Subject: [PATCH] fix #49 --- .../java/com/sigpwned/chardet4j/Chardet.java | 4 +- .../com/sigpwned/chardet4j/ChardetTest.java | 186 ++++++++++++++++++ 2 files changed, 188 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/sigpwned/chardet4j/Chardet.java b/src/main/java/com/sigpwned/chardet4j/Chardet.java index 79576a0..0434ab0 100644 --- a/src/main/java/com/sigpwned/chardet4j/Chardet.java +++ b/src/main/java/com/sigpwned/chardet4j/Chardet.java @@ -354,13 +354,13 @@ public static DecodedInputStreamReader decode(InputStream input, String declared return new DecodedInputStreamReader(bomed, bomed.bom().get().getCharset()); // If there is no BOM, then read some bytes to detect the charset. - final byte[] buf = ByteStreams.readNBytes(input, DECODE_DETECT_BUFSIZE); + final byte[] buf = ByteStreams.readNBytes(bomed, DECODE_DETECT_BUFSIZE); // Note that charset cannot be null, since we check defaultCharset above. Charset charset = detectCharset(buf, declaredEncoding).orElse(defaultCharset); return new DecodedInputStreamReader( - new SequenceInputStream(new ByteArrayInputStream(buf), input), charset); + new SequenceInputStream(new ByteArrayInputStream(buf), bomed), charset); } /** diff --git a/src/test/java/com/sigpwned/chardet4j/ChardetTest.java b/src/test/java/com/sigpwned/chardet4j/ChardetTest.java index 730bd88..65ab933 100644 --- a/src/test/java/com/sigpwned/chardet4j/ChardetTest.java +++ b/src/test/java/com/sigpwned/chardet4j/ChardetTest.java @@ -19,17 +19,26 @@ */ package com.sigpwned.chardet4j; +import static java.util.Arrays.asList; +import static java.util.Objects.requireNonNull; +import static org.hamcrest.CoreMatchers.anyOf; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.Reader; +import java.io.SequenceInputStream; +import java.io.StringWriter; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; +import java.util.List; +import java.util.Optional; import org.junit.Test; import com.google.common.io.CharStreams; import com.google.common.io.Resources; +import com.sigpwned.chardet4j.io.DecodedInputStreamReader; public class ChardetTest { @Test @@ -169,4 +178,181 @@ public void longTest() throws IOException { assertThat(charset, is(StandardCharsets.UTF_8)); } + + public static class TestableCharset { + public final boolean standard; + public final String charsetName; + public final ByteOrderMark bom; + + public TestableCharset(boolean standard, String charsetName, ByteOrderMark bom) { + this.standard = standard; + this.charsetName = requireNonNull(charsetName); + this.bom = requireNonNull(bom); + } + + public Optional getCharset() { + try { + return Optional.of(Charset.forName(charsetName)); + } catch (UnsupportedCharsetException e) { + return Optional.empty(); + } + } + } + + public static byte[] concat(byte[] xs, byte[] ys) { + byte[] zs = new byte[xs.length + ys.length]; + System.arraycopy(xs, 0, zs, 0, xs.length); + System.arraycopy(ys, 0, zs, xs.length, ys.length); + return zs; + } + + /** + * These are the charsets we'll test decoding with. We'll test decoding with/out a BOM. + */ + public static final List DETECT_CHARSET_TEST_CHARSETS = + asList(new TestableCharset(true, "UTF-16BE", ByteOrderMark.UTF_16BE), + new TestableCharset(true, "UTF-16LE", ByteOrderMark.UTF_16LE), + new TestableCharset(true, "UTF-8", ByteOrderMark.UTF_8), + new TestableCharset(false, "UTF-32BE", ByteOrderMark.UTF_32BE), + new TestableCharset(false, "UTF-32LE", ByteOrderMark.UTF_32LE), + new TestableCharset(false, "UTF-1", ByteOrderMark.UTF_1), + new TestableCharset(false, "UTF-EBCDIC", ByteOrderMark.UTF_EBCDIC)); + + /** + * Test a variety of charsets using a known text and detect them + */ + @Test + public void detectCharsetTest() throws IOException { + // Stopping by Woods on a Snowy Evening, by Robert Frost + // We'll encode this in various charsets and decode them + // We use a text without diacritics to avoid any issues with encoding. We're not here to test + // the correctness of charset implementations, only correct application of same. + // Note: The poem is public domain. + final String originalText = "Whose woods these are I think I know. \n" + + "His house is in the village though; \n" + "He will not see me stopping here \n" + + "To watch his woods fill up with snow. \n" + "\n" + + "My little horse must think it queer \n" + "To stop without a farmhouse near \n" + + "Between the woods and frozen lake \n" + "The darkest evening of the year. \n" + "\n" + + "He gives his harness bells a shake \n" + "To ask if there is some mistake. \n" + + "The only other sound’s the sweep \n" + "Of easy wind and downy flake. \n" + "\n" + + "The woods are lovely, dark and deep, \n" + "But I have promises to keep, \n" + + "And miles to go before I sleep, \n" + "And miles to go before I sleep."; + + // These are all the charsets that Java is required to support + for (TestableCharset testableCharset : DETECT_CHARSET_TEST_CHARSETS) { + if (!testableCharset.getCharset().isPresent()) { + if (testableCharset.standard) + throw new AssertionError( + "JVM does not support standard charset " + testableCharset.charsetName); + continue; + } + + final Charset charset = testableCharset.getCharset().get(); + + + // Make sure we get the right charset when we decode WITHOUT a BOM + final byte[] plainEncodedText = originalText.getBytes(charset); + final Charset plainDetectedCharset = Chardet.detectCharset(plainEncodedText).get(); + if (testableCharset.charsetName.equals("UTF-8")) { + // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text. + assertThat(plainDetectedCharset, + anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8))); + } else { + assertThat(plainDetectedCharset, is(charset)); + } + + // Make sure we get the right charset when we decode WITHOUT a BOM + final byte[] bomEncodedText = + concat(testableCharset.bom.getBytes(), originalText.getBytes(charset)); + final Charset bomDetectedCharset = Chardet.detectCharset(bomEncodedText).get(); + if (testableCharset.charsetName.equals("UTF-8")) { + // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text. + assertThat(bomDetectedCharset, + anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8))); + } else { + assertThat(bomDetectedCharset, is(charset)); + } + } + } + + /** + * These are the charsets we'll test decoding with. We'll test decoding with/out a BOM. + */ + public static final List DECODE_TEST_CHARSETS = + asList(new TestableCharset(true, "UTF-16BE", ByteOrderMark.UTF_16BE), + new TestableCharset(true, "UTF-16LE", ByteOrderMark.UTF_16LE), + new TestableCharset(true, "UTF-8", ByteOrderMark.UTF_8), + new TestableCharset(false, "UTF-32BE", ByteOrderMark.UTF_32BE), + new TestableCharset(false, "UTF-32LE", ByteOrderMark.UTF_32LE), + new TestableCharset(false, "UTF-1", ByteOrderMark.UTF_1), + new TestableCharset(false, "UTF-EBCDIC", ByteOrderMark.UTF_EBCDIC)); + + /** + * Test the ability to decode an InputStream + * + * @see Chardet#decode(byte[], Charset) + */ + @Test + public void decodeTest() throws IOException { + // Stopping by Woods on a Snowy Evening, by Robert Frost + // We'll encode this in various charsets and decode them + // We use a text without diacritics to avoid any issues with encoding. We're not here to test + // the correctness of charset implementations, only correct application of same. + // Note: The poem is public domain. + final String originalText = "Whose woods these are I think I know. \n" + + "His house is in the village though; \n" + "He will not see me stopping here \n" + + "To watch his woods fill up with snow. \n" + "\n" + + "My little horse must think it queer \n" + "To stop without a farmhouse near \n" + + "Between the woods and frozen lake \n" + "The darkest evening of the year. \n" + "\n" + + "He gives his harness bells a shake \n" + "To ask if there is some mistake. \n" + + "The only other sound’s the sweep \n" + "Of easy wind and downy flake. \n" + "\n" + + "The woods are lovely, dark and deep, \n" + "But I have promises to keep, \n" + + "And miles to go before I sleep, \n" + "And miles to go before I sleep."; + + for (TestableCharset testableCharset : DECODE_TEST_CHARSETS) { + if (!testableCharset.getCharset().isPresent()) { + if (testableCharset.standard) + throw new AssertionError( + "JVM does not support standard charset " + testableCharset.charsetName); + continue; + } + + final Charset charset = testableCharset.getCharset().get(); + + final byte[] encodedText = originalText.getBytes(charset); + + // Make sure we get the right charset when we decode WITHOUT a BOM + final StringWriter plainWriter = new StringWriter(); + try (DecodedInputStreamReader plainReader = + Chardet.decode(new ByteArrayInputStream(encodedText), charset)) { + final Charset detectedCharset = plainReader.charset(); + if (testableCharset.charsetName.equals("UTF-8")) { + // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text. + assertThat(detectedCharset, + anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8))); + } else { + assertThat(detectedCharset, is(charset)); + } + CharStreams.copy(plainReader, plainWriter); + } + assertThat(plainWriter.toString(), is(originalText)); + + // Make sure we get the right charset when we decode WITH a BOM + final StringWriter bomWriter = new StringWriter(); + try (DecodedInputStreamReader bomReader = Chardet + .decode(new SequenceInputStream(new ByteArrayInputStream(testableCharset.bom.getBytes()), + new ByteArrayInputStream(encodedText)), charset)) { + final Charset detectedCharset = bomReader.charset(); + if (testableCharset.charsetName.equals("UTF-8")) { + // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text. + assertThat(detectedCharset, + anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8))); + } else { + assertThat(detectedCharset, is(charset)); + } + CharStreams.copy(bomReader, bomWriter); + } + assertThat(bomWriter.toString(), is(originalText)); + } + } }