Merge pull request #50 from sigpwned/bug/49/no-longer-detects-without…

…-bom Fix bug where BOMs were not respected in InputStream decode
sigpwned · Sep 29, 2024 · ed4cb89 · ed4cb89
2 parents d7bb898 + 3e51932
commit ed4cb89
Show file tree

Hide file tree

Showing 2 changed files with 188 additions and 2 deletions.
diff --git a/src/main/java/com/sigpwned/chardet4j/Chardet.java b/src/main/java/com/sigpwned/chardet4j/Chardet.java
@@ -354,13 +354,13 @@ public static DecodedInputStreamReader decode(InputStream input, String declared
       return new DecodedInputStreamReader(bomed, bomed.bom().get().getCharset());
 
     // If there is no BOM, then read some bytes to detect the charset.
-    final byte[] buf = ByteStreams.readNBytes(input, DECODE_DETECT_BUFSIZE);
+    final byte[] buf = ByteStreams.readNBytes(bomed, DECODE_DETECT_BUFSIZE);
 
     // Note that charset cannot be null, since we check defaultCharset above.
     Charset charset = detectCharset(buf, declaredEncoding).orElse(defaultCharset);
 
     return new DecodedInputStreamReader(
-        new SequenceInputStream(new ByteArrayInputStream(buf), input), charset);
+        new SequenceInputStream(new ByteArrayInputStream(buf), bomed), charset);
   }
 
   /**

diff --git a/src/test/java/com/sigpwned/chardet4j/ChardetTest.java b/src/test/java/com/sigpwned/chardet4j/ChardetTest.java
@@ -19,17 +19,26 @@
  */
 package com.sigpwned.chardet4j;
 
+import static java.util.Arrays.asList;
+import static java.util.Objects.requireNonNull;
+import static org.hamcrest.CoreMatchers.anyOf;
 import static org.hamcrest.CoreMatchers.is;
 import static org.hamcrest.MatcherAssert.assertThat;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.Reader;
+import java.io.SequenceInputStream;
+import java.io.StringWriter;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.List;
+import java.util.Optional;
 import org.junit.Test;
 import com.google.common.io.CharStreams;
 import com.google.common.io.Resources;
+import com.sigpwned.chardet4j.io.DecodedInputStreamReader;
 
 public class ChardetTest {
   @Test
@@ -169,4 +178,181 @@ public void longTest() throws IOException {
 
     assertThat(charset, is(StandardCharsets.UTF_8));
   }
+
+  public static class TestableCharset {
+    public final boolean standard;
+    public final String charsetName;
+    public final ByteOrderMark bom;
+
+    public TestableCharset(boolean standard, String charsetName, ByteOrderMark bom) {
+      this.standard = standard;
+      this.charsetName = requireNonNull(charsetName);
+      this.bom = requireNonNull(bom);
+    }
+
+    public Optional<Charset> getCharset() {
+      try {
+        return Optional.of(Charset.forName(charsetName));
+      } catch (UnsupportedCharsetException e) {
+        return Optional.empty();
+      }
+    }
+  }
+
+  public static byte[] concat(byte[] xs, byte[] ys) {
+    byte[] zs = new byte[xs.length + ys.length];
+    System.arraycopy(xs, 0, zs, 0, xs.length);
+    System.arraycopy(ys, 0, zs, xs.length, ys.length);
+    return zs;
+  }
+
+  /**
+   * These are the charsets we'll test decoding with. We'll test decoding with/out a BOM.
+   */
+  public static final List<TestableCharset> DETECT_CHARSET_TEST_CHARSETS =
+      asList(new TestableCharset(true, "UTF-16BE", ByteOrderMark.UTF_16BE),
+          new TestableCharset(true, "UTF-16LE", ByteOrderMark.UTF_16LE),
+          new TestableCharset(true, "UTF-8", ByteOrderMark.UTF_8),
+          new TestableCharset(false, "UTF-32BE", ByteOrderMark.UTF_32BE),
+          new TestableCharset(false, "UTF-32LE", ByteOrderMark.UTF_32LE),
+          new TestableCharset(false, "UTF-1", ByteOrderMark.UTF_1),
+          new TestableCharset(false, "UTF-EBCDIC", ByteOrderMark.UTF_EBCDIC));
+
+  /**
+   * Test a variety of charsets using a known text and detect them
+   */
+  @Test
+  public void detectCharsetTest() throws IOException {
+    // Stopping by Woods on a Snowy Evening, by Robert Frost
+    // We'll encode this in various charsets and decode them
+    // We use a text without diacritics to avoid any issues with encoding. We're not here to test
+    // the correctness of charset implementations, only correct application of same.
+    // Note: The poem is public domain.
+    final String originalText = "Whose woods these are I think I know.   \n"
+        + "His house is in the village though;   \n" + "He will not see me stopping here   \n"
+        + "To watch his woods fill up with snow.   \n" + "\n"
+        + "My little horse must think it queer   \n" + "To stop without a farmhouse near   \n"
+        + "Between the woods and frozen lake   \n" + "The darkest evening of the year.   \n" + "\n"
+        + "He gives his harness bells a shake   \n" + "To ask if there is some mistake.   \n"
+        + "The only other sound’s the sweep   \n" + "Of easy wind and downy flake.   \n" + "\n"
+        + "The woods are lovely, dark and deep,   \n" + "But I have promises to keep,   \n"
+        + "And miles to go before I sleep,   \n" + "And miles to go before I sleep.";
+
+    // These are all the charsets that Java is required to support
+    for (TestableCharset testableCharset : DETECT_CHARSET_TEST_CHARSETS) {
+      if (!testableCharset.getCharset().isPresent()) {
+        if (testableCharset.standard)
+          throw new AssertionError(
+              "JVM does not support standard charset " + testableCharset.charsetName);
+        continue;
+      }
+
+      final Charset charset = testableCharset.getCharset().get();
+
+
+      // Make sure we get the right charset when we decode WITHOUT a BOM
+      final byte[] plainEncodedText = originalText.getBytes(charset);
+      final Charset plainDetectedCharset = Chardet.detectCharset(plainEncodedText).get();
+      if (testableCharset.charsetName.equals("UTF-8")) {
+        // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
+        assertThat(plainDetectedCharset,
+            anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
+      } else {
+        assertThat(plainDetectedCharset, is(charset));
+      }
+
+      // Make sure we get the right charset when we decode WITHOUT a BOM
+      final byte[] bomEncodedText =
+          concat(testableCharset.bom.getBytes(), originalText.getBytes(charset));
+      final Charset bomDetectedCharset = Chardet.detectCharset(bomEncodedText).get();
+      if (testableCharset.charsetName.equals("UTF-8")) {
+        // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
+        assertThat(bomDetectedCharset,
+            anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
+      } else {
+        assertThat(bomDetectedCharset, is(charset));
+      }
+    }
+  }
+
+  /**
+   * These are the charsets we'll test decoding with. We'll test decoding with/out a BOM.
+   */
+  public static final List<TestableCharset> DECODE_TEST_CHARSETS =
+      asList(new TestableCharset(true, "UTF-16BE", ByteOrderMark.UTF_16BE),
+          new TestableCharset(true, "UTF-16LE", ByteOrderMark.UTF_16LE),
+          new TestableCharset(true, "UTF-8", ByteOrderMark.UTF_8),
+          new TestableCharset(false, "UTF-32BE", ByteOrderMark.UTF_32BE),
+          new TestableCharset(false, "UTF-32LE", ByteOrderMark.UTF_32LE),
+          new TestableCharset(false, "UTF-1", ByteOrderMark.UTF_1),
+          new TestableCharset(false, "UTF-EBCDIC", ByteOrderMark.UTF_EBCDIC));
+
+  /**
+   * Test the ability to decode an InputStream
+   * 
+   * @see Chardet#decode(byte[], Charset)
+   */
+  @Test
+  public void decodeTest() throws IOException {
+    // Stopping by Woods on a Snowy Evening, by Robert Frost
+    // We'll encode this in various charsets and decode them
+    // We use a text without diacritics to avoid any issues with encoding. We're not here to test
+    // the correctness of charset implementations, only correct application of same.
+    // Note: The poem is public domain.
+    final String originalText = "Whose woods these are I think I know.   \n"
+        + "His house is in the village though;   \n" + "He will not see me stopping here   \n"
+        + "To watch his woods fill up with snow.   \n" + "\n"
+        + "My little horse must think it queer   \n" + "To stop without a farmhouse near   \n"
+        + "Between the woods and frozen lake   \n" + "The darkest evening of the year.   \n" + "\n"
+        + "He gives his harness bells a shake   \n" + "To ask if there is some mistake.   \n"
+        + "The only other sound’s the sweep   \n" + "Of easy wind and downy flake.   \n" + "\n"
+        + "The woods are lovely, dark and deep,   \n" + "But I have promises to keep,   \n"
+        + "And miles to go before I sleep,   \n" + "And miles to go before I sleep.";
+
+    for (TestableCharset testableCharset : DECODE_TEST_CHARSETS) {
+      if (!testableCharset.getCharset().isPresent()) {
+        if (testableCharset.standard)
+          throw new AssertionError(
+              "JVM does not support standard charset " + testableCharset.charsetName);
+        continue;
+      }
+
+      final Charset charset = testableCharset.getCharset().get();
+
+      final byte[] encodedText = originalText.getBytes(charset);
+
+      // Make sure we get the right charset when we decode WITHOUT a BOM
+      final StringWriter plainWriter = new StringWriter();
+      try (DecodedInputStreamReader plainReader =
+          Chardet.decode(new ByteArrayInputStream(encodedText), charset)) {
+        final Charset detectedCharset = plainReader.charset();
+        if (testableCharset.charsetName.equals("UTF-8")) {
+          // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
+          assertThat(detectedCharset,
+              anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
+        } else {
+          assertThat(detectedCharset, is(charset));
+        }
+        CharStreams.copy(plainReader, plainWriter);
+      }
+      assertThat(plainWriter.toString(), is(originalText));
+
+      // Make sure we get the right charset when we decode WITH a BOM
+      final StringWriter bomWriter = new StringWriter();
+      try (DecodedInputStreamReader bomReader = Chardet
+          .decode(new SequenceInputStream(new ByteArrayInputStream(testableCharset.bom.getBytes()),
+              new ByteArrayInputStream(encodedText)), charset)) {
+        final Charset detectedCharset = bomReader.charset();
+        if (testableCharset.charsetName.equals("UTF-8")) {
+          // Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
+          assertThat(detectedCharset,
+              anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
+        } else {
+          assertThat(detectedCharset, is(charset));
+        }
+        CharStreams.copy(bomReader, bomWriter);
+      }
+      assertThat(bomWriter.toString(), is(originalText));
+    }
+  }
 }