Skip to content

Commit

Permalink
Merge pull request #50 from sigpwned/bug/49/no-longer-detects-without…
Browse files Browse the repository at this point in the history
…-bom

Fix bug where BOMs were not respected in InputStream decode
  • Loading branch information
sigpwned authored Sep 29, 2024
2 parents d7bb898 + 3e51932 commit ed4cb89
Show file tree
Hide file tree
Showing 2 changed files with 188 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/main/java/com/sigpwned/chardet4j/Chardet.java
Original file line number Diff line number Diff line change
Expand Up @@ -354,13 +354,13 @@ public static DecodedInputStreamReader decode(InputStream input, String declared
return new DecodedInputStreamReader(bomed, bomed.bom().get().getCharset());

// If there is no BOM, then read some bytes to detect the charset.
final byte[] buf = ByteStreams.readNBytes(input, DECODE_DETECT_BUFSIZE);
final byte[] buf = ByteStreams.readNBytes(bomed, DECODE_DETECT_BUFSIZE);

// Note that charset cannot be null, since we check defaultCharset above.
Charset charset = detectCharset(buf, declaredEncoding).orElse(defaultCharset);

return new DecodedInputStreamReader(
new SequenceInputStream(new ByteArrayInputStream(buf), input), charset);
new SequenceInputStream(new ByteArrayInputStream(buf), bomed), charset);
}

/**
Expand Down
186 changes: 186 additions & 0 deletions src/test/java/com/sigpwned/chardet4j/ChardetTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,26 @@
*/
package com.sigpwned.chardet4j;

import static java.util.Arrays.asList;
import static java.util.Objects.requireNonNull;
import static org.hamcrest.CoreMatchers.anyOf;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.Reader;
import java.io.SequenceInputStream;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.List;
import java.util.Optional;
import org.junit.Test;
import com.google.common.io.CharStreams;
import com.google.common.io.Resources;
import com.sigpwned.chardet4j.io.DecodedInputStreamReader;

public class ChardetTest {
@Test
Expand Down Expand Up @@ -169,4 +178,181 @@ public void longTest() throws IOException {

assertThat(charset, is(StandardCharsets.UTF_8));
}

public static class TestableCharset {
public final boolean standard;
public final String charsetName;
public final ByteOrderMark bom;

public TestableCharset(boolean standard, String charsetName, ByteOrderMark bom) {
this.standard = standard;
this.charsetName = requireNonNull(charsetName);
this.bom = requireNonNull(bom);
}

public Optional<Charset> getCharset() {
try {
return Optional.of(Charset.forName(charsetName));
} catch (UnsupportedCharsetException e) {
return Optional.empty();
}
}
}

public static byte[] concat(byte[] xs, byte[] ys) {
byte[] zs = new byte[xs.length + ys.length];
System.arraycopy(xs, 0, zs, 0, xs.length);
System.arraycopy(ys, 0, zs, xs.length, ys.length);
return zs;
}

/**
* These are the charsets we'll test decoding with. We'll test decoding with/out a BOM.
*/
public static final List<TestableCharset> DETECT_CHARSET_TEST_CHARSETS =
asList(new TestableCharset(true, "UTF-16BE", ByteOrderMark.UTF_16BE),
new TestableCharset(true, "UTF-16LE", ByteOrderMark.UTF_16LE),
new TestableCharset(true, "UTF-8", ByteOrderMark.UTF_8),
new TestableCharset(false, "UTF-32BE", ByteOrderMark.UTF_32BE),
new TestableCharset(false, "UTF-32LE", ByteOrderMark.UTF_32LE),
new TestableCharset(false, "UTF-1", ByteOrderMark.UTF_1),
new TestableCharset(false, "UTF-EBCDIC", ByteOrderMark.UTF_EBCDIC));

/**
* Test a variety of charsets using a known text and detect them
*/
@Test
public void detectCharsetTest() throws IOException {
// Stopping by Woods on a Snowy Evening, by Robert Frost
// We'll encode this in various charsets and decode them
// We use a text without diacritics to avoid any issues with encoding. We're not here to test
// the correctness of charset implementations, only correct application of same.
// Note: The poem is public domain.
final String originalText = "Whose woods these are I think I know. \n"
+ "His house is in the village though; \n" + "He will not see me stopping here \n"
+ "To watch his woods fill up with snow. \n" + "\n"
+ "My little horse must think it queer \n" + "To stop without a farmhouse near \n"
+ "Between the woods and frozen lake \n" + "The darkest evening of the year. \n" + "\n"
+ "He gives his harness bells a shake \n" + "To ask if there is some mistake. \n"
+ "The only other sound’s the sweep \n" + "Of easy wind and downy flake. \n" + "\n"
+ "The woods are lovely, dark and deep, \n" + "But I have promises to keep, \n"
+ "And miles to go before I sleep, \n" + "And miles to go before I sleep.";

// These are all the charsets that Java is required to support
for (TestableCharset testableCharset : DETECT_CHARSET_TEST_CHARSETS) {
if (!testableCharset.getCharset().isPresent()) {
if (testableCharset.standard)
throw new AssertionError(
"JVM does not support standard charset " + testableCharset.charsetName);
continue;
}

final Charset charset = testableCharset.getCharset().get();


// Make sure we get the right charset when we decode WITHOUT a BOM
final byte[] plainEncodedText = originalText.getBytes(charset);
final Charset plainDetectedCharset = Chardet.detectCharset(plainEncodedText).get();
if (testableCharset.charsetName.equals("UTF-8")) {
// Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
assertThat(plainDetectedCharset,
anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
} else {
assertThat(plainDetectedCharset, is(charset));
}

// Make sure we get the right charset when we decode WITHOUT a BOM
final byte[] bomEncodedText =
concat(testableCharset.bom.getBytes(), originalText.getBytes(charset));
final Charset bomDetectedCharset = Chardet.detectCharset(bomEncodedText).get();
if (testableCharset.charsetName.equals("UTF-8")) {
// Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
assertThat(bomDetectedCharset,
anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
} else {
assertThat(bomDetectedCharset, is(charset));
}
}
}

/**
* These are the charsets we'll test decoding with. We'll test decoding with/out a BOM.
*/
public static final List<TestableCharset> DECODE_TEST_CHARSETS =
asList(new TestableCharset(true, "UTF-16BE", ByteOrderMark.UTF_16BE),
new TestableCharset(true, "UTF-16LE", ByteOrderMark.UTF_16LE),
new TestableCharset(true, "UTF-8", ByteOrderMark.UTF_8),
new TestableCharset(false, "UTF-32BE", ByteOrderMark.UTF_32BE),
new TestableCharset(false, "UTF-32LE", ByteOrderMark.UTF_32LE),
new TestableCharset(false, "UTF-1", ByteOrderMark.UTF_1),
new TestableCharset(false, "UTF-EBCDIC", ByteOrderMark.UTF_EBCDIC));

/**
* Test the ability to decode an InputStream
*
* @see Chardet#decode(byte[], Charset)
*/
@Test
public void decodeTest() throws IOException {
// Stopping by Woods on a Snowy Evening, by Robert Frost
// We'll encode this in various charsets and decode them
// We use a text without diacritics to avoid any issues with encoding. We're not here to test
// the correctness of charset implementations, only correct application of same.
// Note: The poem is public domain.
final String originalText = "Whose woods these are I think I know. \n"
+ "His house is in the village though; \n" + "He will not see me stopping here \n"
+ "To watch his woods fill up with snow. \n" + "\n"
+ "My little horse must think it queer \n" + "To stop without a farmhouse near \n"
+ "Between the woods and frozen lake \n" + "The darkest evening of the year. \n" + "\n"
+ "He gives his harness bells a shake \n" + "To ask if there is some mistake. \n"
+ "The only other sound’s the sweep \n" + "Of easy wind and downy flake. \n" + "\n"
+ "The woods are lovely, dark and deep, \n" + "But I have promises to keep, \n"
+ "And miles to go before I sleep, \n" + "And miles to go before I sleep.";

for (TestableCharset testableCharset : DECODE_TEST_CHARSETS) {
if (!testableCharset.getCharset().isPresent()) {
if (testableCharset.standard)
throw new AssertionError(
"JVM does not support standard charset " + testableCharset.charsetName);
continue;
}

final Charset charset = testableCharset.getCharset().get();

final byte[] encodedText = originalText.getBytes(charset);

// Make sure we get the right charset when we decode WITHOUT a BOM
final StringWriter plainWriter = new StringWriter();
try (DecodedInputStreamReader plainReader =
Chardet.decode(new ByteArrayInputStream(encodedText), charset)) {
final Charset detectedCharset = plainReader.charset();
if (testableCharset.charsetName.equals("UTF-8")) {
// Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
assertThat(detectedCharset,
anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
} else {
assertThat(detectedCharset, is(charset));
}
CharStreams.copy(plainReader, plainWriter);
}
assertThat(plainWriter.toString(), is(originalText));

// Make sure we get the right charset when we decode WITH a BOM
final StringWriter bomWriter = new StringWriter();
try (DecodedInputStreamReader bomReader = Chardet
.decode(new SequenceInputStream(new ByteArrayInputStream(testableCharset.bom.getBytes()),
new ByteArrayInputStream(encodedText)), charset)) {
final Charset detectedCharset = bomReader.charset();
if (testableCharset.charsetName.equals("UTF-8")) {
// Over the wire, UTF-8 is indistinguishable from ISO-8859-1 for this text.
assertThat(detectedCharset,
anyOf(is(StandardCharsets.ISO_8859_1), is(StandardCharsets.UTF_8)));
} else {
assertThat(detectedCharset, is(charset));
}
CharStreams.copy(bomReader, bomWriter);
}
assertThat(bomWriter.toString(), is(originalText));
}
}
}

0 comments on commit ed4cb89

Please sign in to comment.