Skip to content

Commit

Permalink
Emit warnings when dumping binary strings
Browse files Browse the repository at this point in the history
Because of it's Ruby 1.8 heritage, the C extension doesn't care
much about strings encoding. We should get stricter over time.
  • Loading branch information
byroot committed Oct 24, 2024
1 parent cecf04f commit 4de4399
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 23 deletions.
13 changes: 7 additions & 6 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ static VALUE mJSON, cState, mString_Extend, eGeneratorError, eNestingError, Enco

static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend, i_encode;

static int usascii_encindex, utf8_encindex, binary_encindex;

/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
Expand Down Expand Up @@ -498,7 +500,7 @@ static VALUE mString_to_json_raw_object(VALUE self)
VALUE result = rb_hash_new();
rb_hash_aset(result, rb_funcall(mJSON, i_create_id, 0), rb_class_name(rb_obj_class(self)));
ary = rb_funcall(self, i_unpack, 1, rb_str_new2("C*"));
rb_hash_aset(result, rb_str_new2("raw"), ary);
rb_hash_aset(result, rb_utf8_str_new_lit("raw"), ary);
return result;
}

Expand Down Expand Up @@ -749,8 +751,6 @@ static void generate_json_array(FBuffer *buffer, VALUE Vstate, JSON_Generator_St
fbuffer_append_char(buffer, ']');
}

static int usascii_encindex, utf8_encindex, binary_encindex;

static inline int enc_utf8_compatible_p(int enc_idx)
{
if (enc_idx == usascii_encindex) return 1;
Expand All @@ -764,13 +764,14 @@ static inline VALUE ensure_valid_encoding(VALUE str)
VALUE utf8_string;
if (RB_UNLIKELY(!enc_utf8_compatible_p(encindex))) {
if (encindex == binary_encindex) {
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
// TODO: Deprecate in 2.8.0
// TODO: Remove in 3.0.0
utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex);
switch (rb_enc_str_coderange(utf8_string)) {
case ENC_CODERANGE_7BIT:
return utf8_string;
case ENC_CODERANGE_VALID:
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
// TODO: Raise in 3.0.0
rb_warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
return utf8_string;
break;
}
Expand Down
25 changes: 14 additions & 11 deletions ext/json/ext/parser/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -1795,9 +1795,12 @@ static VALUE convert_encoding(VALUE source)

if (encindex == binary_encindex) {
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
// TODO: Deprecate in 2.8.0
// TODO: Remove in 3.0.0
return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
VALUE utf8_string = rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
switch (rb_enc_str_coderange(utf8_string)) {
case ENC_CODERANGE_7BIT:
case ENC_CODERANGE_VALID:
return utf8_string;
}
}

return rb_str_conv_enc(source, rb_enc_from_index(encindex), rb_utf8_encoding());
Expand Down Expand Up @@ -1946,15 +1949,15 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
}


#line 1950 "parser.c"
#line 1953 "parser.c"
enum {JSON_start = 1};
enum {JSON_first_final = 10};
enum {JSON_error = 0};

enum {JSON_en_main = 1};


#line 858 "parser.rl"
#line 861 "parser.rl"


/*
Expand All @@ -1972,16 +1975,16 @@ static VALUE cParser_parse(VALUE self)
GET_PARSER;


#line 1976 "parser.c"
#line 1979 "parser.c"
{
cs = JSON_start;
}

#line 875 "parser.rl"
#line 878 "parser.rl"
p = json->source;
pe = p + json->len;

#line 1985 "parser.c"
#line 1988 "parser.c"
{
if ( p == pe )
goto _test_eof;
Expand Down Expand Up @@ -2015,7 +2018,7 @@ case 1:
cs = 0;
goto _out;
tr2:
#line 850 "parser.rl"
#line 853 "parser.rl"
{
char *np = JSON_parse_value(json, p, pe, &result, 0);
if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;}
Expand All @@ -2025,7 +2028,7 @@ cs = 0;
if ( ++p == pe )
goto _test_eof10;
case 10:
#line 2029 "parser.c"
#line 2032 "parser.c"
switch( (*p) ) {
case 13: goto st10;
case 32: goto st10;
Expand Down Expand Up @@ -2114,7 +2117,7 @@ case 9:
_out: {}
}

#line 878 "parser.rl"
#line 881 "parser.rl"

if (cs >= JSON_first_final && p == pe) {
return result;
Expand Down
9 changes: 6 additions & 3 deletions ext/json/ext/parser/parser.rl
Original file line number Diff line number Diff line change
Expand Up @@ -690,9 +690,12 @@ static VALUE convert_encoding(VALUE source)

if (encindex == binary_encindex) {
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
// TODO: Deprecate in 2.8.0
// TODO: Remove in 3.0.0
return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
VALUE utf8_string = rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
switch (rb_enc_str_coderange(utf8_string)) {
case ENC_CODERANGE_7BIT:
case ENC_CODERANGE_VALID:
return utf8_string;
}
}

return rb_str_conv_enc(source, rb_enc_from_index(encindex), rb_utf8_encoding());
Expand Down
2 changes: 1 addition & 1 deletion lib/json/add/bigdecimal.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def self.json_create(object)
def as_json(*)
{
JSON.create_id => self.class.name,
'b' => _dump,
'b' => _dump.force_encoding(Encoding::UTF_8),
}
end

Expand Down
9 changes: 7 additions & 2 deletions test/json/json_generator_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -471,8 +471,13 @@ def test_valid_utf8_in_different_encoding
wrong_encoding_string = utf8_string.b
# This behavior is historical. Not necessary desirable. We should deprecated it.
# The pure and java version of the gem already don't behave this way.
assert_equal utf8_string.to_json, wrong_encoding_string.to_json
assert_equal JSON.dump(utf8_string), JSON.dump(wrong_encoding_string)
assert_warning(/UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0/) do
assert_equal utf8_string.to_json, wrong_encoding_string.to_json
end

assert_warning(/UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0/) do
assert_equal JSON.dump(utf8_string), JSON.dump(wrong_encoding_string)
end
end

def test_string_ext_included_calls_super
Expand Down

0 comments on commit 4de4399

Please sign in to comment.