diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 7bd8adf8..b4547ba3 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -150,7 +150,7 @@ module Private PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/ - CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/ DEFAULT_ENTITIES_PATTERNS = {} default_entities = ['gt', 'lt', 'quot', 'apos', 'amp'] default_entities.each do |term| @@ -570,8 +570,12 @@ def unnormalize( string, entities=nil, filter=nil ) return rv if matches.size == 0 rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') + if m.start_with?("x") + code_point = Integer(m[1..-1], 16) + else + code_point = Integer(m, 10) + end + [code_point].pack('U*') } matches.collect!{|x|x[0]}.compact! if filter diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb index bf8d2190..4bb5da5c 100644 --- a/test/parse/test_character_reference.rb +++ b/test/parse/test_character_reference.rb @@ -13,5 +13,11 @@ def test_linear_performance_many_preceding_zeros REXML::Document.new('') end end + + def test_hex_precedding_zero + parser = REXML::Parsers::PullParser.new("a�x61;") + parser.pull # :start_element + assert_equal("a�x61;", parser.pull[1]) # :text + end end end