diff --git a/Languages/IronPython/IronPython.Modules/_codecs.cs b/Languages/IronPython/IronPython.Modules/_codecs.cs index 2905d54018..d492ace438 100644 --- a/Languages/IronPython/IronPython.Modules/_codecs.cs +++ b/Languages/IronPython/IronPython.Modules/_codecs.cs @@ -641,18 +641,22 @@ private static PythonTuple DoDecode(Encoding encoding, object input, string erro #if FEATURE_ENCODING // DecoderFallback encoding = (Encoding)encoding.Clone(); - ExceptionFallBack fallback = null; if (fAlwaysThrow) { encoding.DecoderFallback = DecoderFallback.ExceptionFallback; } else { - fallback = new ExceptionFallBack(bytes); + fallback = (encoding is UTF8Encoding && DotNet) ? + // This is a workaround for a bug, see ExceptionFallbackBufferUtf8DotNet + // for more details. + new ExceptionFallBackUtf8DotNet(bytes): + new ExceptionFallBack(bytes); encoding.DecoderFallback = fallback; } #endif string decoded = encoding.GetString(bytes, 0, bytes.Length); int badByteCount = 0; + #if FEATURE_ENCODING // DecoderFallback if (!fAlwaysThrow) { byte[] badBytes = fallback.buffer.badBytes; @@ -666,6 +670,14 @@ private static PythonTuple DoDecode(Encoding encoding, object input, string erro return tuple; } + + internal static readonly bool DotNet; + + static PythonCodecs() { + DotNet = Type.GetType("Mono.Runtime") == null; + } + + private static int CheckPreamble(Encoding enc, string buffer) { byte[] preamble = enc.GetPreamble(); @@ -723,6 +735,11 @@ private static PythonTuple DoEncode(Encoding encoding, object input, string erro class ExceptionFallBack : DecoderFallback { internal ExceptionFallbackBuffer buffer; + // This ctor can be removed as soon as workaround for utf8 encoding in .net is + // no longer necessary. + protected ExceptionFallBack() { + } + public ExceptionFallBack(byte[] bytes) { buffer = new ExceptionFallbackBuffer(bytes); } @@ -738,16 +755,17 @@ public override int MaxCharCount { class ExceptionFallbackBuffer : DecoderFallbackBuffer { internal byte[] badBytes; - private byte[] inputBytes; + protected byte[] inputBytes; + public ExceptionFallbackBuffer(byte[] bytes) { inputBytes = bytes; } public override bool Fallback(byte[] bytesUnknown, int index) { if (index > 0 && index + bytesUnknown.Length != inputBytes.Length) { - throw PythonOps.UnicodeEncodeError("failed to decode bytes at index {0}", index); + throw PythonOps.UnicodeDecodeError( + String.Format("failed to decode bytes at index: {0}", index), bytesUnknown, index); } - // just some bad bytes at the end badBytes = bytesUnknown; return false; @@ -765,6 +783,40 @@ public override int Remaining { get { return 0; } } } + + // This class can be removed as soon as workaround for utf8 encoding in .net is + // no longer necessary. + class ExceptionFallBackUtf8DotNet : ExceptionFallBack { + public ExceptionFallBackUtf8DotNet(byte[] bytes) { + buffer = new ExceptionFallbackBufferUtf8DotNet(bytes); + } + } + + // This class can be removed as soon as workaround for utf8 encoding in .net is + // no longer necessary. + class ExceptionFallbackBufferUtf8DotNet : ExceptionFallbackBuffer { + private bool ignoreNext = false; + + public ExceptionFallbackBufferUtf8DotNet(byte[] bytes) : base(bytes) { + } + + public override bool Fallback(byte[] bytesUnknown, int index) { + // In case of dot net and utf-8 value of index does not conform to documentation provided by + // Microsoft http://msdn.microsoft.com/en-us/library/bdftay9c%28v=vs.100%29.aspx + // The value of index is mysteriously decreased by the size of bytesUnknown + // Tested on Windows 7 64, .NET 4.0.30319.18408, all recommended patches as of 06.02.2014 + if (ignoreNext) { + // dot net sometimes calls second time after this method returns false + // if this is the case, do nothing + return false; + } + // adjust index + index = index + bytesUnknown.Length; + ignoreNext = true; + return base.Fallback(bytesUnknown, index); + } + + } #endif } diff --git a/Languages/IronPython/IronPython/Runtime/Operations/PythonOps.cs b/Languages/IronPython/IronPython/Runtime/Operations/PythonOps.cs index 3ea885fa21..daf310c7ce 100644 --- a/Languages/IronPython/IronPython/Runtime/Operations/PythonOps.cs +++ b/Languages/IronPython/IronPython/Runtime/Operations/PythonOps.cs @@ -4053,6 +4053,10 @@ public static Exception UnicodeDecodeError(string format, params object[] args) return new System.Text.DecoderFallbackException(string.Format(format, args)); } + public static Exception UnicodeDecodeError(string message, byte[] bytesUnknown, int index) { + return new System.Text.DecoderFallbackException(message, bytesUnknown, index); + } + public static Exception UnicodeEncodeError(string format, params object[] args) { return new System.Text.EncoderFallbackException(string.Format(format, args)); } diff --git a/Languages/IronPython/Tests/modules/io_related/codecs_test.py b/Languages/IronPython/Tests/modules/io_related/codecs_test.py index 990ce01679..07f41e70af 100644 --- a/Languages/IronPython/Tests/modules/io_related/codecs_test.py +++ b/Languages/IronPython/Tests/modules/io_related/codecs_test.py @@ -347,6 +347,26 @@ def test_utf_8_decode(): AreEqual(new_str, u'abc') AreEqual(size, 3) + +def test_cp34951(): + def internal_cp34951(sample1): + AreEqual(codecs.utf_8_decode(sample1), (u'12\u20ac\x0a', 6)) + sample1 = sample1[:-1] # 12 + AreEqual(codecs.utf_8_decode(sample1), (u'12\u20ac', 5)) + sample1 = sample1[:-1] # 12 + AreEqual(codecs.utf_8_decode(sample1), (u'12', 2)) + + sample1 = sample1 + 'x7f' # makes it invalid + try: + r = codecs.utf_8_decode(sample1) + Assert(False, "expected UncodeDecodeError not raised") + except Exception as e: + AreEqual(type(e), UnicodeDecodeError) + + internal_cp34951(b'\x31\x32\xe2\x82\xac\x0a') # 12 + internal_cp34951(b'\xef\xbb\xbf\x31\x32\xe2\x82\xac\x0a') # 12 + + def test_utf_8_encode(): ''' '''