From dba757d2bd4ca319c716f0f2eee019f85ed1a702 Mon Sep 17 00:00:00 2001 From: rocky Date: Sun, 23 Apr 2023 06:05:54 -0400 Subject: [PATCH] unicode tweaks - Python 2.x can't accept unicode strings other than \u, so check whether a unicode string is ASCII or not. --- xdis/cross_types.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/xdis/cross_types.py b/xdis/cross_types.py index 0f1c0195..d027e8fa 100644 --- a/xdis/cross_types.py +++ b/xdis/cross_types.py @@ -19,6 +19,13 @@ """ +# From +# https://stackoverflow.com/questions/196345/how-to-check-if-a-string-in-python-is-in-ascii +def is_ascii(s: str) -> bool: + """Check if the characters in string s are in ASCII, U+0-U+7F.""" + return len(s) == len(s.encode()) + + class LongTypeForPython3(int): """ Define a Python3 long integer type which exists in @@ -28,9 +35,9 @@ class LongTypeForPython3(int): def __init__(self, value): self.value = value - def __repr__(self): + def __repr__(self) -> str: """ - Replacement __str__ and str() for Python3. + Replacement repr() and str() for Python3. This ensures we get the "L" suffix on long types. """ return f"""{self.value}L""" @@ -45,15 +52,25 @@ class UnicodeForPython3(str): def __init__(self, value): self.value = value - def __repr__(self): - """ - Replacement __str__ and str() for Python3. - This ensures we get the "u" suffix on unicode types. + def __repr__(self) -> str: + r""" + Replacement repr() and str() for Python3. + This ensures we get the "u" suffix on unicode types, + and also \u when the string is not ASCII representable """ try: - value = self.value.decode("utf-8") + utf8_value = self.value.decode("utf-8") # Do we need to handle utf-16 and utf-32? except UnicodeDecodeError: return f"""u'{str(self.value)[1:]}'""" - else: - return f"""u'{str(value)}'""" + + if is_ascii(utf8_value): + return f"""u'{utf8_value}'""" + + # Turn the unicode character into its Unicode code point, + # but strip of the leading "0x". + stripped_utf8 = utf8_value[len("0x") :] + unicode_codepoint = "".join( + (c if is_ascii(c) else hex(ord(c)) for c in stripped_utf8) + ) + return rf"""u'\u{unicode_codepoint}'"""