Skip to content

Commit

Permalink
unicode tweaks -
Browse files Browse the repository at this point in the history
Python 2.x can't accept unicode strings other than \u, so
check whether a unicode string is ASCII or not.
  • Loading branch information
rocky committed Apr 23, 2023
1 parent c21eebc commit dba757d
Showing 1 changed file with 26 additions and 9 deletions.
35 changes: 26 additions & 9 deletions xdis/cross_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@
"""


# From
# https://stackoverflow.com/questions/196345/how-to-check-if-a-string-in-python-is-in-ascii
def is_ascii(s: str) -> bool:
"""Check if the characters in string s are in ASCII, U+0-U+7F."""
return len(s) == len(s.encode())


class LongTypeForPython3(int):
"""
Define a Python3 long integer type which exists in
Expand All @@ -28,9 +35,9 @@ class LongTypeForPython3(int):
def __init__(self, value):
self.value = value

def __repr__(self):
def __repr__(self) -> str:
"""
Replacement __str__ and str() for Python3.
Replacement repr() and str() for Python3.
This ensures we get the "L" suffix on long types.
"""
return f"""{self.value}L"""
Expand All @@ -45,15 +52,25 @@ class UnicodeForPython3(str):
def __init__(self, value):
self.value = value

def __repr__(self):
"""
Replacement __str__ and str() for Python3.
This ensures we get the "u" suffix on unicode types.
def __repr__(self) -> str:
r"""
Replacement repr() and str() for Python3.
This ensures we get the "u" suffix on unicode types,
and also \u when the string is not ASCII representable
"""
try:
value = self.value.decode("utf-8")
utf8_value = self.value.decode("utf-8")
# Do we need to handle utf-16 and utf-32?
except UnicodeDecodeError:
return f"""u'{str(self.value)[1:]}'"""
else:
return f"""u'{str(value)}'"""

if is_ascii(utf8_value):
return f"""u'{utf8_value}'"""

# Turn the unicode character into its Unicode code point,
# but strip of the leading "0x".
stripped_utf8 = utf8_value[len("0x") :]
unicode_codepoint = "".join(
(c if is_ascii(c) else hex(ord(c)) for c in stripped_utf8)
)
return rf"""u'\u{unicode_codepoint}'"""

0 comments on commit dba757d

Please sign in to comment.