From 1c98d766f8f24284fa7fedbdf5b0cfcad1b92fcc Mon Sep 17 00:00:00 2001 From: Calle Svensson Date: Sun, 12 Apr 2020 01:09:49 +0200 Subject: [PATCH 1/6] Detect and fix invalid variable names --- .../02_invalid_variable_name1.pyc | Bin 0 -> 179 bytes .../02_invalid_variable_name2.pyc | Bin 0 -> 169 bytes .../02_invalid_variable_name3.pyc | Bin 0 -> 168 bytes xdis/code.py | 63 ++++++++++++++++++ xdis/main.py | 12 +++- xdis/util.py | 22 ++++++ 6 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 test/bytecode_3.7/02_invalid_variable_name1.pyc create mode 100644 test/bytecode_3.7/02_invalid_variable_name2.pyc create mode 100644 test/bytecode_3.7/02_invalid_variable_name3.pyc diff --git a/test/bytecode_3.7/02_invalid_variable_name1.pyc b/test/bytecode_3.7/02_invalid_variable_name1.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87a0d25c5fa5d0dda1101fcb00307b7ed0da7692 GIT binary patch literal 179 zcmZ?b<>g`k0-oJEaXvu$F^B^L%s_?%5Em-|i4=w?h7`sq#uTO~rc~xsh7@LThEzsz zhGs@ah7^`y22IvmtRAU3Ir+C(!}E)BQeH9wHT_~#@Y7_u#aMBRvGNvgrGl-3I#`Xm zg7Piaf}+g4l9dcaoIr(O;um*nMPhD2PO70^LFFwDo80`A(wtN~kTZ(efCLL82m%1t C4<^R| literal 0 HcmV?d00001 diff --git a/test/bytecode_3.7/02_invalid_variable_name2.pyc b/test/bytecode_3.7/02_invalid_variable_name2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3caadd7e0a8e02162ac868c24d28cfadb89891b9 GIT binary patch literal 169 zcmZ?b<>g`k0-oJEaXvu$F^B^L%s_?%5Em-|i4=w?h7`sq#uTO~rc~xsh7@LThEzsz zhGs@ah7^`y22IvmtRAU3Ir+C(!}E)BQeH9wHT_~#@Y7_u#aMBRvGNvkT7Jg`k0-oJEaXvu$F^B^L%s_?%5Em-|i4=w?h7`sq#uTO~rc~xsh7@LThEzsz zhGs@ah7^`y22IvmtRAU3Ir+C(!}E)BQeH9wHT_~#@Y7_u#aMBRvGNv^LB%cBf}+g4 tl9dcaoIqJH@rygPA~ClhC)H4|pz;=nO>TZlX-=vg$nIh`Ai=^2f&esPBk2GD literal 0 HcmV?d00001 diff --git a/xdis/code.py b/xdis/code.py index f26162a1..3d855220 100644 --- a/xdis/code.py +++ b/xdis/code.py @@ -15,7 +15,10 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. from xdis import PYTHON3, PYTHON_VERSION +from xdis.util import UniqueSuffixSet import inspect, types +import ast +import re class Code3: @@ -637,3 +640,63 @@ def code_has_star_star_arg(code): """Return True iff The code object has a variable keyword parameter (**kwargs-like).""" return (code.co_flags & 8) != 0 + + +# From: https://stackoverflow.com/questions/36330860/pythonically-check-if-a-variable-name-is-valid +def is_valid_variable_name(name): + """Returns True iff + the argument is a valid Python variable name""" + if not re.fullmatch('^[_a-zA-Z][_0-9a-zA-Z]*$', name): + return False + try: + ast.parse('{} = None'.format(name)) + return True + except (SyntaxError, ValueError, TypeError): + return False + + +def is_valid_variable_names(code): + """Return True iff all of the co_names are valid Python identifier names""" + return all(is_valid_variable_name(name) for name in code.co_names) + + +def fix_variable_name(name): + """Converts an invalid python variable name into a valid variable name similar to the input""" + # Replace invalid character with underscore + name = re.sub('[^_0-9a-zA-Z]', '_', name) + # Replace leading digit with underscore + name = re.sub('^[0-9]', '_', name) + if not is_valid_variable_name(name): + return '_' + name + else: + return name + + +def fix_variable_names(code): + """Modifies a code object, transforming all invalid names into valid names and avoiding collisions.""" + valid_names = UniqueSuffixSet() + fixed_names = [] + for co_name in code.co_names: + fixed_name = valid_names.add(fix_variable_name(co_name)) + fixed_names.append(fixed_name) + + args = [code.co_argcount] + if PYTHON3: + args.append(code.co_kwonlyargcount) + args += [ + code.co_nlocals, + code.co_stacksize, + code.co_flags, + code.co_code, + code.co_consts, + tuple(fixed_names), # replace code.co_names with fixed version + code.co_varnames, + code.co_filename, + code.co_name, + code.co_firstlineno, + code.co_lnotab, + code.co_freevars, + code.co_cellvars + ] + + return types.CodeType(*args) diff --git a/xdis/main.py b/xdis/main.py index 22406ed7..74178986 100644 --- a/xdis/main.py +++ b/xdis/main.py @@ -37,7 +37,7 @@ from xdis import IS_PYPY from xdis.bytecode import Bytecode -from xdis.code import iscode, code2compat, code3compat +from xdis.code import iscode, code2compat, code3compat, is_valid_variable_names, fix_variable_names from xdis.load import check_object_path, load_module from xdis.util import format_code_info from xdis.version import VERSION @@ -68,6 +68,7 @@ def show_module_header( source_size=None, header=True, show_filename=True, + warn_invalid_variables=True, ): real_out = out or sys.stdout @@ -107,7 +108,8 @@ def show_module_header( real_out.write("# Source code size mod 2**32: %d bytes\n" % source_size) if show_filename: real_out.write("# Embedded file name: %s\n" % co.co_filename) - + if warn_invalid_variables and not is_valid_variable_names(co): + real_out.write("# WARNING: Code contains variables with invalid Python variable names.\n") def disco( bytecode_version, @@ -121,6 +123,8 @@ def disco( asm_format=False, show_bytes=False, dup_lines=False, + warn_invalid_variables=True, + fix_invalid_variables=True, ): """ diassembles and deparses a given code block 'co' @@ -138,11 +142,15 @@ def disco( source_size, header, show_filename=False, + warn_invalid_variables=warn_invalid_variables ) # store final output stream for case of error real_out = out or sys.stdout + if fix_invalid_variables: + co = fix_variable_names(co) + if co.co_filename and not asm_format: real_out.write(format_code_info(co, bytecode_version) + "\n") pass diff --git a/xdis/util.py b/xdis/util.py index 83b4c5d3..22a3e063 100644 --- a/xdis/util.py +++ b/xdis/util.py @@ -253,3 +253,25 @@ def show_code(co, version, file=None, is_pypy=False): print(code_info(co, version, is_pypy=is_pypy)) else: file.write(code_info(co, version) + "\n") + +class UniqueSuffixSet: + """A set that will add a numerical suffix to an added value to make sure values are unique""" + + def __init__(self, initial_values=[]): + """Construct the initial set of value from an iterable of unique values""" + self.values = set(initial_values) + if len(self.values) != len(initial_values): + raise ValueError("Initial values not unique, %d != %d" % (len(self.values), len(initial_values))) + + def add(self, value_candidate): + """Add a new value to the set and return the actual value, including suffix, added""" + if value_candidate in self.values: + for suffix_number in range(len(self.values)): + value = '%s_%d' % (value_candidate, suffix_number) + if value not in self.values: + break + else: + value = value_candidate + assert value not in self.values, "The value was found in the set even though it should not be there." + self.values.add(value) + return value From 6d00806ce3f18a4b51d924a20f4e3cf7bc43fc99 Mon Sep 17 00:00:00 2001 From: Calle Svensson Date: Sun, 12 Apr 2020 01:20:37 +0200 Subject: [PATCH 2/6] Add command line arguments for fixing and detecting variable names --- xdis/bin/pydisasm.py | 14 ++++++++++++-- xdis/main.py | 5 ++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/xdis/bin/pydisasm.py b/xdis/bin/pydisasm.py index c35533e8..779d99b0 100644 --- a/xdis/bin/pydisasm.py +++ b/xdis/bin/pydisasm.py @@ -36,8 +36,18 @@ default=False, help="Show only the module header information", ) +@click.option( + "--warn-invalid-vars/--nowarn-invalid-vars", + default=True, + help="warn about invalid variable names", +) +@click.option( + "--fix-invalid-vars/--nofix-invalid-vars", + default=True, + help="fix the names for variables with invalid names", +) @click.argument("files", nargs=-1, type=click.Path(readable=True), required=True) -def main(asm, show_bytes, header, files): +def main(asm, show_bytes, header, warn_invalid_vars, fix_invalid_vars, files): """Disassembles a Python bytecode file. We handle bytecode for virtually every release of Python and some releases of PyPy. @@ -72,7 +82,7 @@ def main(asm, show_bytes, header, files): ) continue - disassemble_file(path, sys.stdout, asm, header, show_bytes) + disassemble_file(path, sys.stdout, asm, header, show_bytes, warn_invalid_vars=warn_invalid_vars, fix_invalid_vars=fix_invalid_vars) return diff --git a/xdis/main.py b/xdis/main.py index 74178986..ffec6c96 100644 --- a/xdis/main.py +++ b/xdis/main.py @@ -264,7 +264,7 @@ def disco_loop_asm_format(opc, version, co, real_out, fn_name_map, all_fns): def disassemble_file( - filename, outstream=sys.stdout, asm_format=False, header=False, show_bytes=False + filename, outstream=sys.stdout, asm_format=False, header=False, show_bytes=False, warn_invalid_vars=True, fix_invalid_vars=True ): """ disassemble Python byte-code file (.pyc) @@ -285,6 +285,7 @@ def disassemble_file( magic_int, source_size, show_filename=True, + warn_invalid_variables=warn_invalid_vars, ) else: @@ -298,6 +299,8 @@ def disassemble_file( source_size, asm_format=asm_format, show_bytes=show_bytes, + warn_invalid_variables=warn_invalid_vars, + fix_invalid_variables=fix_invalid_vars, ) # print co.co_filename return filename, co, version, timestamp, magic_int From 5cd05edb70d9e900408e1eff9e700a666865e1a5 Mon Sep 17 00:00:00 2001 From: Calle Svensson Date: Sun, 12 Apr 2020 01:47:25 +0200 Subject: [PATCH 3/6] More compatible formatting of string --- xdis/code.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xdis/code.py b/xdis/code.py index 3d855220..56f4071e 100644 --- a/xdis/code.py +++ b/xdis/code.py @@ -649,7 +649,7 @@ def is_valid_variable_name(name): if not re.fullmatch('^[_a-zA-Z][_0-9a-zA-Z]*$', name): return False try: - ast.parse('{} = None'.format(name)) + ast.parse('{} = None' % name) return True except (SyntaxError, ValueError, TypeError): return False From bb7af5b5fdfa58f7f034a88bd0755eb935923cc2 Mon Sep 17 00:00:00 2001 From: Calle Svensson Date: Sun, 12 Apr 2020 01:49:02 +0200 Subject: [PATCH 4/6] More compatible formatting of string - with correct syntax this time --- xdis/code.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xdis/code.py b/xdis/code.py index 56f4071e..bb5a0ad8 100644 --- a/xdis/code.py +++ b/xdis/code.py @@ -649,7 +649,7 @@ def is_valid_variable_name(name): if not re.fullmatch('^[_a-zA-Z][_0-9a-zA-Z]*$', name): return False try: - ast.parse('{} = None' % name) + ast.parse('%s = None' % name) return True except (SyntaxError, ValueError, TypeError): return False From 4636d4abbcb62051c64488603d33b666dc6466f7 Mon Sep 17 00:00:00 2001 From: Calle Svensson Date: Sun, 12 Apr 2020 02:24:13 +0200 Subject: [PATCH 5/6] Added pytests for variable naming detection --- pytest/test_disasm.py | 19 +++++++++++++++++++ xdis/code.py | 12 ++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/pytest/test_disasm.py b/pytest/test_disasm.py index 74d88e81..25680f6b 100644 --- a/pytest/test_disasm.py +++ b/pytest/test_disasm.py @@ -70,3 +70,22 @@ def test_funcoutput(capfd, test_tuple, function_to_test): with open(filename_expected + ".got", "w") as out: out.write(got) assert got == expected + + @pytest.mark.parametrize( + ("obfuscated_bytecode_file", "expected_variable_name"), + [ + ("../test/bytecode_3.7/02_invalid_variable_name1.pyc", "y____Hello___"), + ("../test/bytecode_3.7/02_invalid_variable_name2.pyc", "_for"), + ("../test/bytecode_3.7/02_invalid_variable_name3.pyc", "_x"), + ], + ) + def test_obfuscation(obfuscated_bytecode_file, expected_variable_name): + INVALID_VARS_ERROR_MSG = "# WARNING: Code contains variables with invalid Python variable names." + testfile = os.path.join(get_srcdir(), obfuscated_bytecode_file) + resout = StringIO() + disassemble_file(testfile, resout, header=True, warn_invalid_vars=True) + assert INVALID_VARS_ERROR_MSG in resout.getvalue(), "Warning about invalid variables not found when disassembling %s" % obfuscated_bytecode_file + + resout = StringIO() + disassemble_file(testfile, resout, warn_invalid_vars=False, fix_invalid_vars=True) + assert expected_variable_name in resout.getvalue(), "Expected obfuscated variable in testfile %s to be repaired to %s" % (obfuscated_bytecode_file, expected_variable_name) diff --git a/xdis/code.py b/xdis/code.py index bb5a0ad8..56993fee 100644 --- a/xdis/code.py +++ b/xdis/code.py @@ -646,7 +646,7 @@ def code_has_star_star_arg(code): def is_valid_variable_name(name): """Returns True iff the argument is a valid Python variable name""" - if not re.fullmatch('^[_a-zA-Z][_0-9a-zA-Z]*$', name): + if not re.match('^[_a-zA-Z][_0-9a-zA-Z]*$', name): return False try: ast.parse('%s = None' % name) @@ -688,13 +688,13 @@ def fix_variable_names(code): code.co_stacksize, code.co_flags, code.co_code, - code.co_consts, + tuple(code.co_consts), tuple(fixed_names), # replace code.co_names with fixed version - code.co_varnames, - code.co_filename, - code.co_name, + tuple(code.co_varnames), + str(code.co_filename), + str(code.co_name), code.co_firstlineno, - code.co_lnotab, + code.co_lnotab if type(code.co_lnotab) == bytes else code.co_lnotab.encode('utf-8'), code.co_freevars, code.co_cellvars ] From 9bb567fb4ed9afae37db1590ee4ff522e5226c1d Mon Sep 17 00:00:00 2001 From: Calle Svensson Date: Sun, 12 Apr 2020 02:29:04 +0200 Subject: [PATCH 6/6] Changed defaults for warning and fixing of variable names --- xdis/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xdis/main.py b/xdis/main.py index ffec6c96..7d8fc78b 100644 --- a/xdis/main.py +++ b/xdis/main.py @@ -124,7 +124,7 @@ def disco( show_bytes=False, dup_lines=False, warn_invalid_variables=True, - fix_invalid_variables=True, + fix_invalid_variables=False, ): """ diassembles and deparses a given code block 'co' @@ -264,7 +264,7 @@ def disco_loop_asm_format(opc, version, co, real_out, fn_name_map, all_fns): def disassemble_file( - filename, outstream=sys.stdout, asm_format=False, header=False, show_bytes=False, warn_invalid_vars=True, fix_invalid_vars=True + filename, outstream=sys.stdout, asm_format=False, header=False, show_bytes=False, warn_invalid_vars=True, fix_invalid_vars=False ): """ disassemble Python byte-code file (.pyc)