diff --git a/pytest/test_disasm.py b/pytest/test_disasm.py index 74d88e81..25680f6b 100644 --- a/pytest/test_disasm.py +++ b/pytest/test_disasm.py @@ -70,3 +70,22 @@ def test_funcoutput(capfd, test_tuple, function_to_test): with open(filename_expected + ".got", "w") as out: out.write(got) assert got == expected + + @pytest.mark.parametrize( + ("obfuscated_bytecode_file", "expected_variable_name"), + [ + ("../test/bytecode_3.7/02_invalid_variable_name1.pyc", "y____Hello___"), + ("../test/bytecode_3.7/02_invalid_variable_name2.pyc", "_for"), + ("../test/bytecode_3.7/02_invalid_variable_name3.pyc", "_x"), + ], + ) + def test_obfuscation(obfuscated_bytecode_file, expected_variable_name): + INVALID_VARS_ERROR_MSG = "# WARNING: Code contains variables with invalid Python variable names." + testfile = os.path.join(get_srcdir(), obfuscated_bytecode_file) + resout = StringIO() + disassemble_file(testfile, resout, header=True, warn_invalid_vars=True) + assert INVALID_VARS_ERROR_MSG in resout.getvalue(), "Warning about invalid variables not found when disassembling %s" % obfuscated_bytecode_file + + resout = StringIO() + disassemble_file(testfile, resout, warn_invalid_vars=False, fix_invalid_vars=True) + assert expected_variable_name in resout.getvalue(), "Expected obfuscated variable in testfile %s to be repaired to %s" % (obfuscated_bytecode_file, expected_variable_name) diff --git a/test/bytecode_3.7/02_invalid_variable_name1.pyc b/test/bytecode_3.7/02_invalid_variable_name1.pyc new file mode 100644 index 00000000..87a0d25c Binary files /dev/null and b/test/bytecode_3.7/02_invalid_variable_name1.pyc differ diff --git a/test/bytecode_3.7/02_invalid_variable_name2.pyc b/test/bytecode_3.7/02_invalid_variable_name2.pyc new file mode 100644 index 00000000..3caadd7e Binary files /dev/null and b/test/bytecode_3.7/02_invalid_variable_name2.pyc differ diff --git a/test/bytecode_3.7/02_invalid_variable_name3.pyc b/test/bytecode_3.7/02_invalid_variable_name3.pyc new file mode 100644 index 00000000..ea283ba9 Binary files /dev/null and b/test/bytecode_3.7/02_invalid_variable_name3.pyc differ diff --git a/xdis/bin/pydisasm.py b/xdis/bin/pydisasm.py index c35533e8..779d99b0 100644 --- a/xdis/bin/pydisasm.py +++ b/xdis/bin/pydisasm.py @@ -36,8 +36,18 @@ default=False, help="Show only the module header information", ) +@click.option( + "--warn-invalid-vars/--nowarn-invalid-vars", + default=True, + help="warn about invalid variable names", +) +@click.option( + "--fix-invalid-vars/--nofix-invalid-vars", + default=True, + help="fix the names for variables with invalid names", +) @click.argument("files", nargs=-1, type=click.Path(readable=True), required=True) -def main(asm, show_bytes, header, files): +def main(asm, show_bytes, header, warn_invalid_vars, fix_invalid_vars, files): """Disassembles a Python bytecode file. We handle bytecode for virtually every release of Python and some releases of PyPy. @@ -72,7 +82,7 @@ def main(asm, show_bytes, header, files): ) continue - disassemble_file(path, sys.stdout, asm, header, show_bytes) + disassemble_file(path, sys.stdout, asm, header, show_bytes, warn_invalid_vars=warn_invalid_vars, fix_invalid_vars=fix_invalid_vars) return diff --git a/xdis/code.py b/xdis/code.py index f26162a1..56993fee 100644 --- a/xdis/code.py +++ b/xdis/code.py @@ -15,7 +15,10 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. from xdis import PYTHON3, PYTHON_VERSION +from xdis.util import UniqueSuffixSet import inspect, types +import ast +import re class Code3: @@ -637,3 +640,63 @@ def code_has_star_star_arg(code): """Return True iff The code object has a variable keyword parameter (**kwargs-like).""" return (code.co_flags & 8) != 0 + + +# From: https://stackoverflow.com/questions/36330860/pythonically-check-if-a-variable-name-is-valid +def is_valid_variable_name(name): + """Returns True iff + the argument is a valid Python variable name""" + if not re.match('^[_a-zA-Z][_0-9a-zA-Z]*$', name): + return False + try: + ast.parse('%s = None' % name) + return True + except (SyntaxError, ValueError, TypeError): + return False + + +def is_valid_variable_names(code): + """Return True iff all of the co_names are valid Python identifier names""" + return all(is_valid_variable_name(name) for name in code.co_names) + + +def fix_variable_name(name): + """Converts an invalid python variable name into a valid variable name similar to the input""" + # Replace invalid character with underscore + name = re.sub('[^_0-9a-zA-Z]', '_', name) + # Replace leading digit with underscore + name = re.sub('^[0-9]', '_', name) + if not is_valid_variable_name(name): + return '_' + name + else: + return name + + +def fix_variable_names(code): + """Modifies a code object, transforming all invalid names into valid names and avoiding collisions.""" + valid_names = UniqueSuffixSet() + fixed_names = [] + for co_name in code.co_names: + fixed_name = valid_names.add(fix_variable_name(co_name)) + fixed_names.append(fixed_name) + + args = [code.co_argcount] + if PYTHON3: + args.append(code.co_kwonlyargcount) + args += [ + code.co_nlocals, + code.co_stacksize, + code.co_flags, + code.co_code, + tuple(code.co_consts), + tuple(fixed_names), # replace code.co_names with fixed version + tuple(code.co_varnames), + str(code.co_filename), + str(code.co_name), + code.co_firstlineno, + code.co_lnotab if type(code.co_lnotab) == bytes else code.co_lnotab.encode('utf-8'), + code.co_freevars, + code.co_cellvars + ] + + return types.CodeType(*args) diff --git a/xdis/main.py b/xdis/main.py index 22406ed7..7d8fc78b 100644 --- a/xdis/main.py +++ b/xdis/main.py @@ -37,7 +37,7 @@ from xdis import IS_PYPY from xdis.bytecode import Bytecode -from xdis.code import iscode, code2compat, code3compat +from xdis.code import iscode, code2compat, code3compat, is_valid_variable_names, fix_variable_names from xdis.load import check_object_path, load_module from xdis.util import format_code_info from xdis.version import VERSION @@ -68,6 +68,7 @@ def show_module_header( source_size=None, header=True, show_filename=True, + warn_invalid_variables=True, ): real_out = out or sys.stdout @@ -107,7 +108,8 @@ def show_module_header( real_out.write("# Source code size mod 2**32: %d bytes\n" % source_size) if show_filename: real_out.write("# Embedded file name: %s\n" % co.co_filename) - + if warn_invalid_variables and not is_valid_variable_names(co): + real_out.write("# WARNING: Code contains variables with invalid Python variable names.\n") def disco( bytecode_version, @@ -121,6 +123,8 @@ def disco( asm_format=False, show_bytes=False, dup_lines=False, + warn_invalid_variables=True, + fix_invalid_variables=False, ): """ diassembles and deparses a given code block 'co' @@ -138,11 +142,15 @@ def disco( source_size, header, show_filename=False, + warn_invalid_variables=warn_invalid_variables ) # store final output stream for case of error real_out = out or sys.stdout + if fix_invalid_variables: + co = fix_variable_names(co) + if co.co_filename and not asm_format: real_out.write(format_code_info(co, bytecode_version) + "\n") pass @@ -256,7 +264,7 @@ def disco_loop_asm_format(opc, version, co, real_out, fn_name_map, all_fns): def disassemble_file( - filename, outstream=sys.stdout, asm_format=False, header=False, show_bytes=False + filename, outstream=sys.stdout, asm_format=False, header=False, show_bytes=False, warn_invalid_vars=True, fix_invalid_vars=False ): """ disassemble Python byte-code file (.pyc) @@ -277,6 +285,7 @@ def disassemble_file( magic_int, source_size, show_filename=True, + warn_invalid_variables=warn_invalid_vars, ) else: @@ -290,6 +299,8 @@ def disassemble_file( source_size, asm_format=asm_format, show_bytes=show_bytes, + warn_invalid_variables=warn_invalid_vars, + fix_invalid_variables=fix_invalid_vars, ) # print co.co_filename return filename, co, version, timestamp, magic_int diff --git a/xdis/util.py b/xdis/util.py index 83b4c5d3..22a3e063 100644 --- a/xdis/util.py +++ b/xdis/util.py @@ -253,3 +253,25 @@ def show_code(co, version, file=None, is_pypy=False): print(code_info(co, version, is_pypy=is_pypy)) else: file.write(code_info(co, version) + "\n") + +class UniqueSuffixSet: + """A set that will add a numerical suffix to an added value to make sure values are unique""" + + def __init__(self, initial_values=[]): + """Construct the initial set of value from an iterable of unique values""" + self.values = set(initial_values) + if len(self.values) != len(initial_values): + raise ValueError("Initial values not unique, %d != %d" % (len(self.values), len(initial_values))) + + def add(self, value_candidate): + """Add a new value to the set and return the actual value, including suffix, added""" + if value_candidate in self.values: + for suffix_number in range(len(self.values)): + value = '%s_%d' % (value_candidate, suffix_number) + if value not in self.values: + break + else: + value = value_candidate + assert value not in self.values, "The value was found in the set even though it should not be there." + self.values.add(value) + return value