Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect and fix invalid variable names #59

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added test/bytecode_3.7/02_invalid_variable_name1.pyc
Binary file not shown.
Binary file added test/bytecode_3.7/02_invalid_variable_name2.pyc
Binary file not shown.
Binary file added test/bytecode_3.7/02_invalid_variable_name3.pyc
Binary file not shown.
14 changes: 12 additions & 2 deletions xdis/bin/pydisasm.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,18 @@
default=False,
help="Show only the module header information",
)
@click.option(
"--warn-invalid-vars/--nowarn-invalid-vars",
default=True,
help="warn about invalid variable names",
)
@click.option(
"--fix-invalid-vars/--nofix-invalid-vars",
default=True,
help="fix the names for variables with invalid names",
)
@click.argument("files", nargs=-1, type=click.Path(readable=True), required=True)
def main(asm, show_bytes, header, files):
def main(asm, show_bytes, header, warn_invalid_vars, fix_invalid_vars, files):
"""Disassembles a Python bytecode file.

We handle bytecode for virtually every release of Python and some releases of PyPy.
Expand Down Expand Up @@ -72,7 +82,7 @@ def main(asm, show_bytes, header, files):
)
continue

disassemble_file(path, sys.stdout, asm, header, show_bytes)
disassemble_file(path, sys.stdout, asm, header, show_bytes, warn_invalid_vars=warn_invalid_vars, fix_invalid_vars=fix_invalid_vars)
return


Expand Down
63 changes: 63 additions & 0 deletions xdis/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

from xdis import PYTHON3, PYTHON_VERSION
from xdis.util import UniqueSuffixSet
import inspect, types
import ast
import re


class Code3:
Expand Down Expand Up @@ -637,3 +640,63 @@ def code_has_star_star_arg(code):
"""Return True iff
The code object has a variable keyword parameter (**kwargs-like)."""
return (code.co_flags & 8) != 0


# From: https://stackoverflow.com/questions/36330860/pythonically-check-if-a-variable-name-is-valid
def is_valid_variable_name(name):
"""Returns True iff
the argument is a valid Python variable name"""
if not re.fullmatch('^[_a-zA-Z][_0-9a-zA-Z]*$', name):
return False
try:
ast.parse('{} = None'.format(name))
ZetaTwo marked this conversation as resolved.
Show resolved Hide resolved
return True
except (SyntaxError, ValueError, TypeError):
return False


def is_valid_variable_names(code):
"""Return True iff all of the co_names are valid Python identifier names"""
return all(is_valid_variable_name(name) for name in code.co_names)


def fix_variable_name(name):
"""Converts an invalid python variable name into a valid variable name similar to the input"""
# Replace invalid character with underscore
name = re.sub('[^_0-9a-zA-Z]', '_', name)
# Replace leading digit with underscore
name = re.sub('^[0-9]', '_', name)
if not is_valid_variable_name(name):
return '_' + name
else:
return name


def fix_variable_names(code):
"""Modifies a code object, transforming all invalid names into valid names and avoiding collisions."""
valid_names = UniqueSuffixSet()
fixed_names = []
for co_name in code.co_names:
fixed_name = valid_names.add(fix_variable_name(co_name))
fixed_names.append(fixed_name)

args = [code.co_argcount]
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This whole section is ugly

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Left a comment about that. Ideal is to add "replace" methods to the xdis code type and and a method to convert that code type to the native code type.

It is more work that we originally bargained for but after its done things will be much nicer I think.

if PYTHON3:
args.append(code.co_kwonlyargcount)
args += [
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Recently I've come to learn that Python 3.8 there is now a "replace" method on Code. See https://docs.python.org/3/library/types.html#types.CodeType

So xdis in its code type should add this method (for all types of CodeType) . So the way ths might work to create an xdis code type and then convert that to the local code type.

I realize this is a digression from the main topic here, so it can be deferred. I do want to note it becuase this kind of code is pervasive and ugly and error prone.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds good. I'm not super familiar with code objects especially not accross different Python versions and I'm also not familiar with this codebase so I don't know what Python versions xdis is aming to be compatible with but I'll gladly change this part of the code anytime as it is by far the worst part of the whole PR.

code.co_nlocals,
code.co_stacksize,
code.co_flags,
code.co_code,
code.co_consts,
tuple(fixed_names), # replace code.co_names with fixed version
code.co_varnames,
code.co_filename,
code.co_name,
code.co_firstlineno,
code.co_lnotab,
code.co_freevars,
code.co_cellvars
]

return types.CodeType(*args)
17 changes: 14 additions & 3 deletions xdis/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

from xdis import IS_PYPY
from xdis.bytecode import Bytecode
from xdis.code import iscode, code2compat, code3compat
from xdis.code import iscode, code2compat, code3compat, is_valid_variable_names, fix_variable_names
from xdis.load import check_object_path, load_module
from xdis.util import format_code_info
from xdis.version import VERSION
Expand Down Expand Up @@ -68,6 +68,7 @@ def show_module_header(
source_size=None,
header=True,
show_filename=True,
warn_invalid_variables=True,
):

real_out = out or sys.stdout
Expand Down Expand Up @@ -107,7 +108,8 @@ def show_module_header(
real_out.write("# Source code size mod 2**32: %d bytes\n" % source_size)
if show_filename:
real_out.write("# Embedded file name: %s\n" % co.co_filename)

if warn_invalid_variables and not is_valid_variable_names(co):
real_out.write("# WARNING: Code contains variables with invalid Python variable names.\n")

def disco(
bytecode_version,
Expand All @@ -121,6 +123,8 @@ def disco(
asm_format=False,
show_bytes=False,
dup_lines=False,
warn_invalid_variables=True,
fix_invalid_variables=True,
):
"""
diassembles and deparses a given code block 'co'
Expand All @@ -138,11 +142,15 @@ def disco(
source_size,
header,
show_filename=False,
warn_invalid_variables=warn_invalid_variables
)

# store final output stream for case of error
real_out = out or sys.stdout

if fix_invalid_variables:
co = fix_variable_names(co)

if co.co_filename and not asm_format:
real_out.write(format_code_info(co, bytecode_version) + "\n")
pass
Expand Down Expand Up @@ -256,7 +264,7 @@ def disco_loop_asm_format(opc, version, co, real_out, fn_name_map, all_fns):


def disassemble_file(
filename, outstream=sys.stdout, asm_format=False, header=False, show_bytes=False
filename, outstream=sys.stdout, asm_format=False, header=False, show_bytes=False, warn_invalid_vars=True, fix_invalid_vars=True
):
"""
disassemble Python byte-code file (.pyc)
Expand All @@ -277,6 +285,7 @@ def disassemble_file(
magic_int,
source_size,
show_filename=True,
warn_invalid_variables=warn_invalid_vars,
)

else:
Expand All @@ -290,6 +299,8 @@ def disassemble_file(
source_size,
asm_format=asm_format,
show_bytes=show_bytes,
warn_invalid_variables=warn_invalid_vars,
fix_invalid_variables=fix_invalid_vars,
)
# print co.co_filename
return filename, co, version, timestamp, magic_int
Expand Down
22 changes: 22 additions & 0 deletions xdis/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,25 @@ def show_code(co, version, file=None, is_pypy=False):
print(code_info(co, version, is_pypy=is_pypy))
else:
file.write(code_info(co, version) + "\n")

class UniqueSuffixSet:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could not come up with a better name for this class

"""A set that will add a numerical suffix to an added value to make sure values are unique"""

def __init__(self, initial_values=[]):
"""Construct the initial set of value from an iterable of unique values"""
self.values = set(initial_values)
if len(self.values) != len(initial_values):
raise ValueError("Initial values not unique, %d != %d" % (len(self.values), len(initial_values)))

def add(self, value_candidate):
"""Add a new value to the set and return the actual value, including suffix, added"""
if value_candidate in self.values:
for suffix_number in range(len(self.values)):
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the first name is already in the set, it's enough to try len(self.values) additional names by the pigeonhole principle.

value = '%s_%d' % (value_candidate, suffix_number)
if value not in self.values:
break
else:
value = value_candidate
assert value not in self.values, "The value was found in the set even though it should not be there."
self.values.add(value)
return value