Skip to content

Commit

Permalink
add .adt pretty-printer in format_adt.py
Browse files Browse the repository at this point in the history
This makes it easier to inspect .adt files manually without the burden
of fully parsing them.

The formatter works by exploiting the almost-Python syntax of the ADT.
It tweaks it into valid Python, then parses and formats it using
Python's ast.literal_eval and pprint.pprint modules. After this, it
reverses the transformation to produce something very much like BAP.
  • Loading branch information
katrinafyi committed Oct 16, 2023
1 parent c4406e9 commit aacd974
Showing 1 changed file with 118 additions and 0 deletions.
118 changes: 118 additions & 0 deletions scripts/format_adt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env python3
# vim: noai:ts=2:sw=2:expandtab

"""
format_adt.py implements pretty-printing of BAP .adt files
by translating the ADT into Python syntax, then parsing and
formatting the python.
Although this eval()s, it is made safe by using ast.literal_eval
which only supports parsing of a literal Python expression.
"""

import re
import ast
import sys
import pprint
import typing
import argparse

# grep -E '[A-Z][a-ZA-Z]+\\(' *.adt --only-matching | sort | uniq | tr -d '(' | xargs -n1 printf "'%s', "
heads = ['Annotation', 'Arg', 'Args', 'ARSHIFT', 'Attr', 'Attrs', 'Blk', 'Blks', 'Both', 'Call', 'Concat', 'Def', 'Defs', 'Direct', 'EQ', 'Extract', 'Goto', 'Imm', 'In', 'Indirect', 'Int', 'Jmps', 'LittleEndian', 'Load', 'LOW', 'Mem', 'Memmap', 'NEQ', 'NOT', 'Out', 'Phis', 'PLUS', 'Program', 'Project', 'Region', 'Section', 'Sections', 'SIGNED', 'Store', 'Sub', 'Subs', 'Tid', 'UNSIGNED', 'Var']
heads_joined = '|'.join(heads)

def preprocess(data: str) -> str:
"""
Preprocesses BAP ADT intrinsics (like Program, Subs, ...) into tuple syntax.
For example, Program(1, 2, 3) becomes ("Program", 1, 2, 3).
"""
heads_re = re.compile(f'({heads_joined})[(]')

data = heads_re.sub(lambda x: '(' + repr(x[1]) + ', ', data)
return data

class DoubleQuoteStr(str):
def __repr__(self):
# TODO: maybe make more robust?
r = super().__repr__()
r = r[1:-1]
r = r.replace(r"\'", r"'")
r = r.replace(r'"', r'\"')
return '"' + r + '"'

class UnderscoreInt(int):
def __repr__(self):
return f'{self:_}'

Exp = tuple | list | str | int
def clean(data: Exp) -> Exp:
"""
Intermediate step before formatting to tweak pprint's formatting.
This ensures we match BAP as close as possible with double-quoted strings
and underscores in Tid.
"""
if isinstance(data, tuple) and data[0] == 'Tid' and not isinstance(data[1], UnderscoreInt):
return clean((data[0], UnderscoreInt(data[1]), ) + data[2:])
if isinstance(data, str):
return DoubleQuoteStr(data)
if isinstance(data, (list, tuple)):
return data.__class__(map(clean, data))
return data

def postprocess(data: str) -> str:
"""
Postprocesses the formatted Python expression to restore the BAP-style intrinsics.
"""
heads_re2 = re.compile(f'[(]"({heads_joined})",(\\s)')

data = heads_re2.sub(lambda x: x[1] + '(' + ('\n' if x[2] == '\n' else ''), data)
data = data.replace(',)', ')')
return data


def main(args):
infile = args.input
outfile = args.output
width = args.width
update = args.update

data = infile.read()

out = data
out = preprocess(out)
out = ast.literal_eval(out)
out = clean(out)
out = pprint.pformat(out, indent=width, underscore_numbers=False)
out = postprocess(out)

if update:
infile.close()
with open(infile.name, 'w') as outfile:
outfile.write(out)
outfile.write('\n')
else:
outfile.write(out)
outfile.write('\n')
outfile.flush()

if __name__ == '__main__':
argp = argparse.ArgumentParser(description="pretty formats BAP ADT files.")
argp.add_argument('input', nargs='?', type=argparse.FileType('r'), default=sys.stdin,
help="input .adt file (default: stdin)")
excl = argp.add_mutually_exclusive_group()
excl.add_argument('output', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
help="output file name (default: stdout)")

argp.add_argument('--width', '-w', default=1, type=int,
help="indent size in spaces (default: 1)")

excl.add_argument('--update', '-i', action='store_true',
help="write output back to the input file (default: false)")

args = argp.parse_args()

if args.input is sys.stdin and args.update:
argp.error('argument --update/-i: not allowed with stdin input')

main(args)

0 comments on commit aacd974

Please sign in to comment.