swig_debug_parser.py

# Copyright 2016 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Uses SWIG output to convert C++ code snippets to C# code snippets.

Basic Usage: swig_debug_parser.py -d <swig_debug_output> -s <cs_sources>

When running SWIG, use `-debug-top 4` and pipe the output to a file to collect
the debug information about the SWIG parse state. This information can be used
by this script to generate a mapping from C++ identifiers to C# identifiers.
Then, the script looks for code snippets in the comments of the C# source
(denoted by areas in comments wrapped in backticks) and replaces all C++
identifiers with the appropriate C# identifiers.
"""

import re

from absl import app
from absl import flags
from absl import logging

FLAGS = flags.FLAGS
flags.DEFINE_spaceseplist('srcs',
                          None,
                          'The C# source files to process in-place.',
                          short_name='s')
flags.DEFINE_string('debug_top',
                    None,
                    'The SWIG output file generated by running SWIG with the '
                    'argument `debug_top 4`.',
                    short_name='d')
flags.DEFINE_string('namespace',
                    None,
                    'Place all C# identifiers in the given namespace.',
                    short_name='n')

# Matches strings of the form '+++ SomeToken --------------------------'
NODE_HEADER_REGEX = re.compile(r'\+\+\+ ([a-zA-Z0-9_:]+) -+')

# Matches strings of the form '    | Key - "StringValue"'
NODE_FIELD_REGEX = re.compile(r' *\| ([a-zA-Z0-9_:]+) *- "(.+)"')

# Matches strings of the form '    | Key - 0xdeadbeef'
NODE_HEX_FIELD_REGEX = re.compile(r' *\| ([a-zA-Z0-9_:]+) *- (0x[0-9a-f]+)')

# Matches strings of the form '... `some::function::name(int, std::string)` ...'
FUNCTION_NAME_REGEX = re.compile(r'`([a-zA-Z_:]+)[^`]*`')

# When parsing nodes it is useful to know what kind of node is being parsed, so
# a special key is added to each node so that it is aware of what list it came
# from.
NODE_TYPE_KEY = '__type__'

# We have to store children in a special sub-dict because some of the child node
# identifiers can clash with field keys.
NODE_CHILDREN_KEY = '__children__'


class PeekableIter(object):
  """A simple iterator wrapper that supplies a peek function.

  This is useful when doing things like parsing text, where you often want to
  see the next token before consuming it.
  """

  def __init__(self, iterable):
    self.iterator = iter(iterable)
    self.next_value = None

  def __iter__(self):
    return self

  def next(self):
    """Returns the next value being iterated over."""
    if self.next_value is not None:
      next_value = self.next_value
      self.next_value = None
      return next_value
    else:
      return next(self.iterator)

  def peek(self):
    """Returns the next value being iterated over without consuming it."""
    if self.next_value is None:
      self.next_value = next(self.iterator)
    return self.next_value


def indentation(line):
  """Returns the number of leading whitespace characters."""
  return len(line) - len(line.lstrip())


def parse_children(node, it, indent):
  """Parses a child node on the given node.

  Child nodes start with three '+', followed by the node name, followed by a
  series of -'s.

  SWIG Debug output looks something like the following:

     +++ somenode ----------------------------------------
     | field        - "Value"
     | another_field - "Some other value"

           +++ childnode ----------------------------------------
           | child_field    - "blah blah blah"

  Each node has a header which declares what kind of node it is, followed by
  zero or more key value pairs, followed by zero or more child nodes which
  follow the same pattern.

  Sometimes there is a blank link, or a line consisting of only a vertical pipe
  character between the final field and the child nodes.

  Args:
    node: The node to parse for children.
    it: A PeekableIter to iterate over the lines of the swig debug output.
    indent: The current indentation level. Used to determine when to recurse
        into children or return.
  """
  field = NODE_HEADER_REGEX.search(it.next()).group(1)
  children = node.setdefault(NODE_CHILDREN_KEY, {})
  field_list = children.setdefault(field, [])
  child_node = parse_node(it, indent)
  child_node[NODE_TYPE_KEY] = field
  field_list.append(child_node)


def consume_empty_lines(it):
  """Skip over the trailing lines at the end of a node.

  Some nodes have a trailing newline or pipe character for no real reason, so we
  skip them.

  Args:
    it: A PeekableIter to iterate over the lines of the swig debug output.

  Raises:
    StopIteration: The iterator reached the end.
  """
  # pylint: disable=g-explicit-bool-comparison
  while it.peek() == '' or it.peek().strip() == '|':
    it.next()


def parse_node(it, indent):
  """Parses the fields of the node and returns a dict with those fields.

  Child nodes start with three '+', followed by the node name, followed by a
  series of -'s.

  Args:
    it: A PeekableIter to iterate over the lines of the swig debug output.
    indent: The current indentation level. Used to determine when to recurse
        into children or return.

  Returns:
    A dict containing the parsed fields.
  """
  node = {}
  try:
    while indentation(it.peek()) == indent:
      line = it.peek()

      # This is the start of a new node.
      if '+++' in line:
        break

      # Check if this a field we can read
      matches = NODE_FIELD_REGEX.search(line)
      if matches:
        key, value = matches.groups()
        node[key] = value
      else:
        matches = NODE_HEX_FIELD_REGEX.search(line)
        if matches:
          key, value = matches.groups()
          node[key] = int(value, 16)
      it.next()

    consume_empty_lines(it)
  except StopIteration:
    return node

  # Parse child nodes.
  try:
    while indentation(it.peek()) > indent:
      parse_children(node, it, indentation(it.peek()))
      consume_empty_lines(it)
  except StopIteration:
    return node

  return node


def parse_swig_debug_top(it):
  """Parses the output of `swig --debug-top 4`.

  Parses the debug output into a series of nested dictionaries, which we can
  use to # generate a map of C++ classes and enums to C# classes and enums.

  Args:
    it: A PeekableIter to iterate over the lines of the swig debug output.

  Returns:
    A dictionary representing the root node of the parse tree.
  """

  # The deubg output begins with a bunch of stuff we don't care about. Parse
  # forward until we find a line containing +++, which indicates the root node
  # of the tree.
  while it.peek()[0:3] != '+++':
    it.next()

  root_node = {}
  parse_children(root_node, it, 0)

  return root_node


def gather_subtitution_data(node,
                            identifier_metadata_map,
                            file_module_map,
                            current_class=None,
                            includes=None):
  """Builds metadata needed to perform identifer substitution on the C# sources.

  This is basically doing the first phase of two-phase parsing. We need to build
  a mapping between C++ identifiers to C# identifiers. However, the C# class
  names are not necessarily known until after the nodes are parsed. To solve
  that, a second mapping between filenames and the modules they represent is
  used. The identifer map stores some metadata, including what file it it was
  declared in. Those together can later be used to evaluate what class a given
  identifier should belong to.

  Args:
    node: The current node being scanned for metadata.
    identifier_metadata_map: A map between C++ identifiers and the metadata
        needed to determine their C# identifer.
    file_module_map: The map between files and which module they represent.
    current_class: The class that the data in the current node refers to.
    includes: The list of includes that we had to parse to get to this object.
  """
  # Check if we've recursed into an included file.
  local_includes = includes or []
  node_type = node.get(NODE_TYPE_KEY)
  if node_type in ('include', 'import'):
    name = node.get('name')
    if name:
      local_includes = list(includes)
      local_includes.append(name)
      # If this include file has declared that its contents is part of a module,
      # record the mapping between the file name and the module name.
      module = node.get('module')
      if module and isinstance(name, str):
        file_module_map[name] = module

  # Add classes and nodes to the identifier map.
  elif node_type == 'class' or node_type == 'struct':
    current_class = node.get('name')
    metadata = {'includes': local_includes}
    if FLAGS.namespace:
      metadata['namespace'] = FLAGS.namespace
    identifier_metadata_map[current_class] = metadata

  # Add C++ function declarations to the identifier map.
  elif node_type == 'cdecl':
    symname = node.get('sym:name')
    name = node.get('name')
    if current_class:
      name = '%s::%s' % (current_class, name)
    if name and symname:
      metadata = {
          'includes': local_includes,
          'symname': symname,
      }
      if FLAGS.namespace:
        metadata['namespace'] = FLAGS.namespace
      identifier_metadata_map[name] = metadata

  # Recurse into all children nodes and repeat
  child_nodes = node.get(NODE_CHILDREN_KEY, {})
  for children in child_nodes.values():
    for child_node in children:
      gather_subtitution_data(child_node, identifier_metadata_map,
                              file_module_map, current_class, local_includes)


def resolve_module(includes, file_module_map):
  """Scan backwards through the list of includes to find the module.

  The files that SWIG operates on can recursively include other files. We are
  only interested in the most recent module declaration, so we scan the list
  backwards until we find a node that has declared a module.

  Args:
    includes: The list of includes that we had to parse to get to this object.
    file_module_map: The map between files and which module they represent.

  Returns:
    The current module, if any, for the given list of includes.
  """
  for filename in reversed(includes):
    module = file_module_map.get(filename)
    if module:
      return module
  return None


def generated_substitution_map(identifier_metadata_map, file_module_map):
  """Uses the collected metadata to create a dict of C++ to C# identifiers.

  Args:
    identifier_metadata_map: A map between C++ identifiers and the metadata
        needed to determine their C# identifer.
    file_module_map: The map between files and which module they represent.

  Returns:
    A dict of fully qualified C++ identifiers and the C# identifiers they map
    to.
  """
  substitution_map = {}

  for identifier, metadata in identifier_metadata_map.items():
    includes = metadata.get('includes')
    if includes:
      cs_name = []
      namespace = metadata.get('namespace')
      module = resolve_module(includes, file_module_map)
      symname = metadata.get('symname')
      if namespace:
        cs_name.append(namespace)
      if module:
        cs_name.append(module)
      if symname:
        cs_name.append(symname)
      substitution_map[identifier] = '.'.join(cs_name)

  return substitution_map


def perform_substitution(match, substitution_map):
  """Substitutes C++ identifiers with C# identifiers.

  We want to perform subsitutions on function names, but not accidentally hit
  anything else in the string. For example, if the line looks like this:

  /// Returns true if `firebase::crash::Initialize()` has been called.

  Then we want the final string to be:

  /// Returns true if `Firebase.Crash.Initialize()` has been called.

  The regex looks for identifiers enclosed within backticks ignoring things
  like parentheses. If we did the substitution directly, the backticks and
  parentheses would be lost. Instead, what we do is find out what the captured
  match was (in this case, 'firebase::crash::Initialize') then take the whole
  match ('`firebase::crash::Initialize()`'), and subtitute just the portion we
  care about so that the surrounding characters can be preserved.

  Args:
    match: The re.Match object representing the match.
    substitution_map: The dict of potential substitutions.

  Returns:
    The new C# code resulting from performing the substitutions.
  """
  full_match = match.group(0)
  cpp = match.group(1)
  cs = substitution_map.get(cpp)
  if cs:
    return full_match.replace(cpp, cs)
  else:
    return full_match


def apply_substitution(file_content, substitution_map):
  """Apply the substitution map to an entire file.

  Args:
    file_content: The text on which to perform the substitutions.
    substitution_map: The dict of potential substitutions.

  Returns:
    A new string with all substitutions performed.
  """
  return FUNCTION_NAME_REGEX.sub(
      lambda match: perform_substitution(match, substitution_map), file_content)


def main(unused_argv):
  """Converts references to C++ identifiers into C# identifiers.

  Given the output of running SWIG with the argument `-debug-top 4`, convert all
  references to C++ identifiers into C# identifiers in the given files,
  optionally prepending all C# identifiers with the namespace given by -n
  """
  with open(FLAGS.debug_top, 'r') as debug_file:
    debug_file_content = debug_file.read()

  # Parse the debug output into a format we can work with.
  debug_line_iter = PeekableIter(debug_file_content.splitlines())
  debug_data = parse_swig_debug_top(debug_line_iter)

  # Gather the data we need to perform the substitutions.
  identifier_metadata_map = {}
  file_module_map = {}
  gather_subtitution_data(debug_data, identifier_metadata_map, file_module_map)

  substitution_map = generated_substitution_map(identifier_metadata_map,
                                                file_module_map)

  for src in FLAGS.srcs:
    with open(src, 'r') as cs_file:
      file_content = cs_file.read()

    # Apply the maps to the C# code.
    file_content = apply_substitution(file_content, substitution_map)

    try:
      with open(src, 'w') as cs_file:
        cs_file.write(file_content)
    except IOError as e:
      logging.warning('Unable to patch file %s (%s)', cs_file, str(e))


if __name__ == '__main__':
  flags.mark_flag_as_required('debug_top')
  flags.mark_flag_as_required('srcs')
  app.run(main)