From 6d6a5d946e2198de47099878373adf6eb64617b0 Mon Sep 17 00:00:00 2001
From: James Gross <45212823+rakuy0@users.noreply.github.com>
Date: Thu, 16 Nov 2023 15:19:10 -0500
Subject: [PATCH] Deal with uninitialized data sections in PE files and tackle
 some recursion issues (#622)

---
 envi/codeflow.py                         | 83 +++++++++++++++++++-----
 envi/const.py                            |  1 +
 vivisect/__init__.py                     |  6 +-
 vivisect/analysis/generic/codeblocks.py  | 10 +++
 vivisect/analysis/generic/funcentries.py |  2 +
 vivisect/base.py                         |  8 +--
 vivisect/parsers/pe.py                   |  2 +
 vivisect/tests/testvivisect.py           |  1 -
 vivisect/tools/graphutil.py              | 14 +++-
 9 files changed, 104 insertions(+), 23 deletions(-)

diff --git a/envi/codeflow.py b/envi/codeflow.py
index 795cbd456..db287d80f 100644
--- a/envi/codeflow.py
+++ b/envi/codeflow.py
@@ -49,11 +49,17 @@ def __init__(self, mem, persist=False, exptable=True, recurse=True):
         self._cf_recurse = recurse
         self._cf_exptable = exptable
         self._cf_blocks = []
+
+        self._cf_blocked = collections.OrderedDict()
+        self._cf_delaying = collections.defaultdict(set)
+        self._cf_delayed = collections.defaultdict(set)
+        self._calls_from = {}
+
         self._dynamic_branch_handlers = []
 
     def _cb_opcode(self, va, op, branches):
         '''
-        Extend CodeFlowContext and implement this method to recieve
+        Extend CodeFlowContext and implement this method to receive
         a callback for every newly discovered opcode.
         '''
         return branches
@@ -70,7 +76,7 @@ def _cb_noflow(self, va, tva):
         '''
         Implement this method to receive a callback when a given code
         branch is skipped due to being in the noflow dictionary.
-        ( likely due to prodedural branch to noreturn address )
+        ( likely due to procedural branch to noreturn address )
         '''
         pass
 
@@ -89,8 +95,8 @@ def _cb_branchtable(self, tableva, ptrva, destva):
 
     def _cb_dynamic_branch(self, va, op, bflags, branches):
         '''
-        if codeflow finds a branch to a non-discrete value (eg. to a register)
-        we handle it here.  by default, we simply track the dynamic branch in a global
+        if codeflow finds a branch to a non-discrete value (eg: to a register)
+        we handle it here. By default, we simply track the dynamic branch in a global
         VaSet which is added to every workspace.
         '''
         '''
@@ -128,7 +134,7 @@ def addFunctionDef(self, fva, calls_from):
 
     def addCodeFlow(self, va, arch=envi.ARCH_DEFAULT):
         '''
-        Do code flow disassembly from the specified address.  Returnes a list
+        Do code flow disassembly from the specified address. Returns a list
         of the procedural branch targets discovered during code flow...
 
         Set persist=True to store 'opdone' and never disassemble the same thing twice
@@ -179,7 +185,8 @@ def addCodeFlow(self, va, arch=envi.ARCH_DEFAULT):
 
                 bva, bflags = branches.pop()
 
-                # look for dynamic branches (ie. branches which don't have a known target).  assume at least one branch
+                # look for dynamic branches (ie. branches which don't have a known target).
+                # Assume at least one branch
                 if bva is None:
                     self._cb_dynamic_branch(va, op, bflags, branches)
 
@@ -217,6 +224,9 @@ def addCodeFlow(self, va, arch=envi.ARCH_DEFAULT):
                     if not self._mem.probeMemory(bva, 1, e_const.MM_EXEC):
                         continue
 
+                    if self._mem.probeMemory(bva, 1, e_const.MM_UNINIT):
+                        continue
+
                     if bflags & envi.BR_PROC:
 
                         # Record that the current code flow has a call from it
@@ -225,18 +235,18 @@ def addCodeFlow(self, va, arch=envi.ARCH_DEFAULT):
 
                         if bva != nextva:  # NOTE: avoid call 0 constructs
 
-                            # Now we decend so we do deepest func callbacks first!
+                            # Now we descend so we do deepest func callbacks first!
                             if self._cf_recurse:
                                 # descend into functions, but make sure we don't descend into
                                 # recursive functions
                                 if bva in self._cf_blocks:
-                                    logger.debug("not recursing to function 0x%x (at 0x%x): it's already in analysis call path (ie. it called *this* func)", 
+                                    logger.debug("not recursing to function 0x%x (at 0x%x): it's already in analysis call path (ie. it called *this* func)",
                                             bva, va)
                                     logger.debug("call path: \t" + ", ".join([hex(x) for x in self._cf_blocks]))
-                                    # the function that we want to make prodcedural
+                                    # the function that we want to make procedural
                                     # called us so we can't call to make it procedural
                                     # until it's done
-                                    cf_eps[bva] = bflags
+                                    cf_eps[bva] = (startva, bflags)
                                 else:
                                     logger.debug("descending into function 0x%x (from 0x%x)", bva, va)
                                     self.addEntryPoint(bva, arch=bflags)
@@ -249,6 +259,17 @@ def addCodeFlow(self, va, arch=envi.ARCH_DEFAULT):
 
                             # We only go up to procedural branches, not across
                             continue
+
+                    # we're jumping to a function we're in the middle of
+                    # it's effectively a call from, but we should block
+                    # until the other finishes processing to avoid some...odd
+                    # issues with noret detection
+                    if bva in self._cf_blocks and op.iflags & envi.IF_BRANCH:
+                        if self._cf_recurse and startva != bva:
+                            self._cf_delayed[startva].add(bva)
+                            self._cf_delaying[bva].add(startva)
+
+                        continue
                 except Exception as e:
                     logger.warning("codeflow: %r", e, exc_info=True)
 
@@ -257,10 +278,23 @@ def addCodeFlow(self, va, arch=envi.ARCH_DEFAULT):
 
         # remove our local blocks from global block stack
         self._cf_blocks.pop()
-        while cf_eps:
-            fva, arch = cf_eps.popitem()
-            if not self._mem.isFunction(fva):
-                self.addEntryPoint(fva, arch=arch)
+        for fva, (pva, othrarch) in cf_eps.items():
+            if fva in self._cf_blocks:
+                self._cf_blocked[fva] = (pva, othrarch)
+            else:
+                if not self._mem.isFunction(fva):
+                    self.addEntryPoint(fva, arch=othrarch)
+
+        fallback = collections.OrderedDict()
+        items = list(self._cf_blocked.items())
+        for fva, othrarch in items:
+            if fva not in self._cf_blocks and not self._mem.isFunction(fva):
+                self._funcs.pop(fva, None)
+                self._cf_blocked.pop(fva, None)
+                self.addEntryPoint(fva, arch=othrarch)
+            else:
+                fallback[fva] = arch
+        self._cf_blocked = fallback
 
         return list(calls_from.keys())
 
@@ -291,7 +325,26 @@ def addEntryPoint(self, va, arch=envi.ARCH_DEFAULT):
         # logger.debug('addEntryPoint(0x%x): calls_from: %r', va, calls_from)
 
         # Finally, notify the callback of a new function
-        self._cb_function(va, {'CallsFrom': calls_from})
+        # we gotta hold some of these off for a bit
+        if va not in self._cf_delayed:
+            self._cb_function(va, {'CallsFrom': calls_from})
+            # remove this function from any blocking lists
+            if va in self._cf_delaying:
+                todo = []
+                for blocked in self._cf_delaying[va]:
+                    self._cf_delayed[blocked].discard(va)
+                    if len(self._cf_delayed[blocked]) == 0:
+                        todo.append(blocked)
+
+                self._cf_delaying.pop(va, None)
+                for ova in todo:
+                    self._cf_delayed.pop(ova, None)
+                    calls = self._calls_from.pop(ova, {})
+                    self._cb_function(ova, {'CallsFrom': calls})
+        else:
+            # stash these off for later
+            self._calls_from[va] = calls_from
+
         return va
 
     def flushFunction(self, fva):
diff --git a/envi/const.py b/envi/const.py
index 28d146ab8..d12412dba 100644
--- a/envi/const.py
+++ b/envi/const.py
@@ -18,6 +18,7 @@
 MM_WRITE = 0x2
 MM_EXEC = 0x1
 MM_SHARED = 0x08
+MM_UNINIT = 0x10
 
 MM_READ_WRITE = MM_READ | MM_WRITE
 MM_READ_EXEC = MM_READ | MM_EXEC
diff --git a/vivisect/__init__.py b/vivisect/__init__.py
index 8214b1914..a6d8e1a76 100644
--- a/vivisect/__init__.py
+++ b/vivisect/__init__.py
@@ -21,8 +21,9 @@
 import envi
 import envi.exc as e_exc
 import envi.bits as e_bits
-import envi.common as e_common
 import envi.memory as e_mem
+import envi.const as e_const
+import envi.common as e_common
 import envi.config as e_config
 import envi.bytesig as e_bytesig
 import envi.symstore.resolver as e_resolv
@@ -964,6 +965,9 @@ def findPointers(self, cache=True):
 
         for mva, msize, mperm, mname in self.getMemoryMaps():
 
+            if mperm & e_const.MM_UNINIT:
+                continue
+
             offset, bytes = self.getByteDef(mva)
             maxsize = len(bytes) - size
 
diff --git a/vivisect/analysis/generic/codeblocks.py b/vivisect/analysis/generic/codeblocks.py
index 707e835e6..70a85a547 100644
--- a/vivisect/analysis/generic/codeblocks.py
+++ b/vivisect/analysis/generic/codeblocks.py
@@ -8,6 +8,7 @@
 import collections
 
 import envi
+import envi.const as e_const
 
 from vivisect.const import REF_CODE, LOC_POINTER, LOC_OP
 
@@ -97,6 +98,12 @@ def analyzeFunction(vw, funcva):
                 if rflags & envi.BR_DEREF:
                     continue
 
+                mmap = vw.getMemoryMap(tova)
+                if mmap:
+                    mva, msize, mperm, mname = mmap
+                    if mperm & e_const.MM_UNINIT:
+                        continue
+
                 branch = True
                 todo.append(tova)
 
@@ -136,6 +143,9 @@ def analyzeFunction(vw, funcva):
         # (like during dynamic branch analysis)
         try:
             bsize = blocks[bva]
+            if bsize == 0:
+                continue
+
             tmpcb = vw.getCodeBlock(bva)
             # sometimes codeblocks can be deleted if owned by multiple functions
             if bva not in oldblocks or tmpcb is None:
diff --git a/vivisect/analysis/generic/funcentries.py b/vivisect/analysis/generic/funcentries.py
index 180372cfe..fdc2ba93c 100644
--- a/vivisect/analysis/generic/funcentries.py
+++ b/vivisect/analysis/generic/funcentries.py
@@ -31,6 +31,8 @@ def analyze(vw):
         # Segment permissions check for likely code stuff at all
         if not mapflags & e_const.MM_EXEC:
             continue
+        if mapflags & e_const.MM_UNINIT:
+            continue
 
         i = 0
         maxsize = mapsize - 4
diff --git a/vivisect/base.py b/vivisect/base.py
index 394d9b019..f191c83c6 100644
--- a/vivisect/base.py
+++ b/vivisect/base.py
@@ -728,8 +728,8 @@ def _mcb_WorkspaceServer(self, name, wshost):
     def _fmcb_Thunk(self, funcva, th, thunkname):
         # If the function being made a thunk is registered
         # in NoReturnApis, update codeflow...
-        if self.getMeta('NoReturnApis').get( thunkname.lower() ):
-            self.cfctx.addNoReturnAddr( funcva )
+        if self.getMeta('NoReturnApis').get(thunkname.lower()):
+            self.cfctx.addNoReturnAddr(funcva)
 
     def _fmcb_CallsFrom(self, funcva, th, callsfrom):
         for va in callsfrom:
@@ -823,7 +823,7 @@ def _cb_function(self, fva, fmeta):
 
         fname = vw.getName( fva )
         if vw.getMeta('NoReturnApis').get( fname.lower() ):
-            self._cf_noret[ fva ] = True
+            self._cf_noret[fva] = True
 
         if len( vw.getFunctionBlocks( fva )) == 1:
             return
@@ -833,7 +833,7 @@ def _cb_function(self, fva, fmeta):
             va = lva[0]
             ctup = vw.getCodeBlock(va)
             if ctup and fva == ctup[2] and vw.getFunctionMeta(fva, 'BlockCount', default=0) == 1:
-                self._cf_noret[ fva ] = True
+                self._cf_noret[fva] = True
                 break
 
     def _cb_branchtable(self, tablebase, tableva, destva):
diff --git a/vivisect/parsers/pe.py b/vivisect/parsers/pe.py
index f55547bfa..11ca1a023 100644
--- a/vivisect/parsers/pe.py
+++ b/vivisect/parsers/pe.py
@@ -286,6 +286,8 @@ def loadPeIntoWorkspace(vw, pe, filename=None, baseaddr=None):
             mapflags |= e_const.MM_EXEC
         if chars & PE.IMAGE_SCN_CNT_CODE:
             mapflags |= e_const.MM_EXEC
+        if chars & PE.IMAGE_SCN_CNT_UNINITIALIZED_DATA:
+            mapflags |= e_const.MM_UNINIT
 
         secrva = sec.VirtualAddress
         secvsize = sec.VirtualSize
diff --git a/vivisect/tests/testvivisect.py b/vivisect/tests/testvivisect.py
index c9a1fe1fd..3cfa43933 100644
--- a/vivisect/tests/testvivisect.py
+++ b/vivisect/tests/testvivisect.py
@@ -373,7 +373,6 @@ def test_cli_xrefs(self):
         self.assertIn("From: 0x0804fe94, To: 0x080490d0, Type: Code, Flags: 0x00010001\n", output)
         self.chgrp_vw.canvas.clearCanvas()
 
-
     def test_loc_types(self):
         '''
         Test that we have data consistency in locations
diff --git a/vivisect/tools/graphutil.py b/vivisect/tools/graphutil.py
index b0e556ad1..1a1ecb286 100644
--- a/vivisect/tools/graphutil.py
+++ b/vivisect/tools/graphutil.py
@@ -3,13 +3,17 @@
 Some glue code to do workspace related things based on visgraph
 '''
 import time
-import envi
 import logging
-import vivisect
 import collections
+
+import envi
+import envi.const as e_const
+
 import visgraph.pathcore as vg_pathcore
 import visgraph.graphcore as vg_graphcore
 
+import vivisect
+
 xrskip = envi.BR_PROC | envi.BR_DEREF
 
 logger = logging.getLogger(__name__)
@@ -475,6 +479,12 @@ def buildFunctionGraph(vw, fva, revloop=False, g=None):
             if xrflags & xrskip:
                 continue
 
+            mmap = vw.getMemoryMap(xrto)
+            if mmap:
+                mva, msize, mperm, mname = mmap
+                if mperm & e_const.MM_UNINIT:
+                    continue
+
             if not g.hasNode(xrto):
                 cblock = vw.getCodeBlock(xrto)
                 if cblock is None: