-
Notifications
You must be signed in to change notification settings - Fork 454
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Extract Rust specific strings from binaries #791 #836
Changes from 1 commit
a75661e
e7c7595
8a394bb
e9ca68e
45978ea
4cbffaf
f128d19
80dce99
13c8920
27958fb
e074722
dbf7ad1
bbd3d53
4839543
3ebd075
226486e
c46410e
8fabe4b
8bd3711
74f3a91
2d5bf95
76d5f84
b02fc6a
39e814c
02288d7
797e5e3
657d497
73afe8b
267862e
9fe75c7
a67f9f2
07a7558
5a6fdb6
1650f8b
6cdccb3
62405fe
c98450d
57fc902
ef27592
1909255
6011ea7
890ba55
df20ec1
2fdb823
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,8 +8,9 @@ | |
|
||
import pefile | ||
import binary2strings as b2s | ||
|
||
from floss.results import StaticString, StringEncoding | ||
from floss.language.utils import find_lea_xrefs, get_struct_string_candidates | ||
from floss.language.utils import find_lea_xrefs, find_mov_xrefs, find_push_xrefs, get_struct_string_candidates | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
@@ -68,7 +69,7 @@ def split_strings(static_strings: List[StaticString], address: int) -> None: | |
return | ||
|
||
|
||
def extract_rust_strings(sample: str, min_length: int) -> List[StaticString]: | ||
def extract_rust_strings(sample: pathlib.Path, min_length: int) -> List[StaticString]: | ||
""" | ||
Extract Rust strings from a sample | ||
""" | ||
|
@@ -104,9 +105,11 @@ def get_string_blob_strings(pe: pefile.PE, min_length: int) -> Iterable[StaticSt | |
static_strings = filter_and_transform_utf8_strings(strings, start_rdata) | ||
|
||
struct_string_addrs = map(lambda c: c.address, get_struct_string_candidates(pe)) | ||
xrefs = find_lea_xrefs(pe) | ||
xrefs_lea = find_lea_xrefs(pe) | ||
xrefs_push = find_push_xrefs(pe) | ||
xrefs_mov = find_mov_xrefs(pe) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we want to do all of these all the time (for both architectures or specific to 32/64 bit?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Push and mov operations are only specific to the i386 architecture, and they have been handled in the latest commit. Thanks |
||
|
||
for addr in itertools.chain(struct_string_addrs, xrefs): | ||
for addr in itertools.chain(struct_string_addrs, xrefs_lea, xrefs_push, xrefs_mov): | ||
address = addr - image_base - virtual_address + pointer_to_raw_data | ||
|
||
if not (start_rdata <= address < end_rdata): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -160,6 +160,109 @@ def find_lea_xrefs(pe: pefile.PE) -> Iterable[VA]: | |
yield xref | ||
|
||
|
||
def find_i386_push_xrefs(buf: bytes) -> Iterable[VA]: | ||
""" | ||
scan the given data found at the given base address | ||
to find all the 32-bit PUSH instructions, | ||
extracting the target virtual address. | ||
""" | ||
push_insn_re = re.compile( | ||
rb""" | ||
( | ||
\x68 # 68 aa aa 00 00 push 0xaaaa | ||
) | ||
(?P<address>....) | ||
""", | ||
re.DOTALL + re.VERBOSE, | ||
) | ||
|
||
for match in push_insn_re.finditer(buf): | ||
address_bytes = match.group("address") | ||
address = struct.unpack("<I", address_bytes)[0] | ||
|
||
yield address | ||
|
||
|
||
def find_push_xrefs(pe: pefile.PE) -> Iterable[VA]: | ||
""" | ||
scan the executable sections of the given PE file | ||
for PUSH instructions that reference valid memory addresses, | ||
yielding the virtual addresses. | ||
""" | ||
low, high = get_image_range(pe) | ||
|
||
for section in pe.sections: | ||
if not section.IMAGE_SCN_MEM_EXECUTE: | ||
continue | ||
|
||
code = section.get_data() | ||
|
||
if pe.FILE_HEADER.Machine == pefile.MACHINE_TYPE["IMAGE_FILE_MACHINE_AMD64"]: | ||
xrefs: Iterable[VA] = [] # no push instructions on amd64 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we be sure string references would never be pushed? probably, but can you add some context/references around this? the comment is a little misleading currently There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've carefully checked amd64 binary files, and as of now, I haven't come across any mov or push instructions related to string references. However, considering the complexity of the Instruction set, I can't rule out the possibility of some rare cases escaping my notice. To ensure completeness, I've included this in the latest commit. |
||
elif pe.FILE_HEADER.Machine == pefile.MACHINE_TYPE["IMAGE_FILE_MACHINE_I386"]: | ||
xrefs = find_i386_push_xrefs(code) | ||
else: | ||
raise ValueError("unhandled architecture") | ||
|
||
for xref in xrefs: | ||
if low <= xref < high: | ||
yield xref | ||
|
||
|
||
def find_i386_mov_xrefs(buf: bytes) -> Iterable[VA]: | ||
""" | ||
scan the given data found at the given base address | ||
to find all the 32-bit MOV instructions, | ||
extracting the target virtual address. | ||
""" | ||
mov_insn_re = re.compile( | ||
rb""" | ||
( | ||
\xB9 # b9 aa aa 00 00 mov ecx,0xaaaa | ||
| \xBB # bb aa aa 00 00 mov ebx,0xaaaa | ||
| \xBA # ba aa aa 00 00 mov edx,0xaaaa | ||
| \xB8 # b8 aa aa 00 00 mov eax,0xaaaa | ||
| \xBE # be aa aa 00 00 mov esi,0xaaaa | ||
| \xBF # bf aa aa 00 00 mov edi,0xaaaa | ||
) | ||
(?P<address>....) | ||
""", | ||
re.DOTALL + re.VERBOSE, | ||
) | ||
|
||
for match in mov_insn_re.finditer(buf): | ||
address_bytes = match.group("address") | ||
address = struct.unpack("<I", address_bytes)[0] | ||
|
||
yield address | ||
|
||
|
||
def find_mov_xrefs(pe: pefile.PE) -> Iterable[VA]: | ||
""" | ||
scan the executable sections of the given PE file | ||
for MOV instructions that reference valid memory addresses, | ||
yielding the virtual addresses. | ||
""" | ||
low, high = get_image_range(pe) | ||
|
||
for section in pe.sections: | ||
if not section.IMAGE_SCN_MEM_EXECUTE: | ||
continue | ||
|
||
code = section.get_data() | ||
|
||
if pe.FILE_HEADER.Machine == pefile.MACHINE_TYPE["IMAGE_FILE_MACHINE_AMD64"]: | ||
xrefs: Iterable[VA] = [] # no mov instructions on amd64 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment as above |
||
elif pe.FILE_HEADER.Machine == pefile.MACHINE_TYPE["IMAGE_FILE_MACHINE_I386"]: | ||
xrefs = find_i386_mov_xrefs(code) | ||
else: | ||
raise ValueError("unhandled architecture") | ||
|
||
for xref in xrefs: | ||
if low <= xref < high: | ||
yield xref | ||
|
||
|
||
def get_max_section_size(pe: pefile.PE) -> int: | ||
"""get the size of the largest section, as seen on disk.""" | ||
return max(map(lambda s: s.SizeOfRawData, pe.sections)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you please share some current coverage output for 32 and 64 bit samples?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently, both are at approximately 90% coverage.
coverage32.txt
coverage64.txt
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you repeat this for a few random samples and just share the stats here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On random samples from VT:-
result.txt
Average:- 94.5%
Low:- 88%
High:- 99%
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
great, that looks pretty promising!