diff --git a/CHANGELOG.md b/CHANGELOG.md index ec3369aef..92760e64c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ - vmray: load more analysis archives @mr-tz - dynamic: only check file limitations for static file formats @mr-tz +- vmray: skip non-printable strings @mike-hunhoff ### capa Explorer Web diff --git a/capa/features/extractors/strings.py b/capa/features/extractors/strings.py index 3596e49d8..5a1ecf8e3 100644 --- a/capa/features/extractors/strings.py +++ b/capa/features/extractors/strings.py @@ -9,6 +9,7 @@ # See the License for the specific language governing permissions and limitations under the License. import re +import string import contextlib from collections import namedtuple @@ -19,6 +20,7 @@ UNICODE_RE_4 = re.compile(b"((?:[%s]\x00){%d,})" % (ASCII_BYTE, 4)) REPEATS = [b"A", b"\x00", b"\xfe", b"\xff"] SLICE_SIZE = 4096 +PRINTABLE_CHAR_SET = set(string.printable) String = namedtuple("String", ["s", "offset"]) @@ -84,3 +86,7 @@ def extract_unicode_strings(buf, n=4): for match in r.finditer(buf): with contextlib.suppress(UnicodeDecodeError): yield String(match.group().decode("utf-16"), match.start()) + + +def is_printable_str(s: str) -> bool: + return set(s).issubset(PRINTABLE_CHAR_SET) diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py index 6ded3a4fc..871189e54 100644 --- a/capa/features/extractors/vmray/call.py +++ b/capa/features/extractors/vmray/call.py @@ -12,6 +12,7 @@ from capa.features.insn import API, Number from capa.features.common import String, Feature from capa.features.address import Address +from capa.features.extractors.strings import is_printable_str from capa.features.extractors.vmray.models import PARAM_TYPE_INT, PARAM_TYPE_STR, Param, FunctionCall, hexint from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle @@ -27,11 +28,9 @@ def get_call_param_features(param: Param, ch: CallHandle) -> Iterator[tuple[Feat if param.deref.type_ in PARAM_TYPE_INT: yield Number(hexint(param.deref.value)), ch.address elif param.deref.type_ in PARAM_TYPE_STR: - # TODO(mr-tz): remove FPS like " \\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\x09\\x0a\\x0b\\x0c\\x0d\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\..." - # https://github.com/mandiant/capa/issues/2432 - - # parsing the data up to here results in double-escaped backslashes, remove those here - yield String(param.deref.value.replace("\\\\", "\\")), ch.address + if is_printable_str(param.deref.value): + # parsing the data up to here results in double-escaped backslashes, remove those here + yield String(param.deref.value.replace("\\\\", "\\")), ch.address else: logger.debug("skipping deref param type %s", param.deref.type_) elif param.value is not None: