Skip to content

Commit

Permalink
binary file detection for latin1 (#589)
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek authored Aug 1, 2024
1 parent dcf0a75 commit 5f05701
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 8 deletions.
9 changes: 8 additions & 1 deletion credsweeper/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,14 @@ def is_binary(data: bytes) -> bool:
return True
if b"\0\0" in data:
return True
return False
non_ascii_cnt = 0
for i in data[:MAX_LINE_LENGTH]:
if 0x20 > i and i not in (0x09, 0x0A, 0x0D) or 0x7E < i < 0xA0:
# less than space and not tab, line feed, line end
non_ascii_cnt += 1
chunk_len = float(MAX_LINE_LENGTH if MAX_LINE_LENGTH < len(data) else len(data))
# experiment for 255217 binary files shown avg = 0.268264 ± 0.168767, so let choose minimal
return 0.1 < non_ascii_cnt / chunk_len

@staticmethod
def read_file(path: Union[str, Path], encodings: Optional[List[str]] = None) -> List[str]:
Expand Down
19 changes: 12 additions & 7 deletions tests/utils/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from lxml.etree import XMLSyntaxError

from credsweeper.common.constants import Chars, DEFAULT_ENCODING, UTF_8, MAX_LINE_LENGTH, CHUNK_STEP_SIZE, CHUNK_SIZE, \
OVERLAP_SIZE
OVERLAP_SIZE, LATIN_1, UTF_16
from credsweeper.utils import Util
from tests import AZ_DATA, AZ_STRING, SAMPLES_PATH

Expand Down Expand Up @@ -309,14 +309,19 @@ def test_is_elf_n(self):
self.assertFalse(Util.is_elf(data))

def test_is_binary_p(self):
self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_32")))
self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_32_le")))
self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_32_be")))
self.assertTrue(Util.is_binary(b"\0\0\0\0"))
# unsupported encoding
self.assertTrue(Util.is_binary(AZ_STRING.encode("utf_32")))
self.assertTrue(Util.is_binary(AZ_STRING.encode("utf_32_le")))
self.assertTrue(Util.is_binary(AZ_STRING.encode("utf_32_be")))
# utf-16 is supported but must be decoded before Util.is_binary()
self.assertTrue(Util.is_binary(AZ_STRING.encode(UTF_16)))
self.assertTrue(Util.is_binary(AZ_STRING.encode("utf_16_le")))
self.assertTrue(Util.is_binary(AZ_STRING.encode("utf_16_be")))

def test_is_binary_n(self):
self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_16")))
self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_16_le")))
self.assertFalse(Util.is_elf(AZ_STRING.encode("utf_16_be")))
self.assertFalse(Util.is_binary("Üben von Xylophon und Querflöte ist ja zweckmäßig".encode(LATIN_1)))
self.assertFalse(Util.is_binary(b"\x7Ffew unprintable letters\x00"))

def test_is_ascii_entropy_validate_p(self):
self.assertTrue(Util.is_ascii_entropy_validate(b''))
Expand Down

0 comments on commit 5f05701

Please sign in to comment.