-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathai.py
53 lines (48 loc) · 1.91 KB
/
ai.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import glob
import random
import string
import pickle
import time
from sklearn import tree
acceptable_chars = string.digits + string.ascii_letters + string.punctuation
unprintable_bytes_list = [i for i in range(256) if chr(i) not in acceptable_chars]
unprintable_bytes = bytes(unprintable_bytes_list)
all_files = glob.glob("*/*.jar")
def get_random_string():
random_file = random.choice(all_files)
with open(random_file, "rb") as f:
file_bytes = f.read()
random_segment = ""
while len(random_segment) < 5:
start_point = random.randint(0, len(file_bytes) - 10)
end_point = start_point + 10
random_segment = file_bytes[start_point:end_point]
# Remove all non-printable characters
random_segment = random_segment.translate(None, unprintable_bytes)
if random_segment not in file_bytes:
random_segment = ""
return random_segment
print("finding sample segments")
random_tests = [get_random_string() for i in range(2000)]
with open(f"random_tests{time.time()}.pkl", "wb") as f:
pickle.dump(random_tests, f)
print("testing sample segments")
classifier = tree.DecisionTreeClassifier(max_depth=5)
data = {"input_data": [], "classification": []}
for file in all_files:
with open(file, "rb") as f:
file_bytes = f.read()
random_test_results = [
random_test in file_bytes for random_test in random_tests # type: ignore
]
data["input_data"].append(random_test_results)
data["classification"].append(
"malware" if file.startswith("rats") else "benign"
)
print("fitting classifier")
classifier.fit(data["input_data"], data["classification"])
classifier_result = tree.export_text(classifier, feature_names=random_tests)
print(classifier_result)
print("saving classifier")
with open(f"classifier{time.time()}.pkl", "wb") as f:
pickle.dump(classifier, f)