-
Notifications
You must be signed in to change notification settings - Fork 0
/
CrispDM.py
95 lines (73 loc) · 4.05 KB
/
CrispDM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import httpagentparser
import pandas as pd
import requests
from sklearn.ensemble import GradientBoostingClassifier
# Create a data from .json file in the same directory
df = pd.read_json('../../WebstormProjects/fingerprint-frontend/prediction/fingerprintsnew.json')
df = df.drop(['_id'], axis=1)
df = df.drop(['_class'], axis=1)
df = df.drop('process', axis=1)
response = requests.get('http://localhost:8080/api/bot-data')
# get response body as json
fingerprint = response.json()
df_latest = pd.json_normalize(fingerprint)
df_latest = df_latest.drop(['id.timestamp', 'id.date', 'process'], axis=1)
# Concatenate the two dataframes
df = pd.concat([df, df_latest], ignore_index=True)
# Replace "None" with a placeholder value (e.g., 0) in the 'bot' column
df['bot'] = df['bot'].replace({'None': 0})
# Fill NaN values with a default value or use other imputation techniques
df = df.fillna(0)
# Convert the 'languages' column to a more usable format
df = df.drop(['languages'], axis=1)
# Handle the 'distinctiveProps' column containing JSON-like data
df = df.drop(['distinctiveProps'], axis=1)
df = df.drop(['documentElementKeys'], axis=1)
df = df.drop(['functionBind'], axis=1)
df = df.drop(['windowExternal'], axis=1)
# New columns for slimerjs, pahntomjs, headless, electron check if these strings are in the appVersion if yes then 1 else 0
df['slimerjs'] = df['appVersion'].apply(lambda x: 1 if 'slimerjs' in x.lower() else 0)
df['phantomjs'] = df['appVersion'].apply(lambda x: 1 if 'phantomjs' in x.lower() else 0)
df['headless'] = df['appVersion'].apply(lambda x: 1 if 'headless' in x.lower() else 0)
df['electron'] = df['appVersion'].apply(lambda x: 1 if 'electron' in x.lower() else 0)
df = df.drop(['appVersion'], axis=1)
# Convert boolean columns to numeric
bool_columns = ['bot', 'android', 'documentFocus', 'notificationPermissions', 'pluginsArray', 'webDriver', 'slimerjs', 'phantomjs', 'headless', 'electron']
df[bool_columns] = df[bool_columns].astype('bool')
# Handle User Agent
df['userAgent'] = df['userAgent'].apply(lambda x: httpagentparser.detect(x))
# Create new columns from the dictionary in userAgent, if it exists give it a default value of 'Unknown'
df['browserNameUA'] = df['userAgent'].apply(lambda x: x['browser']['name'] if 'browser' in x else 'Unknown')
df['browserVersionUA'] = df['userAgent'].apply(lambda x: x['browser']['version'] if 'browser' in x else 'Unknown')
df['osNameUA'] = df['userAgent'].apply(lambda x: x['os']['name'] if 'os' in x else 'Unknown')
df['platformName'] = df['userAgent'].apply(lambda x: x['platform']['name'] if 'platform' in x else 'Unknown')
df['platformVersion'] = df['userAgent'].apply(lambda x: x['platform']['version'] if 'platform' in x else 'Unknown')
df = df.drop(['userAgent'], axis=1)
# One-hot encode categorical columns
categorical_columns = ['browserEngineKind', 'browserKind', 'webGlVendor', 'webGlRenderer', 'browserNameUA', 'browserVersionUA', 'osNameUA', 'platformName', 'platformVersion']
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
# Sort the columns alphabetically
df_encoded = df_encoded.reindex(sorted(df_encoded.columns), axis=1)
df_latest = df_encoded.iloc[[-1]]
df_latest['bot'] = None
df_encoded = df_encoded.drop(df_encoded.tail(1).index)
# Feature matrix (X) and target variable (y)
X = df_encoded.drop('bot', axis=1)
y = df_encoded['bot']
# Initialize the Decision Tree model
clf = GradientBoostingClassifier(n_estimators=150, max_depth=5)
# Train the model
clf.fit(X, y)
print(clf.predict(df_latest.drop(['bot'], axis=1)))
# Evaluate the model on the test set
# print(confusion_matrix(y_test, clf.predict(X_test)))
# report = classification_report(y_test, clf.predict(X_test), output_dict=True)
# print('Accuracy: ', report['accuracy'])
# print('Precision: ', report['False']['precision'])
# print('Recall: ', report['False']['recall'])
# print('F1 Score: ', report['False']['f1-score'])
# print('Precision: ', report['True']['precision'])
# print('Recall: ', report['True']['recall'])
# print('F1 Score: ', report['True']['f1-score'])
#
# print(balanced_accuracy_score(y_test, clf.predict(X_test)))