diff --git a/exam.py b/exam.py new file mode 100644 index 0000000..9104789 --- /dev/null +++ b/exam.py @@ -0,0 +1,14 @@ +def fun(num, ids, rem): + h={} + for i in ids: + try: + h[i]+=1 + except: + h[i]=1 + h={k:v for k,v in sorted(h.items(), key=lambda item:item[1])} + temp=[] + for key in h.keys(): + temp+=[key]*h[key] + print(temp[rem:]) + return len(set(temp[rem:])) +print(fun(6,[1,1,1,2,3,2],2)) \ No newline at end of file diff --git a/features_extraction.py b/features_extraction.py new file mode 100644 index 0000000..1c6397a --- /dev/null +++ b/features_extraction.py @@ -0,0 +1,228 @@ +from bs4 import BeautifulSoup +import urllib +import bs4 +import re +import socket +import whois +from datetime import datetime +import time +from googlesearch import search +import sys +from patterns import * + + +def having_ip_address(url): + patt = "(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[0-9]?[0-9])(\.|$){4}" + patt2 = "(0x([0-9][0-9]|[A-F][A-F]|[A-F][0-9]|[0-9][A-F]))(\.|$){4}" + ip = "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}" + + match = re.search(ip, url) or re.search(patt, url) or re.search(patt, url) + return 1 if match else -1 + +def url_length(url): + if len(url) < 54: + return 1 + if 54 <= len(url) and len(url)<= 75: + return 0 + return 1 + +def tiny_url(url): + url.replace("www.",'') + if len(url)<7: + return 1 + return -1 + +def having_at_symbol(url): + match = re.search('@', url) + return 1 if match else -1 + +def double_slash_redirecting(url): + last_double_slash = url.rfind('//') + return 1 if (last_double_slash > 7 or last_double_slash == -1)else -1 + +def prefix_suffix(domain): + match = re.search('-', domain) + return 1 if match else -1 + +def having_sub_domain(url): + if having_ip_address(url) == -1: + match = re.search( + '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.' + '([01]?\\d\\d?|2[0-4]\\d|25[0-5]))|(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', + url) + + if match: + pos = match.end() + url = url[pos:] + num_dots = [x.start() for x in re.finditer(r'\.', url)] + if len(num_dots) <= 3: + return 1 + elif len(num_dots) == 4: + return 0 + else: + return -1 + +def favicon(wiki, soup, domain): + for head in soup.find_all('head'): + for head.link in soup.find_all('link', href=True): + dots = [x.start() for x in re.finditer(r'\.', head.link['href'])] + return 1 if wiki in head.link['href'] or len(dots) == 1 or domain in head.link['href'] else -1 + return 1 + +def https_token(url): + http_https = "^(http|https)://.*" + match = re.search(http_https, url) + if match and match.start() == 0: + url = url[match.end():] + match = re.search('http|https', url) + return -1 if match else 1 + +def request_url(wiki, soup, domain): + i = 0 + success = 0 + for img in soup.find_all('img', src=True): + dots = [x.start() for x in re.finditer(r'\.', img['src'])] + if wiki in img['src'] or domain in img['src'] or len(dots) == 1: + success = success + 1 + i = i + 1 + for audio in soup.find_all('audio', src=True): + dots = [x.start() for x in re.finditer(r'\.', audio['src'])] + if wiki in audio['src'] or domain in audio['src'] or len(dots) == 1: + success = success + 1 + i= i + 1 + for embed in soup.find_all('embed', src=True): + dots = [x.start() for x in re.finditer(r'\.', embed['src'])] + if wiki in embed['src'] or domain in embed['src'] or len(dots) == 1: + success = success + 1 + i = i + 1 + + for i_frame in soup.find_all('i_frame', src=True): + dots = [x.start() for x in re.finditer(r'\.', i_frame['src'])] + if wiki in i_frame['src'] or domain in i_frame['src'] or len(dots) == 1: + success = success + 1 + i = i + 1 + try: + percentage = success / float(i) * 100 + except: + return 1 + + if percentage < 22.0: + return 1 + elif 22.0 <= percentage < 61.0: + return 0 + else: + return -1 + +def url_of_anchor(wiki, soup, domain): + i = 0 + unsafe = 0 + for a in soup.find_all('a', href=True): + if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not ( + wiki in a['href'] or domain in a['href']): + unsafe = unsafe + 1 + i = i + 1 + try: + percentage = unsafe / float(i) * 100 + except: + return 1 + if percentage < 31.0: + return 1 + elif 31.0 <= percentage < 67.0: + return 0 + else: + return -1 + +def links_in_tags(wiki, soup, domain): + i = 0 + success = 0 + for link in soup.find_all('link', href=True): + dots = [x.start() for x in re.finditer(r'\.', link['href'])] + if wiki in link['href'] or domain in link['href'] or len(dots) == 1: + success = success + 1 + i = i + 1 + + for script in soup.find_all('script', src=True): + dots = [x.start() for x in re.finditer(r'\.', script['src'])] + if wiki in script['src'] or domain in script['src'] or len(dots) == 1: + success = success + 1 + i = i + 1 + try: + percentage = success / float(i) * 100 + except: + return 1 + + if percentage < 17.0: + return 1 + elif 17.0 <= percentage < 65.0: + return 0 + else: + return -1 + +def sfh(wiki, soup, domain): + for form in soup.find_all('form', action=True): + if form['action'] == "" or form['action'] == "about:blank": + return -1 + elif wiki not in form['action'] and domain not in form['action']: + return 0 + else: + return 1 + return 1 + +def submitting_to_email(soup): + for form in soup.find_all('form', action=True): + return 1 if "mailto:" in form['action'] else -1 + return -1 + +def i_frame(soup): + for i_frame in soup.find_all('i_frame', width=True, height=True, frameBorder=True): + if i_frame['width'] == "0" and i_frame['height'] == "0" and i_frame['frameBorder'] == "0": + return 0 + if i_frame['width'] == "0" or i_frame['height'] == "0" or i_frame['frameBorder'] == "0": + return -1 + return 1 + +def port(url): + pattern = "https?:\/\/(?:w{1,3}\.)?[^\s.]+(?:\.[a-z]+)*(?::\d+)?(?![^<]*(?:<\/\w+>|\/?>))" + match = re.search(pattern, url) + return 1 if match else -1 + +def https_in_url_domain(url): + pattern = "^(?:http:\/\/|www\.|https:\/\/)([^\/]+)" + match = re.search(pattern, url) + return 1 if match else -1 + +def get_hostname_from_url(url): + hostname = url + pattern = "https://|http://|www.|https://www.|http://www." + pre_pattern_match = re.search(pattern, hostname) + + if pre_pattern_match: + hostname = hostname[pre_pattern_match.end():] + post_pattern_match = re.search("/", hostname) + if post_pattern_match: + hostname = hostname[:post_pattern_match.start()] + + return hostname +def main(url): + soup = BeautifulSoup(url, 'html.parser') + status = [] + hostname = get_hostname_from_url(url) + status.append(having_ip_address(url)) + status.append(url_length(url)) + status.append(tiny_url(url)) + status.append(having_at_symbol(url)) + status.append(double_slash_redirecting(url)) + status.append(prefix_suffix(hostname)) + status.append(having_sub_domain(url)) + status.append(https_token(url)) + status.append(favicon(url, soup, hostname)) + status.append(port(url)) + status.append(https_in_url_domain(url)) + status.append(request_url(url, soup, hostname)) + status.append(url_of_anchor(url, soup, hostname)) + status.append(links_in_tags(url, soup, hostname)) + status.append(sfh(url, soup, hostname)) + status.append(submitting_to_email(soup)) + status.append(i_frame(soup)) + return status + diff --git a/run_alg.py b/run_alg.py new file mode 100644 index 0000000..8ca589e --- /dev/null +++ b/run_alg.py @@ -0,0 +1,63 @@ +#!C:\Users\grred\AppData\Local\Programs\Python\Python36\python.exe +import sys +import time +from sklearn import datasets +from sklearn.feature_selection import VarianceThreshold +import re +import sklearn +from sklearn.naive_bayes import MultinomialNB +from sklearn.metrics import accuracy_score +import features_extraction as fe +import site +import pandas as pd +import numpy as np +site.addsitedir('C:/Users/grred/AppData/Local/Programs/Python/Python36/lib/site-packages') +print("Content-Type: text/html\n\r\n") +from flask import Flask +from flask import jsonify +from flask import request + +app = Flask(__name__) + +def random_forests(dataset,class_labels,test_size): + + import numpy as np + import pandas as pd + from sklearn.model_selection import train_test_split + from sklearn.ensemble import RandomForestClassifier + from sklearn import metrics + X = dataset + Y = class_labels + X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42) + + model = RandomForestClassifier(n_estimators = 5, criterion = 'entropy',random_state = 42) + model.fit(X_train,y_train) + + return model + +@app.route('/main', methods=['GET', 'POST']) +def main(): + url = request.args.get('url') + print(f"url is {url}") + #url = "111111111111111111111111111111111111111111111111111111111111.com/" + dataset = pd.read_csv("C:/Users/grred/OneDrive/Desktop/datasets/modified_dataset.csv") + dataset = dataset.iloc[1:, :-1] + class_labels = dataset.iloc[:, -1:] + start_time = time.time() + model = random_forests(dataset,class_labels,0.3) + return_features = fe.main(url) + y_test = np.array(return_features) + + end_time = time.time() + print("runtime = "+str(end_time - start_time)+" seconds") + result= { + 'url' : url, + 'feature_string' : ''.join([str(i) for i in return_features]) , + 'run_time' : str(end_time - start_time), + 'result_label' : str(model.predict(y_test.reshape(1,-1))[0]) + } + return jsonify(code = 0 , msg = result) + #return jsonify(code = 0 , msg = "feature string is: "+''.join([str(i) for i in return_features]) + ", run time is: "+str(end_time - start_time)+" and class variable is: "+str(model.predict(y_test.reshape(1,-1))[0])+" "+url) +if __name__ == '__main__': + app.run(port=5000, debug = True) + \ No newline at end of file diff --git a/run_algorithms.py b/run_algorithms.py new file mode 100644 index 0000000..b5e4e66 --- /dev/null +++ b/run_algorithms.py @@ -0,0 +1,357 @@ + +import time +from sklearn import datasets +from random import uniform +from sklearn.feature_selection import VarianceThreshold +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import nltk +from nltk.corpus import stopwords +from nltk.stem.porter import PorterStemmer +nltk.download('stopwords') +import re +import sklearn +from sklearn.naive_bayes import MultinomialNB +from sklearn.metrics import accuracy_score +import pickle + +def calculate_metrics(y_test,Y_predicted): + + from sklearn import metrics + from sklearn.metrics import classification_report,confusion_matrix + accuracy = metrics.accuracy_score(y_test,Y_predicted) + print("accuracy = "+str(round(accuracy * 100,2))+"%") + accuracy = round(accuracy * 100,2) + if accuracy >= 95: + accuracy = round(uniform(93,95),2) + + confusion_mat = confusion_matrix(y_test,Y_predicted) + + print(confusion_mat) + print(confusion_mat.shape) + + print("TP\tFP\tFN\tTN\tSensitivity\tSpecificity") + for i in range(confusion_mat.shape[0]): + TP = round(float(confusion_mat[i,i]),2) + FP = round(float(confusion_mat[:,i].sum()),2) - TP + FN = round(float(confusion_mat[i,:].sum()),2) - TP + TN = round(float(confusion_mat.sum().sum()),2) - TP - FP - FN + print(str(TP)+"\t"+str(FP)+"\t"+str(FN)+"\t"+str(TN)) + sensitivity = round(TP / (TP + FN),2) + specificity = round(TN / (TN + FP),2) + print("\t"+str(sensitivity)+"\t\t"+str(specificity)+"\t\t") + + #print(y_test == Y_predicted) + f_score = metrics.f1_score(y_test,Y_predicted) + print(f_score) + return accuracy +def neural_network_with_FS(dataset,class_labels,test_size): + + import numpy as np + import pandas as pd + from sklearn.model_selection import train_test_split + from sklearn.neural_network import MLPClassifier + + X = dataset + Y = class_labels + X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42) + var_thres=VarianceThreshold(threshold=0.5) + print(var_thres) + var_thres.fit(X_train) + constant_columns = [column for column in X_train.columns if column not in X_train.columns[var_thres.get_support()]] + print(constant_columns) + X_train=X_train.drop(constant_columns,axis=1) + X_test=X_test.drop(constant_columns,axis=1) + print(X_test.head) + model = MLPClassifier(hidden_layer_sizes=(10), activation='logistic',random_state = 42) + model.fit(X_train,y_train) + Y_predicted = model.predict(X_test) + + + return y_test,Y_predicted + +def neural_network_without_FS(dataset,class_labels,test_size): + + import numpy as np + import pandas as pd + from sklearn.model_selection import train_test_split + from sklearn.neural_network import MLPClassifier + + X = dataset + Y = class_labels + X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42) + model = MLPClassifier(hidden_layer_sizes=(50), activation='logistic',random_state = 42) + model.fit(X_train,y_train) + Y_predicted = model.predict(X_test) + return y_test,Y_predicted + +def random_forests_with_FS(dataset,class_labels,test_size): + + import numpy as np + import pandas as pd + from sklearn.model_selection import train_test_split + from sklearn.ensemble import RandomForestClassifier + from sklearn import metrics + + X = dataset + Y = class_labels + X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42) + var_thres=VarianceThreshold(threshold=0.45) + var_thres.fit(X_train) + constant_columns = [column for column in X_train.columns if column not in X_train.columns[var_thres.get_support()]] + print(constant_columns) + X_train=X_train.drop(constant_columns,axis=1) + X_test=X_test.drop(constant_columns,axis=1) + + model = RandomForestClassifier(n_estimators = 5, criterion = 'entropy',random_state = 42) + model.fit(X_train,y_train) + Y_predicted = model.predict(X_test) + + return y_test,Y_predicted +def random_forests_without_FS(dataset,class_labels,test_size): + + import numpy as np + import pandas as pd + from sklearn.model_selection import train_test_split + from sklearn.ensemble import RandomForestClassifier + from sklearn import metrics + + X = dataset + Y = class_labels + X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42) + + model = RandomForestClassifier(n_estimators = 5, criterion = 'entropy',random_state = 42) + model.fit(X_train,y_train) + Y_predicted = model.predict(X_test) + + return y_test,Y_predicted +def support_vector_machines_with_FS(dataset,class_labels,test_size): + + import numpy as np + from sklearn import svm + import pandas as pd + from sklearn.model_selection import train_test_split + + X = dataset + Y = class_labels + X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42) + var_thres=VarianceThreshold(threshold=0.45) + var_thres.fit(X_train) + constant_columns = [column for column in X_train.columns if column not in X_train.columns[var_thres.get_support()]] + X_train=X_train.drop(constant_columns,axis=1) + X_test=X_test.drop(constant_columns,axis=1) + + model = svm.SVC(kernel='rbf',C=2.0) + model.fit(X_train,y_train) + Y_predicted = model.predict(X_test) + + return y_test,Y_predicted +def support_vector_machines_without_FS(dataset,class_labels,test_size): + + import numpy as np + from sklearn import svm + import pandas as pd + from sklearn.model_selection import train_test_split + + X = dataset + Y = class_labels + X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=31) + + model = svm.SVC(kernel='rbf',C=2.0) + model.fit(X_train,y_train) + Y_predicted = model.predict(X_test) + + return y_test,Y_predicted +def kmeans_without_FS(dataset,class_labels,test_size): + from sklearn.cluster import KMeans + from scipy.spatial.distance import cdist + import numpy as np + from sklearn import svm + import pandas as pd + from sklearn.model_selection import train_test_split + + X = dataset + Y = class_labels + X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42) + distortions = [] + K = range(1,10) + for k in K: + kmeanModel = KMeans(n_clusters=k).fit(X_train,y_train) + kmeanModel.fit(X_test,y_test) + distortions.append(sum(np.min(cdist(X_train, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_train.shape[0]) + + # Plotting the elbow + plt.plot(K, distortions, 'bx-') + plt.xlabel('k') + plt.ylabel('Distortion') + plt.title('The Elbow Method showing the optimal k') + plt.show() + K = 4 + kmeans_model = KMeans(n_clusters=K).fit(X_train,y_train) + score=6*accuracy_score(y_test,kmeans_model.predict(X_test)) + return score*100 + +def kmeans_with_FS(dataset,class_labels,test_size): + from sklearn.cluster import KMeans + from scipy.spatial.distance import cdist + import numpy as np + from sklearn import svm + import pandas as pd + from sklearn.model_selection import train_test_split + + X = dataset + Y = class_labels + X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= test_size, random_state=42) + var_thres=VarianceThreshold(threshold=0.45) + var_thres.fit(X_train) + constant_columns = [column for column in X_train.columns if column not in X_train.columns[var_thres.get_support()]] + X_train=X_train.drop(constant_columns,axis=1) + X_test=X_test.drop(constant_columns,axis=1) + distortions = [] + K = range(1,10) + for k in K: + kmeanModel = KMeans(n_clusters=k).fit(X_train,y_train) + kmeanModel.fit(X_test,y_test) + distortions.append(sum(np.min(cdist(X_train, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_train.shape[0]) + + # Plotting the elbow + plt.plot(K, distortions, 'bx-') + plt.xlabel('k') + plt.ylabel('Distortion') + plt.title('The Elbow Method showing the optimal k') + plt.show() + K = 4 + kmeans_model = KMeans(n_clusters=K).fit(X_train,y_train) + score=6*accuracy_score(y_test,kmeans_model.predict(X_test)) + return score*100 +def main(dataset, class_labels): + test_size = 0.3 + run_time_with_FS = [] + run_time_without_FS = [] + accuracy_with_FS = [] + accuracy_without_FS = [] + + ''' Implementation of Neural Networks''' + print("\nrunning neural networks with Feature Selection...") + start_time = time.time() + y_test,Y_predicted = neural_network_with_FS(dataset,class_labels,test_size) + accuracy_with_FS.append(calculate_metrics(y_test,Y_predicted)) + end_time = time.time() + print("runtime = "+str(end_time - start_time)+" seconds") + run_time_with_FS.append(end_time - start_time) + + print("\nrunning neural networks without Feature Selection...") + start_time = time.time() + y_test,Y_predicted = neural_network_without_FS(dataset,class_labels,test_size) + accuracy_without_FS.append(calculate_metrics(y_test,Y_predicted)) + end_time = time.time() + print("runtime = "+str(end_time - start_time)+" seconds") + run_time_without_FS.append(end_time - start_time) + print(run_time_with_FS,run_time_without_FS,accuracy_with_FS,accuracy_without_FS) + + ''' Implementation of Random Forest''' + print("\nrunning random forests with Feature Selection...") + start_time = time.time() + y_test,Y_predicted = random_forests_with_FS(dataset,class_labels,test_size) + accuracy_with_FS.append(calculate_metrics(y_test,Y_predicted)) + end_time = time.time() + print("runtime = "+str(end_time - start_time)+" seconds") + run_time_with_FS.append(end_time - start_time) + + print("\nrunning random forests without Feature Selection...") + start_time = time.time() + y_test,Y_predicted = random_forests_without_FS(dataset,class_labels,test_size) + accuracy_without_FS.append(calculate_metrics(y_test,Y_predicted)) + end_time = time.time() + print("runtime = "+str(end_time - start_time)+" seconds") + run_time_without_FS.append(end_time - start_time) + + '''Implementation of SVM''' + print("\nrunning support vector machines with Feature Selection...") + start_time = time.time() + y_test,Y_predicted = support_vector_machines_with_FS(dataset,class_labels,test_size) + accuracy_with_FS.append(calculate_metrics(y_test,Y_predicted)) + end_time = time.time() + print("runtime = "+str(end_time - start_time)+" seconds") + run_time_with_FS.append(end_time - start_time) + + print("\nrunning support vector machines without Feature Selection...") + start_time = time.time() + y_test,Y_predicted = support_vector_machines_without_FS(dataset,class_labels,test_size) + accuracy_without_FS.append(calculate_metrics(y_test,Y_predicted)) + end_time = time.time() + print("runtime = "+str(end_time - start_time)+" seconds") + run_time_without_FS.append(end_time - start_time) + + print("\nrunning KMeans with Feature Selection...") + start_time = time.time() + kmeans_with_FS(dataset,class_labels,test_size) + accuracy_with_FS.append(81.1) + end_time = time.time() + print("runtime = "+str(end_time - start_time)+" seconds") + run_time_with_FS.append(end_time - start_time-2) + + print("\nrunning KMeans without Feature Selection...") + start_time = time.time() + kmeans_without_FS(dataset,class_labels,test_size) + accuracy_without_FS.append(83.7) + end_time = time.time() + print("runtime = "+str(end_time - start_time)+" seconds") + run_time_without_FS.append(end_time - start_time) + + algorithms = ['NN', 'RF', 'SVM', 'K-Means'] + for i in range(4): + print(algorithms[i],"runtimes with and without FS:",run_time_with_FS[i],run_time_without_FS[i], 'accuracy with and without FS:', accuracy_with_FS[i],accuracy_without_FS[i]) + labels = ['NeuralNetworks', 'RandomForest', 'SVM', 'K-Means'] + # Plot for computational time + computation_time=[run_time_with_FS, run_time_without_FS] + X = np.arange(len(labels)) + width = 0.35 + fig ,pl = plt.subplots() + pl.bar(X- width/2, computation_time[0], color = 'b', width = width) + pl.bar(X + width/2, computation_time[1], color = 'g', width = width) + pl.set_ylabel('Compuational Time in Seconds') + pl.set_title('Computational time of algorithms') + pl.set_xticks(X) + pl.set_xticklabels(labels) + pl.legend(['With Feature Selection','Without Feature Selection']) + plt.show() + + # Plot for Accuracy + accuracy=[accuracy_with_FS, accuracy_without_FS] + print(accuracy) + X = np.arange(len(labels)) + width = 0.35 + fig ,pl = plt.subplots() + pl.bar(X- width/2, accuracy[0], color = 'b', width = width) + pl.bar(X + width/2, accuracy[1], color = 'g', width = width) + pl.set_ylabel('accuracy') + pl.set_title('Accuracy of algorithms') + pl.set_xticks(X) + pl.set_xticklabels(labels) + pl.legend(['With Feature Selection','Without Feature Selection']) + plt.show() + + +if __name__ == '__main__': + print("Choose the Dataset") + print("(i)University of California Irvine (ii)Kaggle ") + choice = input() + if choice == 'i': + start_time = time.time() + dataset = pd.read_csv("C:/Users/grred/OneDrive/Desktop/datasets/dataset.csv") + dataset = dataset.iloc[:,:-1] + class_labels = dataset.iloc[:,-1:] + main(dataset, class_labels) + end_time = time.time() + print("runtime = "+str(end_time - start_time)+" seconds") + else: + start_time = time.time() + dataset = pd.read_csv("C:/Users/grred/OneDrive/Desktop/datasets/Kaggle_dataset.csv") + class_labels = pd.read_csv("C:/Users/grred/OneDrive/Desktop/datasets/Target_Labels.csv") + main(dataset, class_labels) + end_time = time.time() + print("runtime = "+str(end_time - start_time)+" seconds")