Merge pull request #218 from DedSecInside/add_nlp

Add website classification
DedSecInside · Sep 8, 2021 · 74e2436 · 74e2436
2 parents b5f7213 + e721fff
commit 74e2436
Show file tree

Hide file tree

Showing 7 changed files with 1,612 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -30,7 +30,6 @@ torBot
 
 venv/
 .venv/
-*.csv
 .DS_Store
 .env
-data/*.csv
+data/*.csv
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ python-dotenv = "^0.10.2"
 threadsafe = "^1.0.0"
 progress = "^1.5.0"
 numpy = "^1.20.2"
+scikit-learn = "^0.24.2"
 
 [tool.poetry.dev-dependencies]
 pytest = "^6.2.3"

diff --git a/src/nlp/README.md b/src/nlp/README.md
@@ -0,0 +1,12 @@
+# Natural Language Processing Library
+
+This library provides tool for performing natural language processing on websites.
+This library is in it's infancy currently and can only be used for testing.
+
+To test gathering data use:
+`python3 gater_data.py`
+* This will generate the data necessary to train the classification model 
+
+To predict the classification of a webiste use:
+`python3 main.py -website https://www.github.com` 
+* Add `-accuracy` argument, to view the accuracy of the prediction
diff --git a/src/nlp/gather_data.py b/src/nlp/gather_data.py
@@ -0,0 +1,33 @@
+import csv
+from pathlib import Path
+
+
+def write_data():
+    """
+    Writes the training data from the csv file to a directory based on the
+    scikit-learn.datasets `load_files` specification.
+
+    dataset source: https://www.kaggle.com/hetulmehta/website-classification
+
+    e.g.
+    container_folder/
+            category_1_folder/
+                    file_1.txt file_2.txt file_3.txt ... file_42.txt
+            category_2_folder/
+                    file_43.txt file_44.txt ...
+    """
+
+    with open('website_classification.csv') as csvfile:
+        website_reader = csv.reader(csvfile, delimiter=',')
+        for row in website_reader:
+            [id, website, content, category] = row
+            if category != 'category':
+                category = category.replace('/', '+')
+		dir_name = f"training_data/{category}"
+                Path(dir_name).mkdir(parents=True, exist_ok=True)
+                with open(f'{dir_name}/{id}.txt', mode='w+') as txtfile:
+                    txtfile.write(content)
+
+
+if __name__ == "__main__":
+    write_data()
diff --git a/src/nlp/main.py b/src/nlp/main.py
@@ -0,0 +1,44 @@
+import argparse
+import requests
+import numpy as np
+
+from bs4 import BeautifulSoup
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.linear_model import SGDClassifier
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from sklearn.datasets import load_files
+
+# get html for site
+parser = argparse.ArgumentParser(description='Classify Website')
+parser.add_argument('-website', type=str, help='Website to categorize')
+parser.add_argument('-accuracy', type=bool, help='Print accuracy')
+args = parser.parse_args()
+soup = BeautifulSoup(requests.get(args.website).text, features='html.parser')
+html = soup.get_text()
+
+# create classifier
+clf = Pipeline([
+    ('vect', CountVectorizer()),
+    ('tfidf', TfidfTransformer()),
+    ('clf', SGDClassifier())
+])
+dataset = load_files('training_data')
+x_train, x_test, y_train, y_test = train_test_split(
+						dataset.data,
+						dataset.target
+					)
+clf.fit(x_train, y_train)
+
+
+website = 'Unknown'
+if soup.title:
+    website = soup.title.text
+
+# returns an array of target_name values
+predicted = clf.predict([html])
+print(f'The category of {website} is {dataset.target_names[predicted[0]]}')
+
+if args.accuracy:
+    accuracy = np.mean(predicted == y_test)
+    print(f'Accuracy: {accuracy}%')