Skip to content

Commit

Permalink
Merge pull request #218 from DedSecInside/add_nlp
Browse files Browse the repository at this point in the history
Add website classification
  • Loading branch information
KingAkeem authored Sep 8, 2021
2 parents b5f7213 + e721fff commit 74e2436
Show file tree
Hide file tree
Showing 7 changed files with 1,612 additions and 3 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ torBot

venv/
.venv/
*.csv
.DS_Store
.env
data/*.csv
data/*.csv
113 changes: 112 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ python-dotenv = "^0.10.2"
threadsafe = "^1.0.0"
progress = "^1.5.0"
numpy = "^1.20.2"
scikit-learn = "^0.24.2"

[tool.poetry.dev-dependencies]
pytest = "^6.2.3"
Expand Down
12 changes: 12 additions & 0 deletions src/nlp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Natural Language Processing Library

This library provides tool for performing natural language processing on websites.
This library is in it's infancy currently and can only be used for testing.

To test gathering data use:
`python3 gater_data.py`
* This will generate the data necessary to train the classification model

To predict the classification of a webiste use:
`python3 main.py -website https://www.github.com`
* Add `-accuracy` argument, to view the accuracy of the prediction
33 changes: 33 additions & 0 deletions src/nlp/gather_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import csv
from pathlib import Path


def write_data():
"""
Writes the training data from the csv file to a directory based on the
scikit-learn.datasets `load_files` specification.
dataset source: https://www.kaggle.com/hetulmehta/website-classification
e.g.
container_folder/
category_1_folder/
file_1.txt file_2.txt file_3.txt ... file_42.txt
category_2_folder/
file_43.txt file_44.txt ...
"""

with open('website_classification.csv') as csvfile:
website_reader = csv.reader(csvfile, delimiter=',')
for row in website_reader:
[id, website, content, category] = row
if category != 'category':
category = category.replace('/', '+')
dir_name = f"training_data/{category}"
Path(dir_name).mkdir(parents=True, exist_ok=True)
with open(f'{dir_name}/{id}.txt', mode='w+') as txtfile:
txtfile.write(content)


if __name__ == "__main__":
write_data()
44 changes: 44 additions & 0 deletions src/nlp/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import argparse
import requests
import numpy as np

from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.datasets import load_files

# get html for site
parser = argparse.ArgumentParser(description='Classify Website')
parser.add_argument('-website', type=str, help='Website to categorize')
parser.add_argument('-accuracy', type=bool, help='Print accuracy')
args = parser.parse_args()
soup = BeautifulSoup(requests.get(args.website).text, features='html.parser')
html = soup.get_text()

# create classifier
clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier())
])
dataset = load_files('training_data')
x_train, x_test, y_train, y_test = train_test_split(
dataset.data,
dataset.target
)
clf.fit(x_train, y_train)


website = 'Unknown'
if soup.title:
website = soup.title.text

# returns an array of target_name values
predicted = clf.predict([html])
print(f'The category of {website} is {dataset.target_names[predicted[0]]}')

if args.accuracy:
accuracy = np.mean(predicted == y_test)
print(f'Accuracy: {accuracy}%')
Loading

0 comments on commit 74e2436

Please sign in to comment.