diff --git a/.vscode/settings.json b/.vscode/settings.json index 5416d7b..a701c10 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,12 @@ { + "editor.codeActionsOnSave": { + "source.fixAll": true, + "source.organizeImports": "always", + }, "cSpell.words": [ "Kaggle", + "maxsplit", + "ngram", "ozlerhakan", "phishingemails", "rtatman", diff --git a/phishnet.py b/phishnet.py index bd5ed18..97f4648 100644 --- a/phishnet.py +++ b/phishnet.py @@ -1,18 +1,22 @@ +""" +This module uses GPT-2 from Hugging Face's transformers library to generate text based on a prompt. +""" + from transformers import GPT2Tokenizer, GPT2LMHeadModel tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') -prompt_text = "Urgent: Your account has been compromised. Please click the link immediately to protect your information:" +PROMPT_TEXT = "Urgent: Your account has been compromised" -inputs = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors='pt') +inputs = tokenizer.encode(PROMPT_TEXT, add_special_tokens=False, return_tensors='pt') outputs = model.generate( - inputs, - max_length=100, - num_return_sequences=1, - temperature=0.7, - top_p=0.9, - do_sample=True, + inputs, + max_length=100, + num_return_sequences=1, + temperature=0.7, + top_p=0.9, + do_sample=True, no_repeat_ngram_size=2 ) diff --git a/scripts/download_base_datasets.py b/scripts/download_base_datasets.py index 747d5fe..9edf539 100644 --- a/scripts/download_base_datasets.py +++ b/scripts/download_base_datasets.py @@ -1,16 +1,22 @@ +""" +This module downloads various datasets from Kaggle and saves them locally. +""" + import os import requests from tqdm import tqdm from kaggle.api.kaggle_api_extended import KaggleApi def download_file(url, filename): - with requests.get(url, stream=True) as r: + """ + Downloads a file from the given URL and saves it to the specified filename. + """ + with requests.get(url, stream=True, timeout=10) as r: # Added timeout total_length = int(r.headers.get('content-length')) with open(filename, 'wb') as f: for chunk in tqdm(r.iter_content(chunk_size=1024), total=total_length//1024, unit='KB', desc=f'Downloading {filename}'): if chunk: f.write(chunk) - if not os.path.exists('datasets'): os.makedirs('datasets') @@ -30,10 +36,10 @@ def download_file(url, filename): # Download Kaggle datasets for dataset in kaggle_datasets: - dataset_key = dataset.split('/')[-1] + dataset_key = dataset.rsplit('/', maxsplit=1)[-1] dataset_path = f'datasets/{dataset_key}' if not os.path.exists(dataset_path): print(f'Trying to download {dataset_key}...') api.dataset_download_files(dataset, path='datasets', unzip=True, quiet=False) -print("Datasets downloaded and renamed successfully.") \ No newline at end of file +print("Datasets downloaded and renamed successfully.")