diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..b4608e2 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @sirlolcat diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 0000000..0f39daf --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,31 @@ +name: Pylint + +on: + push: + branches: + - main + pull_request: + branches: + - main + types: [opened, synchronize, reopened] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pylint + - name: Analysing the code with pylint + run: | + pylint $(git ls-files '*.py') diff --git a/.gitignore b/.gitignore index 68bc17f..f5fe7e1 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +datasets \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a701c10 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,19 @@ +{ + "editor.codeActionsOnSave": { + "source.fixAll": true, + "source.organizeImports": "always", + }, + "cSpell.words": [ + "Kaggle", + "maxsplit", + "ngram", + "ozlerhakan", + "phishingemails", + "rtatman", + "subhajournal", + "suraj", + "tqdm", + "venky", + "wcukierski" + ] +} \ No newline at end of file diff --git a/phishnet.py b/phishnet.py new file mode 100644 index 0000000..97f4648 --- /dev/null +++ b/phishnet.py @@ -0,0 +1,23 @@ +""" +This module uses GPT-2 from Hugging Face's transformers library to generate text based on a prompt. +""" + +from transformers import GPT2Tokenizer, GPT2LMHeadModel + +tokenizer = GPT2Tokenizer.from_pretrained('gpt2') +model = GPT2LMHeadModel.from_pretrained('gpt2') + +PROMPT_TEXT = "Urgent: Your account has been compromised" + +inputs = tokenizer.encode(PROMPT_TEXT, add_special_tokens=False, return_tensors='pt') +outputs = model.generate( + inputs, + max_length=100, + num_return_sequences=1, + temperature=0.7, + top_p=0.9, + do_sample=True, + no_repeat_ngram_size=2 +) + +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..70db91d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,32 @@ +accelerate==0.24.1 +bleach==6.1.0 +certifi==2023.11.17 +charset-normalizer==3.3.2 +filelock==3.13.1 +fsspec==2023.10.0 +huggingface-hub==0.19.4 +idna==3.4 +Jinja2==3.1.2 +kaggle==1.5.16 +MarkupSafe==2.1.3 +mpmath==1.3.0 +networkx==3.2.1 +numpy==1.26.2 +packaging==23.2 +psutil==5.9.6 +python-dateutil==2.8.2 +python-slugify==8.0.1 +PyYAML==6.0.1 +regex==2023.10.3 +requests==2.31.0 +safetensors==0.4.0 +six==1.16.0 +sympy==1.12 +text-unidecode==1.3 +tokenizers==0.15.0 +torch==2.1.1 +tqdm==4.66.1 +transformers==4.35.2 +typing_extensions==4.8.0 +urllib3==2.1.0 +webencodings==0.5.1 diff --git a/scripts/download_base_datasets.py b/scripts/download_base_datasets.py new file mode 100644 index 0000000..9edf539 --- /dev/null +++ b/scripts/download_base_datasets.py @@ -0,0 +1,45 @@ +""" +This module downloads various datasets from Kaggle and saves them locally. +""" + +import os +import requests +from tqdm import tqdm +from kaggle.api.kaggle_api_extended import KaggleApi + +def download_file(url, filename): + """ + Downloads a file from the given URL and saves it to the specified filename. + """ + with requests.get(url, stream=True, timeout=10) as r: # Added timeout + total_length = int(r.headers.get('content-length')) + with open(filename, 'wb') as f: + for chunk in tqdm(r.iter_content(chunk_size=1024), total=total_length//1024, unit='KB', desc=f'Downloading {filename}'): + if chunk: + f.write(chunk) +if not os.path.exists('datasets'): + os.makedirs('datasets') + +# Kaggle datasets +kaggle_datasets = [ + "wcukierski/enron-email-dataset", + "rtatman/fraudulent-email-corpus", + "venky73/spam-mails-dataset", + "subhajournal/phishingemails", + "suraj520/customer-support-ticket-dataset", + "ozlerhakan/spam-or-not-spam-dataset", +] + +# Initialize Kaggle API +api = KaggleApi() +api.authenticate() + +# Download Kaggle datasets +for dataset in kaggle_datasets: + dataset_key = dataset.rsplit('/', maxsplit=1)[-1] + dataset_path = f'datasets/{dataset_key}' + if not os.path.exists(dataset_path): + print(f'Trying to download {dataset_key}...') + api.dataset_download_files(dataset, path='datasets', unzip=True, quiet=False) + +print("Datasets downloaded and renamed successfully.")