chore: update .gitignore

feat: add dataset download script feat: add phishnet driver script chore: create pylint action PyLint action now runs on pull requests to the main branch. Signed-off-by: dann <[email protected]> chore: adjust pylint.yml style: adjust PyLint style chore: add CODEOWNERS Signed-off-by: dann <[email protected]> fix: adjust pylint action
0x5844 · Nov 25, 2023 · c88219b · c88219b
1 parent 993ef1c
commit c88219b
Show file tree

Hide file tree

Showing 7 changed files with 153 additions and 0 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1 @@
+* @sirlolcat
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -0,0 +1,31 @@
+name: Pylint
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+    types: [opened, synchronize, reopened]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pylint
+      - name: Analysing the code with pylint
+        run: |
+          pylint $(git ls-files '*.py')
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+datasets
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,19 @@
+{
+    "editor.codeActionsOnSave": {
+        "source.fixAll": true,
+        "source.organizeImports": "always",
+    },
+    "cSpell.words": [
+        "Kaggle",
+        "maxsplit",
+        "ngram",
+        "ozlerhakan",
+        "phishingemails",
+        "rtatman",
+        "subhajournal",
+        "suraj",
+        "tqdm",
+        "venky",
+        "wcukierski"
+    ]
+}
diff --git a/phishnet.py b/phishnet.py
@@ -0,0 +1,23 @@
+"""
+This module uses GPT-2 from Hugging Face's transformers library to generate text based on a prompt.
+"""
+
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+
+tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+model = GPT2LMHeadModel.from_pretrained('gpt2')
+
+PROMPT_TEXT = "Urgent: Your account has been compromised"
+
+inputs = tokenizer.encode(PROMPT_TEXT, add_special_tokens=False, return_tensors='pt')
+outputs = model.generate(
+    inputs,
+    max_length=100,
+    num_return_sequences=1,
+    temperature=0.7,
+    top_p=0.9,
+    do_sample=True,
+    no_repeat_ngram_size=2
+)
+
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,32 @@
+accelerate==0.24.1
+bleach==6.1.0
+certifi==2023.11.17
+charset-normalizer==3.3.2
+filelock==3.13.1
+fsspec==2023.10.0
+huggingface-hub==0.19.4
+idna==3.4
+Jinja2==3.1.2
+kaggle==1.5.16
+MarkupSafe==2.1.3
+mpmath==1.3.0
+networkx==3.2.1
+numpy==1.26.2
+packaging==23.2
+psutil==5.9.6
+python-dateutil==2.8.2
+python-slugify==8.0.1
+PyYAML==6.0.1
+regex==2023.10.3
+requests==2.31.0
+safetensors==0.4.0
+six==1.16.0
+sympy==1.12
+text-unidecode==1.3
+tokenizers==0.15.0
+torch==2.1.1
+tqdm==4.66.1
+transformers==4.35.2
+typing_extensions==4.8.0
+urllib3==2.1.0
+webencodings==0.5.1
diff --git a/scripts/download_base_datasets.py b/scripts/download_base_datasets.py
@@ -0,0 +1,45 @@
+"""
+This module downloads various datasets from Kaggle and saves them locally.
+"""
+
+import os
+import requests
+from tqdm import tqdm
+from kaggle.api.kaggle_api_extended import KaggleApi
+
+def download_file(url, filename):
+    """
+    Downloads a file from the given URL and saves it to the specified filename.
+    """
+    with requests.get(url, stream=True, timeout=10) as r:  # Added timeout
+        total_length = int(r.headers.get('content-length'))
+        with open(filename, 'wb') as f:
+            for chunk in tqdm(r.iter_content(chunk_size=1024), total=total_length//1024, unit='KB', desc=f'Downloading {filename}'):
+                if chunk:
+                    f.write(chunk)
+if not os.path.exists('datasets'):
+    os.makedirs('datasets')
+
+# Kaggle datasets
+kaggle_datasets = [
+    "wcukierski/enron-email-dataset",
+    "rtatman/fraudulent-email-corpus",
+    "venky73/spam-mails-dataset",
+    "subhajournal/phishingemails",
+    "suraj520/customer-support-ticket-dataset",
+    "ozlerhakan/spam-or-not-spam-dataset",
+]
+
+# Initialize Kaggle API
+api = KaggleApi()
+api.authenticate()
+
+# Download Kaggle datasets
+for dataset in kaggle_datasets:
+    dataset_key = dataset.rsplit('/', maxsplit=1)[-1]
+    dataset_path = f'datasets/{dataset_key}'
+    if not os.path.exists(dataset_path):
+        print(f'Trying to download {dataset_key}...')
+        api.dataset_download_files(dataset, path='datasets', unzip=True, quiet=False)
+
+print("Datasets downloaded and renamed successfully.")