Skip to content

Commit

Permalink
Merge pull request #68 from SADiLaR/feature/pdf-mass-upload-test
Browse files Browse the repository at this point in the history
added mass pdf upload command for testing
  • Loading branch information
daniel-gray-tangent authored Jun 7, 2024
2 parents 29a4a5d + caeda56 commit 6248706
Show file tree
Hide file tree
Showing 13 changed files with 243 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@ app/static_files/
app/media/
/app/logging/
/logging/
/pdf_uploads/
/pdf_upload_completed/
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@ list:
@echo "ruff-fix - Run ruff check --fix"
@echo "pre-commit-install - Install pre-commit"
@echo "dev-quick-install - Run all the necessary commands to start the project"
@echo "dev-mass-pdf-upload - Run command to upload all pdf files in the media folder"
@echo "make-messages - Run command to ensure translation .po files are created"
@echo "compile-messages - Run command to ensure translation .mo files are created"
@echo "docker-shell - Access the container shell"
@echo "check - Run the Django check command"

up:
@docker compose up
Expand Down Expand Up @@ -93,6 +96,9 @@ dev-quick-install:
echo "Creating superuser"
@make create-super-user

dev-mass-pdf-upload:
@docker compose run --rm web python manage.py dev_pdf_mass_upload

docker-shell:
docker exec -it sadilar-terminology-web bash

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,5 @@ Docker Volumes for production:

* /media
* /logging
* /pdf_uploads
* /pdf_upload_completed
Empty file.
Empty file.
107 changes: 107 additions & 0 deletions app/general/management/commands/dev_pdf_mass_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import os
import random
import shutil

import magic
from django.core.files.base import ContentFile
from django.core.management.base import BaseCommand

from general.models import DocumentFile
from general.service.extract_text import GetTextError, GetTextFromPDF


class Command(BaseCommand):
help = "Mass PDF uploader for testing purposes."

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.dir_main = "/pdf_uploads/"
self.dir_completed = "/pdf_upload_completed/completed/"
self.dir_error = "/pdf_upload_completed/error/"

def handle(self, *args, **options):
os.system("clear")
print("Mass file uploader for testing purposes.")

self.create_directory(self.dir_completed)
self.create_directory(self.dir_error)

for root, dirs, files in os.walk(self.dir_main):
for file in files:
file_path = os.path.join(root, file)

# Check if the file is a PDF file and save the data
self.handle_file(file_path, file)

def handle_file(self, file_path, file):
# Get the file type
file_type = magic.from_file(file_path, mime=True)

# Check if the file is a PDF file
directory = self.check_file_type(file_type)
self.print_pdf_file(file)

# If file is a PDF file it saves the data and moves the file to the completed directory
if directory:
data = {
"title": file.strip(),
"file": file.strip(),
"uploaded_file": file_path,
}
# Save the data to the database and uploads the file
self.save_data(data)

# Move the file to the completed directory
self.move_file(file_path, file, directory)

# If the file is not a PDF file, print an error message and move the file to the error directory
else:
self.print_error()
# Move the file to the error directory
self.move_file(file_path, file, self.dir_error)

def check_file_type(self, file_type):
return self.dir_completed if file_type == "application/pdf" else None

def move_file(self, file_path, file, directory):
if not os.path.isfile(directory + file):
shutil.move(file_path, directory)
else:
print(
f"The file '{os.path.basename(directory + file)}' already exists in the destination directory."
)

def print_pdf_file(self, file):
print("\n")
print("\033[92m" + file + "\033[0m")

def print_error(self):
print("\n")
print("\033[91m" + "Only PDF files are allowed" + "\033[0m")

def save_data(self, data):
# Generate a random number for the institution ID
random_number = random.randint(1, 20)
content_file = ContentFile(data["uploaded_file"], name=data["title"])

try:
document_data = GetTextFromPDF(data["uploaded_file"]).to_text()

instance = DocumentFile(
title=data["title"],
document_data=document_data, # Scraps the PDF file and extracts the text
uploaded_file=content_file,
document_type="Glossary",
institution_id=random_number,
)
instance.save()

except GetTextError as e:
print(f"Error: {e}")
return

def create_directory(self, directory):
try:
os.makedirs(directory, exist_ok=True)
except OSError as error:
print(f"Directory '{directory}' can not be created. Error: {error}")
Empty file added app/general/service/__init__.py
Empty file.
27 changes: 27 additions & 0 deletions app/general/service/extract_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from pypdf import PdfReader
from pypdf.errors import PdfStreamError


class GetTextError(Exception):
pass


class GetTextFromPDF:
def __init__(self, uploaded_file):
self.uploaded_file = uploaded_file

def to_text(self):
if self.uploaded_file:
text_list = []
# Read the PDF file and extract text
try:
reader = PdfReader(self.uploaded_file)
for page in reader.pages:
text_list.append(page.extract_text())

get_pdf_text = " ".join(text_list)

return str(get_pdf_text)

except PdfStreamError:
raise GetTextError("The uploaded PDF file is corrupted or not fully downloaded.")
65 changes: 65 additions & 0 deletions app/general/tests/test_dev_mass_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
import unittest
from unittest.mock import MagicMock

from faker import Faker

from general.management.commands.dev_pdf_mass_upload import Command
from general.models import DocumentFile, Institution


class TestHandleFile(unittest.TestCase):
def setUp(self):
self.command = Command()
self.command.check_file_type = MagicMock()
self.command.move_file = MagicMock()
self.command.print_error = MagicMock()
self.command.print_pdf_file = MagicMock()
self.command.save_data = MagicMock()
self.test_dir = os.getenv("TESTING_DIR", "/app/general/tests/files")
self.test_file = self.test_dir + "Lorem.pdf"
self.fake = Faker()

def test_handle_file_pdf(self):
self.command.check_file_type.return_value = self.test_dir
self.command.handle_file(self.test_file, self.test_file)
self.command.check_file_type.assert_called_once()
self.command.move_file.assert_called_once()
self.command.save_data.assert_called_once()
self.command.print_pdf_file.assert_called_once()
self.command.print_error.assert_not_called()

def test_handle_file_non_pdf(self):
self.command.check_file_type.return_value = None
self.command.handle_file(self.test_file, self.test_file)
self.command.check_file_type.assert_called_once()
self.command.move_file.assert_called_once()
self.command.save_data.assert_not_called()
self.command.print_pdf_file.assert_called_once()
self.command.print_error.assert_called_once()

def test_check_file_type_pdf(self):
self.assertNotEqual(self.command.check_file_type("application/pdf"), self.test_dir)

def test_save_data(self):
self.command = Command()
# Create some Institutions instances for testing
for _ in range(20):
Institution.objects.create(
name=self.fake.company(),
abbreviation=self.fake.company_suffix(),
url=self.fake.url(),
email=self.fake.company_email(),
logo="",
)

data = {
"title": "Test file",
"file": "Test file",
"uploaded_file": self.test_file,
}

self.command.save_data(data)

document_file = DocumentFile.objects.get(title="Test file")
self.assertEqual(document_file.title, "Test file")
30 changes: 30 additions & 0 deletions app/general/tests/test_extract_text_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
import unittest

from general.service.extract_text import GetTextFromPDF


class TestExtractTextService(unittest.TestCase):
def setUp(self):
test_dir = os.getenv("TESTING_DIR", "/app/general/tests/files")
self.file_mock = test_dir + "/Lorem.pdf"

def test_in_text(self):
with open(self.file_mock, "rb") as file:
pypdf = GetTextFromPDF(file)

result = pypdf.to_text().strip()

words = result.split()

self.assertIn("turpis.", words)

def test_not_in_text(self):
with open(self.file_mock, "rb") as file:
pypdf = GetTextFromPDF(file)

result = pypdf.to_text().strip()

words = result.split()

self.assertNotIn("notintext.", words)
2 changes: 2 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ services:
volumes:
- ./app:/app
- ./logging:/logging
- ./pdf_uploads:/pdf_uploads
- ./pdf_upload_completed:/pdf_upload_completed
ports:
- "8000:8000"
depends_on:
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
django-extensions
pygraphviz
ruff
faker
1 change: 1 addition & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
-r requirements.txt
django-extensions
ruff
faker

0 comments on commit 6248706

Please sign in to comment.