-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #68 from SADiLaR/feature/pdf-mass-upload-test
added mass pdf upload command for testing
- Loading branch information
Showing
13 changed files
with
243 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,3 +37,5 @@ app/static_files/ | |
app/media/ | ||
/app/logging/ | ||
/logging/ | ||
/pdf_uploads/ | ||
/pdf_upload_completed/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,3 +56,5 @@ Docker Volumes for production: | |
|
||
* /media | ||
* /logging | ||
* /pdf_uploads | ||
* /pdf_upload_completed |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import os | ||
import random | ||
import shutil | ||
|
||
import magic | ||
from django.core.files.base import ContentFile | ||
from django.core.management.base import BaseCommand | ||
|
||
from general.models import DocumentFile | ||
from general.service.extract_text import GetTextError, GetTextFromPDF | ||
|
||
|
||
class Command(BaseCommand): | ||
help = "Mass PDF uploader for testing purposes." | ||
|
||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.dir_main = "/pdf_uploads/" | ||
self.dir_completed = "/pdf_upload_completed/completed/" | ||
self.dir_error = "/pdf_upload_completed/error/" | ||
|
||
def handle(self, *args, **options): | ||
os.system("clear") | ||
print("Mass file uploader for testing purposes.") | ||
|
||
self.create_directory(self.dir_completed) | ||
self.create_directory(self.dir_error) | ||
|
||
for root, dirs, files in os.walk(self.dir_main): | ||
for file in files: | ||
file_path = os.path.join(root, file) | ||
|
||
# Check if the file is a PDF file and save the data | ||
self.handle_file(file_path, file) | ||
|
||
def handle_file(self, file_path, file): | ||
# Get the file type | ||
file_type = magic.from_file(file_path, mime=True) | ||
|
||
# Check if the file is a PDF file | ||
directory = self.check_file_type(file_type) | ||
self.print_pdf_file(file) | ||
|
||
# If file is a PDF file it saves the data and moves the file to the completed directory | ||
if directory: | ||
data = { | ||
"title": file.strip(), | ||
"file": file.strip(), | ||
"uploaded_file": file_path, | ||
} | ||
# Save the data to the database and uploads the file | ||
self.save_data(data) | ||
|
||
# Move the file to the completed directory | ||
self.move_file(file_path, file, directory) | ||
|
||
# If the file is not a PDF file, print an error message and move the file to the error directory | ||
else: | ||
self.print_error() | ||
# Move the file to the error directory | ||
self.move_file(file_path, file, self.dir_error) | ||
|
||
def check_file_type(self, file_type): | ||
return self.dir_completed if file_type == "application/pdf" else None | ||
|
||
def move_file(self, file_path, file, directory): | ||
if not os.path.isfile(directory + file): | ||
shutil.move(file_path, directory) | ||
else: | ||
print( | ||
f"The file '{os.path.basename(directory + file)}' already exists in the destination directory." | ||
) | ||
|
||
def print_pdf_file(self, file): | ||
print("\n") | ||
print("\033[92m" + file + "\033[0m") | ||
|
||
def print_error(self): | ||
print("\n") | ||
print("\033[91m" + "Only PDF files are allowed" + "\033[0m") | ||
|
||
def save_data(self, data): | ||
# Generate a random number for the institution ID | ||
random_number = random.randint(1, 20) | ||
content_file = ContentFile(data["uploaded_file"], name=data["title"]) | ||
|
||
try: | ||
document_data = GetTextFromPDF(data["uploaded_file"]).to_text() | ||
|
||
instance = DocumentFile( | ||
title=data["title"], | ||
document_data=document_data, # Scraps the PDF file and extracts the text | ||
uploaded_file=content_file, | ||
document_type="Glossary", | ||
institution_id=random_number, | ||
) | ||
instance.save() | ||
|
||
except GetTextError as e: | ||
print(f"Error: {e}") | ||
return | ||
|
||
def create_directory(self, directory): | ||
try: | ||
os.makedirs(directory, exist_ok=True) | ||
except OSError as error: | ||
print(f"Directory '{directory}' can not be created. Error: {error}") |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from pypdf import PdfReader | ||
from pypdf.errors import PdfStreamError | ||
|
||
|
||
class GetTextError(Exception): | ||
pass | ||
|
||
|
||
class GetTextFromPDF: | ||
def __init__(self, uploaded_file): | ||
self.uploaded_file = uploaded_file | ||
|
||
def to_text(self): | ||
if self.uploaded_file: | ||
text_list = [] | ||
# Read the PDF file and extract text | ||
try: | ||
reader = PdfReader(self.uploaded_file) | ||
for page in reader.pages: | ||
text_list.append(page.extract_text()) | ||
|
||
get_pdf_text = " ".join(text_list) | ||
|
||
return str(get_pdf_text) | ||
|
||
except PdfStreamError: | ||
raise GetTextError("The uploaded PDF file is corrupted or not fully downloaded.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import os | ||
import unittest | ||
from unittest.mock import MagicMock | ||
|
||
from faker import Faker | ||
|
||
from general.management.commands.dev_pdf_mass_upload import Command | ||
from general.models import DocumentFile, Institution | ||
|
||
|
||
class TestHandleFile(unittest.TestCase): | ||
def setUp(self): | ||
self.command = Command() | ||
self.command.check_file_type = MagicMock() | ||
self.command.move_file = MagicMock() | ||
self.command.print_error = MagicMock() | ||
self.command.print_pdf_file = MagicMock() | ||
self.command.save_data = MagicMock() | ||
self.test_dir = os.getenv("TESTING_DIR", "/app/general/tests/files") | ||
self.test_file = self.test_dir + "Lorem.pdf" | ||
self.fake = Faker() | ||
|
||
def test_handle_file_pdf(self): | ||
self.command.check_file_type.return_value = self.test_dir | ||
self.command.handle_file(self.test_file, self.test_file) | ||
self.command.check_file_type.assert_called_once() | ||
self.command.move_file.assert_called_once() | ||
self.command.save_data.assert_called_once() | ||
self.command.print_pdf_file.assert_called_once() | ||
self.command.print_error.assert_not_called() | ||
|
||
def test_handle_file_non_pdf(self): | ||
self.command.check_file_type.return_value = None | ||
self.command.handle_file(self.test_file, self.test_file) | ||
self.command.check_file_type.assert_called_once() | ||
self.command.move_file.assert_called_once() | ||
self.command.save_data.assert_not_called() | ||
self.command.print_pdf_file.assert_called_once() | ||
self.command.print_error.assert_called_once() | ||
|
||
def test_check_file_type_pdf(self): | ||
self.assertNotEqual(self.command.check_file_type("application/pdf"), self.test_dir) | ||
|
||
def test_save_data(self): | ||
self.command = Command() | ||
# Create some Institutions instances for testing | ||
for _ in range(20): | ||
Institution.objects.create( | ||
name=self.fake.company(), | ||
abbreviation=self.fake.company_suffix(), | ||
url=self.fake.url(), | ||
email=self.fake.company_email(), | ||
logo="", | ||
) | ||
|
||
data = { | ||
"title": "Test file", | ||
"file": "Test file", | ||
"uploaded_file": self.test_file, | ||
} | ||
|
||
self.command.save_data(data) | ||
|
||
document_file = DocumentFile.objects.get(title="Test file") | ||
self.assertEqual(document_file.title, "Test file") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import os | ||
import unittest | ||
|
||
from general.service.extract_text import GetTextFromPDF | ||
|
||
|
||
class TestExtractTextService(unittest.TestCase): | ||
def setUp(self): | ||
test_dir = os.getenv("TESTING_DIR", "/app/general/tests/files") | ||
self.file_mock = test_dir + "/Lorem.pdf" | ||
|
||
def test_in_text(self): | ||
with open(self.file_mock, "rb") as file: | ||
pypdf = GetTextFromPDF(file) | ||
|
||
result = pypdf.to_text().strip() | ||
|
||
words = result.split() | ||
|
||
self.assertIn("turpis.", words) | ||
|
||
def test_not_in_text(self): | ||
with open(self.file_mock, "rb") as file: | ||
pypdf = GetTextFromPDF(file) | ||
|
||
result = pypdf.to_text().strip() | ||
|
||
words = result.split() | ||
|
||
self.assertNotIn("notintext.", words) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,3 +2,4 @@ | |
django-extensions | ||
pygraphviz | ||
ruff | ||
faker |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
-r requirements.txt | ||
django-extensions | ||
ruff | ||
faker |