Skip to content

Commit

Permalink
Merge pull request #72 from SADiLaR/feature/add-admin-service
Browse files Browse the repository at this point in the history
updated admin with GetTextFromPdf service
  • Loading branch information
daniel-gray-tangent authored Jun 10, 2024
2 parents be4cc1c + 9c60c3c commit 8f639f3
Showing 1 changed file with 10 additions and 23 deletions.
33 changes: 10 additions & 23 deletions app/general/admin.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import magic
from django.contrib import admin
from django.forms import HiddenInput, ModelForm
from pypdf import PdfReader
from pypdf.errors import PdfStreamError
from simple_history.admin import SimpleHistoryAdmin

from general.service.extract_text import GetTextError, GetTextFromPDF

from .models import DocumentFile, Institution, Language, Project, Subject


Expand Down Expand Up @@ -34,8 +34,14 @@ def clean(self):
if file_type != "application/pdf":
self.add_error("uploaded_file", "Only PDF files are allowed.")

# Extract text from PDF file
cleaned_data["document_data"] = self.pdf_to_text(uploaded_file)
try:
# Extract text from PDF file
cleaned_data["document_data"] = GetTextFromPDF(uploaded_file).to_text()

except GetTextError:
return self.add_error(
"uploaded_file", "The uploaded PDF file is corrupted or not fully downloaded."
)

cleaned_data["mime_type"] = file_type

Expand All @@ -52,25 +58,6 @@ def clean(self):

return cleaned_data

def pdf_to_text(self, uploaded_file):
if uploaded_file:
text_list = []
# Read the PDF file and extract text
try:
reader = PdfReader(uploaded_file)
for page in reader.pages:
text_list.append(page.extract_text())

get_pdf_text = " ".join(text_list)

return str(get_pdf_text)

except PdfStreamError:
return self.add_error(
"uploaded_file", "The uploaded PDF file is corrupted or not fully downloaded."
)
return None


class DocumentFileAdmin(SimpleHistoryAdmin):
ordering = ["title"]
Expand Down

0 comments on commit 8f639f3

Please sign in to comment.