From 9c60c3c95cb5314bb510c02cdeba8c758ffca360 Mon Sep 17 00:00:00 2001 From: Daniel Gray Date: Fri, 7 Jun 2024 14:01:27 +0200 Subject: [PATCH] updated admin with GetTextFromPdf service - removed pdf_to_text --- app/general/admin.py | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/app/general/admin.py b/app/general/admin.py index e91dd51f..a9b55bd0 100644 --- a/app/general/admin.py +++ b/app/general/admin.py @@ -1,10 +1,10 @@ import magic from django.contrib import admin from django.forms import HiddenInput, ModelForm -from pypdf import PdfReader -from pypdf.errors import PdfStreamError from simple_history.admin import SimpleHistoryAdmin +from general.service.extract_text import GetTextError, GetTextFromPDF + from .models import DocumentFile, Institution, Language, Project, Subject @@ -34,8 +34,14 @@ def clean(self): if file_type != "application/pdf": self.add_error("uploaded_file", "Only PDF files are allowed.") - # Extract text from PDF file - cleaned_data["document_data"] = self.pdf_to_text(uploaded_file) + try: + # Extract text from PDF file + cleaned_data["document_data"] = GetTextFromPDF(uploaded_file).to_text() + + except GetTextError: + return self.add_error( + "uploaded_file", "The uploaded PDF file is corrupted or not fully downloaded." + ) cleaned_data["mime_type"] = file_type @@ -52,25 +58,6 @@ def clean(self): return cleaned_data - def pdf_to_text(self, uploaded_file): - if uploaded_file: - text_list = [] - # Read the PDF file and extract text - try: - reader = PdfReader(uploaded_file) - for page in reader.pages: - text_list.append(page.extract_text()) - - get_pdf_text = " ".join(text_list) - - return str(get_pdf_text) - - except PdfStreamError: - return self.add_error( - "uploaded_file", "The uploaded PDF file is corrupted or not fully downloaded." - ) - return None - class DocumentFileAdmin(SimpleHistoryAdmin): ordering = ["title"]