Skip to content

Commit

Permalink
Merge pull request #114 from SADiLaR/rework-text-extraction
Browse files Browse the repository at this point in the history
Rework text extraction
  • Loading branch information
friedelwolff authored Aug 17, 2024
2 parents 401f92d + 1128177 commit b0b2f02
Show file tree
Hide file tree
Showing 7 changed files with 157 additions and 238 deletions.
31 changes: 14 additions & 17 deletions app/general/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from django.utils.translation import gettext as _
from simple_history.admin import SimpleHistoryAdmin

from general.service.extract_text import GetTextError, GetTextFromPDF
from general.service.extract_text import GetTextError, pdf_to_text

from .models import DocumentFile, Institution, Language, Project, Subject

Expand Down Expand Up @@ -34,29 +34,26 @@ def clean(self):
file_type = magic.from_buffer(uploaded_file.read(), mime=True)
if file_type != "application/pdf":
self.add_error("uploaded_file", _("Only PDF files are allowed."))

try:
# Extract text from PDF file
cleaned_data["document_data"] = GetTextFromPDF(uploaded_file).to_text()

except GetTextError:
return self.add_error(
"uploaded_file", _("The uploaded file is corrupted or not fully downloaded.")
)

cleaned_data["mime_type"] = file_type

uploaded_file.seek(0) # Reset file pointer after read
limit = 10 * 1024 * 1024
if uploaded_file.size and uploaded_file.size > limit:
self.add_error("uploaded_file", _("File size must not exceed 10MB."))
if not self.has_error("uploaded_file"):
# Don't parse if validation above failed
try:
cleaned_data["document_data"] = pdf_to_text(uploaded_file)
except GetTextError:
return self.add_error(
"uploaded_file",
_("The uploaded file is corrupted or not fully downloaded."),
)
uploaded_file.seek(0) # Reset file pointer after read

if not url and not uploaded_file:
self.add_error("url", _("Either URL or uploaded file must be provided."))
self.add_error("uploaded_file", _("Either URL or uploaded file must be provided."))

if uploaded_file:
limit = 10 * 1024 * 1024
if uploaded_file.size and uploaded_file.size > limit:
self.add_error("uploaded_file", _("File size must not exceed 10MB."))

return cleaned_data


Expand Down
109 changes: 0 additions & 109 deletions app/general/management/commands/dev_pdf_mass_upload.py

This file was deleted.

54 changes: 54 additions & 0 deletions app/general/management/commands/import_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# TODO:
# - Provide better command-line parameters for control, e.g.
# - import for given institution
# - associate with specific language(s)/subject(s)
# - make usable outside Docker

import os
import random

import magic
from django.core.files.base import ContentFile
from django.core.management.base import BaseCommand

from general.models import DocumentFile
from general.service.extract_text import GetTextError, pdf_to_text


class Command(BaseCommand):
help = "Mass PDF uploader for testing purposes"

def add_arguments(self, parser):
parser.add_argument("directory", help="Directory with files to import")

def handle(self, *args, **options):
for root, dirs, files in os.walk(options["directory"]):
for file in files:
if not os.path.splitext(file)[1] == ".pdf":
continue
file_path = os.path.join(root, file)
self.handle_file(file_path, file)

def handle_file(self, file_path, file_name):
print(file_name)
file_type = magic.from_file(file_path, mime=True)
if file_type == "application/pdf":
self.save_data(file_path, file_name)
else:
print("Only PDF files are allowed")

def save_data(self, file_path, file_name):
with open(file_path, "rb") as f:
content_file = ContentFile(f.read(), name=file_name)

try:
instance = DocumentFile(
title=file_name,
document_data=pdf_to_text(file_path),
uploaded_file=content_file,
document_type="Glossary",
institution_id=random.randint(1, 20),
)
instance.save()
except GetTextError as e:
print(f"Error: {e}")
43 changes: 22 additions & 21 deletions app/general/service/extract_text.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,30 @@
from pypdf import PdfReader
from pypdf.errors import PdfStreamError
# TODO:
# - remove unneeded whitespace (e.g. multiple consecutive spaces)
# - remove unprintable characters, or replacing them with some symbol
# character so that excerpts look better.
# - consider removing a few too common words, like single digits "1", etc.
# or maybe anything that occurs too frequently in the full-text index that
# could cause a full-table scan.
# - consider multilingual stemming to enhance chances of success in a multi-
# lingual setup... hard!


class GetTextError(Exception):
pass


class GetTextFromPDF:
def __init__(self, uploaded_file):
self.uploaded_file = uploaded_file
def pdf_to_text(pdf):
# imports postponed, as they will normally not be needed frequently
from pypdf import PdfReader
from pypdf.errors import PdfStreamError

def to_text(self):
if self.uploaded_file:
text_list = []
# Read the PDF file and extract text
try:
reader = PdfReader(self.uploaded_file)
for page in reader.pages:
text_list.append(page.extract_text())
text_list = []
try:
for page in PdfReader(pdf).pages:
text_list.append(page.extract_text())
except PdfStreamError:
raise GetTextError("The uploaded PDF file is corrupted or not fully downloaded.")
except Exception:
raise GetTextError("Error during text extraction from PDF file.")

get_pdf_text = " ".join(text_list)

return str(get_pdf_text)

except PdfStreamError:
raise GetTextError("The uploaded PDF file is corrupted or not fully downloaded.")
except Exception:
raise GetTextError("Error during text extraction from PDF file.")
return " ".join(text_list)
69 changes: 0 additions & 69 deletions app/general/tests/test_dev_mass_upload.py

This file was deleted.

32 changes: 10 additions & 22 deletions app/general/tests/test_extract_text_service.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,18 @@
import os
import unittest

from general.service.extract_text import GetTextFromPDF
from general.service.extract_text import pdf_to_text


class TestExtractTextService(unittest.TestCase):
def setUp(self):
test_dir = os.getenv("TESTING_DIR", "/app/general/tests/files")
self.file_mock = test_dir + "/Lorem.pdf"

def test_in_text(self):
with open(self.file_mock, "rb") as file:
pypdf = GetTextFromPDF(file)

result = pypdf.to_text().strip()

words = result.split()

self.assertIn("turpis.", words)

def test_not_in_text(self):
with open(self.file_mock, "rb") as file:
pypdf = GetTextFromPDF(file)

result = pypdf.to_text().strip()

words = result.split()

self.assertNotIn("notintext.", words)
self.file_name = os.path.join(test_dir, "Lorem.pdf")

def test_text_extraction(self):
with open(self.file_name, "rb") as file:
text = pdf_to_text(file)
self.assertIn("fermentum turpis.", text)
self.assertNotIn("notintext.", text)
self.assertGreater(len(text), 1470, "Too little text extracted")
self.assertGreater(len(text.split()), 220, "Too few words (spaces) extracted")
Loading

0 comments on commit b0b2f02

Please sign in to comment.