From 35a7c8bb05b3b8d11a39454154b5e54fc2d71cac Mon Sep 17 00:00:00 2001
From: Stuart Owen <stuart.owen@manchester.ac.uk>
Date: Thu, 11 Apr 2024 15:59:42 +0100
Subject: [PATCH] remove the double checking and updating of the mime type
 #1820

---
 lib/seek/content_extraction.rb | 32 ++------------------
 test/unit/content_blob_test.rb | 55 ----------------------------------
 2 files changed, 2 insertions(+), 85 deletions(-)

diff --git a/lib/seek/content_extraction.rb b/lib/seek/content_extraction.rb
index b5e4190731..5168e878d8 100644
--- a/lib/seek/content_extraction.rb
+++ b/lib/seek/content_extraction.rb
@@ -69,7 +69,6 @@ def extract_text_from_pdf
         Docsplit.extract_text(pdf_filepath, output: converted_storage_directory) unless File.exist?(txt_filepath)
         File.read(txt_filepath)
       rescue Docsplit::ExtractionFailed => e
-        extract_text_from_pdf if double_check_mime_type
         Rails.logger.error("Problem with extracting text from pdf #{id} #{e}")
         ''
       end
@@ -78,11 +77,7 @@ def extract_text_from_pdf
 
     def to_csv(sheet = 1, trim = false)
       return '' unless is_excel?
-      begin
-        spreadsheet_to_csv(filepath, sheet, trim, Seek::Config.jvm_memory_allocation)
-      rescue SysMODB::SpreadsheetExtractionException
-        to_csv(sheet, trim) if double_check_mime_type
-      end
+      spreadsheet_to_csv(filepath, sheet, trim, Seek::Config.jvm_memory_allocation)
     end
 
     def extract_csv()
@@ -90,34 +85,11 @@ def extract_csv()
     end
 
     def to_spreadsheet_xml
-      begin
-        spreadsheet_to_xml(filepath, Seek::Config.jvm_memory_allocation)
-      rescue SysMODB::SpreadsheetExtractionException=>e
-        if double_check_mime_type
-          to_spreadsheet_xml
-        else
-          raise e
-        end
-      end
+      spreadsheet_to_xml(filepath, Seek::Config.jvm_memory_allocation)
     end
 
     private
 
-    # checks the type using mime magic, and updates if found to be different. This is to help cases where extraction
-    # fails due to the mime type being incorrectly set
-    #
-    # @return boolean - the mime type was changed
-    def double_check_mime_type
-      suggested_type = mime_magic_content_type
-      if suggested_type && suggested_type != content_type
-        update_column(:content_type, suggested_type)
-        true
-      else
-        false
-      end
-    end
-
-
     # filters special characters, keeping alphanumeric characters, hyphen ('-'), underscore('_') and newlines
     def filter_text_content(content)
       content.gsub(/[^-_0-9a-z \n]/i, ' ')
diff --git a/test/unit/content_blob_test.rb b/test/unit/content_blob_test.rb
index f92574fc13..111a1cafd7 100644
--- a/test/unit/content_blob_test.rb
+++ b/test/unit/content_blob_test.rb
@@ -900,61 +900,6 @@ def test_exception_when_both_data_and_io_object
     refute File.exist?(txt_path)
   end
 
-  test 'fix mime type after failed csv extraction' do
-    blob = FactoryBot.create(:image_content_blob, content_type:'application/excel', original_filename:'image.xls')
-    assert blob.is_excel?
-
-    text = blob.to_csv
-
-    assert text.blank?
-
-    blob.reload
-
-    refute blob.is_excel?
-    assert_equal 'image/png',blob.content_type
-  end
-
-  test 'fix mime type after failed pdf contents for search' do
-    blob = FactoryBot.create(:image_content_blob, content_type: 'application/msword', original_filename: 'image.doc')
-    assert blob.is_pdf_convertable?
-
-    assert_empty blob.pdf_contents_for_search
-
-    blob.reload
-
-    refute blob.is_pdf_convertable?
-    assert_equal 'image/png', blob.content_type
-
-    # incorrectly described as pdf
-    blob = FactoryBot.create(:image_content_blob, content_type: 'application/pdf', original_filename: 'image.pdf')
-
-    assert_empty blob.pdf_contents_for_search
-
-    blob.reload
-
-    refute blob.is_pdf_convertable?
-    assert_equal 'image/png', blob.content_type
-
-    # handles when the file is actually broken, rather than failing due to the mime type
-    blob = FactoryBot.create(:broken_pdf_content_blob)
-    assert_empty blob.pdf_contents_for_search
-    assert_equal 'application/pdf', blob.content_type
-  end
-
-  test 'fix mime type after spreadsheet xml fail' do
-    blob = FactoryBot.create(:image_content_blob, content_type:'application/msexcel', original_filename:'image.xls')
-    assert blob.is_extractable_spreadsheet?
-
-    assert_raises(SysMODB::SpreadsheetExtractionException) do
-      blob.to_spreadsheet_xml
-    end
-
-    blob.reload
-
-    refute blob.is_extractable_spreadsheet?
-    assert_equal 'image/png',blob.content_type
-  end
-
   test 'tmp_io_objects in tmp dir are deleted' do
     file = Tempfile.new('testing-content-blob')
     file.write('test test test')