From 35a7c8bb05b3b8d11a39454154b5e54fc2d71cac Mon Sep 17 00:00:00 2001 From: Stuart Owen Date: Thu, 11 Apr 2024 15:59:42 +0100 Subject: [PATCH] remove the double checking and updating of the mime type #1820 --- lib/seek/content_extraction.rb | 32 ++------------------ test/unit/content_blob_test.rb | 55 ---------------------------------- 2 files changed, 2 insertions(+), 85 deletions(-) diff --git a/lib/seek/content_extraction.rb b/lib/seek/content_extraction.rb index b5e4190731..5168e878d8 100644 --- a/lib/seek/content_extraction.rb +++ b/lib/seek/content_extraction.rb @@ -69,7 +69,6 @@ def extract_text_from_pdf Docsplit.extract_text(pdf_filepath, output: converted_storage_directory) unless File.exist?(txt_filepath) File.read(txt_filepath) rescue Docsplit::ExtractionFailed => e - extract_text_from_pdf if double_check_mime_type Rails.logger.error("Problem with extracting text from pdf #{id} #{e}") '' end @@ -78,11 +77,7 @@ def extract_text_from_pdf def to_csv(sheet = 1, trim = false) return '' unless is_excel? - begin - spreadsheet_to_csv(filepath, sheet, trim, Seek::Config.jvm_memory_allocation) - rescue SysMODB::SpreadsheetExtractionException - to_csv(sheet, trim) if double_check_mime_type - end + spreadsheet_to_csv(filepath, sheet, trim, Seek::Config.jvm_memory_allocation) end def extract_csv() @@ -90,34 +85,11 @@ def extract_csv() end def to_spreadsheet_xml - begin - spreadsheet_to_xml(filepath, Seek::Config.jvm_memory_allocation) - rescue SysMODB::SpreadsheetExtractionException=>e - if double_check_mime_type - to_spreadsheet_xml - else - raise e - end - end + spreadsheet_to_xml(filepath, Seek::Config.jvm_memory_allocation) end private - # checks the type using mime magic, and updates if found to be different. This is to help cases where extraction - # fails due to the mime type being incorrectly set - # - # @return boolean - the mime type was changed - def double_check_mime_type - suggested_type = mime_magic_content_type - if suggested_type && suggested_type != content_type - update_column(:content_type, suggested_type) - true - else - false - end - end - - # filters special characters, keeping alphanumeric characters, hyphen ('-'), underscore('_') and newlines def filter_text_content(content) content.gsub(/[^-_0-9a-z \n]/i, ' ') diff --git a/test/unit/content_blob_test.rb b/test/unit/content_blob_test.rb index f92574fc13..111a1cafd7 100644 --- a/test/unit/content_blob_test.rb +++ b/test/unit/content_blob_test.rb @@ -900,61 +900,6 @@ def test_exception_when_both_data_and_io_object refute File.exist?(txt_path) end - test 'fix mime type after failed csv extraction' do - blob = FactoryBot.create(:image_content_blob, content_type:'application/excel', original_filename:'image.xls') - assert blob.is_excel? - - text = blob.to_csv - - assert text.blank? - - blob.reload - - refute blob.is_excel? - assert_equal 'image/png',blob.content_type - end - - test 'fix mime type after failed pdf contents for search' do - blob = FactoryBot.create(:image_content_blob, content_type: 'application/msword', original_filename: 'image.doc') - assert blob.is_pdf_convertable? - - assert_empty blob.pdf_contents_for_search - - blob.reload - - refute blob.is_pdf_convertable? - assert_equal 'image/png', blob.content_type - - # incorrectly described as pdf - blob = FactoryBot.create(:image_content_blob, content_type: 'application/pdf', original_filename: 'image.pdf') - - assert_empty blob.pdf_contents_for_search - - blob.reload - - refute blob.is_pdf_convertable? - assert_equal 'image/png', blob.content_type - - # handles when the file is actually broken, rather than failing due to the mime type - blob = FactoryBot.create(:broken_pdf_content_blob) - assert_empty blob.pdf_contents_for_search - assert_equal 'application/pdf', blob.content_type - end - - test 'fix mime type after spreadsheet xml fail' do - blob = FactoryBot.create(:image_content_blob, content_type:'application/msexcel', original_filename:'image.xls') - assert blob.is_extractable_spreadsheet? - - assert_raises(SysMODB::SpreadsheetExtractionException) do - blob.to_spreadsheet_xml - end - - blob.reload - - refute blob.is_extractable_spreadsheet? - assert_equal 'image/png',blob.content_type - end - test 'tmp_io_objects in tmp dir are deleted' do file = Tempfile.new('testing-content-blob') file.write('test test test')