Skip to content

Commit

Permalink
remove the double checking and updating of the mime type seek4science…
Browse files Browse the repository at this point in the history
  • Loading branch information
stuzart committed Apr 16, 2024
1 parent e8e0dee commit 35a7c8b
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 85 deletions.
32 changes: 2 additions & 30 deletions lib/seek/content_extraction.rb
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def extract_text_from_pdf
Docsplit.extract_text(pdf_filepath, output: converted_storage_directory) unless File.exist?(txt_filepath)
File.read(txt_filepath)
rescue Docsplit::ExtractionFailed => e
extract_text_from_pdf if double_check_mime_type
Rails.logger.error("Problem with extracting text from pdf #{id} #{e}")
''
end
Expand All @@ -78,46 +77,19 @@ def extract_text_from_pdf

def to_csv(sheet = 1, trim = false)
return '' unless is_excel?
begin
spreadsheet_to_csv(filepath, sheet, trim, Seek::Config.jvm_memory_allocation)
rescue SysMODB::SpreadsheetExtractionException
to_csv(sheet, trim) if double_check_mime_type
end
spreadsheet_to_csv(filepath, sheet, trim, Seek::Config.jvm_memory_allocation)
end

def extract_csv()
File.read(filepath)
end

def to_spreadsheet_xml
begin
spreadsheet_to_xml(filepath, Seek::Config.jvm_memory_allocation)
rescue SysMODB::SpreadsheetExtractionException=>e
if double_check_mime_type
to_spreadsheet_xml
else
raise e
end
end
spreadsheet_to_xml(filepath, Seek::Config.jvm_memory_allocation)
end

private

# checks the type using mime magic, and updates if found to be different. This is to help cases where extraction
# fails due to the mime type being incorrectly set
#
# @return boolean - the mime type was changed
def double_check_mime_type
suggested_type = mime_magic_content_type
if suggested_type && suggested_type != content_type
update_column(:content_type, suggested_type)
true
else
false
end
end


# filters special characters, keeping alphanumeric characters, hyphen ('-'), underscore('_') and newlines
def filter_text_content(content)
content.gsub(/[^-_0-9a-z \n]/i, ' ')
Expand Down
55 changes: 0 additions & 55 deletions test/unit/content_blob_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -900,61 +900,6 @@ def test_exception_when_both_data_and_io_object
refute File.exist?(txt_path)
end

test 'fix mime type after failed csv extraction' do
blob = FactoryBot.create(:image_content_blob, content_type:'application/excel', original_filename:'image.xls')
assert blob.is_excel?

text = blob.to_csv

assert text.blank?

blob.reload

refute blob.is_excel?
assert_equal 'image/png',blob.content_type
end

test 'fix mime type after failed pdf contents for search' do
blob = FactoryBot.create(:image_content_blob, content_type: 'application/msword', original_filename: 'image.doc')
assert blob.is_pdf_convertable?

assert_empty blob.pdf_contents_for_search

blob.reload

refute blob.is_pdf_convertable?
assert_equal 'image/png', blob.content_type

# incorrectly described as pdf
blob = FactoryBot.create(:image_content_blob, content_type: 'application/pdf', original_filename: 'image.pdf')

assert_empty blob.pdf_contents_for_search

blob.reload

refute blob.is_pdf_convertable?
assert_equal 'image/png', blob.content_type

# handles when the file is actually broken, rather than failing due to the mime type
blob = FactoryBot.create(:broken_pdf_content_blob)
assert_empty blob.pdf_contents_for_search
assert_equal 'application/pdf', blob.content_type
end

test 'fix mime type after spreadsheet xml fail' do
blob = FactoryBot.create(:image_content_blob, content_type:'application/msexcel', original_filename:'image.xls')
assert blob.is_extractable_spreadsheet?

assert_raises(SysMODB::SpreadsheetExtractionException) do
blob.to_spreadsheet_xml
end

blob.reload

refute blob.is_extractable_spreadsheet?
assert_equal 'image/png',blob.content_type
end

test 'tmp_io_objects in tmp dir are deleted' do
file = Tempfile.new('testing-content-blob')
file.write('test test test')
Expand Down

0 comments on commit 35a7c8b

Please sign in to comment.