From b42b2df885d1fee10c1fc273e529074b892771c4 Mon Sep 17 00:00:00 2001 From: conorom Date: Tue, 7 Jun 2022 18:22:56 -0400 Subject: [PATCH] CharacterizeJob non-ASCII filename bug --- app/jobs/characterize_job.rb | 2 +- spec/jobs/characterize_job_spec.rb | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/app/jobs/characterize_job.rb b/app/jobs/characterize_job.rb index 4ad91fff9c..e10e4848fa 100644 --- a/app/jobs/characterize_job.rb +++ b/app/jobs/characterize_job.rb @@ -62,7 +62,7 @@ def characterize(file_set, _file_id, filepath) # rubocop:disable Metrics/AbcSize file_set.date_modified = Hyrax::TimeService.time_in_utc if file_set.characterization_proxy.original_checksum.first != previous_checksum # set title to label if that's how it was before this characterization - file_set.title = [file_set.characterization_proxy.original_name] if reset_title + file_set.title = [file_set.characterization_proxy.original_name.force_encoding("UTF-8")] if reset_title # always set the label to the original_name file_set.label = file_set.characterization_proxy.original_name diff --git a/spec/jobs/characterize_job_spec.rb b/spec/jobs/characterize_job_spec.rb index 45575904f4..5da91804e3 100644 --- a/spec/jobs/characterize_job_spec.rb +++ b/spec/jobs/characterize_job_spec.rb @@ -92,12 +92,13 @@ allow(file_set).to receive(:characterization_proxy).and_call_original end - context 'title and label were the previously the same' do + context 'title and label were previously the same' do let(:title) { ['old_filename.jpg'] } let(:label) { 'old_filename.jpg' } before do - allow(file_set).to receive_message_chain(:characterization_proxy, :original_name).and_return('new_filename.jpg') # rubocop:disable RSpec/MessageChain + allow(file_set).to receive_message_chain(:characterization_proxy, :original_name) + .and_return(String.new('new_filename.jpg', encoding: 'ASCII-8BIT')) # rubocop:disable RSpec/MessageChain end it 'sets title to label' do @@ -107,14 +108,33 @@ expect(file_set.title).to eq ['new_filename.jpg'] expect(file_set.label).to eq 'new_filename.jpg' end + + # https://github.com/samvera/hyrax/issues/5671 + context 'original_name, which has encoding set to ASCII-8BIT, contains non-ASCII characters' do + before do + allow(file_set).to receive_message_chain(:characterization_proxy, :original_name) + .and_return(String.new('ファイル.txt', encoding: 'ASCII-8BIT')) # rubocop:disable RSpec/MessageChain + end + + it 'does not raise an error, and still sets title to label' do + expect(file).to receive(:save!) + expect(file_set).to receive(:update_index) + expect { described_class.perform_now(file_set, file.id) } + .not_to raise_error(Encoding::UndefinedConversionError, '"\xE3" from ASCII-8BIT to UTF-8') + expect(file_set.title).to eq ['ファイル.txt'] + expect(file_set.label).to eq 'ファイル.txt' + end + end end context 'title and label were not previously the same' do let(:title) { ['My User-Entered Title'] } let(:label) { 'old_filename.jpg' } + let(:original_name) { 'new_filename.jpg' } before do - allow(file_set).to receive_message_chain(:characterization_proxy, :original_name).and_return('new_filename.jpg') # rubocop:disable RSpec/MessageChain + allow(file_set).to receive_message_chain(:characterization_proxy, :original_name) + .and_return(String.new('new_filename.jpg', encoding: 'ASCII-8BIT')) # rubocop:disable RSpec/MessageChain end it 'assumes a user-entered title value and leaves title as-is' do