From 28bdd74288602b955983542cd3c7acde3930532d Mon Sep 17 00:00:00 2001 From: Andrew Wallace Date: Thu, 18 Jul 2019 11:30:55 -0700 Subject: [PATCH] Make ingest metadata-only. - doesn't pass `remote-files` to the hyrax actor stack, so that no files will be imported into californica - removes customized characterization logic - removes command-line fits from Dockerfile and fits servlet from docker-compose.yml --- .rubocop.yml | 1 - .travis.yml | 4 - Dockerfile | 7 -- app/importers/californica_mapper.rb | 47 +------ app/jobs/hyrax/characterize_job.rb | 32 ----- app/lib/californica/corrupt_file_error.rb | 33 ----- app/lib/californica/is_valid_image.rb | 16 --- .../initializers/characterization_service.rb | 117 ------------------ docker-compose.yml | 6 - docker.env | 2 - spec/importers/californica_importer_spec.rb | 14 --- spec/importers/californica_mapper_spec.rb | 55 +------- spec/jobs/hyrax/characterize_job_spec.rb | 36 ------ spec/lib/californica/is_valid_image_spec.rb | 19 --- spec/rails_helper.rb | 1 - spec/system/import_and_show_work_spec.rb | 5 - 16 files changed, 3 insertions(+), 392 deletions(-) delete mode 100644 app/jobs/hyrax/characterize_job.rb delete mode 100644 app/lib/californica/corrupt_file_error.rb delete mode 100644 app/lib/californica/is_valid_image.rb delete mode 100644 config/initializers/characterization_service.rb delete mode 100644 spec/jobs/hyrax/characterize_job_spec.rb delete mode 100644 spec/lib/californica/is_valid_image_spec.rb diff --git a/.rubocop.yml b/.rubocop.yml index 2ad2ea33..4d0c2507 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -4,7 +4,6 @@ inherit_gem: AllCops: TargetRubyVersion: 2.4 Exclude: - - 'fits-*/**/**' - 'db/**/*.rb' - 'vendor/**/*' - 'tmp/**/*' diff --git a/.travis.yml b/.travis.yml index 274ae2bd..890c1b4e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,10 +25,6 @@ env: before_install: - gem update --system - gem install bundler:1.17.3 - - curl -sLO https://projects.iq.harvard.edu/files/fits/files/fits-0.8.6_1.zip - - unzip fits-0.8.6_1.zip - - chmod +x fits-0.8.6/fits.sh - - export PATH="$(pwd)/fits-0.8.6:$PATH" before_script: - bundle exec rake db:create - ln --symbolic /usr/lib/chromium-browser/chromedriver "${HOME}/bin/chromedriver" diff --git a/Dockerfile b/Dockerfile index d3de731b..bcce6f71 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,5 @@ FROM ruby:2.5 -# Install fits -RUN mkdir /fits -WORKDIR /fits -ADD https://github.com/harvard-lts/fits/releases/download/1.4.0/fits-1.4.0.zip /fits/ -RUN unzip fits-1.4.0.zip -d /fits -ENV PATH "/fits:$PATH" - RUN apt-get update -qq # Add https support to apt to download yarn & newer node RUN apt-get install -y apt-transport-https diff --git a/app/importers/californica_mapper.rb b/app/importers/californica_mapper.rb index dbf5d412..c37c712b 100644 --- a/app/importers/californica_mapper.rb +++ b/app/importers/californica_mapper.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true class CalifornicaMapper < Darlingtonia::HashMapper - attr_reader :missing_file_log, :import_file_path, :row_number + attr_reader :row_number CALIFORNICA_TERMS_MAP = { alternative_title: ["AltTitle.other", @@ -57,8 +57,6 @@ class CalifornicaMapper < Darlingtonia::HashMapper DELIMITER = '|~|' def initialize(attributes = {}) - @missing_file_log = ENV['MISSING_FILE_LOG'] || "#{::Rails.root}/log/missing_files_#{Rails.env}" - @import_file_path = attributes[:import_file_path] || ENV['IMPORT_FILE_PATH'] || '/opt/data' @row_number = attributes[:row_number] super() end @@ -76,7 +74,7 @@ def self.required_headers def fields # The fields common to all object types - common_fields = CALIFORNICA_TERMS_MAP.keys + [:remote_files, :visibility, :member_of_collections_attributes, :license] + common_fields = CALIFORNICA_TERMS_MAP.keys + [:visibility, :member_of_collections_attributes, :license] # Pages additionally have a field :in_works_ids, which defines their parent work return common_fields + [:in_works_ids] if ['ChildWork', 'Page'].include?(metadata["Object Type"]) common_fields @@ -90,33 +88,6 @@ def collection? object_type&.downcase&.chomp == 'collection' end - ## - # Take a filename and: - # 1) Check that it exists. Log it to a missing files log if it doesn't. - # 2) Turn the filename into a file:// url - # 3) Pass it to the actor stack in the remote_files param. This means that - # it will be processed by the CreateWithRemoteFilesActor - # Using the remote_files param to ingest local files is misleading. - # However, it lets us fetch the file from disk in a background job - # instead of creating a Hyrax::UploadedFile object while the CSV is - # being parsed, which gives us a performance advantage. - def remote_files - return [] if collection? - if metadata['File Name'].nil? - File.open(@missing_file_log, 'a') { |file| file.puts "Work #{ark} is missing a filename" } - return [] - end - file_name = file_uri_base_path.join(master_file_path).to_s - file_exists = File.exist?(file_name) - return_value = [] - if file_exists - return_value = [{ url: file_uri_for(name: metadata['File Name']) }] - else - File.open(@missing_file_log, 'a') { |file| file.puts "Work #{ark} has an invalid file: #{file_name} not found" } - end - return_value - end - def visibility value_from_csv = metadata['Visibility']&.squish&.downcase visibility_mapping.fetch(value_from_csv, Hydra::AccessControls::AccessRight::VISIBILITY_TEXT_VALUE_PUBLIC) @@ -284,18 +255,4 @@ def in_works_ids record_page_sequence [parent_work.id] end - - private - - def file_uri_for(name:) - uri = URI('file:///') - uri.path = file_uri_base_path.join(name).to_s - uri.to_s - end - - # Prefer the import_file_path that's been explicitly passed to this instance of CalifornicaMapper - # if it exists. - def file_uri_base_path - Pathname.new(@import_file_path) - end end diff --git a/app/jobs/hyrax/characterize_job.rb b/app/jobs/hyrax/characterize_job.rb deleted file mode 100644 index 6bd7d1c6..00000000 --- a/app/jobs/hyrax/characterize_job.rb +++ /dev/null @@ -1,32 +0,0 @@ -# frozen_string_literal: true -# Local override of the class at https://github.com/samvera/hyrax/blob/master/app/jobs/characterize_job.rb -# We are overriding this locally so we can improve and customize the error logging. -class Hyrax::CharacterizeJob < Hyrax::ApplicationJob - queue_as Hyrax.config.ingest_queue_name - - # Characterizes the file at 'filepath' if available, otherwise, pulls a copy from the repository - # and runs characterization on that file. - # @param [FileSet] file_set - # @param [String] file_id identifier for a Hydra::PCDM::File - # @param [String, NilClass] filepath the cached file within the Hyrax.config.working_path - def perform(file_set, file_id, filepath = nil) - raise "#{file_set.class.characterization_proxy} was not found for FileSet #{file_set.id}" unless file_set.characterization_proxy? - filepath = Hyrax::WorkingDirectory.find_or_retrieve(file_id, file_set.id) unless filepath && File.exist?(filepath) - Hydra::Works::CharacterizationService.run(file_set.characterization_proxy, filepath) - Rails.logger.debug "Ran characterization on #{file_set.characterization_proxy.id} (#{file_set.characterization_proxy.mime_type})" - file_set.characterization_proxy.save! - file_set.update_index - file_set.parent&.in_collections&.each(&:update_index) - - # Check that MiniMagick agrees that this is a TIFF - if Californica::IsValidImage.new(image: filepath).valid? - # Continue to create derivatives - CreateDerivativesJob.perform_later(file_set, file_id, filepath) - else - # If there is a MiniMagick error, record an error and don't continue to - # create derivatives for that file - error = Californica::CorruptFileError.new(file_set_id: file_set.id, mime_type: file_set.characterization_proxy.mime_type) - Rails.logger.error error.message - end - end -end diff --git a/app/lib/californica/corrupt_file_error.rb b/app/lib/californica/corrupt_file_error.rb deleted file mode 100644 index 2f4d14bf..00000000 --- a/app/lib/californica/corrupt_file_error.rb +++ /dev/null @@ -1,33 +0,0 @@ -# frozen_string_literal: true -# A specialized error that can be used to write actionable logs about failed file characterization. -# Eventually, we might also use this to create a report of corrupt files. -module Californica - class CorruptFileError < StandardError - def initialize(file_set_id:, mime_type:) - @file_set_id = file_set_id - @mime_type = mime_type - end - - def ark - FileSet.find(@file_set_id).parent.ark - rescue - "Unknown ark" - end - - def work_id - FileSet.find(@file_set_id).parent.id - rescue - "Unknown work_id" - end - - def import_url - FileSet.find(@file_set_id).import_url - rescue - "Unknown import path" - end - - def message - "event: unexpected file characterization: ark: #{ark}, work_id: #{work_id}, import_url: #{import_url}, mime_type: #{@mime_type}" - end - end -end diff --git a/app/lib/californica/is_valid_image.rb b/app/lib/californica/is_valid_image.rb deleted file mode 100644 index ac7386dd..00000000 --- a/app/lib/californica/is_valid_image.rb +++ /dev/null @@ -1,16 +0,0 @@ -# frozen_string_literal: true - -module Californica - class IsValidImage - def initialize(image:) - @image = image - end - - def valid? - MiniMagick::Image.new(@image).identify - return true - rescue MiniMagick::Error - return false - end - end -end diff --git a/config/initializers/characterization_service.rb b/config/initializers/characterization_service.rb deleted file mode 100644 index b2650e70..00000000 --- a/config/initializers/characterization_service.rb +++ /dev/null @@ -1,117 +0,0 @@ -# frozen_string_literal: true -require 'hydra-file_characterization' -require 'nokogiri' - -module Hydra::Works - class CharacterizationService - # @param [Hydra::PCDM::File] object which has properties to recieve characterization values. - # @param [String, File] source for characterization to be run on. File object or path on disk. - # If none is provided, it will assume the binary content already present on the object. - # @param [Hash] options to be passed to characterization. parser_mapping:, parser_class:, tools: - def self.run(object, source = nil, options = {}) - new(object, source, options).characterize - end - - attr_accessor :object, :source, :mapping, :parser_class, :tools - - def initialize(object, source, options) - @object = object - @source = source - @mapping = options.fetch(:parser_mapping, Hydra::Works::Characterization.mapper) - @parser_class = options.fetch(:parser_class, Hydra::Works::Characterization::FitsDocument) - end - - # Get given source into form that can be passed to Hydra::FileCharacterization - # Use Hydra::FileCharacterization to extract metadata (an OM XML document) - # Get the terms (and their values) from the extracted metadata - # Assign the values of the terms to the properties of the object - def characterize - content = source_to_content - extracted_md = extract_metadata(content) - terms = parse_metadata(extracted_md) - store_metadata(terms) - end - - protected - - # @return content of object if source is nil; otherwise, return a File or the source - def source_to_content - return object.content if source.nil? - # do not read the file into memory It could be huge... - return File.open(source) if source.is_a? String - source.rewind - source.read - end - - def extract_metadata(content) - if ENV['FITS_SERVLET_URL'] - Hydra::FileCharacterization.characterize(content, file_name, :fits_servlet) - else - Hydra::FileCharacterization.characterize(content, file_name, :fits) do |cfg| - cfg[:fits] = Hydra::Derivatives.fits_path - end - end - end - - # Determine the filename to send to Hydra::FileCharacterization. If no source is present, - # use the name of the file from the object; otherwise, use the supplied source. - def file_name - if source - source.is_a?(File) ? File.basename(source.path) : File.basename(source) - else - object.original_name.nil? ? "original_file" : object.original_name - end - end - - # Use OM to parse metadata - def parse_metadata(metadata) - omdoc = parser_class.new - omdoc.ng_xml = Nokogiri::XML(metadata) if metadata.present? - omdoc.__cleanup__ if omdoc.respond_to? :__cleanup__ - characterization_terms(omdoc) - end - - # Get proxy terms and values from the parser - def characterization_terms(omdoc) - h = {} - omdoc.class.terminology.terms.each_pair do |key, target| - # a key is a proxy if its target responds to proxied_term - next unless target.respond_to? :proxied_term - begin - h[key] = omdoc.send(key) - rescue NoMethodError - next - end - end - h.delete_if { |_k, v| v.empty? } - end - - # Assign values of the instance properties from the metadata mapping :prop => val - def store_metadata(terms) - terms.each_pair do |term, value| - property = property_for(term) - next if property.nil? - # Array-ify the value to avoid a conditional here - Array(value).each { |v| append_property_value(property, v) } - end - end - - # Check parser_config then self for matching term. - # Return property symbol or nil - def property_for(term) - if mapping.key?(term) && object.respond_to?(mapping[term]) - mapping[term] - elsif object.respond_to?(term) - term - end - end - - def append_property_value(property, value) - # We don't want multiple mime_types; this overwrites each time to accept last value - value = object.send(property) + [value] unless property == :mime_type - # We don't want multiple heights / widths, pick the max - value = value.map(&:to_i).max.to_s if property == :height || property == :width - object.send("#{property}=", value) - end - end -end diff --git a/docker-compose.yml b/docker-compose.yml index 310b3ce6..a7f799c1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,7 +10,6 @@ services: - solr - fedora_test - solr_test - - fits env_file: - ./docker.env stdin_open: true @@ -99,11 +98,6 @@ services: ports: - "8985:8983" - fits: - image: harvardlts/fitsservlet_container:latest - ports: - - "8889:8080" - # iiif: # image: uclalibrary/cantaloupe # environment: diff --git a/docker.env b/docker.env index 946bf3c1..b74ff11f 100644 --- a/docker.env +++ b/docker.env @@ -23,10 +23,8 @@ FEDORA_TEST_BASE_PATH=/test FEDORA_TEST_URL=http://fedora_test:8080/rest FEDORA_URL=http://fedora:8080/rest FEDORA_USER=fedoraAdmin -FITS_SERVLET_URL=http://fits:8080/fits GEONAMES_USERNAME= IMPORT_FILE_PATH=/opt/data -MISSING_FILE_LOG=log/missing_files.log RAILS_HOST=localhost RAILS_SERVE_STATIC_FILES=true REDIS_CABLE_DB=1 diff --git a/spec/importers/californica_importer_spec.rb b/spec/importers/californica_importer_spec.rb index e9d679bd..c91775ea 100644 --- a/spec/importers/californica_importer_spec.rb +++ b/spec/importers/californica_importer_spec.rb @@ -9,11 +9,6 @@ let(:user) { FactoryBot.create(:admin) } let(:csv_path) { 'spec/fixtures/example.csv' } - # Cleanup log files after each test run - after do - File.delete(ENV['MISSING_FILE_LOG']) if File.exist?(ENV['MISSING_FILE_LOG']) - end - describe 'CSV import' do it 'has an import_file_path' do expect(importer.import_file_path).to eq csv_import.import_file_path @@ -116,14 +111,5 @@ end end end - - context 'when the image file is missing' do - let(:csv_path) { 'spec/fixtures/example-missingimage.csv' } - - it "records missing files" do - importer.import - expect(File.readlines(ENV['MISSING_FILE_LOG']).each(&:chomp!).last).to match(/missing_file.tif/) - end - end end end diff --git a/spec/importers/californica_mapper_spec.rb b/spec/importers/californica_mapper_spec.rb index a130b091..bdb046ee 100644 --- a/spec/importers/californica_mapper_spec.rb +++ b/spec/importers/californica_mapper_spec.rb @@ -3,7 +3,7 @@ require 'rails_helper' RSpec.describe CalifornicaMapper do - subject(:mapper) { described_class.new(import_file_path: fixture_path) } + subject(:mapper) { described_class.new } let(:metadata) do { "AltTitle.other" => "alternative title", # alternative_title @@ -60,7 +60,6 @@ end before { mapper.metadata = metadata } - after { File.delete(ENV['MISSING_FILE_LOG']) if File.exist?(ENV['MISSING_FILE_LOG']) } it "maps resource type to local authority values, if possible" do expect(mapper.resource_type).to contain_exactly( @@ -78,57 +77,6 @@ expect(mapper.map_field(:dlcs_collection_name)).to contain_exactly("Connell (Will) Papers, 1928-1961") end - context 'with a blank filename' do - let(:metadata) do - { "Item ARK" => "21198/zz0002nq4w", - "Title" => "Protesters with signs in gallery of Los Angeles County Supervisors" \ - "hearing over eminent domain for construction of Harbor Freeway, Calif., 1947", - "Type.typeOfResource" => "still image", - "Subject" => "Express highways--California--Los Angeles County--Design and construction|~|" \ - "Eminent domain--California--Los Angeles|~|Demonstrations--California--Los Angeles County|~|" \ - "Transportation|~|Government|~|Activism|~|Interstate 10", - "Publisher.publisherName" => "Los Angeles Daily News", - "Format.medium" => "1 photograph", - "Rights.countryCreation" => "US", - "Name.repository" => "University of California, Los Angeles. $b Library Special Collections", - "Description.caption" => "This example does not have a caption.", - # "File Name" => nil, - "Coverage.geographic" => "Los Angeles (Calif.)", - "Name.subject" => "Los Angeles County (Calif.). $b Board of Supervisors", - "Photographer" => "Ansel Adams", - "Language" => "English", - "Uniform title" => "Protesters with signs in gallery of Los Angeles County Supervisors", - "Support" => "Support" } - end - - it "does not throw an error if File Name is empty" do - expect(mapper.remote_files).to eq [] - end - - it "logs the missing file" do - mapper.remote_files - File.open(ENV['MISSING_FILE_LOG']) do |file| - expect(file.read).to eq "Work ark:/21198/zz0002nq4w is missing a filename\n" - end - end - end - - context 'with a blank filename for a "Collection" row' do - let(:metadata) do - { "Item ARK" => "123/abc", - "Object Type" => "Collection", - "Title" => "Collection Title" } - end - - it 'doesn\'t log a missing file' do - FileUtils.touch(ENV['MISSING_FILE_LOG']) - expect(mapper.remote_files).to eq [] - File.open(ENV['MISSING_FILE_LOG']) do |file| - expect(file.read).to eq "" - end - end - end - describe '#fields' do it 'has expected fields' do expect(mapper.fields).to include( @@ -156,7 +104,6 @@ :place_of_origin, :publisher, :repository, - :remote_files, :resource_type, :rights_country, :rights_holder, diff --git a/spec/jobs/hyrax/characterize_job_spec.rb b/spec/jobs/hyrax/characterize_job_spec.rb deleted file mode 100644 index d5a9bbe9..00000000 --- a/spec/jobs/hyrax/characterize_job_spec.rb +++ /dev/null @@ -1,36 +0,0 @@ -# frozen_string_literal: true -require 'rails_helper' - -RSpec.describe Hyrax::CharacterizeJob do - context 'a row has a corrupt file', :clean, :inline_jobs do - let(:corrupt_file) { File.join(fixture_path, 'images', 'corrupted', 'food.tif') } - let(:file_set) { FactoryBot.create(:file_set) } - let(:original_logger) { Rails.logger } - let(:output) { StringIO.new } - let(:work) { FactoryBot.create(:work) } - let(:file_id) { file_set.files.first.id } - let(:characterization_error) { output.string } - - before do - original_logger - Rails.logger = Logger.new(output) - Hydra::Works::AddFileToFileSet.call(file_set, File.open(corrupt_file, 'rb'), :original_file) - work.ordered_members << file_set - work.save - described_class.perform_now(file_set, file_id) - end - - after do - Rails.logger = original_logger - end - - it 'reports corrupt files in the log' do - expect(characterization_error).to match(/event: unexpected file characterization/) - expect(characterization_error).to match(/ark: #{Work.last.ark}/) - expect(characterization_error).to match(/work_id: #{Work.last.id}/) - expect(characterization_error).to match(/food.tif/) - # Different versions of FITS return different mime types - expect(characterization_error).to match(/mime_type: image\/tiff/).or match(/mime_type: application\/octet-stream/) - end - end -end diff --git a/spec/lib/californica/is_valid_image_spec.rb b/spec/lib/californica/is_valid_image_spec.rb deleted file mode 100644 index be8e4c17..00000000 --- a/spec/lib/californica/is_valid_image_spec.rb +++ /dev/null @@ -1,19 +0,0 @@ -# frozen_string_literal: true -require 'rails_helper' - -RSpec.describe Californica::IsValidImage do - let(:corrupted_file) { Rails.root.join('spec', 'fixtures', 'images', 'corrupted', 'food.tif') } - let(:good_file) { Rails.root.join('spec', 'fixtures', 'images', 'good', 'food.tif') } - let(:is_valid_image_with_corrupted_file) { described_class.new(image: corrupted_file) } - let(:is_valid_image) { described_class.new(image: good_file) } - - describe '#valid?' do - it 'returns false if the image is currupted' do - expect(is_valid_image_with_corrupted_file.valid?).to eq(false) - end - - it 'returns true if the image is valid' do - expect(is_valid_image.valid?).to eq(true) - end - end -end diff --git a/spec/rails_helper.rb b/spec/rails_helper.rb index 314cfd88..7a8d3032 100644 --- a/spec/rails_helper.rb +++ b/spec/rails_helper.rb @@ -13,7 +13,6 @@ require 'sidekiq/api' ENV['IMPORT_FILE_PATH'] = "#{::Rails.root}/spec/fixtures" -ENV['MISSING_FILE_LOG'] = "#{::Rails.root}/log/missing_files_test" ENV['IMPORT_PATH'] = "#{::Rails.root}/spec/fixtures" ENV['UPLOAD_PATH'] = "#{::Rails.root}/tmp/uploads" ENV['CACHE_PATH'] = "#{::Rails.root}/tmp/uploads/cache" diff --git a/spec/system/import_and_show_work_spec.rb b/spec/system/import_and_show_work_spec.rb index 37bc5f0e..4bacadfd 100644 --- a/spec/system/import_and_show_work_spec.rb +++ b/spec/system/import_and_show_work_spec.rb @@ -10,11 +10,6 @@ let(:user) { FactoryBot.create(:admin) } let(:collection) { Collection.find_or_create_by_ark('ark:/111/222') } - # Cleanup log files after each test run - after do - File.delete(ENV['MISSING_FILE_LOG']) if File.exist?(ENV['MISSING_FILE_LOG']) - end - context "importing the same object twice" do let(:first_csv_import) { FactoryBot.create(:csv_import, user: user, manifest: first_manifest) } let(:first_manifest) { Rack::Test::UploadedFile.new(File.join(fixture_path, 'coordinates_example.csv'), 'text/csv') }