diff --git a/.github/workflows/ci-cache.yml b/.github/workflows/ci-cache.yml
index 484af1e5..5159e840 100644
--- a/.github/workflows/ci-cache.yml
+++ b/.github/workflows/ci-cache.yml
@@ -33,6 +33,7 @@ jobs:
           mkdir /opt/scholarspace-minter
           mkdir /opt/scholarspace/fedora-data
           mkdir /opt/scholarspace/solr-data
+          mkdir /opt/scholarspace/scholarspace-ingest
           cd /opt/scholarspace
      # Checkout the repository code
       - name: Check out repository code
diff --git a/Dockerfile b/Dockerfile
index d468f8db..d3b7de5b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,6 +9,7 @@ RUN mkdir -p /opt/scholarspace/scholarspace-hyrax \
     && mkdir -p /opt/scholarspace/scholarspace-tmp \
     && mkdir -p /opt/scholarspace/scholarspace-minter \
     && mkdir -p /opt/scholarspace/scholarspace-derivatives \
+    && mkdir -p /opt/scholarspace/scholarspace-ingest \
     && chmod 775 -R /opt/scholarspace/scholarspace-derivatives
 
 WORKDIR /opt/scholarspace/scholarspace-hyrax
diff --git a/Gemfile b/Gemfile
index 576944c0..f8581083 100644
--- a/Gemfile
+++ b/Gemfile
@@ -64,8 +64,7 @@ gem 'riiif', '~> 2.0'
 
 gem 'cookies_eu'
 
-#gem 'bulkrax', git: 'https://github.com/samvera-labs/bulkrax.git'
-gem 'bulkrax', '2.3.0'
+gem 'bulkrax', '8.1.0'
 
 gem 'willow_sword', github: 'notch8/willow_sword'
 
diff --git a/Gemfile.lock b/Gemfile.lock
index 24c7f5d3..79d861e0 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -104,7 +104,7 @@ GEM
     babel-transpiler (0.7.0)
       babel-source (>= 4.0, < 6)
       execjs (~> 2.0)
-    bagit (0.4.5)
+    bagit (0.4.6)
       docopt (~> 0.5.0)
       validatable (~> 1.6)
     base64 (0.2.0)
@@ -162,18 +162,21 @@ GEM
       signet (~> 0.8)
       typhoeus
     builder (3.2.4)
-    bulkrax (2.3.0)
-      bagit (~> 0.4)
+    bulkrax (8.1.0)
+      bagit (~> 0.4.6)
       coderay
+      denormalize_fields
       iso8601 (~> 0.9.0)
       kaminari
       language_list (~> 1.2, >= 1.2.1)
-      libxml-ruby (~> 3.1.0)
+      libxml-ruby (~> 3.2.4)
       loofah (>= 2.2.3)
+      marcel
       oai (>= 0.4, < 2.x)
       rack (>= 2.0.6)
       rails (>= 5.1.6)
       rdf (>= 2.0.2, < 4.0)
+      rubyzip
       simple_form
     byebug (11.1.3)
     cancancan (1.17.0)
@@ -221,6 +224,8 @@ GEM
     declarative-builder (0.1.0)
       declarative-option (< 0.2.0)
     declarative-option (0.1.0)
+    denormalize_fields (1.3.0)
+      activerecord (>= 4.1.14, < 8.0.0)
     deprecation (1.1.0)
       activesupport
     devise (4.9.2)
@@ -577,7 +582,7 @@ GEM
       multi_json
     libv8-node (16.19.0.1-x86_64-darwin)
     libv8-node (16.19.0.1-x86_64-linux)
-    libxml-ruby (3.1.0)
+    libxml-ruby (3.2.4)
     link_header (0.0.8)
     linkeddata (3.1.6)
       equivalent-xml (~> 0.6)
@@ -1062,7 +1067,7 @@ DEPENDENCIES
   blacklight_range_limit
   bootsnap (>= 1.1.0)
   bootstrap-sass (~> 3.0)
-  bulkrax (= 2.3.0)
+  bulkrax (= 8.1.0)
   byebug
   capybara (>= 2.15)
   chosen-rails
diff --git a/README.md b/README.md
index b69b8b76..60a451ad 100644
--- a/README.md
+++ b/README.md
@@ -66,6 +66,7 @@ a separate user for the app, but it is not necessary.  That user will need to ow
       /opt/scholarspace/certs 
       /opt/scholarspace/scholarspace-tmp 
       /opt/scholarspace/scholarspace-minter 
+      /opt/scholarspace/scholarspace-ingest 
       ```
 6. In `/opt/scholarspace/scholarspace-hyrax` run `cp example.env .env` to create the local environment file.
 7. Edit `.env` to add the following values:
@@ -174,16 +175,6 @@ echo $CR_PAT | docker login ghcr.io -u [USERNAME] --password-stdin
 
 ## Setting up a new production instance
 
-### (Optional) Install etd-loader
-
-* Install the **etd-loader** application in `/opt/etd-loader` as per instructions at https://github.com/gwu-libraries/etd-loader
-
-* When configuring `config.py`, ensure that it contains the following values:
-  ```
-  ingest_path = "/opt/scholarspace/scholarspace-hyrax"
-  ingest_command = "rake RAILS_ENV=production gwss:ingest_etd"
-  ```
-
 ### Migrating Production Database
 
 In the app-server container (i.e. through `docker exec -it scholarspace-hyrax_app-server_1 /bin/sh`, followed by `su scholarspace`), run:
diff --git a/app/models/ability.rb b/app/models/ability.rb
index 783958ee..4cd1b013 100644
--- a/app/models/ability.rb
+++ b/app/models/ability.rb
@@ -42,4 +42,15 @@ def contentadmins_can_create_curation_concerns
     can :index, Hydra::AccessControls::Embargo
     can :index, Hydra::AccessControls::Lease
   end
+
+  # Added for Bulkrax 5.0.0+
+  def can_import_works?
+    # can_create_any_work?
+    admin? or contentadmin_user?
+  end
+
+  def can_export_works?
+    # can_create_any_work?
+    admin? or contentadmin_user?
+  end
 end
diff --git a/app/models/collection.rb b/app/models/collection.rb
index 87e71166..f31d4a17 100644
--- a/app/models/collection.rb
+++ b/app/models/collection.rb
@@ -4,4 +4,8 @@ class Collection < ActiveFedora::Base
   # You can replace these metadata if they're not suitable
   include Hyrax::BasicMetadata
   self.indexer = Hyrax::CollectionWithBasicMetadataIndexer
+
+  property :bulkrax_identifier, predicate: ::RDF::URI("https://iro.bl.uk/resource#bulkraxIdentifier"), multiple: false do |index|
+    index.as :stored_searchable, :facetable
+  end
 end
diff --git a/app/models/file_set.rb b/app/models/file_set.rb
index 393de0ee..0c503479 100644
--- a/app/models/file_set.rb
+++ b/app/models/file_set.rb
@@ -1,4 +1,10 @@
 # Generated by hyrax:models:install
 class FileSet < ActiveFedora::Base
+#  include ::Hyrax::FileSetBehavior
+
+  property :bulkrax_identifier, predicate: ::RDF::URI("https://iro.bl.uk/resource#bulkraxIdentifier"), multiple: false do |index|
+    index.as :stored_searchable, :facetable
+  end
+
   include ::Hyrax::FileSetBehavior
 end
diff --git a/app/models/gw_etd.rb b/app/models/gw_etd.rb
index d2c72a78..435cad1b 100644
--- a/app/models/gw_etd.rb
+++ b/app/models/gw_etd.rb
@@ -28,5 +28,9 @@ class GwEtd < ActiveFedora::Base
     index.as :stored_searchable, :facetable
   end
 
+  property :bulkrax_identifier, predicate: ::RDF::URI("https://iro.bl.uk/resource#bulkraxIdentifier"), multiple: false do |index|
+    index.as :stored_searchable, :facetable
+  end
+
   include ::Hyrax::BasicMetadata
 end
diff --git a/app/models/gw_journal_issue.rb b/app/models/gw_journal_issue.rb
index eb61c5fd..51eb987a 100644
--- a/app/models/gw_journal_issue.rb
+++ b/app/models/gw_journal_issue.rb
@@ -24,6 +24,10 @@ class GwJournalIssue < ActiveFedora::Base
     index.as :stored_searchable
   end
 
+  property :bulkrax_identifier, predicate: ::RDF::URI("https://iro.bl.uk/resource#bulkraxIdentifier"), multiple: false do |index|
+    index.as :stored_searchable, :facetable
+  end
+
   # This must be included at the end, because it finalizes the metadata
   # schema (by adding accepts_nested_attributes)
   include ::Hyrax::BasicMetadata
diff --git a/app/models/gw_work.rb b/app/models/gw_work.rb
index 2c85a7ca..addc060b 100644
--- a/app/models/gw_work.rb
+++ b/app/models/gw_work.rb
@@ -16,5 +16,9 @@ class GwWork < ActiveFedora::Base
     index.as :stored_searchable
   end
 
+  property :bulkrax_identifier, predicate: ::RDF::URI("https://iro.bl.uk/resource#bulkraxIdentifier"), multiple: false do |index|
+    index.as :stored_searchable, :facetable
+  end
+
   include ::Hyrax::BasicMetadata
-end
\ No newline at end of file
+end
diff --git a/bin/importer b/bin/importer
new file mode 100644
index 00000000..996d805d
--- /dev/null
+++ b/bin/importer
@@ -0,0 +1,146 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require_relative '../config/environment'
+
+require 'slop'
+
+def main(opts = {})
+  check_required_params
+
+  update = opts[:importer_id].present?
+  port = opts[:port].presence
+  url = build_url(opts.delete(:importer_id), opts.delete(:url), port)
+
+  headers = { 'Content-Type' => 'application/json' }
+  headers['Authorization'] = "Token: #{opts.delete(:auth_token)}"
+  params = build_params(opts)
+
+  logger.info("POST to #{url} - PARAMS #{params}")
+
+  conn = Faraday.new(
+    url: url,
+    headers: headers
+  )
+
+  response = if update
+               conn.put do |request|
+                 request.body = params.to_json
+               end
+             else
+               conn.post do |request|
+                 request.body = params.to_json
+               end
+             end
+
+  puts "#{response.status} - #{response.body.truncate(200)}"
+end
+
+def check_required_params
+  if opts[:importer_id].blank? && invalid?(opts)
+    puts 'Missing required parameters'
+    help
+  end
+
+  if opts[:auth_token].blank? # rubocop:disable Style/GuardClause
+    puts 'Missing Authentication Token --auth_token'
+    exit
+  end
+end
+
+def invalid?(opts)
+  required_params.each do |p|
+    return true if opts[p.to_sym].blank?
+  end
+  return false
+end
+
+def required_params
+  Bulkrax.api_definition['bulkrax']['importer'].map { |key, value| key if value['required'] == true }.compact
+end
+
+def build_params(opts = {})
+  params = {}
+  params[:commit] = opts.delete(:commit)
+  parser_fields = {
+    metadata_file_name: opts.delete(:metadata_file_name),
+    metadata_format: opts.delete(:metadata_format),
+    rights_statement: opts.delete(:rights_statement),
+    override_rights_statement: opts.delete(:override_rights_statement),
+    import_file_path: opts.delete(:import_file_path),
+    metadata_prefix: opts.delete(:metadata_prefix),
+    set: opts.delete(:set),
+    collection_name: opts.delete(:collection_name)
+  }.compact
+  params[:importer] = opts.compact
+  params[:importer][:user_id] = opts.delete(:user_id)
+  params[:importer][:admin_set_id] = opts.delete(:admin_set_id)
+  params[:importer][:parser_fields] = parser_fields || {}
+  return params.compact
+end
+
+def build_url(importer_id, url, port = nil)
+  if url.nil?
+    protocol = Rails.application.config.force_ssl ? 'https://' : 'http://'
+    host = Rails.application.config.action_mailer.default_url_options[:host]
+    url = "#{protocol}#{host}"
+    url = "#{url}:#{port}" if port
+  end
+  path = Bulkrax::Engine.routes.url_helpers.polymorphic_path(Bulkrax::Importer)
+  url = File.join(url, path)
+  url = File.join(url, importer_id) if importer_id
+  return url
+end
+
+def logger
+  Rails.logger
+end
+
+def version
+  puts "Bulkrax #{Bulkrax::VERSION}"
+  puts "Slop #{Slop::VERSION}"
+end
+
+# Format the help for the CLI
+def help
+  puts 'CREATE:'
+  puts '  bin/importer --name "My Import" --parser_klass Bulkrax::CsvParser --commit "Create and Import" --import_file_path /data/tmp/import.csv --auth_token 12345'
+  puts 'UPDATE:'
+  puts '  bin/importer --importer_id 1 --commit "Update and Re-Import (update metadata only)" --import_file_path /data/tmp/import.csv --auth_token 12345'
+  puts 'PARAMETERS:'
+  Bulkrax.api_definition['bulkrax']['importer'].each_pair do |key, value|
+    next if key == 'parser_fields'
+    puts "  --#{key}"
+    value.each_pair do |k, v|
+      next if k == 'contained_in'
+      puts "    #{k}: #{v}"
+    end
+  end
+  puts '  --url'
+  puts "    Repository URL"
+  exit
+end
+
+# Setup the options
+options = Slop.parse do |o|
+  o.on '--version', 'Print the version' do
+    version
+    exit
+  end
+
+  o.on '--help', 'Print help' do
+    help
+    exit
+  end
+
+  Bulkrax.api_definition['bulkrax']['importer'].each_pair do |key, value|
+    if value['required'].blank?
+      o.string "--#{key}", value['definition'], default: nil
+    else
+      o.string "--#{key}", value['definition']
+    end
+  end
+  o.string '--url', 'Repository URL'
+end
+
+main(options.to_hash)
diff --git a/config/environments/test.rb b/config/environments/test.rb
index c284a0f7..13aac196 100644
--- a/config/environments/test.rb
+++ b/config/environments/test.rb
@@ -41,5 +41,5 @@
   # config.action_view.raise_on_missing_translations = true
   config.permanent_url_base = "https://scholarspace-etds.library.gwu.edu/"
 
-  config.active_job.queue_adapter = :test
+  config.active_job.queue_adapter = :sidekiq
 end
diff --git a/config/initializers/bulkrax.rb b/config/initializers/bulkrax.rb
index ca5d0d08..c926236a 100644
--- a/config/initializers/bulkrax.rb
+++ b/config/initializers/bulkrax.rb
@@ -7,8 +7,16 @@
   # ]
 
   # WorkType to use as the default if none is specified in the import
-  # Default is the first returned by Hyrax.config.curation_concerns
-  config.default_work_type = 'GwWork'
+  # Default is the first returned by Hyrax.config.curation_concerns, stringified
+  config.default_work_type = "GwWork"
+
+  # Factory Class to use when generating and saving objects
+  config.object_factory = Bulkrax::ObjectFactory
+  # Use this for a Postgres-backed Valkyrized Hyrax
+  # config.object_factory = Bulkrax::ValkyrieObjectFactory
+  
+  # Queue name for imports
+  config.ingest_queue_name = :import
 
   # Path to store pending imports
   # config.import_path = 'tmp/imports'
@@ -33,23 +41,18 @@
   #   config.field_mappings = {
   #     "Bulkrax::OaiDcParser" => { **individual field mappings go here*** }
   #   }
+
   config.field_mappings['Bulkrax::CsvParser'] = {
-    "contributor" => { from: ["contributor", split: ';' ] },
-    "creator" => { from: ["creator"], split: "; " },
-    "date_created" => { from: ["date_created"], split: ';'  },
-    "description" => { from: ["description"] },
-    "identifier" => { from: ["identifier"], split: ';'  },
-    "related_url" => { from: ["related_url"] },
-    "rights_statement" => { from: ["rights_statement"] },
-    "license" => { from: ["license"], split: ';'  }, 
-    "source_identifier" => { from: ["source_identifier"] },
-    "keyword" => { from: ["keyword"], split: ';' },
-    "title" => { from: ["title"] },
-    "doi" => {from: ["doi"], split: ';'},
-    "resource_type" => { from: ["resource_type"], split: ';'  },
-    "gw_affiliation" => { from: ["gw_affiliation"], split: ';'  },
-    'parents' => { from: ['parents'], related_parents_field_mapping: true },
-    'children' => { from: ['children'], related_children_field_mapping: true }
+    # Setting source_identifier: true makes bulkrax_identifier a mandatory field,
+    # so it MUST be present in the CSV row for EVERY item (regardless of type, so this includes FileSets as well)
+    'bulkrax_identifier' => { from: ['bulkrax_identifier'], source_identifier: true },
+    'keyword' => { from: ['keyword'], split: true },
+    'advisor' => { from: ['advisor'], split: true },
+    'doi' => { from: ['doi'], split: true },
+    'committee_member' => { from: ['committee_member'], split: true },
+    'gw_affiliation' => { from: ['gw_affiliation'], split: true },
+    'file' => { from: ['file'], split: '\;' },
+    'parents' => { from: ['parents'], split: '\;', related_parents_field_mapping: true },
   }
 
   # Add to, or change existing mappings as follows
@@ -62,7 +65,7 @@
   #   (For more info on importing relationships, see Bulkrax Wiki: https://github.com/samvera-labs/bulkrax/wiki/Configuring-Bulkrax#parent-child-relationship-field-mappings)
   #
   # #   e.g. to add the required source_identifier field
-  #   #   config.field_mappings["Bulkrax::CsvParser"]["source_id"] = { from: ["old_source_id"], source_identifier: true  }
+  #   #   config.field_mappings["Bulkrax::CsvParser"]["source_id"] = { from: ["old_source_id"], source_identifier: true, search_field: 'source_id_sim' }
   # If you want Bulkrax to fill in source_identifiers for you, see below
 
   # To duplicate a set of mappings from one parser to another
@@ -75,11 +78,28 @@
   # It is given two aruguments, self at the time of call and the index of the reocrd
   #    config.fill_in_blank_source_identifiers = ->(parser, index) { "b-#{parser.importer.id}-#{index}"}
   # or use a uuid
-  #config.fill_in_blank_source_identifiers = ->(parser, index) { SecureRandom.uuid }
+  # config.fill_in_blank_source_identifiers = ->(parser, index) { SecureRandom.uuid }
 
   # Properties that should not be used in imports/exports. They are reserved for use by Hyrax.
   # config.reserved_properties += ['my_field']
+
+  # List of Questioning Authority properties that are controlled via YAML files in
+  # the config/authorities/ directory. For example, the :rights_statement property
+  # is controlled by the active terms in config/authorities/rights_statements.yml
+  # Defaults: 'rights_statement' and 'license'
+  # config.qa_controlled_properties += ['my_field']
+
+  # Specify the delimiter regular expression for splitting an attribute's values into a multi-value array.
+  #config.multi_value_element_split_on = /\s*[:;|]\s*/.freeze
+  config.multi_value_element_split_on = ';'.freeze
+
+  # Specify the delimiter for joining an attribute's multi-value array into a string.  Note: the
+  # specific delimeter should likely be present in the multi_value_element_split_on expression.
+  config.multi_value_element_join_on = ' | '
 end
 
 # Sidebar for hyrax 3+ support
-#Hyrax::DashboardController.sidebar_partials[:repository_content] << "hyrax/dashboard/sidebar/bulkrax_sidebar_additions" if Object.const_defined?(:Hyrax) && ::Hyrax::DashboardController&.respond_to?(:sidebar_partials)
\ No newline at end of file
+# rubocop:disable Style/IfUnlessModifier
+if Object.const_defined?(:Hyrax) && ::Hyrax::DashboardController&.respond_to?(:sidebar_partials)
+  Hyrax::DashboardController.sidebar_partials[:repository_content] << "hyrax/dashboard/sidebar/bulkrax_sidebar_additions"
+end
diff --git a/db/schema.rb b/db/schema.rb
index e963d30e..304acc64 100644
--- a/db/schema.rb
+++ b/db/schema.rb
@@ -10,7 +10,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.
 
-ActiveRecord::Schema.define(version: 2024_02_08_142942) do
+ActiveRecord::Schema.define(version: 2024_03_07_053156) do
 
   # These are extensions that must be enabled in order to support this database
   enable_extension "plpgsql"
@@ -40,6 +40,10 @@
     t.datetime "last_succeeded_at"
     t.string "importerexporter_type", default: "Bulkrax::Importer"
     t.integer "import_attempts", default: 0
+    t.string "status_message", default: "Pending"
+    t.index ["identifier", "importerexporter_id", "importerexporter_type"], name: "bulkrax_identifier_idx"
+    t.index ["importerexporter_id", "importerexporter_type"], name: "bulkrax_entries_importerexporter_idx"
+    t.index ["type"], name: "index_bulkrax_entries_on_type"
   end
 
   create_table "bulkrax_exporter_runs", force: :cascade do |t|
@@ -70,6 +74,9 @@
     t.date "finish_date"
     t.string "work_visibility"
     t.string "workflow_status"
+    t.boolean "include_thumbnails", default: false
+    t.boolean "generated_metadata", default: false
+    t.string "status_message", default: "Pending"
     t.index ["user_id"], name: "index_bulkrax_exporters_on_user_id"
   end
 
@@ -110,9 +117,22 @@
     t.boolean "validate_only"
     t.datetime "last_error_at"
     t.datetime "last_succeeded_at"
+    t.string "status_message", default: "Pending"
     t.index ["user_id"], name: "index_bulkrax_importers_on_user_id"
   end
 
+  create_table "bulkrax_pending_relationships", force: :cascade do |t|
+    t.bigint "importer_run_id", null: false
+    t.string "parent_id", null: false
+    t.string "child_id", null: false
+    t.datetime "created_at", null: false
+    t.datetime "updated_at", null: false
+    t.integer "order", default: 0
+    t.index ["child_id"], name: "index_bulkrax_pending_relationships_on_child_id"
+    t.index ["importer_run_id"], name: "index_bulkrax_pending_relationships_on_importer_run_id"
+    t.index ["parent_id"], name: "index_bulkrax_pending_relationships_on_parent_id"
+  end
+
   create_table "bulkrax_statuses", force: :cascade do |t|
     t.string "status_message"
     t.string "error_class"
@@ -124,6 +144,9 @@
     t.string "runnable_type"
     t.datetime "created_at", null: false
     t.datetime "updated_at", null: false
+    t.index ["error_class"], name: "index_bulkrax_statuses_on_error_class"
+    t.index ["runnable_id", "runnable_type"], name: "bulkrax_statuses_runnable_idx"
+    t.index ["statusable_id", "statusable_type"], name: "bulkrax_statuses_statusable_idx"
   end
 
   create_table "checksum_audit_logs", id: :serial, force: :cascade do |t|
@@ -682,6 +705,7 @@
 
   add_foreign_key "bulkrax_exporter_runs", "bulkrax_exporters", column: "exporter_id"
   add_foreign_key "bulkrax_importer_runs", "bulkrax_importers", column: "importer_id"
+  add_foreign_key "bulkrax_pending_relationships", "bulkrax_importer_runs", column: "importer_run_id"
   add_foreign_key "collection_type_participants", "hyrax_collection_types"
   add_foreign_key "curation_concerns_operations", "users"
   add_foreign_key "mailboxer_conversation_opt_outs", "mailboxer_conversations", column: "conversation_id", name: "mb_opt_outs_on_conversations_id"
diff --git a/docker-compose.yml b/docker-compose.yml
index db72b3d7..cadac627 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -116,6 +116,7 @@ services:
      - ${NGINX_CERT_DIR}:/opt/scholarspace/certs
      - ${NGINX_KEY_DIR}:/opt/scholarspace/keys
      - /opt/scholarspace/scholarspace-derivatives:/opt/scholarspace/scholarspace-derivatives
+     - /opt/scholarspace/scholarspace-ingest:/opt/scholarspace/scholarspace-ingest
      - app-hyrax:/opt/scholarspace
      # Uncomment for development
     #  - /opt/scholarspace/scholarspace-hyrax:/opt/scholarspace/scholarspace-hyrax
@@ -149,6 +150,7 @@ services:
      - ${NGINX_CERT_DIR}:/opt/scholarspace/certs
      - ${NGINX_KEY_DIR}:/opt/scholarspace/keys
      - /opt/scholarspace/scholarspace-derivatives:/opt/scholarspace/scholarspace-derivatives
+     - /opt/scholarspace/scholarspace-ingest:/opt/scholarspace/scholarspace-ingest
      - app-hyrax:/opt/scholarspace
      # Uncomment for development
     #  - /opt/scholarspace/scholarspace-hyrax:/opt/scholarspace/scholarspace-hyrax
diff --git a/lib/tasks/ingest_bulkrax_prep.rake b/lib/tasks/ingest_bulkrax_prep.rake
new file mode 100644
index 00000000..b53082e5
--- /dev/null
+++ b/lib/tasks/ingest_bulkrax_prep.rake
@@ -0,0 +1,279 @@
+require 'fileutils'
+require 'nokogiri'
+require 'rake'
+require 'zip'
+
+namespace :gwss do
+  desc "Creates a bulkrax zip for all of the ProQuest ETD zip files in a folder"
+  task :ingest_pq_etds, [:filepath] do |t, args|
+
+    @degree_etd_map = {}
+
+    def get_metadata_doc_path(pq_files_dir)
+      xml_paths = Dir.glob("#{pq_files_dir}/*_DATA.xml")
+      pq_xml_file_path = xml_paths.first
+      pq_xml_file_path
+    end
+
+    def get_etd_doc(xml_file_path)
+      File.open(xml_file_path) { |f| Nokogiri::XML(f) }
+    end
+
+    def get_abstract(doc)
+      abstract_text_array = []
+      doc.xpath("//DISS_content/DISS_abstract/DISS_para").each do |p|
+        abstract_text_array << p.text
+      end
+      abstract_text = Nokogiri::HTML(abstract_text_array.join("\n")).text
+    end
+
+    def fullname(person_node)
+      lastname = person_node.xpath("DISS_name/DISS_surname").text
+      firstname = person_node.xpath("DISS_name/DISS_fname").text
+      middlename = person_node.xpath("DISS_name/DISS_middle").text
+
+      fullname = lastname + ", " + firstname
+      fullname = fullname + " " + middlename unless middlename.empty?
+      fullname
+    end
+
+    def get_creators(doc)
+      creators_array = []
+      contributors_array = []
+      doc.xpath("//DISS_authorship/DISS_author").each do |author_node|
+        author_type = author_node.attribute('type').text
+
+        if author_type == 'primary'
+          creators_array << fullname(author_node)
+        else
+          contributors_array << fullname(author_node)
+        end 
+      end
+
+      {'creators' => creators_array, 'contributors' => contributors_array}
+    end
+
+    def get_node_value(doc, xpath)
+      doc.xpath(xpath).text
+    end
+
+    def get_keywords(doc)
+      keyword_array = []
+      doc.xpath("//DISS_description/DISS_categorization/DISS_keyword").text.split(',') do |k|
+        keyword_array << k.strip()
+      end
+      keyword_array
+    end
+
+    def get_date_created(doc)
+      comp_date = doc.xpath("//DISS_description/DISS_dates/DISS_comp_date").text 
+      if !comp_date.empty? and comp_date.length >= 4
+        comp_date[0..3]
+      else
+        nil
+      end
+    end
+
+    def is_embargoed?(doc)
+      sales_restric = doc.xpath("//DISS_restriction/DISS_sales_restriction")
+      return false if sales_restric.empty?
+      rmv = sales_restric.attribute('remove')
+      return false if rmv.nil?
+      # else
+      true
+    end
+
+    def get_embargo_date(doc)
+      sales_restric = doc.xpath("//DISS_restriction/DISS_sales_restriction")
+      return nil if sales_restric.empty?
+      return nil if sales_restric.attribute('remove').text.empty?
+      sales_restric.attribute('remove').text
+    end
+
+    def get_advisors(doc)
+      advisors = []
+      doc.xpath("//DISS_description/DISS_advisor").each do |advisor_node|
+        advisors << fullname(advisor_node)
+      end
+      advisors
+    end
+    
+    def get_committee_members(doc)
+      committee_members = []
+      doc.xpath("//DISS_description/DISS_cmte_member").each do |committee_member_node|
+        committee_members << fullname(committee_member_node)
+      end
+      committee_members
+    end
+
+    def convert_to_iso(date_str)
+      date = Date.strptime(date_str, '%m/%d/%Y')
+      date.strftime('%Y-%m-%d')
+    end
+
+    def build_resource_type_degree_mapping
+      etd_degree_map = YAML.load_file('config/etd_degree_map.yml')
+      @degree_etd_map = {}
+      degree_categories = etd_degree_map.keys
+      # Flip etd_degree_map to create degree_etd_map
+      # So that for any given degree, we can get back whether it's a masters or a doctorate
+      degree_categories.each do |degree_category|
+        etd_degree_map[degree_category].each do |degree_name|
+        # upcase each degree (just in case) and ignore "."s
+          @degree_etd_map[degree_name.upcase.delete('.')] = degree_category
+        end
+      end
+    end
+
+    def extract_metadata(doc) 
+      work_metadata = Hash.new  
+      work_metadata['model'] = 'GwEtd'
+      work_metadata['title'] = get_node_value(doc, "//DISS_description/DISS_title")
+      creators = get_creators(doc)
+      work_metadata['creator'] = creators['creators'].join(';')
+      work_metadata['contributor'] = creators['contributors'].join(';')
+      work_metadata['language'] = get_node_value(doc, "//DISS_description/DISS_categorization/DISS_language")
+      work_metadata['description'] = get_abstract(doc)
+      work_metadata['keyword'] = get_keywords(doc).join(';')
+      degree = get_node_value(doc, "//DISS_description/DISS_degree")
+      work_metadata['degree'] = degree
+      work_metadata['resource_type'] = @degree_etd_map[degree.upcase.delete('.')]
+      work_metadata['advisor'] = get_advisors(doc).join(';')
+      work_metadata['gw_affiliation'] = get_node_value(doc, "//DISS_description/DISS_institution/DISS_inst_contact")
+      etd_date_created = get_date_created(doc)
+      work_metadata['date_created'] = etd_date_created unless etd_date_created.nil?
+      work_metadata['committee_member'] = get_committee_members(doc).join(';')
+      work_metadata['rights_statement'] = 'http://rightsstatements.org/vocab/InC/1.0/'
+      # Can't currently load this license because this Bulkrax code
+      # https://github.com/samvera/bulkrax/blob/v8.1.0/app/models/concerns/bulkrax/import_behavior.rb#L145-L146
+      # will try to match it with http://www.europeana.eu/portal/rights/rr-r.html/ (slash-terminated)
+      # -- not finding a match, Bulkrax will throw an error.
+      # work_metadata['license'] = 'http://www.europeana.eu/portal/rights/rr-r.html'
+      work_metadata
+    end
+
+    def hash_array_to_csv_array(hash_array)
+      hash_keys = hash_array.flat_map(&:keys).uniq
+      # header row
+      csv_array = [hash_keys]
+      hash_array.each do |row|
+        csv_array << hash_keys.map {|key| row[key]}
+      end
+      csv_array
+    end
+
+    def write_csv(csv_array, csv_path)
+      CSV.open(csv_path, 'w') do |csv|
+        csv_array.each do |row|
+          csv << row
+        end
+      end
+    end
+
+    def repair_filename(filepath)
+      # translate spaces in the filename portion to _ 
+      if File.dirname(filepath) == '.'
+        File.basename(filepath).tr(' ', '_')
+      else
+        File.join(File.dirname(filepath), File.basename(filepath).tr(' ', '_'))
+      end
+    end
+
+    build_resource_type_degree_mapping
+    puts "build_resource_type_degree_mapping: "
+    puts @degree_etd_map
+        
+    # create folder for metadata.csv and files folder
+
+    # if running spec tests, add /test/ to the tmp file path to prevent filling /tmp/bulkrax_zip when tests are run
+    if Rails.env.test?
+      bulkrax_zip_path = "#{ENV['TEMP_FILE_BASE']}/test/bulkrax_zip"
+    else
+      bulkrax_zip_path = "#{ENV['TEMP_FILE_BASE']}/bulkrax_zip"
+    end
+
+    bulkrax_files_path = "#{bulkrax_zip_path}/files" 
+    puts "File.exists?(bulkrax_zip_path) = #{File.exists?(bulkrax_zip_path)}"
+    FileUtils.makedirs("#{bulkrax_files_path}") unless File.exists?(bulkrax_zip_path)
+
+    # get all ETD zip files in the args.filepath folder
+    path_to_zips = args.filepath
+
+    works_metadata = []
+    filesets_metadata = []
+
+    zip_paths = Dir.glob("#{path_to_zips}/etdadmin*.zip")
+    puts("zip_paths: #{zip_paths}")
+    zip_paths.each do |zip_path|
+      # for each ETD zip file:
+      puts("Processing #{zip_path}")
+      zip_file = Zip::File.open(zip_path)
+      zip_file_basename = File.basename(zip_path, '.zip') # e.g. etdadmin_upload_353614
+      zip_file_dir = "#{bulkrax_files_path}/#{zip_file_basename}" # e.g. bulkrax_zip/files/etdadmin_upload_353614
+      Dir.mkdir(zip_file_dir) unless File.exists?(zip_file_dir)
+
+      attachment_file_paths = []
+      zip_file.each do |entry|
+        puts("  Extracting #{entry.name}")
+        entry_name_clean = repair_filename(entry.name)
+        zip_file.extract(entry, "#{zip_file_dir}/#{entry_name_clean}") 
+        # skip directories - these get their own entries in a zip file
+        attachment_file_paths <<  "#{entry_name_clean}" if !entry.name_is_directory?
+      end
+
+      # 1. extract the work metdata and add to the works metadata array
+      xml_file_path = get_metadata_doc_path(zip_file_dir)
+      etd_doc = get_etd_doc(xml_file_path)
+      puts "xml is located at: #{xml_file_path}"
+      etd_md = extract_metadata(etd_doc)
+      parent_work_identifier = SecureRandom.uuid
+      etd_md['bulkrax_identifier'] = parent_work_identifier
+      works_metadata << etd_md
+
+      # Set up embargo info that will be applied below to all file attachments
+      etd_is_embargoed = is_embargoed?(etd_doc)
+      if etd_is_embargoed
+        embargo_date = get_embargo_date(etd_doc) 
+        if !embargo_date.nil?
+          embargo_release_date = convert_to_iso(embargo_date)
+        else
+          embargo_release_date = nil
+        end 
+      end
+
+      # 2. extract the attachment files paths and add to the filesets metadata array
+      # Remove the metadata xml file so we don't go and attach it to thw work
+      attachment_file_paths.delete(File.basename(xml_file_path))
+      attachment_file_paths.each do |fp|
+        fp_basename = File.basename(fp)
+        puts "path = #{fp}, basename = #{fp_basename}"
+        file_md = Hash.new
+        file_md['model'] = 'FileSet'
+        safe_fp = "#{zip_file_basename}/#{fp}"
+        file_md['file'] =  safe_fp
+        file_md['title'] = fp_basename
+        file_md['bulkrax_identifier'] = SecureRandom.uuid
+        file_md['parents'] = parent_work_identifier
+
+        # Add embargo info and 
+        if etd_is_embargoed
+          file_md['visibility'] = 'embargo'
+          file_md['visibility_during_embargo'] = 'restricted'
+          file_md['visibility_after_embargo'] = 'open'
+          file_md['embargo_release_date'] = embargo_release_date
+        end
+        filesets_metadata << file_md
+      end
+    end
+    
+    all_md = works_metadata + filesets_metadata
+
+    csv_rows = hash_array_to_csv_array(all_md)
+    bulkrax_csv_filepath = "#{bulkrax_zip_path}/metadata.csv"
+    write_csv(csv_rows, bulkrax_csv_filepath)
+
+    # FUTURE EXPANSION:  Zip up the bulkrax ingest manifest and files
+    # zip up the working folder
+    # Consider a system command here?  Not so simple with rubyzip
+  end
+end
diff --git a/spec/features/bulkrax_upload_spec.rb b/spec/features/bulkrax_upload_spec.rb
new file mode 100644
index 00000000..792871f1
--- /dev/null
+++ b/spec/features/bulkrax_upload_spec.rb
@@ -0,0 +1,91 @@
+require 'rails_helper'
+require 'csv'
+
+Rails.application.load_tasks
+
+RSpec.describe "Deposit files through Bulkrax" do
+
+  before :all do
+    # remove the folder so it doesn't repeatedly add new works when ingest task is run
+    FileUtils.rm_rf("#{Rails.root}/tmp/test/bulkrax_zip")
+
+    Rake::Task["gwss:ingest_pq_etds"].invoke("#{Rails.root}/spec/fixtures/etd_zips")
+  end
+  
+  it 'generates deposit file structure via gwss:ingest_pq_etds task' do
+    expect(File.directory?("#{Rails.root}/tmp/test/bulkrax_zip")).to be true
+    expect(File.directory?("#{Rails.root}/tmp/test/bulkrax_zip/files")).to be true
+    
+    expect(File.directory?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_1")).to be true
+    expect(File.file?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_1/Ab_gwu_0075A_16593_DATA.xml")).to be true
+    expect(File.file?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_1/Ab_gwu_0075A_16593.pdf")).to be true
+
+    expect(File.directory?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_2")).to be true
+    expect(File.file?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_2/Ab_gwu_0076A_12345_DATA.xml")).to be true
+    expect(File.file?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_2/Ab_gwu_0076A_12345.pdf")).to be true
+
+    expect(File.file?("#{Rails.root}/tmp/test/bulkrax_zip/metadata.csv")).to be true
+  end
+
+  it 'generates accurate CSV file for import' do
+    csv_rows = CSV.read("#{Rails.root}/tmp/test/bulkrax_zip/metadata.csv")
+
+    headers_arr = csv_rows[0]
+
+    expect(headers_arr).to eq(["model", "title", "creator", "contributor", "language",
+                               "description", "keyword", "degree", "resource_type", "advisor", "gw_affiliation",
+                               "date_created", "committee_member", "rights_statement", "bulkrax_identifier",
+                               "file", "parents", "visibility", "visibility_during_embargo",
+                               "visibility_after_embargo", "embargo_release_date"])
+
+    # check that there are five rows - one for header, one for each of the etds, one for each of the files
+    expect(csv_rows.count).to eq(5)
+
+    first_work_metadata = csv_rows[1]
+    second_work_metadata = csv_rows[2]
+    first_file_data = csv_rows[3]
+    second_file_data = csv_rows[4]
+
+    expect(first_work_metadata.include?("GwEtd")).to be true    
+    expect(second_work_metadata.include?("GwEtd")).to be true
+
+    expect(first_file_data.include?("embargo")).to be true
+    expect(second_file_data.include?("embargo")).to be true
+
+    expect(first_file_data.include?("restricted")).to be true
+    expect(second_file_data.include?("restricted")).to be true
+  end
+
+  it 'can deposit works via bulkrax import' do
+    admin_user = FactoryBot.create(:admin_user)
+    etds_admin_set = Hyrax::AdministrativeSet.new(title: ['ETDs'])
+    etds_admin_set = Hyrax.persister.save(resource: etds_admin_set)
+    Hyrax::AdminSetCreateService.call!(admin_set: etds_admin_set, creating_user: admin_user)
+
+    sign_in_user(admin_user)
+
+    visit '/importers/new'
+
+    fill_in('importer_name', with: "Test Bulkrax Import")
+    select('ETDs', from: 'importer_admin_set_id')
+    select('CSV - Comma Separated Values', from: 'importer_parser_klass')
+
+    import_parser_radio_button_elements = page.all('//*[@id="importer_parser_fields_file_style_specify_a_path_on_the_server"]')
+    import_parser_radio_button_elements.last.click
+    
+    import_parser_file_path_elements = page.all('//*[@id="importer_parser_fields_import_file_path"]')
+    import_parser_file_path_elements.last.fill_in with: "#{Rails.root}/tmp/test/bulkrax_zip/metadata.csv"
+
+    click_on("Create and Import")
+
+    # the 'expect' statements below are not super specific, but the test will fail if any step in the deposit fails, so feels robust enough
+    
+    # check if both works are created
+    work_1 = GwEtd.where(title: "A False Work For Testing Purposes").first
+    work_2 = GwEtd.where(title: "Another False Work For Bulkrax Testing Purposes").first
+
+    # check if both works get an embargo ID
+    expect(work_1.embargo_id.present?).to be true
+    expect(work_2.embargo_id.present?).to be true
+  end
+end
diff --git a/spec/fixtures/etd_zips/etdadmin_upload_1.zip b/spec/fixtures/etd_zips/etdadmin_upload_1.zip
new file mode 100644
index 00000000..ec0e4ecf
Binary files /dev/null and b/spec/fixtures/etd_zips/etdadmin_upload_1.zip differ
diff --git a/spec/fixtures/etd_zips/etdadmin_upload_2.zip b/spec/fixtures/etd_zips/etdadmin_upload_2.zip
new file mode 100644
index 00000000..5c040605
Binary files /dev/null and b/spec/fixtures/etd_zips/etdadmin_upload_2.zip differ
diff --git a/spec/rails_helper.rb b/spec/rails_helper.rb
index dcb6421c..22a21aad 100644
--- a/spec/rails_helper.rb
+++ b/spec/rails_helper.rb
@@ -9,6 +9,9 @@
 require_relative "../spec/support/fedora_cleaner"
 require_relative "../spec/support/solr_cleaner"
 
+require 'sidekiq/testing'
+Sidekiq::Testing.inline!
+
 # Add additional requires below this line. Rails is not loaded until this point!
 
 # Requires supporting ruby files with custom matchers and macros, etc, in