diff --git a/.github/workflows/ci-cache.yml b/.github/workflows/ci-cache.yml index 484af1e5..5159e840 100644 --- a/.github/workflows/ci-cache.yml +++ b/.github/workflows/ci-cache.yml @@ -33,6 +33,7 @@ jobs: mkdir /opt/scholarspace-minter mkdir /opt/scholarspace/fedora-data mkdir /opt/scholarspace/solr-data + mkdir /opt/scholarspace/scholarspace-ingest cd /opt/scholarspace # Checkout the repository code - name: Check out repository code diff --git a/Dockerfile b/Dockerfile index d468f8db..d3b7de5b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,7 @@ RUN mkdir -p /opt/scholarspace/scholarspace-hyrax \ && mkdir -p /opt/scholarspace/scholarspace-tmp \ && mkdir -p /opt/scholarspace/scholarspace-minter \ && mkdir -p /opt/scholarspace/scholarspace-derivatives \ + && mkdir -p /opt/scholarspace/scholarspace-ingest \ && chmod 775 -R /opt/scholarspace/scholarspace-derivatives WORKDIR /opt/scholarspace/scholarspace-hyrax diff --git a/Gemfile b/Gemfile index 576944c0..f8581083 100644 --- a/Gemfile +++ b/Gemfile @@ -64,8 +64,7 @@ gem 'riiif', '~> 2.0' gem 'cookies_eu' -#gem 'bulkrax', git: 'https://github.com/samvera-labs/bulkrax.git' -gem 'bulkrax', '2.3.0' +gem 'bulkrax', '8.1.0' gem 'willow_sword', github: 'notch8/willow_sword' diff --git a/Gemfile.lock b/Gemfile.lock index 24c7f5d3..79d861e0 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -104,7 +104,7 @@ GEM babel-transpiler (0.7.0) babel-source (>= 4.0, < 6) execjs (~> 2.0) - bagit (0.4.5) + bagit (0.4.6) docopt (~> 0.5.0) validatable (~> 1.6) base64 (0.2.0) @@ -162,18 +162,21 @@ GEM signet (~> 0.8) typhoeus builder (3.2.4) - bulkrax (2.3.0) - bagit (~> 0.4) + bulkrax (8.1.0) + bagit (~> 0.4.6) coderay + denormalize_fields iso8601 (~> 0.9.0) kaminari language_list (~> 1.2, >= 1.2.1) - libxml-ruby (~> 3.1.0) + libxml-ruby (~> 3.2.4) loofah (>= 2.2.3) + marcel oai (>= 0.4, < 2.x) rack (>= 2.0.6) rails (>= 5.1.6) rdf (>= 2.0.2, < 4.0) + rubyzip simple_form byebug (11.1.3) cancancan (1.17.0) @@ -221,6 +224,8 @@ GEM declarative-builder (0.1.0) declarative-option (< 0.2.0) declarative-option (0.1.0) + denormalize_fields (1.3.0) + activerecord (>= 4.1.14, < 8.0.0) deprecation (1.1.0) activesupport devise (4.9.2) @@ -577,7 +582,7 @@ GEM multi_json libv8-node (16.19.0.1-x86_64-darwin) libv8-node (16.19.0.1-x86_64-linux) - libxml-ruby (3.1.0) + libxml-ruby (3.2.4) link_header (0.0.8) linkeddata (3.1.6) equivalent-xml (~> 0.6) @@ -1062,7 +1067,7 @@ DEPENDENCIES blacklight_range_limit bootsnap (>= 1.1.0) bootstrap-sass (~> 3.0) - bulkrax (= 2.3.0) + bulkrax (= 8.1.0) byebug capybara (>= 2.15) chosen-rails diff --git a/README.md b/README.md index b69b8b76..60a451ad 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ a separate user for the app, but it is not necessary. That user will need to ow /opt/scholarspace/certs /opt/scholarspace/scholarspace-tmp /opt/scholarspace/scholarspace-minter + /opt/scholarspace/scholarspace-ingest ``` 6. In `/opt/scholarspace/scholarspace-hyrax` run `cp example.env .env` to create the local environment file. 7. Edit `.env` to add the following values: @@ -174,16 +175,6 @@ echo $CR_PAT | docker login ghcr.io -u [USERNAME] --password-stdin ## Setting up a new production instance -### (Optional) Install etd-loader - -* Install the **etd-loader** application in `/opt/etd-loader` as per instructions at https://github.com/gwu-libraries/etd-loader - -* When configuring `config.py`, ensure that it contains the following values: - ``` - ingest_path = "/opt/scholarspace/scholarspace-hyrax" - ingest_command = "rake RAILS_ENV=production gwss:ingest_etd" - ``` - ### Migrating Production Database In the app-server container (i.e. through `docker exec -it scholarspace-hyrax_app-server_1 /bin/sh`, followed by `su scholarspace`), run: diff --git a/app/models/ability.rb b/app/models/ability.rb index 783958ee..4cd1b013 100644 --- a/app/models/ability.rb +++ b/app/models/ability.rb @@ -42,4 +42,15 @@ def contentadmins_can_create_curation_concerns can :index, Hydra::AccessControls::Embargo can :index, Hydra::AccessControls::Lease end + + # Added for Bulkrax 5.0.0+ + def can_import_works? + # can_create_any_work? + admin? or contentadmin_user? + end + + def can_export_works? + # can_create_any_work? + admin? or contentadmin_user? + end end diff --git a/app/models/collection.rb b/app/models/collection.rb index 87e71166..f31d4a17 100644 --- a/app/models/collection.rb +++ b/app/models/collection.rb @@ -4,4 +4,8 @@ class Collection < ActiveFedora::Base # You can replace these metadata if they're not suitable include Hyrax::BasicMetadata self.indexer = Hyrax::CollectionWithBasicMetadataIndexer + + property :bulkrax_identifier, predicate: ::RDF::URI("https://iro.bl.uk/resource#bulkraxIdentifier"), multiple: false do |index| + index.as :stored_searchable, :facetable + end end diff --git a/app/models/file_set.rb b/app/models/file_set.rb index 393de0ee..0c503479 100644 --- a/app/models/file_set.rb +++ b/app/models/file_set.rb @@ -1,4 +1,10 @@ # Generated by hyrax:models:install class FileSet < ActiveFedora::Base +# include ::Hyrax::FileSetBehavior + + property :bulkrax_identifier, predicate: ::RDF::URI("https://iro.bl.uk/resource#bulkraxIdentifier"), multiple: false do |index| + index.as :stored_searchable, :facetable + end + include ::Hyrax::FileSetBehavior end diff --git a/app/models/gw_etd.rb b/app/models/gw_etd.rb index d2c72a78..435cad1b 100644 --- a/app/models/gw_etd.rb +++ b/app/models/gw_etd.rb @@ -28,5 +28,9 @@ class GwEtd < ActiveFedora::Base index.as :stored_searchable, :facetable end + property :bulkrax_identifier, predicate: ::RDF::URI("https://iro.bl.uk/resource#bulkraxIdentifier"), multiple: false do |index| + index.as :stored_searchable, :facetable + end + include ::Hyrax::BasicMetadata end diff --git a/app/models/gw_journal_issue.rb b/app/models/gw_journal_issue.rb index eb61c5fd..51eb987a 100644 --- a/app/models/gw_journal_issue.rb +++ b/app/models/gw_journal_issue.rb @@ -24,6 +24,10 @@ class GwJournalIssue < ActiveFedora::Base index.as :stored_searchable end + property :bulkrax_identifier, predicate: ::RDF::URI("https://iro.bl.uk/resource#bulkraxIdentifier"), multiple: false do |index| + index.as :stored_searchable, :facetable + end + # This must be included at the end, because it finalizes the metadata # schema (by adding accepts_nested_attributes) include ::Hyrax::BasicMetadata diff --git a/app/models/gw_work.rb b/app/models/gw_work.rb index 2c85a7ca..addc060b 100644 --- a/app/models/gw_work.rb +++ b/app/models/gw_work.rb @@ -16,5 +16,9 @@ class GwWork < ActiveFedora::Base index.as :stored_searchable end + property :bulkrax_identifier, predicate: ::RDF::URI("https://iro.bl.uk/resource#bulkraxIdentifier"), multiple: false do |index| + index.as :stored_searchable, :facetable + end + include ::Hyrax::BasicMetadata -end \ No newline at end of file +end diff --git a/bin/importer b/bin/importer new file mode 100644 index 00000000..996d805d --- /dev/null +++ b/bin/importer @@ -0,0 +1,146 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require_relative '../config/environment' + +require 'slop' + +def main(opts = {}) + check_required_params + + update = opts[:importer_id].present? + port = opts[:port].presence + url = build_url(opts.delete(:importer_id), opts.delete(:url), port) + + headers = { 'Content-Type' => 'application/json' } + headers['Authorization'] = "Token: #{opts.delete(:auth_token)}" + params = build_params(opts) + + logger.info("POST to #{url} - PARAMS #{params}") + + conn = Faraday.new( + url: url, + headers: headers + ) + + response = if update + conn.put do |request| + request.body = params.to_json + end + else + conn.post do |request| + request.body = params.to_json + end + end + + puts "#{response.status} - #{response.body.truncate(200)}" +end + +def check_required_params + if opts[:importer_id].blank? && invalid?(opts) + puts 'Missing required parameters' + help + end + + if opts[:auth_token].blank? # rubocop:disable Style/GuardClause + puts 'Missing Authentication Token --auth_token' + exit + end +end + +def invalid?(opts) + required_params.each do |p| + return true if opts[p.to_sym].blank? + end + return false +end + +def required_params + Bulkrax.api_definition['bulkrax']['importer'].map { |key, value| key if value['required'] == true }.compact +end + +def build_params(opts = {}) + params = {} + params[:commit] = opts.delete(:commit) + parser_fields = { + metadata_file_name: opts.delete(:metadata_file_name), + metadata_format: opts.delete(:metadata_format), + rights_statement: opts.delete(:rights_statement), + override_rights_statement: opts.delete(:override_rights_statement), + import_file_path: opts.delete(:import_file_path), + metadata_prefix: opts.delete(:metadata_prefix), + set: opts.delete(:set), + collection_name: opts.delete(:collection_name) + }.compact + params[:importer] = opts.compact + params[:importer][:user_id] = opts.delete(:user_id) + params[:importer][:admin_set_id] = opts.delete(:admin_set_id) + params[:importer][:parser_fields] = parser_fields || {} + return params.compact +end + +def build_url(importer_id, url, port = nil) + if url.nil? + protocol = Rails.application.config.force_ssl ? 'https://' : 'http://' + host = Rails.application.config.action_mailer.default_url_options[:host] + url = "#{protocol}#{host}" + url = "#{url}:#{port}" if port + end + path = Bulkrax::Engine.routes.url_helpers.polymorphic_path(Bulkrax::Importer) + url = File.join(url, path) + url = File.join(url, importer_id) if importer_id + return url +end + +def logger + Rails.logger +end + +def version + puts "Bulkrax #{Bulkrax::VERSION}" + puts "Slop #{Slop::VERSION}" +end + +# Format the help for the CLI +def help + puts 'CREATE:' + puts ' bin/importer --name "My Import" --parser_klass Bulkrax::CsvParser --commit "Create and Import" --import_file_path /data/tmp/import.csv --auth_token 12345' + puts 'UPDATE:' + puts ' bin/importer --importer_id 1 --commit "Update and Re-Import (update metadata only)" --import_file_path /data/tmp/import.csv --auth_token 12345' + puts 'PARAMETERS:' + Bulkrax.api_definition['bulkrax']['importer'].each_pair do |key, value| + next if key == 'parser_fields' + puts " --#{key}" + value.each_pair do |k, v| + next if k == 'contained_in' + puts " #{k}: #{v}" + end + end + puts ' --url' + puts " Repository URL" + exit +end + +# Setup the options +options = Slop.parse do |o| + o.on '--version', 'Print the version' do + version + exit + end + + o.on '--help', 'Print help' do + help + exit + end + + Bulkrax.api_definition['bulkrax']['importer'].each_pair do |key, value| + if value['required'].blank? + o.string "--#{key}", value['definition'], default: nil + else + o.string "--#{key}", value['definition'] + end + end + o.string '--url', 'Repository URL' +end + +main(options.to_hash) diff --git a/config/environments/test.rb b/config/environments/test.rb index c284a0f7..13aac196 100644 --- a/config/environments/test.rb +++ b/config/environments/test.rb @@ -41,5 +41,5 @@ # config.action_view.raise_on_missing_translations = true config.permanent_url_base = "https://scholarspace-etds.library.gwu.edu/" - config.active_job.queue_adapter = :test + config.active_job.queue_adapter = :sidekiq end diff --git a/config/initializers/bulkrax.rb b/config/initializers/bulkrax.rb index ca5d0d08..c926236a 100644 --- a/config/initializers/bulkrax.rb +++ b/config/initializers/bulkrax.rb @@ -7,8 +7,16 @@ # ] # WorkType to use as the default if none is specified in the import - # Default is the first returned by Hyrax.config.curation_concerns - config.default_work_type = 'GwWork' + # Default is the first returned by Hyrax.config.curation_concerns, stringified + config.default_work_type = "GwWork" + + # Factory Class to use when generating and saving objects + config.object_factory = Bulkrax::ObjectFactory + # Use this for a Postgres-backed Valkyrized Hyrax + # config.object_factory = Bulkrax::ValkyrieObjectFactory + + # Queue name for imports + config.ingest_queue_name = :import # Path to store pending imports # config.import_path = 'tmp/imports' @@ -33,23 +41,18 @@ # config.field_mappings = { # "Bulkrax::OaiDcParser" => { **individual field mappings go here*** } # } + config.field_mappings['Bulkrax::CsvParser'] = { - "contributor" => { from: ["contributor", split: ';' ] }, - "creator" => { from: ["creator"], split: "; " }, - "date_created" => { from: ["date_created"], split: ';' }, - "description" => { from: ["description"] }, - "identifier" => { from: ["identifier"], split: ';' }, - "related_url" => { from: ["related_url"] }, - "rights_statement" => { from: ["rights_statement"] }, - "license" => { from: ["license"], split: ';' }, - "source_identifier" => { from: ["source_identifier"] }, - "keyword" => { from: ["keyword"], split: ';' }, - "title" => { from: ["title"] }, - "doi" => {from: ["doi"], split: ';'}, - "resource_type" => { from: ["resource_type"], split: ';' }, - "gw_affiliation" => { from: ["gw_affiliation"], split: ';' }, - 'parents' => { from: ['parents'], related_parents_field_mapping: true }, - 'children' => { from: ['children'], related_children_field_mapping: true } + # Setting source_identifier: true makes bulkrax_identifier a mandatory field, + # so it MUST be present in the CSV row for EVERY item (regardless of type, so this includes FileSets as well) + 'bulkrax_identifier' => { from: ['bulkrax_identifier'], source_identifier: true }, + 'keyword' => { from: ['keyword'], split: true }, + 'advisor' => { from: ['advisor'], split: true }, + 'doi' => { from: ['doi'], split: true }, + 'committee_member' => { from: ['committee_member'], split: true }, + 'gw_affiliation' => { from: ['gw_affiliation'], split: true }, + 'file' => { from: ['file'], split: '\;' }, + 'parents' => { from: ['parents'], split: '\;', related_parents_field_mapping: true }, } # Add to, or change existing mappings as follows @@ -62,7 +65,7 @@ # (For more info on importing relationships, see Bulkrax Wiki: https://github.com/samvera-labs/bulkrax/wiki/Configuring-Bulkrax#parent-child-relationship-field-mappings) # # # e.g. to add the required source_identifier field - # # config.field_mappings["Bulkrax::CsvParser"]["source_id"] = { from: ["old_source_id"], source_identifier: true } + # # config.field_mappings["Bulkrax::CsvParser"]["source_id"] = { from: ["old_source_id"], source_identifier: true, search_field: 'source_id_sim' } # If you want Bulkrax to fill in source_identifiers for you, see below # To duplicate a set of mappings from one parser to another @@ -75,11 +78,28 @@ # It is given two aruguments, self at the time of call and the index of the reocrd # config.fill_in_blank_source_identifiers = ->(parser, index) { "b-#{parser.importer.id}-#{index}"} # or use a uuid - #config.fill_in_blank_source_identifiers = ->(parser, index) { SecureRandom.uuid } + # config.fill_in_blank_source_identifiers = ->(parser, index) { SecureRandom.uuid } # Properties that should not be used in imports/exports. They are reserved for use by Hyrax. # config.reserved_properties += ['my_field'] + + # List of Questioning Authority properties that are controlled via YAML files in + # the config/authorities/ directory. For example, the :rights_statement property + # is controlled by the active terms in config/authorities/rights_statements.yml + # Defaults: 'rights_statement' and 'license' + # config.qa_controlled_properties += ['my_field'] + + # Specify the delimiter regular expression for splitting an attribute's values into a multi-value array. + #config.multi_value_element_split_on = /\s*[:;|]\s*/.freeze + config.multi_value_element_split_on = ';'.freeze + + # Specify the delimiter for joining an attribute's multi-value array into a string. Note: the + # specific delimeter should likely be present in the multi_value_element_split_on expression. + config.multi_value_element_join_on = ' | ' end # Sidebar for hyrax 3+ support -#Hyrax::DashboardController.sidebar_partials[:repository_content] << "hyrax/dashboard/sidebar/bulkrax_sidebar_additions" if Object.const_defined?(:Hyrax) && ::Hyrax::DashboardController&.respond_to?(:sidebar_partials) \ No newline at end of file +# rubocop:disable Style/IfUnlessModifier +if Object.const_defined?(:Hyrax) && ::Hyrax::DashboardController&.respond_to?(:sidebar_partials) + Hyrax::DashboardController.sidebar_partials[:repository_content] << "hyrax/dashboard/sidebar/bulkrax_sidebar_additions" +end diff --git a/db/schema.rb b/db/schema.rb index e963d30e..304acc64 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 2024_02_08_142942) do +ActiveRecord::Schema.define(version: 2024_03_07_053156) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -40,6 +40,10 @@ t.datetime "last_succeeded_at" t.string "importerexporter_type", default: "Bulkrax::Importer" t.integer "import_attempts", default: 0 + t.string "status_message", default: "Pending" + t.index ["identifier", "importerexporter_id", "importerexporter_type"], name: "bulkrax_identifier_idx" + t.index ["importerexporter_id", "importerexporter_type"], name: "bulkrax_entries_importerexporter_idx" + t.index ["type"], name: "index_bulkrax_entries_on_type" end create_table "bulkrax_exporter_runs", force: :cascade do |t| @@ -70,6 +74,9 @@ t.date "finish_date" t.string "work_visibility" t.string "workflow_status" + t.boolean "include_thumbnails", default: false + t.boolean "generated_metadata", default: false + t.string "status_message", default: "Pending" t.index ["user_id"], name: "index_bulkrax_exporters_on_user_id" end @@ -110,9 +117,22 @@ t.boolean "validate_only" t.datetime "last_error_at" t.datetime "last_succeeded_at" + t.string "status_message", default: "Pending" t.index ["user_id"], name: "index_bulkrax_importers_on_user_id" end + create_table "bulkrax_pending_relationships", force: :cascade do |t| + t.bigint "importer_run_id", null: false + t.string "parent_id", null: false + t.string "child_id", null: false + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + t.integer "order", default: 0 + t.index ["child_id"], name: "index_bulkrax_pending_relationships_on_child_id" + t.index ["importer_run_id"], name: "index_bulkrax_pending_relationships_on_importer_run_id" + t.index ["parent_id"], name: "index_bulkrax_pending_relationships_on_parent_id" + end + create_table "bulkrax_statuses", force: :cascade do |t| t.string "status_message" t.string "error_class" @@ -124,6 +144,9 @@ t.string "runnable_type" t.datetime "created_at", null: false t.datetime "updated_at", null: false + t.index ["error_class"], name: "index_bulkrax_statuses_on_error_class" + t.index ["runnable_id", "runnable_type"], name: "bulkrax_statuses_runnable_idx" + t.index ["statusable_id", "statusable_type"], name: "bulkrax_statuses_statusable_idx" end create_table "checksum_audit_logs", id: :serial, force: :cascade do |t| @@ -682,6 +705,7 @@ add_foreign_key "bulkrax_exporter_runs", "bulkrax_exporters", column: "exporter_id" add_foreign_key "bulkrax_importer_runs", "bulkrax_importers", column: "importer_id" + add_foreign_key "bulkrax_pending_relationships", "bulkrax_importer_runs", column: "importer_run_id" add_foreign_key "collection_type_participants", "hyrax_collection_types" add_foreign_key "curation_concerns_operations", "users" add_foreign_key "mailboxer_conversation_opt_outs", "mailboxer_conversations", column: "conversation_id", name: "mb_opt_outs_on_conversations_id" diff --git a/docker-compose.yml b/docker-compose.yml index db72b3d7..cadac627 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -116,6 +116,7 @@ services: - ${NGINX_CERT_DIR}:/opt/scholarspace/certs - ${NGINX_KEY_DIR}:/opt/scholarspace/keys - /opt/scholarspace/scholarspace-derivatives:/opt/scholarspace/scholarspace-derivatives + - /opt/scholarspace/scholarspace-ingest:/opt/scholarspace/scholarspace-ingest - app-hyrax:/opt/scholarspace # Uncomment for development # - /opt/scholarspace/scholarspace-hyrax:/opt/scholarspace/scholarspace-hyrax @@ -149,6 +150,7 @@ services: - ${NGINX_CERT_DIR}:/opt/scholarspace/certs - ${NGINX_KEY_DIR}:/opt/scholarspace/keys - /opt/scholarspace/scholarspace-derivatives:/opt/scholarspace/scholarspace-derivatives + - /opt/scholarspace/scholarspace-ingest:/opt/scholarspace/scholarspace-ingest - app-hyrax:/opt/scholarspace # Uncomment for development # - /opt/scholarspace/scholarspace-hyrax:/opt/scholarspace/scholarspace-hyrax diff --git a/lib/tasks/ingest_bulkrax_prep.rake b/lib/tasks/ingest_bulkrax_prep.rake new file mode 100644 index 00000000..b53082e5 --- /dev/null +++ b/lib/tasks/ingest_bulkrax_prep.rake @@ -0,0 +1,279 @@ +require 'fileutils' +require 'nokogiri' +require 'rake' +require 'zip' + +namespace :gwss do + desc "Creates a bulkrax zip for all of the ProQuest ETD zip files in a folder" + task :ingest_pq_etds, [:filepath] do |t, args| + + @degree_etd_map = {} + + def get_metadata_doc_path(pq_files_dir) + xml_paths = Dir.glob("#{pq_files_dir}/*_DATA.xml") + pq_xml_file_path = xml_paths.first + pq_xml_file_path + end + + def get_etd_doc(xml_file_path) + File.open(xml_file_path) { |f| Nokogiri::XML(f) } + end + + def get_abstract(doc) + abstract_text_array = [] + doc.xpath("//DISS_content/DISS_abstract/DISS_para").each do |p| + abstract_text_array << p.text + end + abstract_text = Nokogiri::HTML(abstract_text_array.join("\n")).text + end + + def fullname(person_node) + lastname = person_node.xpath("DISS_name/DISS_surname").text + firstname = person_node.xpath("DISS_name/DISS_fname").text + middlename = person_node.xpath("DISS_name/DISS_middle").text + + fullname = lastname + ", " + firstname + fullname = fullname + " " + middlename unless middlename.empty? + fullname + end + + def get_creators(doc) + creators_array = [] + contributors_array = [] + doc.xpath("//DISS_authorship/DISS_author").each do |author_node| + author_type = author_node.attribute('type').text + + if author_type == 'primary' + creators_array << fullname(author_node) + else + contributors_array << fullname(author_node) + end + end + + {'creators' => creators_array, 'contributors' => contributors_array} + end + + def get_node_value(doc, xpath) + doc.xpath(xpath).text + end + + def get_keywords(doc) + keyword_array = [] + doc.xpath("//DISS_description/DISS_categorization/DISS_keyword").text.split(',') do |k| + keyword_array << k.strip() + end + keyword_array + end + + def get_date_created(doc) + comp_date = doc.xpath("//DISS_description/DISS_dates/DISS_comp_date").text + if !comp_date.empty? and comp_date.length >= 4 + comp_date[0..3] + else + nil + end + end + + def is_embargoed?(doc) + sales_restric = doc.xpath("//DISS_restriction/DISS_sales_restriction") + return false if sales_restric.empty? + rmv = sales_restric.attribute('remove') + return false if rmv.nil? + # else + true + end + + def get_embargo_date(doc) + sales_restric = doc.xpath("//DISS_restriction/DISS_sales_restriction") + return nil if sales_restric.empty? + return nil if sales_restric.attribute('remove').text.empty? + sales_restric.attribute('remove').text + end + + def get_advisors(doc) + advisors = [] + doc.xpath("//DISS_description/DISS_advisor").each do |advisor_node| + advisors << fullname(advisor_node) + end + advisors + end + + def get_committee_members(doc) + committee_members = [] + doc.xpath("//DISS_description/DISS_cmte_member").each do |committee_member_node| + committee_members << fullname(committee_member_node) + end + committee_members + end + + def convert_to_iso(date_str) + date = Date.strptime(date_str, '%m/%d/%Y') + date.strftime('%Y-%m-%d') + end + + def build_resource_type_degree_mapping + etd_degree_map = YAML.load_file('config/etd_degree_map.yml') + @degree_etd_map = {} + degree_categories = etd_degree_map.keys + # Flip etd_degree_map to create degree_etd_map + # So that for any given degree, we can get back whether it's a masters or a doctorate + degree_categories.each do |degree_category| + etd_degree_map[degree_category].each do |degree_name| + # upcase each degree (just in case) and ignore "."s + @degree_etd_map[degree_name.upcase.delete('.')] = degree_category + end + end + end + + def extract_metadata(doc) + work_metadata = Hash.new + work_metadata['model'] = 'GwEtd' + work_metadata['title'] = get_node_value(doc, "//DISS_description/DISS_title") + creators = get_creators(doc) + work_metadata['creator'] = creators['creators'].join(';') + work_metadata['contributor'] = creators['contributors'].join(';') + work_metadata['language'] = get_node_value(doc, "//DISS_description/DISS_categorization/DISS_language") + work_metadata['description'] = get_abstract(doc) + work_metadata['keyword'] = get_keywords(doc).join(';') + degree = get_node_value(doc, "//DISS_description/DISS_degree") + work_metadata['degree'] = degree + work_metadata['resource_type'] = @degree_etd_map[degree.upcase.delete('.')] + work_metadata['advisor'] = get_advisors(doc).join(';') + work_metadata['gw_affiliation'] = get_node_value(doc, "//DISS_description/DISS_institution/DISS_inst_contact") + etd_date_created = get_date_created(doc) + work_metadata['date_created'] = etd_date_created unless etd_date_created.nil? + work_metadata['committee_member'] = get_committee_members(doc).join(';') + work_metadata['rights_statement'] = 'http://rightsstatements.org/vocab/InC/1.0/' + # Can't currently load this license because this Bulkrax code + # https://github.com/samvera/bulkrax/blob/v8.1.0/app/models/concerns/bulkrax/import_behavior.rb#L145-L146 + # will try to match it with http://www.europeana.eu/portal/rights/rr-r.html/ (slash-terminated) + # -- not finding a match, Bulkrax will throw an error. + # work_metadata['license'] = 'http://www.europeana.eu/portal/rights/rr-r.html' + work_metadata + end + + def hash_array_to_csv_array(hash_array) + hash_keys = hash_array.flat_map(&:keys).uniq + # header row + csv_array = [hash_keys] + hash_array.each do |row| + csv_array << hash_keys.map {|key| row[key]} + end + csv_array + end + + def write_csv(csv_array, csv_path) + CSV.open(csv_path, 'w') do |csv| + csv_array.each do |row| + csv << row + end + end + end + + def repair_filename(filepath) + # translate spaces in the filename portion to _ + if File.dirname(filepath) == '.' + File.basename(filepath).tr(' ', '_') + else + File.join(File.dirname(filepath), File.basename(filepath).tr(' ', '_')) + end + end + + build_resource_type_degree_mapping + puts "build_resource_type_degree_mapping: " + puts @degree_etd_map + + # create folder for metadata.csv and files folder + + # if running spec tests, add /test/ to the tmp file path to prevent filling /tmp/bulkrax_zip when tests are run + if Rails.env.test? + bulkrax_zip_path = "#{ENV['TEMP_FILE_BASE']}/test/bulkrax_zip" + else + bulkrax_zip_path = "#{ENV['TEMP_FILE_BASE']}/bulkrax_zip" + end + + bulkrax_files_path = "#{bulkrax_zip_path}/files" + puts "File.exists?(bulkrax_zip_path) = #{File.exists?(bulkrax_zip_path)}" + FileUtils.makedirs("#{bulkrax_files_path}") unless File.exists?(bulkrax_zip_path) + + # get all ETD zip files in the args.filepath folder + path_to_zips = args.filepath + + works_metadata = [] + filesets_metadata = [] + + zip_paths = Dir.glob("#{path_to_zips}/etdadmin*.zip") + puts("zip_paths: #{zip_paths}") + zip_paths.each do |zip_path| + # for each ETD zip file: + puts("Processing #{zip_path}") + zip_file = Zip::File.open(zip_path) + zip_file_basename = File.basename(zip_path, '.zip') # e.g. etdadmin_upload_353614 + zip_file_dir = "#{bulkrax_files_path}/#{zip_file_basename}" # e.g. bulkrax_zip/files/etdadmin_upload_353614 + Dir.mkdir(zip_file_dir) unless File.exists?(zip_file_dir) + + attachment_file_paths = [] + zip_file.each do |entry| + puts(" Extracting #{entry.name}") + entry_name_clean = repair_filename(entry.name) + zip_file.extract(entry, "#{zip_file_dir}/#{entry_name_clean}") + # skip directories - these get their own entries in a zip file + attachment_file_paths << "#{entry_name_clean}" if !entry.name_is_directory? + end + + # 1. extract the work metdata and add to the works metadata array + xml_file_path = get_metadata_doc_path(zip_file_dir) + etd_doc = get_etd_doc(xml_file_path) + puts "xml is located at: #{xml_file_path}" + etd_md = extract_metadata(etd_doc) + parent_work_identifier = SecureRandom.uuid + etd_md['bulkrax_identifier'] = parent_work_identifier + works_metadata << etd_md + + # Set up embargo info that will be applied below to all file attachments + etd_is_embargoed = is_embargoed?(etd_doc) + if etd_is_embargoed + embargo_date = get_embargo_date(etd_doc) + if !embargo_date.nil? + embargo_release_date = convert_to_iso(embargo_date) + else + embargo_release_date = nil + end + end + + # 2. extract the attachment files paths and add to the filesets metadata array + # Remove the metadata xml file so we don't go and attach it to thw work + attachment_file_paths.delete(File.basename(xml_file_path)) + attachment_file_paths.each do |fp| + fp_basename = File.basename(fp) + puts "path = #{fp}, basename = #{fp_basename}" + file_md = Hash.new + file_md['model'] = 'FileSet' + safe_fp = "#{zip_file_basename}/#{fp}" + file_md['file'] = safe_fp + file_md['title'] = fp_basename + file_md['bulkrax_identifier'] = SecureRandom.uuid + file_md['parents'] = parent_work_identifier + + # Add embargo info and + if etd_is_embargoed + file_md['visibility'] = 'embargo' + file_md['visibility_during_embargo'] = 'restricted' + file_md['visibility_after_embargo'] = 'open' + file_md['embargo_release_date'] = embargo_release_date + end + filesets_metadata << file_md + end + end + + all_md = works_metadata + filesets_metadata + + csv_rows = hash_array_to_csv_array(all_md) + bulkrax_csv_filepath = "#{bulkrax_zip_path}/metadata.csv" + write_csv(csv_rows, bulkrax_csv_filepath) + + # FUTURE EXPANSION: Zip up the bulkrax ingest manifest and files + # zip up the working folder + # Consider a system command here? Not so simple with rubyzip + end +end diff --git a/spec/features/bulkrax_upload_spec.rb b/spec/features/bulkrax_upload_spec.rb new file mode 100644 index 00000000..792871f1 --- /dev/null +++ b/spec/features/bulkrax_upload_spec.rb @@ -0,0 +1,91 @@ +require 'rails_helper' +require 'csv' + +Rails.application.load_tasks + +RSpec.describe "Deposit files through Bulkrax" do + + before :all do + # remove the folder so it doesn't repeatedly add new works when ingest task is run + FileUtils.rm_rf("#{Rails.root}/tmp/test/bulkrax_zip") + + Rake::Task["gwss:ingest_pq_etds"].invoke("#{Rails.root}/spec/fixtures/etd_zips") + end + + it 'generates deposit file structure via gwss:ingest_pq_etds task' do + expect(File.directory?("#{Rails.root}/tmp/test/bulkrax_zip")).to be true + expect(File.directory?("#{Rails.root}/tmp/test/bulkrax_zip/files")).to be true + + expect(File.directory?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_1")).to be true + expect(File.file?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_1/Ab_gwu_0075A_16593_DATA.xml")).to be true + expect(File.file?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_1/Ab_gwu_0075A_16593.pdf")).to be true + + expect(File.directory?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_2")).to be true + expect(File.file?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_2/Ab_gwu_0076A_12345_DATA.xml")).to be true + expect(File.file?("#{Rails.root}/tmp/test/bulkrax_zip/files/etdadmin_upload_2/Ab_gwu_0076A_12345.pdf")).to be true + + expect(File.file?("#{Rails.root}/tmp/test/bulkrax_zip/metadata.csv")).to be true + end + + it 'generates accurate CSV file for import' do + csv_rows = CSV.read("#{Rails.root}/tmp/test/bulkrax_zip/metadata.csv") + + headers_arr = csv_rows[0] + + expect(headers_arr).to eq(["model", "title", "creator", "contributor", "language", + "description", "keyword", "degree", "resource_type", "advisor", "gw_affiliation", + "date_created", "committee_member", "rights_statement", "bulkrax_identifier", + "file", "parents", "visibility", "visibility_during_embargo", + "visibility_after_embargo", "embargo_release_date"]) + + # check that there are five rows - one for header, one for each of the etds, one for each of the files + expect(csv_rows.count).to eq(5) + + first_work_metadata = csv_rows[1] + second_work_metadata = csv_rows[2] + first_file_data = csv_rows[3] + second_file_data = csv_rows[4] + + expect(first_work_metadata.include?("GwEtd")).to be true + expect(second_work_metadata.include?("GwEtd")).to be true + + expect(first_file_data.include?("embargo")).to be true + expect(second_file_data.include?("embargo")).to be true + + expect(first_file_data.include?("restricted")).to be true + expect(second_file_data.include?("restricted")).to be true + end + + it 'can deposit works via bulkrax import' do + admin_user = FactoryBot.create(:admin_user) + etds_admin_set = Hyrax::AdministrativeSet.new(title: ['ETDs']) + etds_admin_set = Hyrax.persister.save(resource: etds_admin_set) + Hyrax::AdminSetCreateService.call!(admin_set: etds_admin_set, creating_user: admin_user) + + sign_in_user(admin_user) + + visit '/importers/new' + + fill_in('importer_name', with: "Test Bulkrax Import") + select('ETDs', from: 'importer_admin_set_id') + select('CSV - Comma Separated Values', from: 'importer_parser_klass') + + import_parser_radio_button_elements = page.all('//*[@id="importer_parser_fields_file_style_specify_a_path_on_the_server"]') + import_parser_radio_button_elements.last.click + + import_parser_file_path_elements = page.all('//*[@id="importer_parser_fields_import_file_path"]') + import_parser_file_path_elements.last.fill_in with: "#{Rails.root}/tmp/test/bulkrax_zip/metadata.csv" + + click_on("Create and Import") + + # the 'expect' statements below are not super specific, but the test will fail if any step in the deposit fails, so feels robust enough + + # check if both works are created + work_1 = GwEtd.where(title: "A False Work For Testing Purposes").first + work_2 = GwEtd.where(title: "Another False Work For Bulkrax Testing Purposes").first + + # check if both works get an embargo ID + expect(work_1.embargo_id.present?).to be true + expect(work_2.embargo_id.present?).to be true + end +end diff --git a/spec/fixtures/etd_zips/etdadmin_upload_1.zip b/spec/fixtures/etd_zips/etdadmin_upload_1.zip new file mode 100644 index 00000000..ec0e4ecf Binary files /dev/null and b/spec/fixtures/etd_zips/etdadmin_upload_1.zip differ diff --git a/spec/fixtures/etd_zips/etdadmin_upload_2.zip b/spec/fixtures/etd_zips/etdadmin_upload_2.zip new file mode 100644 index 00000000..5c040605 Binary files /dev/null and b/spec/fixtures/etd_zips/etdadmin_upload_2.zip differ diff --git a/spec/rails_helper.rb b/spec/rails_helper.rb index dcb6421c..22a21aad 100644 --- a/spec/rails_helper.rb +++ b/spec/rails_helper.rb @@ -9,6 +9,9 @@ require_relative "../spec/support/fedora_cleaner" require_relative "../spec/support/solr_cleaner" +require 'sidekiq/testing' +Sidekiq::Testing.inline! + # Add additional requires below this line. Rails is not loaded until this point! # Requires supporting ruby files with custom matchers and macros, etc, in