diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 476f0c53..dbaf0203 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -14,6 +14,10 @@ jobs: ruby: ['3.1', '3.2'] steps: - uses: actions/checkout@v3 + # required to avoid https://github.com/actions/runner-images/issues/37 + # because faraday depends on patron, which requires curl headers to build + - name: Install cURL Headers + run: sudo apt-get update && sudo apt-get install libcurl4-openssl-dev - name: Set up Ruby uses: ruby/setup-ruby@v1 with: diff --git a/.rubocop.yml b/.rubocop.yml index bc7ef392..7c662511 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -25,6 +25,10 @@ RSpec/DescribeClass: - 'spec/features/**/*' - 'spec/integration/**/*' +RSpec/MultipleMemoizedHelpers: + Exclude: + - 'spec/lib/earthworks/harvester_spec.rb' + Gemspec/DeprecatedAttributeAssignment: # new in 1.30 Enabled: true Gemspec/RequireMFA: # new in 1.23 @@ -338,4 +342,4 @@ Rails/WhereMissing: # new in 2.16 Rails/WhereNot: # new in 2.8 Enabled: true Rails/WhereNotWithMultipleConditions: # new in 2.17 - Enabled: true \ No newline at end of file + Enabled: true diff --git a/Gemfile b/Gemfile index 29737a8d..ff24d9a2 100644 --- a/Gemfile +++ b/Gemfile @@ -33,6 +33,7 @@ group :development do # Display performance information such as SQL time and flame graphs for each request in your browser. # Can be configured to work on production as well see: https://github.com/MiniProfiler/rack-mini-profiler/blob/master/README.md gem 'rack-mini-profiler', '~> 2.0' + gem 'debug' end group :development, :test do @@ -74,7 +75,7 @@ gem 'newrelic_rpm' gem 'twitter-typeahead-rails' gem 'blacklight_range_limit', '~> 7.0' gem 'redis', '~> 5.0' -gem 'geo_combine' +gem 'geo_combine', github: 'OpenGeoMetadata/GeoCombine' gem 'geo_monitor', '~> 0.7', github: 'geoblacklight/geo_monitor' gem 'sidekiq', '~> 7.0' gem 'whenever', require: false diff --git a/Gemfile.lock b/Gemfile.lock index ddeba53d..4deb6c35 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,3 +1,17 @@ +GIT + remote: https://github.com/OpenGeoMetadata/GeoCombine.git + revision: c2c54279da4be9ef6105d5a8bf3fd30d552aa8ad + specs: + geo_combine (0.7.0) + activesupport + faraday-net_http_persistent (~> 2.0) + git + json-schema + nokogiri + rsolr + sanitize + thor + GIT remote: https://github.com/geoblacklight/geo_monitor.git revision: b4cad62d8bdd43c42e5f31f56fe62e666c5f03d2 @@ -154,6 +168,9 @@ GEM database_cleaner-core (~> 2.0.0) database_cleaner-core (2.0.1) date (3.3.3) + debug (1.7.1) + irb (>= 1.5.0) + reline (>= 0.3.1) deep_merge (1.2.2) deprecation (1.1.0) activesupport @@ -215,27 +232,31 @@ GEM factory_bot_rails (6.2.0) factory_bot (~> 6.2.0) railties (>= 5.0.0) - faraday (1.10.3) + faraday (1.9.0) faraday-em_http (~> 1.0) faraday-em_synchrony (~> 1.0) faraday-excon (~> 1.1) - faraday-httpclient (~> 1.0) + faraday-httpclient (< 3) faraday-multipart (~> 1.0) - faraday-net_http (~> 1.0) - faraday-net_http_persistent (~> 1.0) - faraday-patron (~> 1.0) + faraday-net_http (< 3) + faraday-net_http_persistent (< 3) + faraday-patron (< 3) faraday-rack (~> 1.0) faraday-retry (~> 1.0) ruby2_keywords (>= 0.0.4) faraday-em_http (1.0.0) faraday-em_synchrony (1.0.0) faraday-excon (1.1.0) - faraday-httpclient (1.0.1) + faraday-httpclient (2.0.1) + httpclient (>= 2.2) faraday-multipart (1.0.4) multipart-post (~> 2) - faraday-net_http (1.0.1) - faraday-net_http_persistent (1.2.0) - faraday-patron (1.0.0) + faraday-net_http (2.1.0) + faraday-net_http_persistent (2.0.2) + faraday-net_http (< 3) + net-http-persistent (~> 4.0) + faraday-patron (2.0.1) + patron (>= 0.4.2) faraday-rack (1.0.0) faraday-retry (1.0.3) faraday_middleware (1.0.0) @@ -244,14 +265,6 @@ GEM ffi-compiler (1.0.1) ffi (>= 1.0.0) rake - geo_combine (0.6.0) - activesupport - json-schema - net-http-persistent (~> 2.0) - nokogiri - rsolr - sanitize - thor geoblacklight (3.7.0) blacklight (~> 7.8) coderay @@ -264,6 +277,9 @@ GEM mime-types rails (>= 5.2.4, < 7.1) rgeo-geojson + git (1.18.0) + addressable (~> 2.8) + rchardet (~> 1.8) globalid (1.1.0) activesupport (>= 5.0) handlebars_assets (0.23.9) @@ -280,8 +296,12 @@ GEM http-cookie (1.0.5) domain_name (~> 0.5) http-form_data (2.3.0) + httpclient (2.8.3) i18n (1.12.0) concurrent-ruby (~> 1.0) + io-console (0.6.0) + irb (1.6.3) + reline (>= 0.3.0) jbuilder (2.11.5) actionview (>= 5.0.0) activesupport (>= 5.0.0) @@ -322,12 +342,12 @@ GEM mime-types-data (~> 3.2015) mime-types-data (3.2023.0218.1) mini_mime (1.1.2) - mini_portile2 (2.8.1) minitar (0.9) minitest (5.18.0) msgpack (1.6.1) multipart-post (2.3.0) - net-http-persistent (2.9.4) + net-http-persistent (4.0.1) + connection_pool (~> 2.2) net-imap (0.3.4) date net-protocol @@ -342,8 +362,7 @@ GEM net-ssh (7.1.0) newrelic_rpm (9.0.0) nio4r (2.5.8) - nokogiri (1.14.2) - mini_portile2 (~> 2.8.0) + nokogiri (1.14.2-x86_64-darwin) racc (~> 1.4) nokogiri (1.14.2-x86_64-linux) racc (~> 1.4) @@ -353,6 +372,7 @@ GEM parallel (1.22.1) parser (3.2.1.1) ast (~> 2.4.1) + patron (0.13.3) pg (1.4.6) popper_js (1.16.1) public_suffix (5.0.1) @@ -394,6 +414,7 @@ GEM zeitwerk (~> 2.5) rainbow (3.1.1) rake (13.0.6) + rchardet (1.8.0) recaptcha (5.12.3) json redis (5.0.6) @@ -401,6 +422,8 @@ GEM redis-client (0.14.0) connection_pool regexp_parser (2.7.0) + reline (0.3.2) + io-console (~> 0.5) responders (3.1.0) actionpack (>= 5.2) railties (>= 5.2) @@ -494,8 +517,7 @@ GEM actionpack (>= 5.2) activesupport (>= 5.2) sprockets (>= 3.0.0) - sqlite3 (1.6.2) - mini_portile2 (~> 2.8.0) + sqlite3 (1.6.2-x86_64-darwin) sqlite3 (1.6.2-x86_64-linux) sshkit (1.21.4) net-scp (>= 1.1.2) @@ -539,7 +561,7 @@ GEM zeitwerk (2.6.7) PLATFORMS - ruby + x86_64-darwin-20 x86_64-linux DEPENDENCIES @@ -556,12 +578,13 @@ DEPENDENCIES capistrano-shared_configs capybara database_cleaner + debug devise devise-guests (>= 0.3.3) devise-remote-user dlss-capistrano factory_bot_rails - geo_combine + geo_combine! geo_monitor (~> 0.7)! geoblacklight (~> 3.7) honeybadger @@ -596,4 +619,4 @@ DEPENDENCIES whenever BUNDLED WITH - 2.3.20 + 2.4.7 diff --git a/config/settings.yml b/config/settings.yml index 4d5d5d26..601e623a 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -202,3 +202,63 @@ GEO_BLACKLIGHT_HARVEST_SITES: f: dct_provenance_s: - MIT + +# Repositories listed here will be harvested and indexed by rake tasks. See: +# https://github.com/OpenGeoMetadata +# If supplied, "provenance" value overrides the dct_provenance_s in the record +# This creates a more consistent display in facets, etc. +OGM_REPOS: + edu.nyu: # includes records from both NYU and Baruch CUNY + shared-repository: # multi-institutional + edu.princeton.arks: + provenance: Princeton + edu.mit: + provenance: MIT + edu.harvard: + provenance: Harvard + edu.columbia: + provenance: Columbia + edu.tufts: + provenance: Tufts + edu.virginia: + provenance: Virginia + edu.umich: + provenance: Michigan + edu.wisc: + provenance: Wisconsin + edu.umn: + provenance: Minnesota + edu.berkeley: + provenance: Berkeley + edu.cornell: + provenance: Cornell + edu.uiowa: + provenance: Iowa + edu.indiana: + provenance: Indiana + edu.purdue: + provenance: Purdue + edu.illinois: + provenance: Illinois + edu.msu: + provenance: Michigan State + edu.umd: + provenance: Maryland + edu.rutgers: + provenance: Rutgers + edu.psu: + provenance: Penn State + edu.osu: + provenance: Ohio State + edu.uchicago: + provenance: Chicago + edu.unl: + provenance: Nebraska + edu.colostate: + provenance: Colorado State + edu.gmu: + provenance: George Mason + edu.uarizona: + provenance: Arizona + edu.vt: + provenance: Virginia Tech diff --git a/lib/earthworks/harvester.rb b/lib/earthworks/harvester.rb new file mode 100644 index 00000000..a336d48f --- /dev/null +++ b/lib/earthworks/harvester.rb @@ -0,0 +1,55 @@ +require 'geo_combine/harvester' + +# A custom OpenGeoMetadata harvester that lets us limit repositories and transform metadata +module Earthworks + class Harvester < GeoCombine::Harvester + attr_reader :ogm_repos + + # Support passing in a configured list of repositories to harvest + def initialize(ogm_repos: ENV.fetch('OGM_REPOS'), **kwargs) + super(**kwargs) + + @ogm_repos = ogm_repos.transform_keys(&:to_s) + end + + # Support skipping and transforming arbitrary records prior to indexing + def docs_to_index + return to_enum(:docs_to_index) unless block_given? + + super do |record, path| + yield transform_record(record, path), path unless skip_record?(record, path) + end + end + + private + + # Some records have placeholder data or are otherwise problematic, but we + # can't denylist them at the institution/repository level. + def skip_record?(_record, path) + # Skip PolicyMap records in shared-repository; they have placeholder data + # See https://github.com/OpenGeoMetadata/shared-repository/tree/master/gbl-policymap + record_repo(path) == 'shared-repository' && path.include?('gbl-policymap') + end + + # We transform some records in order to get more consistent metadata display + # in Earthworks, especially for facets. + def transform_record(record, path) + # Transform provenance to a shorter, consistent value based on the repository + if (transformed_provenance = @ogm_repos.dig(record_repo(path), :provenance)) + record.update({ 'dct_provenance_s' => transformed_provenance }) + end + + record + end + + # Get the name of the repository the record came from + def record_repo(path) + path.split(@ogm_path).last.split('/')[1] + end + + # Only harvest configured repositories, if configuration was provided + def repositories + @repositories ||= @ogm_repos ? super.compact.select { |repo| @ogm_repos.key?(repo) } : super + end + end +end diff --git a/lib/tasks/earthworks.rake b/lib/tasks/earthworks.rake index 520933fb..f333c147 100644 --- a/lib/tasks/earthworks.rake +++ b/lib/tasks/earthworks.rake @@ -1,3 +1,5 @@ +require 'earthworks/harvester' + namespace :earthworks do desc 'Install EarthWorks' task install: [:environment] do @@ -106,44 +108,32 @@ namespace :earthworks do end end end - namespace :opengeometadata do - task setup: [:environment] do - ENV['OGM_PATH'] = 'tmp/opengeometadata' - ENV['OGM_PATH'] = '/var/tmp/opengeometadata' if File.directory?('/var/tmp/opengeometadata') - ENV['SOLR_URL'] = Blacklight.default_index.connection.uri.to_s - puts "Using OGM_PATH=#{ENV.fetch('OGM_PATH', nil)} SOLR_URL=#{ENV.fetch('SOLR_URL', nil)}" - end - desc 'Clone specific OpenGeoMetadata repositories for indexing' - task clone: ['earthworks:opengeometadata:setup'] do - %w[ - edu.berkeley - edu.columbia - edu.nyu - edu.princeton.arks - edu.cornell - big-ten - edu.virginia - ].each do |repo| - system "rake geocombine:clone[#{repo}]" # need `system` to pick up ENV vars - end + # Customized tasks for OpenGeoMetadata records + namespace :opengeometadata do + desc 'Initialize OpenGeoMetadata repositories' + task :clone do + harvester = Earthworks::Harvester.new(ogm_repos: Settings.OGM_REPOS) + total = harvester.clone + puts "Cloned #{total} repositories" end - desc 'Update OpenGeoMetadata repositories via git pull' - task pull: ['earthworks:opengeometadata:setup'] do - system 'rake geocombine:pull' # need `system` to pick up ENV vars + desc 'Fetch updated OpenGeoMetadata records for indexing' + task :pull do + harvester = Earthworks::Harvester.new(ogm_repos: Settings.OGM_REPOS) + total = harvester.pull + puts "Updated #{total} repositories" end desc 'Index OpenGeoMetadata repositories' - task index: ['earthworks:opengeometadata:setup'] do - system 'rake geocombine:index' # need `system` to pick up ENV vars + task :index do + harvester = Earthworks::Harvester.new(ogm_repos: Settings.OGM_REPOS) + indexer = GeoCombine::Indexer.new + puts "Indexing #{harvester.ogm_path} into #{indexer.solr_url}" + total = indexer.index(harvester.docs_to_index) + puts "Indexed #{total} documents" end - desc 'Run full OpenGeoMetadata indexing pipeline' - task pipeline: ['earthworks:opengeometadata:clone', - 'earthworks:opengeometadata:pull', - 'earthworks:opengeometadata:index'] - desc 'Index content from GeoBlacklight sites' task :harvest_geo_blacklight do GeoCombine::GeoBlacklightHarvester.configure do diff --git a/spec/lib/earthworks/harvester_spec.rb b/spec/lib/earthworks/harvester_spec.rb new file mode 100644 index 00000000..f7943ec0 --- /dev/null +++ b/spec/lib/earthworks/harvester_spec.rb @@ -0,0 +1,77 @@ +# frozen_string_literal: true + +require 'git' +require 'net/http' +require 'earthworks/harvester' +require 'spec_helper' + +RSpec.describe Earthworks::Harvester do + subject(:harvester) { described_class.new(ogm_repos: ogm_repos, ogm_path: ogm_path) } + + let(:ogm_path) { 'tmp/ogm' } + let(:ogm_repos) do + { + 'edu.princeton.arks' => { provenance: 'Princeton' }, + 'edu.psu' => { provenance: 'Penn State' } + } + end + + let(:stub_repo) { instance_double(Git::Base) } + let(:stub_gh_api) do + [ + { name: 'edu.princeton.arks', size: 100 }, + { name: 'edu.psu', size: 100 }, + { name: 'edu.stanford', size: 100 } # not on allowlist (we don't harvest ourselves) + ].to_json + end + + before do + allow(Net::HTTP).to receive(:get).with(described_class.ogm_api_uri).and_return(stub_gh_api) + allow(Git).to receive(:open).and_return(stub_repo) + allow(Git).to receive(:clone).and_return(stub_repo) + allow(stub_repo).to receive(:pull).and_return(stub_repo) + end + + describe '#clone' do + it 'clones only repositories configured in settings' do + expect(Git).to receive(:clone).twice + expect(Git).not_to receive(:clone).with('https://github.com/OpenGeoMetadata/edu.stanford.git') + harvester.clone + end + end + + describe '#pull' do + it 'pulls only repositories configured in settings' do + expect(stub_repo).to receive(:pull).twice + expect(stub_repo).not_to receive(:pull).with('edu.stanford') + harvester.pull + end + end + + describe '#docs_to_index' do + # Provenance value will be transformed by our ogm_repos config + let(:psu_doc) { { dct_provenance_s: 'Pennsylvania State University', geoblacklight_version: '1.0' }.to_json } + let(:psu_path) { "#{ogm_path}/edu.psu/metadata-1.0/Maps/08d-01/geoblacklight.json" } + + # PolicyMap records have placeholder data and should be skipped + let(:policymap_doc) { { dct_provenance_s: 'Geoblacklight', geoblacklight_version: '1.0' }.to_json } + let(:policymap_path) { "#{ogm_path}/shared-repository/gbl-policymap/records/geoblacklight.json" } + + before do + allow(Find).to receive(:find).and_yield(psu_path).and_yield(policymap_path) + allow(File).to receive(:read).with(psu_path).and_return(psu_doc) + allow(File).to receive(:read).with(policymap_path).and_return(policymap_doc) + end + + it 'supports skipping arbitrary records' do + docs = harvester.docs_to_index.to_a + expect(docs.length).to eq(1) + expect(docs.first.last).to eq(psu_path) + end + + it 'supports transforming arbitrary records' do + docs = harvester.docs_to_index.to_a + expect(docs.first.first['dct_provenance_s']).to eq('Penn State') + end + end +end