Skip to content

Commit

Permalink
Merge pull request #956 from sul-dlss/custom-harvester
Browse files Browse the repository at this point in the history
Implement custom harvester for OpenGeoMetadata records
  • Loading branch information
thatbudakguy authored Mar 29, 2023
2 parents 7dc9ea6 + a209139 commit df4c1b9
Show file tree
Hide file tree
Showing 8 changed files with 272 additions and 58 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/ruby.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ jobs:
ruby: ['3.1', '3.2']
steps:
- uses: actions/checkout@v3
# required to avoid https://github.com/actions/runner-images/issues/37
# because faraday depends on patron, which requires curl headers to build
- name: Install cURL Headers
run: sudo apt-get update && sudo apt-get install libcurl4-openssl-dev
- name: Set up Ruby
uses: ruby/setup-ruby@v1
with:
Expand Down
6 changes: 5 additions & 1 deletion .rubocop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ RSpec/DescribeClass:
- 'spec/features/**/*'
- 'spec/integration/**/*'

RSpec/MultipleMemoizedHelpers:
Exclude:
- 'spec/lib/earthworks/harvester_spec.rb'

Gemspec/DeprecatedAttributeAssignment: # new in 1.30
Enabled: true
Gemspec/RequireMFA: # new in 1.23
Expand Down Expand Up @@ -338,4 +342,4 @@ Rails/WhereMissing: # new in 2.16
Rails/WhereNot: # new in 2.8
Enabled: true
Rails/WhereNotWithMultipleConditions: # new in 2.17
Enabled: true
Enabled: true
3 changes: 2 additions & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ group :development do
# Display performance information such as SQL time and flame graphs for each request in your browser.
# Can be configured to work on production as well see: https://github.com/MiniProfiler/rack-mini-profiler/blob/master/README.md
gem 'rack-mini-profiler', '~> 2.0'
gem 'debug'
end

group :development, :test do
Expand Down Expand Up @@ -74,7 +75,7 @@ gem 'newrelic_rpm'
gem 'twitter-typeahead-rails'
gem 'blacklight_range_limit', '~> 7.0'
gem 'redis', '~> 5.0'
gem 'geo_combine'
gem 'geo_combine', github: 'OpenGeoMetadata/GeoCombine'
gem 'geo_monitor', '~> 0.7', github: 'geoblacklight/geo_monitor'
gem 'sidekiq', '~> 7.0'
gem 'whenever', require: false
Expand Down
75 changes: 49 additions & 26 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
GIT
remote: https://github.com/OpenGeoMetadata/GeoCombine.git
revision: c2c54279da4be9ef6105d5a8bf3fd30d552aa8ad
specs:
geo_combine (0.7.0)
activesupport
faraday-net_http_persistent (~> 2.0)
git
json-schema
nokogiri
rsolr
sanitize
thor

GIT
remote: https://github.com/geoblacklight/geo_monitor.git
revision: b4cad62d8bdd43c42e5f31f56fe62e666c5f03d2
Expand Down Expand Up @@ -154,6 +168,9 @@ GEM
database_cleaner-core (~> 2.0.0)
database_cleaner-core (2.0.1)
date (3.3.3)
debug (1.7.1)
irb (>= 1.5.0)
reline (>= 0.3.1)
deep_merge (1.2.2)
deprecation (1.1.0)
activesupport
Expand Down Expand Up @@ -215,27 +232,31 @@ GEM
factory_bot_rails (6.2.0)
factory_bot (~> 6.2.0)
railties (>= 5.0.0)
faraday (1.10.3)
faraday (1.9.0)
faraday-em_http (~> 1.0)
faraday-em_synchrony (~> 1.0)
faraday-excon (~> 1.1)
faraday-httpclient (~> 1.0)
faraday-httpclient (< 3)
faraday-multipart (~> 1.0)
faraday-net_http (~> 1.0)
faraday-net_http_persistent (~> 1.0)
faraday-patron (~> 1.0)
faraday-net_http (< 3)
faraday-net_http_persistent (< 3)
faraday-patron (< 3)
faraday-rack (~> 1.0)
faraday-retry (~> 1.0)
ruby2_keywords (>= 0.0.4)
faraday-em_http (1.0.0)
faraday-em_synchrony (1.0.0)
faraday-excon (1.1.0)
faraday-httpclient (1.0.1)
faraday-httpclient (2.0.1)
httpclient (>= 2.2)
faraday-multipart (1.0.4)
multipart-post (~> 2)
faraday-net_http (1.0.1)
faraday-net_http_persistent (1.2.0)
faraday-patron (1.0.0)
faraday-net_http (2.1.0)
faraday-net_http_persistent (2.0.2)
faraday-net_http (< 3)
net-http-persistent (~> 4.0)
faraday-patron (2.0.1)
patron (>= 0.4.2)
faraday-rack (1.0.0)
faraday-retry (1.0.3)
faraday_middleware (1.0.0)
Expand All @@ -244,14 +265,6 @@ GEM
ffi-compiler (1.0.1)
ffi (>= 1.0.0)
rake
geo_combine (0.6.0)
activesupport
json-schema
net-http-persistent (~> 2.0)
nokogiri
rsolr
sanitize
thor
geoblacklight (3.7.0)
blacklight (~> 7.8)
coderay
Expand All @@ -264,6 +277,9 @@ GEM
mime-types
rails (>= 5.2.4, < 7.1)
rgeo-geojson
git (1.18.0)
addressable (~> 2.8)
rchardet (~> 1.8)
globalid (1.1.0)
activesupport (>= 5.0)
handlebars_assets (0.23.9)
Expand All @@ -280,8 +296,12 @@ GEM
http-cookie (1.0.5)
domain_name (~> 0.5)
http-form_data (2.3.0)
httpclient (2.8.3)
i18n (1.12.0)
concurrent-ruby (~> 1.0)
io-console (0.6.0)
irb (1.6.3)
reline (>= 0.3.0)
jbuilder (2.11.5)
actionview (>= 5.0.0)
activesupport (>= 5.0.0)
Expand Down Expand Up @@ -322,12 +342,12 @@ GEM
mime-types-data (~> 3.2015)
mime-types-data (3.2023.0218.1)
mini_mime (1.1.2)
mini_portile2 (2.8.1)
minitar (0.9)
minitest (5.18.0)
msgpack (1.6.1)
multipart-post (2.3.0)
net-http-persistent (2.9.4)
net-http-persistent (4.0.1)
connection_pool (~> 2.2)
net-imap (0.3.4)
date
net-protocol
Expand All @@ -342,8 +362,7 @@ GEM
net-ssh (7.1.0)
newrelic_rpm (9.0.0)
nio4r (2.5.8)
nokogiri (1.14.2)
mini_portile2 (~> 2.8.0)
nokogiri (1.14.2-x86_64-darwin)
racc (~> 1.4)
nokogiri (1.14.2-x86_64-linux)
racc (~> 1.4)
Expand All @@ -353,6 +372,7 @@ GEM
parallel (1.22.1)
parser (3.2.1.1)
ast (~> 2.4.1)
patron (0.13.3)
pg (1.4.6)
popper_js (1.16.1)
public_suffix (5.0.1)
Expand Down Expand Up @@ -394,13 +414,16 @@ GEM
zeitwerk (~> 2.5)
rainbow (3.1.1)
rake (13.0.6)
rchardet (1.8.0)
recaptcha (5.12.3)
json
redis (5.0.6)
redis-client (>= 0.9.0)
redis-client (0.14.0)
connection_pool
regexp_parser (2.7.0)
reline (0.3.2)
io-console (~> 0.5)
responders (3.1.0)
actionpack (>= 5.2)
railties (>= 5.2)
Expand Down Expand Up @@ -494,8 +517,7 @@ GEM
actionpack (>= 5.2)
activesupport (>= 5.2)
sprockets (>= 3.0.0)
sqlite3 (1.6.2)
mini_portile2 (~> 2.8.0)
sqlite3 (1.6.2-x86_64-darwin)
sqlite3 (1.6.2-x86_64-linux)
sshkit (1.21.4)
net-scp (>= 1.1.2)
Expand Down Expand Up @@ -539,7 +561,7 @@ GEM
zeitwerk (2.6.7)

PLATFORMS
ruby
x86_64-darwin-20
x86_64-linux

DEPENDENCIES
Expand All @@ -556,12 +578,13 @@ DEPENDENCIES
capistrano-shared_configs
capybara
database_cleaner
debug
devise
devise-guests (>= 0.3.3)
devise-remote-user
dlss-capistrano
factory_bot_rails
geo_combine
geo_combine!
geo_monitor (~> 0.7)!
geoblacklight (~> 3.7)
honeybadger
Expand Down Expand Up @@ -596,4 +619,4 @@ DEPENDENCIES
whenever

BUNDLED WITH
2.3.20
2.4.7
60 changes: 60 additions & 0 deletions config/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,63 @@ GEO_BLACKLIGHT_HARVEST_SITES:
f:
dct_provenance_s:
- MIT

# Repositories listed here will be harvested and indexed by rake tasks. See:
# https://github.com/OpenGeoMetadata
# If supplied, "provenance" value overrides the dct_provenance_s in the record
# This creates a more consistent display in facets, etc.
OGM_REPOS:
edu.nyu: # includes records from both NYU and Baruch CUNY
shared-repository: # multi-institutional
edu.princeton.arks:
provenance: Princeton
edu.mit:
provenance: MIT
edu.harvard:
provenance: Harvard
edu.columbia:
provenance: Columbia
edu.tufts:
provenance: Tufts
edu.virginia:
provenance: Virginia
edu.umich:
provenance: Michigan
edu.wisc:
provenance: Wisconsin
edu.umn:
provenance: Minnesota
edu.berkeley:
provenance: Berkeley
edu.cornell:
provenance: Cornell
edu.uiowa:
provenance: Iowa
edu.indiana:
provenance: Indiana
edu.purdue:
provenance: Purdue
edu.illinois:
provenance: Illinois
edu.msu:
provenance: Michigan State
edu.umd:
provenance: Maryland
edu.rutgers:
provenance: Rutgers
edu.psu:
provenance: Penn State
edu.osu:
provenance: Ohio State
edu.uchicago:
provenance: Chicago
edu.unl:
provenance: Nebraska
edu.colostate:
provenance: Colorado State
edu.gmu:
provenance: George Mason
edu.uarizona:
provenance: Arizona
edu.vt:
provenance: Virginia Tech
55 changes: 55 additions & 0 deletions lib/earthworks/harvester.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
require 'geo_combine/harvester'

# A custom OpenGeoMetadata harvester that lets us limit repositories and transform metadata
module Earthworks
class Harvester < GeoCombine::Harvester
attr_reader :ogm_repos

# Support passing in a configured list of repositories to harvest
def initialize(ogm_repos: ENV.fetch('OGM_REPOS'), **kwargs)
super(**kwargs)

@ogm_repos = ogm_repos.transform_keys(&:to_s)
end

# Support skipping and transforming arbitrary records prior to indexing
def docs_to_index
return to_enum(:docs_to_index) unless block_given?

super do |record, path|
yield transform_record(record, path), path unless skip_record?(record, path)
end
end

private

# Some records have placeholder data or are otherwise problematic, but we
# can't denylist them at the institution/repository level.
def skip_record?(_record, path)
# Skip PolicyMap records in shared-repository; they have placeholder data
# See https://github.com/OpenGeoMetadata/shared-repository/tree/master/gbl-policymap
record_repo(path) == 'shared-repository' && path.include?('gbl-policymap')
end

# We transform some records in order to get more consistent metadata display
# in Earthworks, especially for facets.
def transform_record(record, path)
# Transform provenance to a shorter, consistent value based on the repository
if (transformed_provenance = @ogm_repos.dig(record_repo(path), :provenance))
record.update({ 'dct_provenance_s' => transformed_provenance })
end

record
end

# Get the name of the repository the record came from
def record_repo(path)
path.split(@ogm_path).last.split('/')[1]
end

# Only harvest configured repositories, if configuration was provided
def repositories
@repositories ||= @ogm_repos ? super.compact.select { |repo| @ogm_repos.key?(repo) } : super
end
end
end
Loading

0 comments on commit df4c1b9

Please sign in to comment.