Skip to content

Commit

Permalink
Permit passing of options to override regex
Browse files Browse the repository at this point in the history
  • Loading branch information
dshorthouse committed Nov 8, 2024
1 parent adf71a8 commit 0e5a6ff
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 35 deletions.
2 changes: 1 addition & 1 deletion dwc_agent.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Gem::Specification.new do |s|
s.name = 'dwc_agent'
s.version = DwcAgent::Version.version
s.license = 'MIT'
s.date = '2024-11-07'
s.date = '2024-11-08'
s.summary = "Parse Darwin Core agent terms such as recordedBy and identifiedBy"
s.description = "Parses the typically messy content in Darwin Core terms that contain people names"
s.authors = ["David P. Shorthouse"]
Expand Down
30 changes: 19 additions & 11 deletions lib/dwc_agent/cleaner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,25 @@ module DwcAgent

class Cleaner

@defaults = {
blacklist: BLACKLIST,
given_blacklist: GIVEN_BLACKLIST,
family_blacklist: FAMILY_BLACKLIST,
particles: PARTICLES
}

class << self
attr_reader :defaults

def instance
Thread.current[:dwc_agent_cleaner] ||= new
end
end

def initialize
@blacklist = BLACKLIST
@given_blacklist = GIVEN_BLACKLIST
@family_blacklist = FAMILY_BLACKLIST
@particles = PARTICLES
attr_reader :options

def initialize(options = {})
@options = self.class.defaults.merge(options)
end

def default
Expand All @@ -35,7 +43,7 @@ def clean(parsed_namae)
end

if parsed_namae.given &&
@given_blacklist.any?{ |s| s.casecmp(parsed_namae.given) == 0 }
options[:given_blacklist].any?{ |s| s.casecmp(parsed_namae.given) == 0 }
return
end

Expand All @@ -55,7 +63,7 @@ def clean(parsed_namae)
return default
end

if parsed_namae.display_order =~ @blacklist
if parsed_namae.display_order =~ options[:blacklist]
return default
end

Expand Down Expand Up @@ -113,7 +121,7 @@ def clean(parsed_namae)
end

if parsed_namae.family &&
@family_blacklist.any?{ |s| s.casecmp(parsed_namae.family) == 0 }
options[:family_blacklist].any?{ |s| s.casecmp(parsed_namae.family) == 0 }
return default
end

Expand All @@ -140,7 +148,7 @@ def clean(parsed_namae)
if !family.nil? &&
given.nil? &&
!particle.nil? &&
!@particles.include?(particle.downcase)
!options[:particles].include?(particle.downcase)
given = particle.sub(/[a-z]\./, &:upcase).sub(/^(.)/) { $1.capitalize }
particle = nil
end
Expand All @@ -161,11 +169,11 @@ def clean(parsed_namae)
return default
end

if !family.nil? && @family_blacklist.any?{ |s| s.casecmp(family) == 0 }
if !family.nil? && options[:family_blacklist].any?{ |s| s.casecmp(family) == 0 }
return default
end

if !given.nil? && @given_blacklist.any?{ |s| s.casecmp(given) == 0 }
if !given.nil? && options[:given_blacklist].any?{ |s| s.casecmp(given) == 0 }
return default
end

Expand Down
48 changes: 27 additions & 21 deletions lib/dwc_agent/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,33 @@ module DwcAgent

class Parser

@defaults = {
prefer_comma_as_separator: true,
separator: SPLIT_BY,
title: TITLE,
appellation: APPELLATION,
suffix: SUFFIX,
strip_out_regex: Regexp.new(STRIP_OUT.to_s),
tidy_remains_regex: Regexp.new(POST_STRIP_TIDY.to_s),
char_subs_regex: Regexp.new([CHAR_SUBS.keys.join].to_s),
phrase_subs_regex: Regexp.new(PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s),
residual_terminators_regex: Regexp.new(SPLIT_BY.to_s + %r{\s*\z}.to_s),
separators: SEPARATORS.map{|k,v| [ Regexp.new(k), v] }
}

class << self
attr_reader :defaults

def instance
Thread.current[:dwc_agent_parser] ||= new
end
end

def initialize
options = {
prefer_comma_as_separator: true,
separator: SPLIT_BY,
title: TITLE,
appellation: APPELLATION,
suffix: SUFFIX
}
@namae = Namae::Parser.new(options)
@strip_out_regex = Regexp.new STRIP_OUT.to_s
@tidy_remains_regex = Regexp.new POST_STRIP_TIDY.to_s
@char_subs_regex = Regexp.new [CHAR_SUBS.keys.join].to_s
@phrase_subs_regex = Regexp.new PHRASE_SUBS.keys.map{|a| Regexp.escape a }.join('|').to_s
@residual_terminators_regex = Regexp.new SPLIT_BY.to_s + %r{\s*\z}.to_s
@separators = SEPARATORS.map{|k,v| [ Regexp.new(k), v] }
attr_reader :options, :namae

def initialize(options = {})
@options = self.class.defaults.merge(options)
@namae = Namae::Parser.new(@options)
end

# Parses the passed-in string and returns a list of names.
Expand All @@ -31,14 +37,14 @@ def initialize
# @return [Array] the list of parsed names
def parse(name)
return [] if name.nil? || name == ""
name.gsub!(@strip_out_regex, ' ')
name.gsub!(@tidy_remains_regex, '')
name.gsub!(Regexp.union(@char_subs_regex, @phrase_subs_regex), CHAR_SUBS.merge(PHRASE_SUBS))
@separators.each{|k| name.gsub!(k[0], k[1])}
name.gsub!(@residual_terminators_regex, '')
name.gsub!(options[:strip_out_regex], ' ')
name.gsub!(options[:tidy_remains_regex], '')
name.gsub!(Regexp.union(options[:char_subs_regex], options[:phrase_subs_regex]), CHAR_SUBS.merge(PHRASE_SUBS))
options[:separators].each{|k| name.gsub!(k[0], k[1])}
name.gsub!(options[:residual_terminators_regex], '')
name.squeeze!(' ')
name.strip!
@namae.parse(name)
namae.parse(name)
end

end
Expand Down
4 changes: 2 additions & 2 deletions lib/dwc_agent/version.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ module DwcAgent
class Version

MAJOR = 3
MINOR = 2
PATCH = 1
MINOR = 3
PATCH = 0
BUILD = 0

def self.version
Expand Down

0 comments on commit 0e5a6ff

Please sign in to comment.