From dfa8215910b6e1798898c1240767fc2d7d614f6c Mon Sep 17 00:00:00 2001 From: David Shorthouse Date: Sat, 29 Jun 2024 09:56:08 -0400 Subject: [PATCH] Strip out more characters --- dwc_agent.gemspec | 2 +- lib/dwc_agent/constants.rb | 4 +++- lib/dwc_agent/version.rb | 2 +- spec/dwc_agent/parser_spec.rb | 20 ++++++++++++++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/dwc_agent.gemspec b/dwc_agent.gemspec index fd2742e..3902d5f 100644 --- a/dwc_agent.gemspec +++ b/dwc_agent.gemspec @@ -7,7 +7,7 @@ Gem::Specification.new do |s| s.name = 'dwc_agent' s.version = DwcAgent::Version.version s.license = 'MIT' - s.date = '2024-06-17' + s.date = '2024-06-30' s.summary = "Parse Darwin Core agent terms such as recordedBy and identifiedBy" s.description = "Parses the typically messy content in Darwin Core terms that contain people names" s.authors = ["David P. Shorthouse"] diff --git a/lib/dwc_agent/constants.rb b/lib/dwc_agent/constants.rb index f865ca7..1d339e1 100644 --- a/lib/dwc_agent/constants.rb +++ b/lib/dwc_agent/constants.rb @@ -1,6 +1,8 @@ module DwcAgent STRIP_OUT = %r{ (?i:acc\s?\#)| + ["'-]{2,}| + \-\.\s| [,;]?\s*(?i:1st|2nd|3rd|[4-9]th)| \s*?\d+\.\d+| \b\d+\(?(?i:[[:alpha:]])\)?\b| @@ -103,7 +105,7 @@ module DwcAgent (?i:no\s+coll\.?(ector)?)| (?i:not?)\s+(?i:name|date|details?|specific)?\s*?(?i:given|name|date|noted)| (?i:non?)\s+(?i:specificato)| - \b[,;]\s+\d+\z| + \b[,;]\s+\d+\.?\z| [!@?]| [,]?\d+| \s+\d+?(\/|\.)?(?i:i|ii|iii|iv|v|vi|vii|viii|ix|x)(\/|\.)\d+| diff --git a/lib/dwc_agent/version.rb b/lib/dwc_agent/version.rb index 4870105..0f79c78 100644 --- a/lib/dwc_agent/version.rb +++ b/lib/dwc_agent/version.rb @@ -4,7 +4,7 @@ class Version MAJOR = 3 MINOR = 1 - PATCH = 2 + PATCH = 3 BUILD = 0 def self.version diff --git a/spec/dwc_agent/parser_spec.rb b/spec/dwc_agent/parser_spec.rb index fcb0a23..a443e5c 100644 --- a/spec/dwc_agent/parser_spec.rb +++ b/spec/dwc_agent/parser_spec.rb @@ -2143,5 +2143,25 @@ module DwcAgent expect(parsed[2].values_at(:given, :family)).to eq(["West", nil]) end + it "should ignore exclamation marks and period at end" do + input = '! L.P. Kvist, 1996.' + parsed = parser.parse(input) + expect(parsed.size).to eq(1) + expect(parsed[0].values_at(:given, :family)).to eq(["L.P.", "Kvist"]) + end + + it "should strip out multiple quotes" do + input = '"""Antonio Bausa"""' + parsed = parser.parse(input) + expect(parsed.size).to eq(1) + expect(parsed[0].values_at(:given, :family)).to eq(["Antonio", "Bausa"]) + end + + it "should strip out double dashes and extra punctuation" do + input = "-- Luke & -. Robertson" + parsed = parser.parse(input) + expect(parsed[0].values_at(:given, :family)).to eq(["Luke", nil]) + end + end end \ No newline at end of file