Skip to content

Commit

Permalink
More work on issues uncovered in #18
Browse files Browse the repository at this point in the history
  • Loading branch information
dshorthouse committed Oct 13, 2023
1 parent a24c2bb commit 99a6595
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 12 deletions.
2 changes: 1 addition & 1 deletion dwc_agent.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Gem::Specification.new do |s|
s.name = 'dwc_agent'
s.version = DwcAgent::Version.version
s.license = 'MIT'
s.date = '2023-10-09'
s.date = '2023-10-12'
s.summary = "Parse Darwin Core agent terms such as recordedBy and identifiedBy"
s.description = "Parses the typically messy content in Darwin Core terms that contain people names"
s.authors = ["David P. Shorthouse"]
Expand Down
9 changes: 4 additions & 5 deletions lib/dwc_agent/constants.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
module DwcAgent
STRIP_OUT = %r{
^[\[{(]|
[\]})]\??$|
(?i:acc\s?\#)|
[,;]?\s*(?i:1st|2nd|3rd|[4-9]th)|
\s*?\d+\.\d+|
Expand All @@ -14,7 +12,7 @@ module DwcAgent
\b[,;]?\s*(?i:etc)\.?|
\b[,;]?\s*(?i:exp)\.?\s*(\b|\z)|
\b[,;]?\s*(?i:aboard)[^$]+|
\b[,;]?\s*(?i:on)\b|
\b[,;]?\s+(?i:on)\b|
\b[,;]?\s*(?i:unkn?own)\b|
\b[,;]?\s*(?i:n/a)\b|
\b[,;]?\s*(?i:ann?onymous)\b|
Expand Down Expand Up @@ -113,8 +111,9 @@ module DwcAgent
(?i:annot)\.?\s*?\b|
\s+(?i:stet)\s*!?\s*\z|
\s+(?i:prep)\.?\s*\z|
\W([({\[].*?[)}\]])|
\W[\(\[\{][A-Za-z]{1,3}$|
([({].*?[)}])|
\s+\[([[:word:]]|[[:space:]]|[-\?\.]){10,}\]|
[\(\{][A-Za-z]{1,3}$|
\b(?i:leg)[\.:]?\s*\b|
(?:[Dd](ed|on))[\.:]|
\d*[A-Za-z]*\d*-\d*\z|
Expand Down
2 changes: 1 addition & 1 deletion lib/dwc_agent/version.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ class Version

MAJOR = 3
MINOR = 0
PATCH = 14
PATCH = 15
BUILD = 0

def self.version
Expand Down
15 changes: 11 additions & 4 deletions spec/dwc_agent/parser_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ module DwcAgent
it "should remove extraneous material" do
input = "Unknown [J. S. Erskine?]"
parsed = parser.parse(input)
expect(parsed.size).to eq(0)
expect(parsed).to eq([])
end

it "should parse name with many given initials" do
Expand All @@ -491,7 +491,7 @@ module DwcAgent
input = "Jack [John] Smith12345"
parsed = parser.parse(input)
expect(parsed.size).to eq(1)
expect(parsed[0].values_at(:given, :family)).to eq(['Jack', 'Smith'])
expect(parsed[0].values_at(:given, :family)).to eq(['Jack John', 'Smith'])
end

it "should explode names with '/'" do
Expand Down Expand Up @@ -1348,8 +1348,8 @@ module DwcAgent
input = "Holm, E (operator).; Ng, J.(netter); Litwiller, S. (netter); Lee, C. (data recorder)"
parsed = parser.parse(input)
expect(parsed.size).to eq(4)
expect(parsed[0].values_at(:given, :family)).to eq(["E.", "Holm"])
expect(parsed[1].values_at(:given, :family)).to eq(["J", "Ng"])
expect(parsed[0].values_at(:given, :family)).to eq(["E", "Holm"])
expect(parsed[1].values_at(:given, :family)).to eq(["J.", "Ng"])
expect(parsed[2].values_at(:given, :family)).to eq(["S.", "Litwiller"])
expect(parsed[3].values_at(:given, :family)).to eq(["C.", "Lee"])
end
Expand Down Expand Up @@ -2073,5 +2073,12 @@ module DwcAgent
expect(parsed[0].values_at(:given, :family)).to eq(["A.", "Breuckner"])
end

it "should remove multiple square braclets within parts of name" do
input = "Attila Meste[r]há[zy]"
parsed = parser.parse(input)
expect(parsed.size).to eq(1)
expect(parsed[0].values_at(:given, :family)).to eq(["Attila", "Mesterházy"])
end

end
end
2 changes: 1 addition & 1 deletion spec/resources/test_data.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ FAH Sperling || [{"family":"Sperling", "given":"F.A.H.", "particle":null, "appel

Chris MacQuarrie || [{"family":"MacQuarrie", "given":"Chris", "particle":null, "appellation": null, "suffix": null, "title": null, "dropping_particle": null, "nick": null}]

Jack [John] Smith12345 || [{"family":"Smith", "given":"Jack", "particle":null, "appellation": null, "suffix": null, "title": null, "dropping_particle": null, "nick": null}]
Jack [John] Smith12345 || [{"family":"Smith", "given":"Jack John", "particle":null, "appellation": null, "suffix": null, "title": null, "dropping_particle": null, "nick": null}]

O.Bennedict/G.J. Spencer || [{"family":"Bennedict", "given":"O.", "particle":null, "appellation": null, "suffix": null, "title": null, "dropping_particle": null, "nick": null}, {"family":"Spencer", "given":"G.J.", "particle":null, "appellation": null, "suffix": null, "title": null, "dropping_particle": null, "nick": null}]

Expand Down

0 comments on commit 99a6595

Please sign in to comment.