Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
nooraangelva committed Jul 26, 2022
1 parent a5d3e2a commit c00e5ce
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 48 deletions.
58 changes: 23 additions & 35 deletions inspire_utils/grobid_authors_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,18 @@
# or submit itself to any jurisdiction.

from __future__ import absolute_import, division, print_function

from inspire_schemas.builders import LiteratureBuilder
from parsel import Selector


class GrobidAuthors(object):
def __init__(self, xml_text):
if isinstance(xml_text, str):
if isinstance(xml_text, bytes):
xml_text = xml_text.decode('utf-8')
xml_text.t
self._xml = Selector(text=xml_text, type="xml")

self._xml = Selector(text=xml_text, type="xml")
self._xml.remove_namespaces()
self._parsed_authors = self._xml.xpath(
"//author[persName/surname[string-length(normalize-space()) > 0]]"
)
self._parsed_authors = self._xml.xpath("//author[persName/surname[string-length(normalize-space()) > 0]]")
self._builder = None

def __getitem__(self, item):
Expand All @@ -50,12 +46,12 @@ def parse_one(self):
self._builder = LiteratureBuilder()
for author in self:
yield {
"author": self._builder.make_author(
'author': self._builder.make_author(
full_name=author.fullname,
raw_affiliations=author.raw_affiliations,
emails=author.emails,
),
"parsed_affiliations": author.processed_affiliations,
'parsed_affiliations': author.processed_affiliations
}

def parse_all(self):
Expand All @@ -78,7 +74,7 @@ def _extract(source, path, type=None, text=False):
return source.xpath(path)

@classmethod
def _extract_string(cls, source, path, type=None, join_char=" "):
def _extract_string(cls, source, path, type=None, join_char=u' '):
data = cls._extract(source, path, type, text=True).getall()
data = [text.strip() for text in data]
return join_char.join(data)
Expand All @@ -90,16 +86,14 @@ def _extract_strings_list(cls, source, path, type=None):

@staticmethod
def _build_address(street, city, post_code, country):
address_list = [
element for element in [street, city, post_code, country] if element
]
address = {"postal_address": ", ".join(address_list)} if address_list else {}
address_list = [element for element in [street, city, post_code, country] if element]
address = {"postal_address": ', '.join(address_list)} if address_list else {}
if city:
address["cities"] = [city]
address['cities'] = [city]
if post_code:
address["postal_code"] = post_code
address['postal_code'] = post_code
if country:
address["country"] = country
address['country'] = country
return address

@property
Expand All @@ -112,13 +106,11 @@ def lastname(self):

@property
def fullname(self):
return ",".join([self.lastname, self.names])
return u",".join([self.lastname, self.names])

@property
def raw_affiliations(self):
return self._extract_strings_list(
self._author, "affiliation/note", type="raw_affiliation"
)
return self._extract_strings_list(self._author, "affiliation/note", type="raw_affiliation")

@property
def emails(self):
Expand All @@ -129,25 +121,21 @@ def processed_affiliations(self):
affiliations = []
for affiliation in self._extract(self._author, "affiliation"):
affiliation_obj = {}
name = self._extract_string(
affiliation, "orgName", type="institution", join_char=", "
)
department = self._extract_strings_list(
affiliation, "orgName", type="department"
)

street = self._extract_string(affiliation, "address/addrLine")
settlement = self._extract_string(affiliation, "address/settlement")
post_code = self._extract_string(affiliation, "address/post_code")
country = self._extract_string(affiliation, "address/country")
name = self._extract_string(affiliation, "orgName", type="institution", join_char=', ')
department = self._extract_strings_list(affiliation, "orgName", type="department")

street = self._extract_string(affiliation, 'address/addrLine')
settlement = self._extract_string(affiliation, 'address/settlement')
post_code = self._extract_string(affiliation, 'address/post_code')
country = self._extract_string(affiliation, 'address/country')

address = self._build_address(street, settlement, post_code, country)

if name:
affiliation_obj["name"] = name
affiliation_obj['name'] = name
if department:
affiliation_obj["department"] = department
affiliation_obj['department'] = department
if address:
affiliation_obj["address"] = address
affiliation_obj['address'] = address
affiliations.append(affiliation_obj)
return affiliations or None
2 changes: 1 addition & 1 deletion tests/fixtures/grobid_empty_author_doc.xml
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,4 @@
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
</TEI>
2 changes: 1 addition & 1 deletion tests/fixtures/grobid_full_doc.xml
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
</TEI>
2 changes: 1 addition & 1 deletion tests/fixtures/grobid_incomplete_doc.xml
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
</TEI>
2 changes: 1 addition & 1 deletion tests/fixtures/grobid_no_authors_doc.xml
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
</TEI>
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


def test_process_grobid_authors():
encoding = 'utf-8'

grobid_response = pkg_resources.resource_string(
__name__,
os.path.join(
Expand Down Expand Up @@ -107,20 +107,21 @@ def test_process_grobid_authors():

expected_authors_count = len(expected_authors)

authors = GrobidAuthors(str(grobid_response, encoding))
authors = GrobidAuthors(grobid_response)
assert len(authors) == expected_authors_count
assert authors.parse_all() == expected_authors


def test_grobid_incomplete_authors():
encoding = 'utf-8'

grobid_response = pkg_resources.resource_string(
__name__,
os.path.join(
'fixtures',
'grobid_incomplete_doc.xml'
)
)

expected_authors = [
{"parsed_affiliations": None, "author": {"full_name": u"Nandi"}},
{
Expand Down Expand Up @@ -161,38 +162,40 @@ def test_grobid_incomplete_authors():
]

expected_authors_count = len(expected_authors)
authors = GrobidAuthors(str(grobid_response, encoding))
authors = GrobidAuthors(grobid_response)
assert len(authors) == expected_authors_count
assert authors.parse_all() == expected_authors


def test_grobid_no_authors():
encoding = 'utf-8'

grobid_response = pkg_resources.resource_string(
__name__,
os.path.join(
'fixtures',
'grobid_no_authors_doc.xml'
)
)

expected_authors = []
expected_authors_count = 0
authors = GrobidAuthors(str(grobid_response, encoding))
authors = GrobidAuthors(grobid_response)
assert len(authors) == expected_authors_count
assert authors.parse_all() == expected_authors


def test_grobid_empty_author():
encoding = 'utf-8'

grobid_response = pkg_resources.resource_string(
__name__,
os.path.join(
'fixtures',
'grobid_no_authors_doc.xml'
'grobid_empty_author_doc.xml'
)
)

expected_authors = [{'parsed_affiliations': None, 'author': {'full_name': u'Abc, Xyz'}}, {'parsed_affiliations': None, 'author': {'emails': [u'[email protected]'], 'full_name': u'Yzc'}}]
expected_authors_count = 2
authors = GrobidAuthors(str(grobid_response, encoding))
authors = GrobidAuthors(grobid_response)
assert len(authors) == expected_authors_count
assert authors.parse_all() == expected_authors

0 comments on commit c00e5ce

Please sign in to comment.