diff --git a/inspire_utils/grobid_authors_parser.py b/inspire_utils/grobid_authors_parser.py
index dc030ca..66f8ee8 100644
--- a/inspire_utils/grobid_authors_parser.py
+++ b/inspire_utils/grobid_authors_parser.py
@@ -21,22 +21,18 @@
# or submit itself to any jurisdiction.
from __future__ import absolute_import, division, print_function
-
from inspire_schemas.builders import LiteratureBuilder
from parsel import Selector
class GrobidAuthors(object):
def __init__(self, xml_text):
- if isinstance(xml_text, str):
+ if isinstance(xml_text, bytes):
xml_text = xml_text.decode('utf-8')
- xml_text.t
- self._xml = Selector(text=xml_text, type="xml")
+ self._xml = Selector(text=xml_text, type="xml")
self._xml.remove_namespaces()
- self._parsed_authors = self._xml.xpath(
- "//author[persName/surname[string-length(normalize-space()) > 0]]"
- )
+ self._parsed_authors = self._xml.xpath("//author[persName/surname[string-length(normalize-space()) > 0]]")
self._builder = None
def __getitem__(self, item):
@@ -50,12 +46,12 @@ def parse_one(self):
self._builder = LiteratureBuilder()
for author in self:
yield {
- "author": self._builder.make_author(
+ 'author': self._builder.make_author(
full_name=author.fullname,
raw_affiliations=author.raw_affiliations,
emails=author.emails,
),
- "parsed_affiliations": author.processed_affiliations,
+ 'parsed_affiliations': author.processed_affiliations
}
def parse_all(self):
@@ -78,7 +74,7 @@ def _extract(source, path, type=None, text=False):
return source.xpath(path)
@classmethod
- def _extract_string(cls, source, path, type=None, join_char=" "):
+ def _extract_string(cls, source, path, type=None, join_char=u' '):
data = cls._extract(source, path, type, text=True).getall()
data = [text.strip() for text in data]
return join_char.join(data)
@@ -90,16 +86,14 @@ def _extract_strings_list(cls, source, path, type=None):
@staticmethod
def _build_address(street, city, post_code, country):
- address_list = [
- element for element in [street, city, post_code, country] if element
- ]
- address = {"postal_address": ", ".join(address_list)} if address_list else {}
+ address_list = [element for element in [street, city, post_code, country] if element]
+ address = {"postal_address": ', '.join(address_list)} if address_list else {}
if city:
- address["cities"] = [city]
+ address['cities'] = [city]
if post_code:
- address["postal_code"] = post_code
+ address['postal_code'] = post_code
if country:
- address["country"] = country
+ address['country'] = country
return address
@property
@@ -112,13 +106,11 @@ def lastname(self):
@property
def fullname(self):
- return ",".join([self.lastname, self.names])
+ return u",".join([self.lastname, self.names])
@property
def raw_affiliations(self):
- return self._extract_strings_list(
- self._author, "affiliation/note", type="raw_affiliation"
- )
+ return self._extract_strings_list(self._author, "affiliation/note", type="raw_affiliation")
@property
def emails(self):
@@ -129,25 +121,21 @@ def processed_affiliations(self):
affiliations = []
for affiliation in self._extract(self._author, "affiliation"):
affiliation_obj = {}
- name = self._extract_string(
- affiliation, "orgName", type="institution", join_char=", "
- )
- department = self._extract_strings_list(
- affiliation, "orgName", type="department"
- )
-
- street = self._extract_string(affiliation, "address/addrLine")
- settlement = self._extract_string(affiliation, "address/settlement")
- post_code = self._extract_string(affiliation, "address/post_code")
- country = self._extract_string(affiliation, "address/country")
+ name = self._extract_string(affiliation, "orgName", type="institution", join_char=', ')
+ department = self._extract_strings_list(affiliation, "orgName", type="department")
+
+ street = self._extract_string(affiliation, 'address/addrLine')
+ settlement = self._extract_string(affiliation, 'address/settlement')
+ post_code = self._extract_string(affiliation, 'address/post_code')
+ country = self._extract_string(affiliation, 'address/country')
address = self._build_address(street, settlement, post_code, country)
if name:
- affiliation_obj["name"] = name
+ affiliation_obj['name'] = name
if department:
- affiliation_obj["department"] = department
+ affiliation_obj['department'] = department
if address:
- affiliation_obj["address"] = address
+ affiliation_obj['address'] = address
affiliations.append(affiliation_obj)
return affiliations or None
diff --git a/tests/fixtures/grobid_empty_author_doc.xml b/tests/fixtures/grobid_empty_author_doc.xml
index 5656a97..3b07898 100644
--- a/tests/fixtures/grobid_empty_author_doc.xml
+++ b/tests/fixtures/grobid_empty_author_doc.xml
@@ -66,4 +66,4 @@
-
\ No newline at end of file
+
diff --git a/tests/fixtures/grobid_full_doc.xml b/tests/fixtures/grobid_full_doc.xml
index d8f6eec..a01bc70 100644
--- a/tests/fixtures/grobid_full_doc.xml
+++ b/tests/fixtures/grobid_full_doc.xml
@@ -106,4 +106,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/
-
\ No newline at end of file
+
diff --git a/tests/fixtures/grobid_incomplete_doc.xml b/tests/fixtures/grobid_incomplete_doc.xml
index 0dd89ba..e66fc2f 100644
--- a/tests/fixtures/grobid_incomplete_doc.xml
+++ b/tests/fixtures/grobid_incomplete_doc.xml
@@ -82,4 +82,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/
-
\ No newline at end of file
+
diff --git a/tests/fixtures/grobid_no_authors_doc.xml b/tests/fixtures/grobid_no_authors_doc.xml
index 3d3540d..e26e06c 100644
--- a/tests/fixtures/grobid_no_authors_doc.xml
+++ b/tests/fixtures/grobid_no_authors_doc.xml
@@ -49,4 +49,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/
-
\ No newline at end of file
+
diff --git a/tests/test_grobid_authors_paerser.py b/tests/test_grobid_authors_parser.py
similarity index 94%
rename from tests/test_grobid_authors_paerser.py
rename to tests/test_grobid_authors_parser.py
index b2432c9..6b026f6 100644
--- a/tests/test_grobid_authors_paerser.py
+++ b/tests/test_grobid_authors_parser.py
@@ -27,7 +27,7 @@
def test_process_grobid_authors():
- encoding = 'utf-8'
+
grobid_response = pkg_resources.resource_string(
__name__,
os.path.join(
@@ -107,13 +107,13 @@ def test_process_grobid_authors():
expected_authors_count = len(expected_authors)
- authors = GrobidAuthors(str(grobid_response, encoding))
+ authors = GrobidAuthors(grobid_response)
assert len(authors) == expected_authors_count
assert authors.parse_all() == expected_authors
def test_grobid_incomplete_authors():
- encoding = 'utf-8'
+
grobid_response = pkg_resources.resource_string(
__name__,
os.path.join(
@@ -121,6 +121,7 @@ def test_grobid_incomplete_authors():
'grobid_incomplete_doc.xml'
)
)
+
expected_authors = [
{"parsed_affiliations": None, "author": {"full_name": u"Nandi"}},
{
@@ -161,13 +162,13 @@ def test_grobid_incomplete_authors():
]
expected_authors_count = len(expected_authors)
- authors = GrobidAuthors(str(grobid_response, encoding))
+ authors = GrobidAuthors(grobid_response)
assert len(authors) == expected_authors_count
assert authors.parse_all() == expected_authors
def test_grobid_no_authors():
- encoding = 'utf-8'
+
grobid_response = pkg_resources.resource_string(
__name__,
os.path.join(
@@ -175,24 +176,26 @@ def test_grobid_no_authors():
'grobid_no_authors_doc.xml'
)
)
+
expected_authors = []
expected_authors_count = 0
- authors = GrobidAuthors(str(grobid_response, encoding))
+ authors = GrobidAuthors(grobid_response)
assert len(authors) == expected_authors_count
assert authors.parse_all() == expected_authors
def test_grobid_empty_author():
- encoding = 'utf-8'
+
grobid_response = pkg_resources.resource_string(
__name__,
os.path.join(
'fixtures',
- 'grobid_no_authors_doc.xml'
+ 'grobid_empty_author_doc.xml'
)
)
+
expected_authors = [{'parsed_affiliations': None, 'author': {'full_name': u'Abc, Xyz'}}, {'parsed_affiliations': None, 'author': {'emails': [u'some@email.cern'], 'full_name': u'Yzc'}}]
expected_authors_count = 2
- authors = GrobidAuthors(str(grobid_response, encoding))
+ authors = GrobidAuthors(grobid_response)
assert len(authors) == expected_authors_count
assert authors.parse_all() == expected_authors