diff --git a/inspire_utils/grobid_authors_parser.py b/inspire_utils/grobid_authors_parser.py index dc030ca..66f8ee8 100644 --- a/inspire_utils/grobid_authors_parser.py +++ b/inspire_utils/grobid_authors_parser.py @@ -21,22 +21,18 @@ # or submit itself to any jurisdiction. from __future__ import absolute_import, division, print_function - from inspire_schemas.builders import LiteratureBuilder from parsel import Selector class GrobidAuthors(object): def __init__(self, xml_text): - if isinstance(xml_text, str): + if isinstance(xml_text, bytes): xml_text = xml_text.decode('utf-8') - xml_text.t - self._xml = Selector(text=xml_text, type="xml") + self._xml = Selector(text=xml_text, type="xml") self._xml.remove_namespaces() - self._parsed_authors = self._xml.xpath( - "//author[persName/surname[string-length(normalize-space()) > 0]]" - ) + self._parsed_authors = self._xml.xpath("//author[persName/surname[string-length(normalize-space()) > 0]]") self._builder = None def __getitem__(self, item): @@ -50,12 +46,12 @@ def parse_one(self): self._builder = LiteratureBuilder() for author in self: yield { - "author": self._builder.make_author( + 'author': self._builder.make_author( full_name=author.fullname, raw_affiliations=author.raw_affiliations, emails=author.emails, ), - "parsed_affiliations": author.processed_affiliations, + 'parsed_affiliations': author.processed_affiliations } def parse_all(self): @@ -78,7 +74,7 @@ def _extract(source, path, type=None, text=False): return source.xpath(path) @classmethod - def _extract_string(cls, source, path, type=None, join_char=" "): + def _extract_string(cls, source, path, type=None, join_char=u' '): data = cls._extract(source, path, type, text=True).getall() data = [text.strip() for text in data] return join_char.join(data) @@ -90,16 +86,14 @@ def _extract_strings_list(cls, source, path, type=None): @staticmethod def _build_address(street, city, post_code, country): - address_list = [ - element for element in [street, city, post_code, country] if element - ] - address = {"postal_address": ", ".join(address_list)} if address_list else {} + address_list = [element for element in [street, city, post_code, country] if element] + address = {"postal_address": ', '.join(address_list)} if address_list else {} if city: - address["cities"] = [city] + address['cities'] = [city] if post_code: - address["postal_code"] = post_code + address['postal_code'] = post_code if country: - address["country"] = country + address['country'] = country return address @property @@ -112,13 +106,11 @@ def lastname(self): @property def fullname(self): - return ",".join([self.lastname, self.names]) + return u",".join([self.lastname, self.names]) @property def raw_affiliations(self): - return self._extract_strings_list( - self._author, "affiliation/note", type="raw_affiliation" - ) + return self._extract_strings_list(self._author, "affiliation/note", type="raw_affiliation") @property def emails(self): @@ -129,25 +121,21 @@ def processed_affiliations(self): affiliations = [] for affiliation in self._extract(self._author, "affiliation"): affiliation_obj = {} - name = self._extract_string( - affiliation, "orgName", type="institution", join_char=", " - ) - department = self._extract_strings_list( - affiliation, "orgName", type="department" - ) - - street = self._extract_string(affiliation, "address/addrLine") - settlement = self._extract_string(affiliation, "address/settlement") - post_code = self._extract_string(affiliation, "address/post_code") - country = self._extract_string(affiliation, "address/country") + name = self._extract_string(affiliation, "orgName", type="institution", join_char=', ') + department = self._extract_strings_list(affiliation, "orgName", type="department") + + street = self._extract_string(affiliation, 'address/addrLine') + settlement = self._extract_string(affiliation, 'address/settlement') + post_code = self._extract_string(affiliation, 'address/post_code') + country = self._extract_string(affiliation, 'address/country') address = self._build_address(street, settlement, post_code, country) if name: - affiliation_obj["name"] = name + affiliation_obj['name'] = name if department: - affiliation_obj["department"] = department + affiliation_obj['department'] = department if address: - affiliation_obj["address"] = address + affiliation_obj['address'] = address affiliations.append(affiliation_obj) return affiliations or None diff --git a/tests/fixtures/grobid_empty_author_doc.xml b/tests/fixtures/grobid_empty_author_doc.xml index 5656a97..3b07898 100644 --- a/tests/fixtures/grobid_empty_author_doc.xml +++ b/tests/fixtures/grobid_empty_author_doc.xml @@ -66,4 +66,4 @@ - \ No newline at end of file + diff --git a/tests/fixtures/grobid_full_doc.xml b/tests/fixtures/grobid_full_doc.xml index d8f6eec..a01bc70 100644 --- a/tests/fixtures/grobid_full_doc.xml +++ b/tests/fixtures/grobid_full_doc.xml @@ -106,4 +106,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/ - \ No newline at end of file + diff --git a/tests/fixtures/grobid_incomplete_doc.xml b/tests/fixtures/grobid_incomplete_doc.xml index 0dd89ba..e66fc2f 100644 --- a/tests/fixtures/grobid_incomplete_doc.xml +++ b/tests/fixtures/grobid_incomplete_doc.xml @@ -82,4 +82,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/ - \ No newline at end of file + diff --git a/tests/fixtures/grobid_no_authors_doc.xml b/tests/fixtures/grobid_no_authors_doc.xml index 3d3540d..e26e06c 100644 --- a/tests/fixtures/grobid_no_authors_doc.xml +++ b/tests/fixtures/grobid_no_authors_doc.xml @@ -49,4 +49,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/ - \ No newline at end of file + diff --git a/tests/test_grobid_authors_paerser.py b/tests/test_grobid_authors_parser.py similarity index 94% rename from tests/test_grobid_authors_paerser.py rename to tests/test_grobid_authors_parser.py index b2432c9..6b026f6 100644 --- a/tests/test_grobid_authors_paerser.py +++ b/tests/test_grobid_authors_parser.py @@ -27,7 +27,7 @@ def test_process_grobid_authors(): - encoding = 'utf-8' + grobid_response = pkg_resources.resource_string( __name__, os.path.join( @@ -107,13 +107,13 @@ def test_process_grobid_authors(): expected_authors_count = len(expected_authors) - authors = GrobidAuthors(str(grobid_response, encoding)) + authors = GrobidAuthors(grobid_response) assert len(authors) == expected_authors_count assert authors.parse_all() == expected_authors def test_grobid_incomplete_authors(): - encoding = 'utf-8' + grobid_response = pkg_resources.resource_string( __name__, os.path.join( @@ -121,6 +121,7 @@ def test_grobid_incomplete_authors(): 'grobid_incomplete_doc.xml' ) ) + expected_authors = [ {"parsed_affiliations": None, "author": {"full_name": u"Nandi"}}, { @@ -161,13 +162,13 @@ def test_grobid_incomplete_authors(): ] expected_authors_count = len(expected_authors) - authors = GrobidAuthors(str(grobid_response, encoding)) + authors = GrobidAuthors(grobid_response) assert len(authors) == expected_authors_count assert authors.parse_all() == expected_authors def test_grobid_no_authors(): - encoding = 'utf-8' + grobid_response = pkg_resources.resource_string( __name__, os.path.join( @@ -175,24 +176,26 @@ def test_grobid_no_authors(): 'grobid_no_authors_doc.xml' ) ) + expected_authors = [] expected_authors_count = 0 - authors = GrobidAuthors(str(grobid_response, encoding)) + authors = GrobidAuthors(grobid_response) assert len(authors) == expected_authors_count assert authors.parse_all() == expected_authors def test_grobid_empty_author(): - encoding = 'utf-8' + grobid_response = pkg_resources.resource_string( __name__, os.path.join( 'fixtures', - 'grobid_no_authors_doc.xml' + 'grobid_empty_author_doc.xml' ) ) + expected_authors = [{'parsed_affiliations': None, 'author': {'full_name': u'Abc, Xyz'}}, {'parsed_affiliations': None, 'author': {'emails': [u'some@email.cern'], 'full_name': u'Yzc'}}] expected_authors_count = 2 - authors = GrobidAuthors(str(grobid_response, encoding)) + authors = GrobidAuthors(grobid_response) assert len(authors) == expected_authors_count assert authors.parse_all() == expected_authors