ref: cern-sis/issues-inspire#49

inspirehep · Jul 26, 2022 · c00e5ce · c00e5ce
1 parent a5d3e2a
commit c00e5ce
Show file tree

Hide file tree

Showing 6 changed files with 39 additions and 48 deletions.
diff --git a/inspire_utils/grobid_authors_parser.py b/inspire_utils/grobid_authors_parser.py
@@ -21,22 +21,18 @@
 # or submit itself to any jurisdiction.
 
 from __future__ import absolute_import, division, print_function
-
 from inspire_schemas.builders import LiteratureBuilder
 from parsel import Selector
 
 
 class GrobidAuthors(object):
     def __init__(self, xml_text):
-        if isinstance(xml_text, str):
+        if isinstance(xml_text, bytes):
             xml_text = xml_text.decode('utf-8')
-        xml_text.t
-        self._xml = Selector(text=xml_text, type="xml")
 
+        self._xml = Selector(text=xml_text, type="xml")
         self._xml.remove_namespaces()
-        self._parsed_authors = self._xml.xpath(
-            "//author[persName/surname[string-length(normalize-space()) > 0]]"
-        )
+        self._parsed_authors = self._xml.xpath("//author[persName/surname[string-length(normalize-space()) > 0]]")
         self._builder = None
 
     def __getitem__(self, item):
@@ -50,12 +46,12 @@ def parse_one(self):
         self._builder = LiteratureBuilder()
         for author in self:
             yield {
-                "author": self._builder.make_author(
+                'author': self._builder.make_author(
                     full_name=author.fullname,
                     raw_affiliations=author.raw_affiliations,
                     emails=author.emails,
                 ),
-                "parsed_affiliations": author.processed_affiliations,
+                'parsed_affiliations': author.processed_affiliations
             }
 
     def parse_all(self):
@@ -78,7 +74,7 @@ def _extract(source, path, type=None, text=False):
         return source.xpath(path)
 
     @classmethod
-    def _extract_string(cls, source, path, type=None, join_char=" "):
+    def _extract_string(cls, source, path, type=None, join_char=u' '):
         data = cls._extract(source, path, type, text=True).getall()
         data = [text.strip() for text in data]
         return join_char.join(data)
@@ -90,16 +86,14 @@ def _extract_strings_list(cls, source, path, type=None):
 
     @staticmethod
     def _build_address(street, city, post_code, country):
-        address_list = [
-            element for element in [street, city, post_code, country] if element
-        ]
-        address = {"postal_address": ", ".join(address_list)} if address_list else {}
+        address_list = [element for element in [street, city, post_code, country] if element]
+        address = {"postal_address": ', '.join(address_list)} if address_list else {}
         if city:
-            address["cities"] = [city]
+            address['cities'] = [city]
         if post_code:
-            address["postal_code"] = post_code
+            address['postal_code'] = post_code
         if country:
-            address["country"] = country
+            address['country'] = country
         return address
 
     @property
@@ -112,13 +106,11 @@ def lastname(self):
 
     @property
     def fullname(self):
-        return ",".join([self.lastname, self.names])
+        return u",".join([self.lastname, self.names])
 
     @property
     def raw_affiliations(self):
-        return self._extract_strings_list(
-            self._author, "affiliation/note", type="raw_affiliation"
-        )
+        return self._extract_strings_list(self._author, "affiliation/note", type="raw_affiliation")
 
     @property
     def emails(self):
@@ -129,25 +121,21 @@ def processed_affiliations(self):
         affiliations = []
         for affiliation in self._extract(self._author, "affiliation"):
             affiliation_obj = {}
-            name = self._extract_string(
-                affiliation, "orgName", type="institution", join_char=", "
-            )
-            department = self._extract_strings_list(
-                affiliation, "orgName", type="department"
-            )
-
-            street = self._extract_string(affiliation, "address/addrLine")
-            settlement = self._extract_string(affiliation, "address/settlement")
-            post_code = self._extract_string(affiliation, "address/post_code")
-            country = self._extract_string(affiliation, "address/country")
+            name = self._extract_string(affiliation, "orgName", type="institution", join_char=', ')
+            department = self._extract_strings_list(affiliation, "orgName", type="department")
+
+            street = self._extract_string(affiliation, 'address/addrLine')
+            settlement = self._extract_string(affiliation, 'address/settlement')
+            post_code = self._extract_string(affiliation, 'address/post_code')
+            country = self._extract_string(affiliation, 'address/country')
 
             address = self._build_address(street, settlement, post_code, country)
 
             if name:
-                affiliation_obj["name"] = name
+                affiliation_obj['name'] = name
             if department:
-                affiliation_obj["department"] = department
+                affiliation_obj['department'] = department
             if address:
-                affiliation_obj["address"] = address
+                affiliation_obj['address'] = address
             affiliations.append(affiliation_obj)
         return affiliations or None
diff --git a/tests/fixtures/grobid_empty_author_doc.xml b/tests/fixtures/grobid_empty_author_doc.xml
@@ -66,4 +66,4 @@
                 </profileDesc>
             </teiHeader>
             <text xml:lang="en"></text>
-        </TEI>
+        </TEI>
diff --git a/tests/fixtures/grobid_full_doc.xml b/tests/fixtures/grobid_full_doc.xml
@@ -106,4 +106,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/
         </profileDesc>
     </teiHeader>
     <text xml:lang="en"></text>
-</TEI>
+</TEI>
diff --git a/tests/fixtures/grobid_incomplete_doc.xml b/tests/fixtures/grobid_incomplete_doc.xml
@@ -82,4 +82,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/
         </profileDesc>
     </teiHeader>
     <text xml:lang="en"></text>
-</TEI>
+</TEI>
diff --git a/tests/fixtures/grobid_no_authors_doc.xml b/tests/fixtures/grobid_no_authors_doc.xml
@@ -49,4 +49,4 @@ xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/
         </profileDesc>
     </teiHeader>
     <text xml:lang="en"></text>
-</TEI>
+</TEI>
diff --git a/tests/test_grobid_authors_paerser.py → tests/test_grobid_authors_parser.py b/tests/test_grobid_authors_paerser.py → tests/test_grobid_authors_parser.py
@@ -27,7 +27,7 @@
 
 
 def test_process_grobid_authors():
-    encoding = 'utf-8'
+
     grobid_response = pkg_resources.resource_string(
         __name__,
         os.path.join(
@@ -107,20 +107,21 @@ def test_process_grobid_authors():
 
     expected_authors_count = len(expected_authors)
 
-    authors = GrobidAuthors(str(grobid_response, encoding))
+    authors = GrobidAuthors(grobid_response)
     assert len(authors) == expected_authors_count
     assert authors.parse_all() == expected_authors
 
 
 def test_grobid_incomplete_authors():
-    encoding = 'utf-8'
+
     grobid_response = pkg_resources.resource_string(
         __name__,
         os.path.join(
             'fixtures',
             'grobid_incomplete_doc.xml'
         )
     )
+
     expected_authors = [
         {"parsed_affiliations": None, "author": {"full_name": u"Nandi"}},
         {
@@ -161,38 +162,40 @@ def test_grobid_incomplete_authors():
     ]
 
     expected_authors_count = len(expected_authors)
-    authors = GrobidAuthors(str(grobid_response, encoding))
+    authors = GrobidAuthors(grobid_response)
     assert len(authors) == expected_authors_count
     assert authors.parse_all() == expected_authors
 
 
 def test_grobid_no_authors():
-    encoding = 'utf-8'
+
     grobid_response = pkg_resources.resource_string(
         __name__,
         os.path.join(
             'fixtures',
             'grobid_no_authors_doc.xml'
         )
     )
+
     expected_authors = []
     expected_authors_count = 0
-    authors = GrobidAuthors(str(grobid_response, encoding))
+    authors = GrobidAuthors(grobid_response)
     assert len(authors) == expected_authors_count
     assert authors.parse_all() == expected_authors
 
 
 def test_grobid_empty_author():
-    encoding = 'utf-8'
+
     grobid_response = pkg_resources.resource_string(
         __name__,
         os.path.join(
             'fixtures',
-            'grobid_no_authors_doc.xml'
+            'grobid_empty_author_doc.xml'
         )
     )
+
     expected_authors = [{'parsed_affiliations': None, 'author': {'full_name': u'Abc, Xyz'}}, {'parsed_affiliations': None, 'author': {'emails': [u'[email protected]'], 'full_name': u'Yzc'}}]
     expected_authors_count = 2
-    authors = GrobidAuthors(str(grobid_response, encoding))
+    authors = GrobidAuthors(grobid_response)
     assert len(authors) == expected_authors_count
     assert authors.parse_all() == expected_authors