diff --git a/inspire_utils/grobid_authors_parser.py b/inspire_utils/grobid_authors_parser.py
new file mode 100644
index 0000000..d0bc945
--- /dev/null
+++ b/inspire_utils/grobid_authors_parser.py
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of INSPIRE.
+# Copyright (C) 2020 CERN.
+#
+# INSPIRE is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# INSPIRE is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with INSPIRE. If not, see .
+#
+# In applying this license, CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+from __future__ import absolute_import, division, print_function
+
+from inspire_schemas.builders import LiteratureBuilder
+from parsel import Selector
+
+
+class GrobidAuthors(object):
+ def __init__(self, xml_text):
+
+ self._xml = Selector(text=xml_text, type="xml")
+
+ self._xml.remove_namespaces()
+ self._parsed_authors = self._xml.xpath(
+ "//author[persName/surname[string-length(normalize-space()) > 0]]"
+ )
+ self._builder = None
+
+ def __getitem__(self, item):
+ return GrobidAuthor(self._parsed_authors[item])
+
+ def __len__(self):
+ return len(self._parsed_authors)
+
+ def parse_one(self):
+ """yield parsed authors one by one"""
+ self._builder = LiteratureBuilder()
+ for author in self:
+ yield {
+ "author": self._builder.make_author(
+ full_name=author.fullname,
+ raw_affiliations=author.raw_affiliations,
+ emails=author.emails,
+ ),
+ "parsed_affiliations": author.processed_affiliations,
+ }
+
+ def parse_all(self):
+ """Returns all authors at once as a list"""
+ return list(self.parse_one())
+
+
+class GrobidAuthor(object):
+ def __init__(self, author_selector):
+ self._author = author_selector
+
+ @staticmethod
+ def _extract(source, path, type=None, text=False):
+ path += "[string-length(normalize-space()) > 0]"
+ if type:
+ path += "[@type='{type}']".format(type=type)
+ if text:
+ path += "/text()"
+ return source.xpath(path)
+ return source.xpath(path)
+
+ @classmethod
+ def _extract_string(cls, source, path, type=None, join_char=" "):
+ data = cls._extract(source, path, type, text=True).getall()
+ data = [text.strip() for text in data]
+ return join_char.join(data)
+
+ @classmethod
+ def _extract_strings_list(cls, source, path, type=None):
+ data = cls._extract(source, path, type, text=True).getall()
+ return [text.strip() for text in data]
+
+ @staticmethod
+ def _build_address(street, city, post_code, country):
+ address_list = [
+ element for element in [street, city, post_code, country] if element
+ ]
+ address = {"postal_address": ", ".join(address_list)} if address_list else {}
+ if city:
+ address["cities"] = [city]
+ if post_code:
+ address["postal_code"] = post_code
+ if country:
+ address["country"] = country
+ return address
+
+ @property
+ def names(self):
+ return self._extract_string(self._author, "persName/forename")
+
+ @property
+ def lastname(self):
+ return self._extract_string(self._author, "persName/surname")
+
+ @property
+ def fullname(self):
+ return ",".join([self.lastname, self.names])
+
+ @property
+ def raw_affiliations(self):
+ return self._extract_strings_list(
+ self._author, "affiliation/note", type="raw_affiliation"
+ )
+
+ @property
+ def emails(self):
+ return self._extract_strings_list(self._author, "email")
+
+ @property
+ def processed_affiliations(self):
+ affiliations = []
+ for affiliation in self._extract(self._author, "affiliation"):
+ affiliation_obj = {}
+ name = self._extract_string(
+ affiliation, "orgName", type="institution", join_char=", "
+ )
+ department = self._extract_strings_list(
+ affiliation, "orgName", type="department"
+ )
+
+ street = self._extract_string(affiliation, "address/addrLine")
+ settlement = self._extract_string(affiliation, "address/settlement")
+ post_code = self._extract_string(affiliation, "address/post_code")
+ country = self._extract_string(affiliation, "address/country")
+
+ address = self._build_address(street, settlement, post_code, country)
+
+ if name:
+ affiliation_obj["name"] = name
+ if department:
+ affiliation_obj["department"] = department
+ if address:
+ affiliation_obj["address"] = address
+ affiliations.append(affiliation_obj)
+ return affiliations or None
diff --git a/setup.py b/setup.py
index dbc2a73..298df0d 100644
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,7 @@
'six~=1.0,>=1.10.0',
'elasticsearch==7.1.0',
'elasticsearch-dsl~=7.1'
+ 'inspire-schemas==61.4.12'
]
docs_require = []
diff --git a/tests/test_grobid_authors_paerser.py b/tests/test_grobid_authors_paerser.py
new file mode 100644
index 0000000..bc28a93
--- /dev/null
+++ b/tests/test_grobid_authors_paerser.py
@@ -0,0 +1,298 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of INSPIRE.
+# Copyright (C) 2014-2017 CERN.
+#
+# INSPIRE is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# INSPIRE is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with INSPIRE. If not, see .
+#
+# In applying this license, CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+from __future__ import absolute_import, division, print_function
+import os
+import pkg_resources
+from inspire_utils.grobid_authors_parser import GrobidAuthors
+
+
+def test_process_grobid_authors():
+ grobid_response = pkg_resources.resource_string(
+ __name__,
+ os.path.join(
+ 'fixtures',
+ 'grobid_full_doc.xml'
+ )
+ )
+ expected_authors = [
+ {
+ "parsed_affiliations": [
+ {
+ "department": [u"S. N"],
+ "name": u"Bose National Centre for Basic Sciences, JD Block",
+ "address": {
+ "country": u"India",
+ "cities": [u"Kolkata-700106"],
+ "postal_address": u"Sector III, Salt Lake, Kolkata-700106, India",
+ },
+ }
+ ],
+ "author": {
+ "raw_affiliations": [
+ {
+ "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India."
+ }
+ ],
+ "emails": [u"parthanandi@bose.res.in"],
+ "full_name": u"Nandi, Partha",
+ },
+ },
+ {
+ "parsed_affiliations": [
+ {
+ "department": [
+ u"Indian Institute of Engineering Science and Technology"
+ ],
+ "address": {
+ "country": u"India",
+ "cities": [u"Shibpur, Howrah"],
+ "postal_address": u"Shibpur, Howrah, India",
+ },
+ }
+ ],
+ "author": {
+ "raw_affiliations": [
+ {
+ "value": u"Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India."
+ }
+ ],
+ "emails": [u"sankarshan.sahu2000@gmail.com"],
+ "full_name": u"Sahu, Sankarshan",
+ },
+ },
+ {
+ "parsed_affiliations": [
+ {
+ "department": [u"S. N"],
+ "name": u"Bose National Centre for Basic Sciences, JD Block",
+ "address": {
+ "country": u"India",
+ "cities": [u"Kolkata-700106"],
+ "postal_address": u"Sector III, Salt Lake, Kolkata-700106, India",
+ },
+ }
+ ],
+ "author": {
+ "raw_affiliations": [
+ {
+ "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India."
+ }
+ ],
+ "emails": [u"sayankpal@bose.res.in"],
+ "full_name": u"Pal, Sayan Kumar",
+ },
+ },
+ ]
+
+ expected_authors_count = len(expected_authors)
+
+ authors = GrobidAuthors(grobid_response)
+ assert len(authors) == expected_authors_count
+ assert authors.parse_all() == expected_authors
+
+
+def test_grobid_incomplete_authors():
+ grobid_response = pkg_resources.resource_string(
+ __name__,
+ os.path.join(
+ 'fixtures',
+ 'grobid_incomplete_doc.xml'
+ )
+ )
+ expected_authors = [
+ {"parsed_affiliations": None, "author": {"full_name": u"Nandi"}},
+ {
+ "parsed_affiliations": [
+ {
+ "address": {
+ "cities": [u"Shibpur, Howrah"],
+ "postal_address": u"Shibpur, Howrah",
+ }
+ }
+ ],
+ "author": {
+ "raw_affiliations": [
+ {
+ "value": u"Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India."
+ }
+ ],
+ "full_name": u"Sahu, Sankarshan",
+ },
+ },
+ {
+ "parsed_affiliations": [
+ {
+ "department": [u"S. N"],
+ "name": u"Bose National Centre for Basic Sciences, JD Block",
+ }
+ ],
+ "author": {
+ "raw_affiliations": [
+ {
+ "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India."
+ }
+ ],
+ "emails": [u"sayankpal@bose.res.in"],
+ "full_name": u"Pal, Sayan Kumar",
+ },
+ },
+ ]
+
+ expected_authors_count = len(expected_authors)
+ authors = GrobidAuthors(grobid_response)
+ assert len(authors) == expected_authors_count
+ assert authors.parse_all() == expected_authors
+
+
+def test_grobid_no_authors():
+ input_xml = """
+
+
+
+
+
+ Remarks on noncommutativity and scale anomaly in planar quantum mechanics
+
+
+
+
+
+
+ January 21, 2021
+
+
+
+
+
+
+
+ January 21, 2021
+
+
+ arXiv:2101.07076v2[hep-th]
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+
+
+
+
+
+ """
+ expected_authors = []
+ expected_authors_count = 0
+ authors = GrobidAuthors(input_xml)
+ assert len(authors) == expected_authors_count
+ assert authors.parse_all() == expected_authors
+
+
+def test_grobid_empty_author():
+ input_xml = """
+
+
+
+
+
+ Remarks on noncommutativity and scale anomaly in planar quantum mechanics
+
+
+
+
+
+
+ January 21, 2021
+
+
+
+
+
+
+ FIRST
+
+
+ email@cern.io
+
+
+
+ XYZ
+ ABC
+
+
+
+
+
+
+ YZC
+
+ some@email.cern
+
+
+
+
+ January 21, 2021
+
+
+ arXiv:2101.07076v2[hep-th]
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+
+
+
+
+
+ """
+ expected_authors = [{'parsed_affiliations': None, 'author': {'full_name': u'Abc, Xyz'}}, {'parsed_affiliations': None, 'author': {'emails': [u'some@email.cern'], 'full_name': u'Yzc'}}]
+ expected_authors_count = 2
+ authors = GrobidAuthors(input_xml)
+ assert len(authors) == expected_authors_count
+ assert authors.parse_all() == expected_authors