diff --git a/inspire_utils/grobid_authors_parser.py b/inspire_utils/grobid_authors_parser.py
new file mode 100644
index 0000000..66f8ee8
--- /dev/null
+++ b/inspire_utils/grobid_authors_parser.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of INSPIRE.
+# Copyright (C) 2020 CERN.
+#
+# INSPIRE is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# INSPIRE is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with INSPIRE. If not, see .
+#
+# In applying this license, CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+from __future__ import absolute_import, division, print_function
+from inspire_schemas.builders import LiteratureBuilder
+from parsel import Selector
+
+
+class GrobidAuthors(object):
+ def __init__(self, xml_text):
+ if isinstance(xml_text, bytes):
+ xml_text = xml_text.decode('utf-8')
+
+ self._xml = Selector(text=xml_text, type="xml")
+ self._xml.remove_namespaces()
+ self._parsed_authors = self._xml.xpath("//author[persName/surname[string-length(normalize-space()) > 0]]")
+ self._builder = None
+
+ def __getitem__(self, item):
+ return GrobidAuthor(self._parsed_authors[item])
+
+ def __len__(self):
+ return len(self._parsed_authors)
+
+ def parse_one(self):
+ """yield parsed authors one by one"""
+ self._builder = LiteratureBuilder()
+ for author in self:
+ yield {
+ 'author': self._builder.make_author(
+ full_name=author.fullname,
+ raw_affiliations=author.raw_affiliations,
+ emails=author.emails,
+ ),
+ 'parsed_affiliations': author.processed_affiliations
+ }
+
+ def parse_all(self):
+ """Returns all authors at once as a list"""
+ return list(self.parse_one())
+
+
+class GrobidAuthor(object):
+ def __init__(self, author_selector):
+ self._author = author_selector
+
+ @staticmethod
+ def _extract(source, path, type=None, text=False):
+ path += "[string-length(normalize-space()) > 0]"
+ if type:
+ path += u"[@type='{}']".format(type)
+ if text:
+ path += "/text()"
+ return source.xpath(path)
+ return source.xpath(path)
+
+ @classmethod
+ def _extract_string(cls, source, path, type=None, join_char=u' '):
+ data = cls._extract(source, path, type, text=True).getall()
+ data = [text.strip() for text in data]
+ return join_char.join(data)
+
+ @classmethod
+ def _extract_strings_list(cls, source, path, type=None):
+ data = cls._extract(source, path, type, text=True).getall()
+ return [text.strip() for text in data]
+
+ @staticmethod
+ def _build_address(street, city, post_code, country):
+ address_list = [element for element in [street, city, post_code, country] if element]
+ address = {"postal_address": ', '.join(address_list)} if address_list else {}
+ if city:
+ address['cities'] = [city]
+ if post_code:
+ address['postal_code'] = post_code
+ if country:
+ address['country'] = country
+ return address
+
+ @property
+ def names(self):
+ return self._extract_string(self._author, "persName/forename")
+
+ @property
+ def lastname(self):
+ return self._extract_string(self._author, "persName/surname")
+
+ @property
+ def fullname(self):
+ return u",".join([self.lastname, self.names])
+
+ @property
+ def raw_affiliations(self):
+ return self._extract_strings_list(self._author, "affiliation/note", type="raw_affiliation")
+
+ @property
+ def emails(self):
+ return self._extract_strings_list(self._author, "email")
+
+ @property
+ def processed_affiliations(self):
+ affiliations = []
+ for affiliation in self._extract(self._author, "affiliation"):
+ affiliation_obj = {}
+ name = self._extract_string(affiliation, "orgName", type="institution", join_char=', ')
+ department = self._extract_strings_list(affiliation, "orgName", type="department")
+
+ street = self._extract_string(affiliation, 'address/addrLine')
+ settlement = self._extract_string(affiliation, 'address/settlement')
+ post_code = self._extract_string(affiliation, 'address/post_code')
+ country = self._extract_string(affiliation, 'address/country')
+
+ address = self._build_address(street, settlement, post_code, country)
+
+ if name:
+ affiliation_obj['name'] = name
+ if department:
+ affiliation_obj['department'] = department
+ if address:
+ affiliation_obj['address'] = address
+ affiliations.append(affiliation_obj)
+ return affiliations or None
diff --git a/setup.py b/setup.py
index dbc2a73..2bd361f 100644
--- a/setup.py
+++ b/setup.py
@@ -43,7 +43,9 @@
'python-dateutil~=2.0,>=2.6.1',
'six~=1.0,>=1.10.0',
'elasticsearch==7.1.0',
- 'elasticsearch-dsl~=7.1'
+ 'elasticsearch-dsl~=7.1',
+ 'inspire-schemas==61.4.12',
+ 'parsel>=1.5'
]
docs_require = []
diff --git a/tests/fixtures/grobid_empty_author_doc.xml b/tests/fixtures/grobid_empty_author_doc.xml
new file mode 100644
index 0000000..3b07898
--- /dev/null
+++ b/tests/fixtures/grobid_empty_author_doc.xml
@@ -0,0 +1,69 @@
+
+
+
+
+
+ Remarks on noncommutativity and scale anomaly in planar quantum mechanics
+
+
+
+
+
+
+ January 21, 2021
+
+
+
+
+
+
+ FIRST
+
+
+ email@cern.io
+
+
+
+ XYZ
+ ABC
+
+
+
+
+
+
+ YZC
+
+ some@email.cern
+
+
+
+
+ January 21, 2021
+
+
+ arXiv:2101.07076v2[hep-th]
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/fixtures/grobid_full_doc.xml b/tests/fixtures/grobid_full_doc.xml
new file mode 100644
index 0000000..a01bc70
--- /dev/null
+++ b/tests/fixtures/grobid_full_doc.xml
@@ -0,0 +1,109 @@
+
+
+
+
+
+ Remarks on noncommutativity and scale anomaly in planar quantum mechanics
+
+
+
+
+
+
+ January 21, 2021
+
+
+
+
+
+
+ Partha
+ Nandi
+
+ parthanandi@bose.res.in
+
+
+ S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India.
+
+ S. N
+ Bose National Centre for Basic Sciences
+ JD Block
+
+ Sector III, Salt Lake
+ Kolkata-700106
+ India
+
+
+
+
+
+ Sankarshan
+ Sahu
+
+ sankarshan.sahu2000@gmail.com
+
+
+ Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India.
+
+ Indian Institute of Engineering Science and Technology
+
+ Bengal-711103
+ Shibpur, Howrah
+ West
+ India
+
+
+
+
+
+ Sayan
+ Kumar
+ Pal
+
+ sayankpal@bose.res.in
+
+
+ S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India.
+
+ S. N
+ Bose National Centre for Basic Sciences
+ JD Block
+
+ Sector III, Salt Lake
+ Kolkata-700106
+ India
+
+
+
+ Remarks on noncommutativity and scale anomaly in planar quantum mechanics
+
+
+
+ January 21, 2021
+
+
+ arXiv:2101.07076v2[hep-th]
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/fixtures/grobid_incomplete_doc.xml b/tests/fixtures/grobid_incomplete_doc.xml
new file mode 100644
index 0000000..e66fc2f
--- /dev/null
+++ b/tests/fixtures/grobid_incomplete_doc.xml
@@ -0,0 +1,85 @@
+
+
+
+
+
+ Remarks on noncommutativity and scale anomaly in planar quantum mechanics
+
+
+
+
+
+
+ January 21, 2021
+
+
+
+
+
+
+ Nandi
+
+
+
+
+ Sankarshan
+ Sahu
+
+
+
+ Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India.
+
+
+ Bengal-711103
+ Shibpur, Howrah
+
+
+
+
+
+ Sayan
+ Kumar
+ Pal
+
+ sayankpal@bose.res.in
+
+
+ S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India.
+
+ S. N
+ Bose National Centre for Basic Sciences
+ JD Block
+
+
+ Remarks on noncommutativity and scale anomaly in planar quantum mechanics
+
+
+
+ January 21, 2021
+
+
+ arXiv:2101.07076v2[hep-th]
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/fixtures/grobid_no_authors_doc.xml b/tests/fixtures/grobid_no_authors_doc.xml
new file mode 100644
index 0000000..e26e06c
--- /dev/null
+++ b/tests/fixtures/grobid_no_authors_doc.xml
@@ -0,0 +1,52 @@
+
+
+
+
+
+ Remarks on noncommutativity and scale anomaly in planar quantum mechanics
+
+
+
+
+
+
+ January 21, 2021
+
+
+
+
+
+
+
+
+
+ Remarks on noncommutativity and scale anomaly in planar quantum mechanics
+
+
+
+ January 21, 2021
+
+
+ arXiv:2101.07076v2[hep-th]
+
+
+
+
+
+
+ GROBID - A machine learning software for extracting information from scholarly documents
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/test_grobid_authors_parser.py b/tests/test_grobid_authors_parser.py
new file mode 100644
index 0000000..6b026f6
--- /dev/null
+++ b/tests/test_grobid_authors_parser.py
@@ -0,0 +1,201 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of INSPIRE.
+# Copyright (C) 2014-2017 CERN.
+#
+# INSPIRE is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# INSPIRE is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with INSPIRE. If not, see .
+#
+# In applying this license, CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+from __future__ import absolute_import, division, print_function
+import os
+import pkg_resources
+from inspire_utils.grobid_authors_parser import GrobidAuthors
+
+
+def test_process_grobid_authors():
+
+ grobid_response = pkg_resources.resource_string(
+ __name__,
+ os.path.join(
+ 'fixtures',
+ 'grobid_full_doc.xml'
+ )
+ )
+ expected_authors = [
+ {
+ "parsed_affiliations": [
+ {
+ "department": [u"S. N"],
+ "name": u"Bose National Centre for Basic Sciences, JD Block",
+ "address": {
+ "country": u"India",
+ "cities": [u"Kolkata-700106"],
+ "postal_address": u"Sector III, Salt Lake, Kolkata-700106, India",
+ },
+ }
+ ],
+ "author": {
+ "raw_affiliations": [
+ {
+ "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India."
+ }
+ ],
+ "emails": [u"parthanandi@bose.res.in"],
+ "full_name": u"Nandi, Partha",
+ },
+ },
+ {
+ "parsed_affiliations": [
+ {
+ "department": [
+ u"Indian Institute of Engineering Science and Technology"
+ ],
+ "address": {
+ "country": u"India",
+ "cities": [u"Shibpur, Howrah"],
+ "postal_address": u"Shibpur, Howrah, India",
+ },
+ }
+ ],
+ "author": {
+ "raw_affiliations": [
+ {
+ "value": u"Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India."
+ }
+ ],
+ "emails": [u"sankarshan.sahu2000@gmail.com"],
+ "full_name": u"Sahu, Sankarshan",
+ },
+ },
+ {
+ "parsed_affiliations": [
+ {
+ "department": [u"S. N"],
+ "name": u"Bose National Centre for Basic Sciences, JD Block",
+ "address": {
+ "country": u"India",
+ "cities": [u"Kolkata-700106"],
+ "postal_address": u"Sector III, Salt Lake, Kolkata-700106, India",
+ },
+ }
+ ],
+ "author": {
+ "raw_affiliations": [
+ {
+ "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India."
+ }
+ ],
+ "emails": [u"sayankpal@bose.res.in"],
+ "full_name": u"Pal, Sayan Kumar",
+ },
+ },
+ ]
+
+ expected_authors_count = len(expected_authors)
+
+ authors = GrobidAuthors(grobid_response)
+ assert len(authors) == expected_authors_count
+ assert authors.parse_all() == expected_authors
+
+
+def test_grobid_incomplete_authors():
+
+ grobid_response = pkg_resources.resource_string(
+ __name__,
+ os.path.join(
+ 'fixtures',
+ 'grobid_incomplete_doc.xml'
+ )
+ )
+
+ expected_authors = [
+ {"parsed_affiliations": None, "author": {"full_name": u"Nandi"}},
+ {
+ "parsed_affiliations": [
+ {
+ "address": {
+ "cities": [u"Shibpur, Howrah"],
+ "postal_address": u"Shibpur, Howrah",
+ }
+ }
+ ],
+ "author": {
+ "raw_affiliations": [
+ {
+ "value": u"Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India."
+ }
+ ],
+ "full_name": u"Sahu, Sankarshan",
+ },
+ },
+ {
+ "parsed_affiliations": [
+ {
+ "department": [u"S. N"],
+ "name": u"Bose National Centre for Basic Sciences, JD Block",
+ }
+ ],
+ "author": {
+ "raw_affiliations": [
+ {
+ "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India."
+ }
+ ],
+ "emails": [u"sayankpal@bose.res.in"],
+ "full_name": u"Pal, Sayan Kumar",
+ },
+ },
+ ]
+
+ expected_authors_count = len(expected_authors)
+ authors = GrobidAuthors(grobid_response)
+ assert len(authors) == expected_authors_count
+ assert authors.parse_all() == expected_authors
+
+
+def test_grobid_no_authors():
+
+ grobid_response = pkg_resources.resource_string(
+ __name__,
+ os.path.join(
+ 'fixtures',
+ 'grobid_no_authors_doc.xml'
+ )
+ )
+
+ expected_authors = []
+ expected_authors_count = 0
+ authors = GrobidAuthors(grobid_response)
+ assert len(authors) == expected_authors_count
+ assert authors.parse_all() == expected_authors
+
+
+def test_grobid_empty_author():
+
+ grobid_response = pkg_resources.resource_string(
+ __name__,
+ os.path.join(
+ 'fixtures',
+ 'grobid_empty_author_doc.xml'
+ )
+ )
+
+ expected_authors = [{'parsed_affiliations': None, 'author': {'full_name': u'Abc, Xyz'}}, {'parsed_affiliations': None, 'author': {'emails': [u'some@email.cern'], 'full_name': u'Yzc'}}]
+ expected_authors_count = 2
+ authors = GrobidAuthors(grobid_response)
+ assert len(authors) == expected_authors_count
+ assert authors.parse_all() == expected_authors