diff --git a/inspire_utils/grobid_authors_parser.py b/inspire_utils/grobid_authors_parser.py new file mode 100644 index 0000000..66f8ee8 --- /dev/null +++ b/inspire_utils/grobid_authors_parser.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- +# +# This file is part of INSPIRE. +# Copyright (C) 2020 CERN. +# +# INSPIRE is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# INSPIRE is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with INSPIRE. If not, see . +# +# In applying this license, CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +from __future__ import absolute_import, division, print_function +from inspire_schemas.builders import LiteratureBuilder +from parsel import Selector + + +class GrobidAuthors(object): + def __init__(self, xml_text): + if isinstance(xml_text, bytes): + xml_text = xml_text.decode('utf-8') + + self._xml = Selector(text=xml_text, type="xml") + self._xml.remove_namespaces() + self._parsed_authors = self._xml.xpath("//author[persName/surname[string-length(normalize-space()) > 0]]") + self._builder = None + + def __getitem__(self, item): + return GrobidAuthor(self._parsed_authors[item]) + + def __len__(self): + return len(self._parsed_authors) + + def parse_one(self): + """yield parsed authors one by one""" + self._builder = LiteratureBuilder() + for author in self: + yield { + 'author': self._builder.make_author( + full_name=author.fullname, + raw_affiliations=author.raw_affiliations, + emails=author.emails, + ), + 'parsed_affiliations': author.processed_affiliations + } + + def parse_all(self): + """Returns all authors at once as a list""" + return list(self.parse_one()) + + +class GrobidAuthor(object): + def __init__(self, author_selector): + self._author = author_selector + + @staticmethod + def _extract(source, path, type=None, text=False): + path += "[string-length(normalize-space()) > 0]" + if type: + path += u"[@type='{}']".format(type) + if text: + path += "/text()" + return source.xpath(path) + return source.xpath(path) + + @classmethod + def _extract_string(cls, source, path, type=None, join_char=u' '): + data = cls._extract(source, path, type, text=True).getall() + data = [text.strip() for text in data] + return join_char.join(data) + + @classmethod + def _extract_strings_list(cls, source, path, type=None): + data = cls._extract(source, path, type, text=True).getall() + return [text.strip() for text in data] + + @staticmethod + def _build_address(street, city, post_code, country): + address_list = [element for element in [street, city, post_code, country] if element] + address = {"postal_address": ', '.join(address_list)} if address_list else {} + if city: + address['cities'] = [city] + if post_code: + address['postal_code'] = post_code + if country: + address['country'] = country + return address + + @property + def names(self): + return self._extract_string(self._author, "persName/forename") + + @property + def lastname(self): + return self._extract_string(self._author, "persName/surname") + + @property + def fullname(self): + return u",".join([self.lastname, self.names]) + + @property + def raw_affiliations(self): + return self._extract_strings_list(self._author, "affiliation/note", type="raw_affiliation") + + @property + def emails(self): + return self._extract_strings_list(self._author, "email") + + @property + def processed_affiliations(self): + affiliations = [] + for affiliation in self._extract(self._author, "affiliation"): + affiliation_obj = {} + name = self._extract_string(affiliation, "orgName", type="institution", join_char=', ') + department = self._extract_strings_list(affiliation, "orgName", type="department") + + street = self._extract_string(affiliation, 'address/addrLine') + settlement = self._extract_string(affiliation, 'address/settlement') + post_code = self._extract_string(affiliation, 'address/post_code') + country = self._extract_string(affiliation, 'address/country') + + address = self._build_address(street, settlement, post_code, country) + + if name: + affiliation_obj['name'] = name + if department: + affiliation_obj['department'] = department + if address: + affiliation_obj['address'] = address + affiliations.append(affiliation_obj) + return affiliations or None diff --git a/setup.py b/setup.py index dbc2a73..2bd361f 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,9 @@ 'python-dateutil~=2.0,>=2.6.1', 'six~=1.0,>=1.10.0', 'elasticsearch==7.1.0', - 'elasticsearch-dsl~=7.1' + 'elasticsearch-dsl~=7.1', + 'inspire-schemas==61.4.12', + 'parsel>=1.5' ] docs_require = [] diff --git a/tests/fixtures/grobid_empty_author_doc.xml b/tests/fixtures/grobid_empty_author_doc.xml new file mode 100644 index 0000000..3b07898 --- /dev/null +++ b/tests/fixtures/grobid_empty_author_doc.xml @@ -0,0 +1,69 @@ + + + + + + Remarks on noncommutativity and scale anomaly in planar quantum mechanics + + + + + + + January 21, 2021 + + + + + + + FIRST + + + email@cern.io + + + + XYZ + ABC + + + + + + + YZC + + some@email.cern + + + + + January 21, 2021 + + + arXiv:2101.07076v2[hep-th] + + + + + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + + + diff --git a/tests/fixtures/grobid_full_doc.xml b/tests/fixtures/grobid_full_doc.xml new file mode 100644 index 0000000..a01bc70 --- /dev/null +++ b/tests/fixtures/grobid_full_doc.xml @@ -0,0 +1,109 @@ + + + + + + Remarks on noncommutativity and scale anomaly in planar quantum mechanics + + + + + + + January 21, 2021 + + + + + + + Partha + Nandi + + parthanandi@bose.res.in + + + S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India. + + S. N + Bose National Centre for Basic Sciences + JD Block +
+ Sector III, Salt Lake + Kolkata-700106 + India +
+
+
+ + + Sankarshan + Sahu + + sankarshan.sahu2000@gmail.com + + + Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India. + + Indian Institute of Engineering Science and Technology +
+ Bengal-711103 + Shibpur, Howrah + West + India +
+
+
+ + + Sayan + Kumar + Pal + + sayankpal@bose.res.in + + + S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India. + + S. N + Bose National Centre for Basic Sciences + JD Block +
+ Sector III, Salt Lake + Kolkata-700106 + India +
+
+
+ Remarks on noncommutativity and scale anomaly in planar quantum mechanics +
+ + + January 21, 2021 + + + arXiv:2101.07076v2[hep-th] +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + +
+ +
diff --git a/tests/fixtures/grobid_incomplete_doc.xml b/tests/fixtures/grobid_incomplete_doc.xml new file mode 100644 index 0000000..e66fc2f --- /dev/null +++ b/tests/fixtures/grobid_incomplete_doc.xml @@ -0,0 +1,85 @@ + + + + + + Remarks on noncommutativity and scale anomaly in planar quantum mechanics + + + + + + + January 21, 2021 + + + + + + + Nandi + + + + + Sankarshan + Sahu + + + + Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India. + +
+ Bengal-711103 + Shibpur, Howrah +
+
+
+ + + Sayan + Kumar + Pal + + sayankpal@bose.res.in + + + S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India. + + S. N + Bose National Centre for Basic Sciences + JD Block + + + Remarks on noncommutativity and scale anomaly in planar quantum mechanics +
+ + + January 21, 2021 + + + arXiv:2101.07076v2[hep-th] +
+
+
+ + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + +
+ +
diff --git a/tests/fixtures/grobid_no_authors_doc.xml b/tests/fixtures/grobid_no_authors_doc.xml new file mode 100644 index 0000000..e26e06c --- /dev/null +++ b/tests/fixtures/grobid_no_authors_doc.xml @@ -0,0 +1,52 @@ + + + + + + Remarks on noncommutativity and scale anomaly in planar quantum mechanics + + + + + + + January 21, 2021 + + + + + + + + + + Remarks on noncommutativity and scale anomaly in planar quantum mechanics + + + + January 21, 2021 + + + arXiv:2101.07076v2[hep-th] + + + + + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + + + diff --git a/tests/test_grobid_authors_parser.py b/tests/test_grobid_authors_parser.py new file mode 100644 index 0000000..6b026f6 --- /dev/null +++ b/tests/test_grobid_authors_parser.py @@ -0,0 +1,201 @@ +# -*- coding: utf-8 -*- +# +# This file is part of INSPIRE. +# Copyright (C) 2014-2017 CERN. +# +# INSPIRE is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# INSPIRE is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with INSPIRE. If not, see . +# +# In applying this license, CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +from __future__ import absolute_import, division, print_function +import os +import pkg_resources +from inspire_utils.grobid_authors_parser import GrobidAuthors + + +def test_process_grobid_authors(): + + grobid_response = pkg_resources.resource_string( + __name__, + os.path.join( + 'fixtures', + 'grobid_full_doc.xml' + ) + ) + expected_authors = [ + { + "parsed_affiliations": [ + { + "department": [u"S. N"], + "name": u"Bose National Centre for Basic Sciences, JD Block", + "address": { + "country": u"India", + "cities": [u"Kolkata-700106"], + "postal_address": u"Sector III, Salt Lake, Kolkata-700106, India", + }, + } + ], + "author": { + "raw_affiliations": [ + { + "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India." + } + ], + "emails": [u"parthanandi@bose.res.in"], + "full_name": u"Nandi, Partha", + }, + }, + { + "parsed_affiliations": [ + { + "department": [ + u"Indian Institute of Engineering Science and Technology" + ], + "address": { + "country": u"India", + "cities": [u"Shibpur, Howrah"], + "postal_address": u"Shibpur, Howrah, India", + }, + } + ], + "author": { + "raw_affiliations": [ + { + "value": u"Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India." + } + ], + "emails": [u"sankarshan.sahu2000@gmail.com"], + "full_name": u"Sahu, Sankarshan", + }, + }, + { + "parsed_affiliations": [ + { + "department": [u"S. N"], + "name": u"Bose National Centre for Basic Sciences, JD Block", + "address": { + "country": u"India", + "cities": [u"Kolkata-700106"], + "postal_address": u"Sector III, Salt Lake, Kolkata-700106, India", + }, + } + ], + "author": { + "raw_affiliations": [ + { + "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India." + } + ], + "emails": [u"sayankpal@bose.res.in"], + "full_name": u"Pal, Sayan Kumar", + }, + }, + ] + + expected_authors_count = len(expected_authors) + + authors = GrobidAuthors(grobid_response) + assert len(authors) == expected_authors_count + assert authors.parse_all() == expected_authors + + +def test_grobid_incomplete_authors(): + + grobid_response = pkg_resources.resource_string( + __name__, + os.path.join( + 'fixtures', + 'grobid_incomplete_doc.xml' + ) + ) + + expected_authors = [ + {"parsed_affiliations": None, "author": {"full_name": u"Nandi"}}, + { + "parsed_affiliations": [ + { + "address": { + "cities": [u"Shibpur, Howrah"], + "postal_address": u"Shibpur, Howrah", + } + } + ], + "author": { + "raw_affiliations": [ + { + "value": u"Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India." + } + ], + "full_name": u"Sahu, Sankarshan", + }, + }, + { + "parsed_affiliations": [ + { + "department": [u"S. N"], + "name": u"Bose National Centre for Basic Sciences, JD Block", + } + ], + "author": { + "raw_affiliations": [ + { + "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India." + } + ], + "emails": [u"sayankpal@bose.res.in"], + "full_name": u"Pal, Sayan Kumar", + }, + }, + ] + + expected_authors_count = len(expected_authors) + authors = GrobidAuthors(grobid_response) + assert len(authors) == expected_authors_count + assert authors.parse_all() == expected_authors + + +def test_grobid_no_authors(): + + grobid_response = pkg_resources.resource_string( + __name__, + os.path.join( + 'fixtures', + 'grobid_no_authors_doc.xml' + ) + ) + + expected_authors = [] + expected_authors_count = 0 + authors = GrobidAuthors(grobid_response) + assert len(authors) == expected_authors_count + assert authors.parse_all() == expected_authors + + +def test_grobid_empty_author(): + + grobid_response = pkg_resources.resource_string( + __name__, + os.path.join( + 'fixtures', + 'grobid_empty_author_doc.xml' + ) + ) + + expected_authors = [{'parsed_affiliations': None, 'author': {'full_name': u'Abc, Xyz'}}, {'parsed_affiliations': None, 'author': {'emails': [u'some@email.cern'], 'full_name': u'Yzc'}}] + expected_authors_count = 2 + authors = GrobidAuthors(grobid_response) + assert len(authors) == expected_authors_count + assert authors.parse_all() == expected_authors