diff --git a/inspire_utils/grobid_authors_parser.py b/inspire_utils/grobid_authors_parser.py new file mode 100644 index 0000000..d0bc945 --- /dev/null +++ b/inspire_utils/grobid_authors_parser.py @@ -0,0 +1,151 @@ +# -*- coding: utf-8 -*- +# +# This file is part of INSPIRE. +# Copyright (C) 2020 CERN. +# +# INSPIRE is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# INSPIRE is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with INSPIRE. If not, see . +# +# In applying this license, CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +from __future__ import absolute_import, division, print_function + +from inspire_schemas.builders import LiteratureBuilder +from parsel import Selector + + +class GrobidAuthors(object): + def __init__(self, xml_text): + + self._xml = Selector(text=xml_text, type="xml") + + self._xml.remove_namespaces() + self._parsed_authors = self._xml.xpath( + "//author[persName/surname[string-length(normalize-space()) > 0]]" + ) + self._builder = None + + def __getitem__(self, item): + return GrobidAuthor(self._parsed_authors[item]) + + def __len__(self): + return len(self._parsed_authors) + + def parse_one(self): + """yield parsed authors one by one""" + self._builder = LiteratureBuilder() + for author in self: + yield { + "author": self._builder.make_author( + full_name=author.fullname, + raw_affiliations=author.raw_affiliations, + emails=author.emails, + ), + "parsed_affiliations": author.processed_affiliations, + } + + def parse_all(self): + """Returns all authors at once as a list""" + return list(self.parse_one()) + + +class GrobidAuthor(object): + def __init__(self, author_selector): + self._author = author_selector + + @staticmethod + def _extract(source, path, type=None, text=False): + path += "[string-length(normalize-space()) > 0]" + if type: + path += "[@type='{type}']".format(type=type) + if text: + path += "/text()" + return source.xpath(path) + return source.xpath(path) + + @classmethod + def _extract_string(cls, source, path, type=None, join_char=" "): + data = cls._extract(source, path, type, text=True).getall() + data = [text.strip() for text in data] + return join_char.join(data) + + @classmethod + def _extract_strings_list(cls, source, path, type=None): + data = cls._extract(source, path, type, text=True).getall() + return [text.strip() for text in data] + + @staticmethod + def _build_address(street, city, post_code, country): + address_list = [ + element for element in [street, city, post_code, country] if element + ] + address = {"postal_address": ", ".join(address_list)} if address_list else {} + if city: + address["cities"] = [city] + if post_code: + address["postal_code"] = post_code + if country: + address["country"] = country + return address + + @property + def names(self): + return self._extract_string(self._author, "persName/forename") + + @property + def lastname(self): + return self._extract_string(self._author, "persName/surname") + + @property + def fullname(self): + return ",".join([self.lastname, self.names]) + + @property + def raw_affiliations(self): + return self._extract_strings_list( + self._author, "affiliation/note", type="raw_affiliation" + ) + + @property + def emails(self): + return self._extract_strings_list(self._author, "email") + + @property + def processed_affiliations(self): + affiliations = [] + for affiliation in self._extract(self._author, "affiliation"): + affiliation_obj = {} + name = self._extract_string( + affiliation, "orgName", type="institution", join_char=", " + ) + department = self._extract_strings_list( + affiliation, "orgName", type="department" + ) + + street = self._extract_string(affiliation, "address/addrLine") + settlement = self._extract_string(affiliation, "address/settlement") + post_code = self._extract_string(affiliation, "address/post_code") + country = self._extract_string(affiliation, "address/country") + + address = self._build_address(street, settlement, post_code, country) + + if name: + affiliation_obj["name"] = name + if department: + affiliation_obj["department"] = department + if address: + affiliation_obj["address"] = address + affiliations.append(affiliation_obj) + return affiliations or None diff --git a/setup.py b/setup.py index dbc2a73..298df0d 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,7 @@ 'six~=1.0,>=1.10.0', 'elasticsearch==7.1.0', 'elasticsearch-dsl~=7.1' + 'inspire-schemas==61.4.12' ] docs_require = [] diff --git a/tests/test_grobid_authors_paerser.py b/tests/test_grobid_authors_paerser.py new file mode 100644 index 0000000..bc28a93 --- /dev/null +++ b/tests/test_grobid_authors_paerser.py @@ -0,0 +1,298 @@ +# -*- coding: utf-8 -*- +# +# This file is part of INSPIRE. +# Copyright (C) 2014-2017 CERN. +# +# INSPIRE is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# INSPIRE is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with INSPIRE. If not, see . +# +# In applying this license, CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +from __future__ import absolute_import, division, print_function +import os +import pkg_resources +from inspire_utils.grobid_authors_parser import GrobidAuthors + + +def test_process_grobid_authors(): + grobid_response = pkg_resources.resource_string( + __name__, + os.path.join( + 'fixtures', + 'grobid_full_doc.xml' + ) + ) + expected_authors = [ + { + "parsed_affiliations": [ + { + "department": [u"S. N"], + "name": u"Bose National Centre for Basic Sciences, JD Block", + "address": { + "country": u"India", + "cities": [u"Kolkata-700106"], + "postal_address": u"Sector III, Salt Lake, Kolkata-700106, India", + }, + } + ], + "author": { + "raw_affiliations": [ + { + "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India." + } + ], + "emails": [u"parthanandi@bose.res.in"], + "full_name": u"Nandi, Partha", + }, + }, + { + "parsed_affiliations": [ + { + "department": [ + u"Indian Institute of Engineering Science and Technology" + ], + "address": { + "country": u"India", + "cities": [u"Shibpur, Howrah"], + "postal_address": u"Shibpur, Howrah, India", + }, + } + ], + "author": { + "raw_affiliations": [ + { + "value": u"Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India." + } + ], + "emails": [u"sankarshan.sahu2000@gmail.com"], + "full_name": u"Sahu, Sankarshan", + }, + }, + { + "parsed_affiliations": [ + { + "department": [u"S. N"], + "name": u"Bose National Centre for Basic Sciences, JD Block", + "address": { + "country": u"India", + "cities": [u"Kolkata-700106"], + "postal_address": u"Sector III, Salt Lake, Kolkata-700106, India", + }, + } + ], + "author": { + "raw_affiliations": [ + { + "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India." + } + ], + "emails": [u"sayankpal@bose.res.in"], + "full_name": u"Pal, Sayan Kumar", + }, + }, + ] + + expected_authors_count = len(expected_authors) + + authors = GrobidAuthors(grobid_response) + assert len(authors) == expected_authors_count + assert authors.parse_all() == expected_authors + + +def test_grobid_incomplete_authors(): + grobid_response = pkg_resources.resource_string( + __name__, + os.path.join( + 'fixtures', + 'grobid_incomplete_doc.xml' + ) + ) + expected_authors = [ + {"parsed_affiliations": None, "author": {"full_name": u"Nandi"}}, + { + "parsed_affiliations": [ + { + "address": { + "cities": [u"Shibpur, Howrah"], + "postal_address": u"Shibpur, Howrah", + } + } + ], + "author": { + "raw_affiliations": [ + { + "value": u"Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India." + } + ], + "full_name": u"Sahu, Sankarshan", + }, + }, + { + "parsed_affiliations": [ + { + "department": [u"S. N"], + "name": u"Bose National Centre for Basic Sciences, JD Block", + } + ], + "author": { + "raw_affiliations": [ + { + "value": u"S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India." + } + ], + "emails": [u"sayankpal@bose.res.in"], + "full_name": u"Pal, Sayan Kumar", + }, + }, + ] + + expected_authors_count = len(expected_authors) + authors = GrobidAuthors(grobid_response) + assert len(authors) == expected_authors_count + assert authors.parse_all() == expected_authors + + +def test_grobid_no_authors(): + input_xml = """ + + + + + + Remarks on noncommutativity and scale anomaly in planar quantum mechanics + + + + + + + January 21, 2021 + + + + + + + + January 21, 2021 + + + arXiv:2101.07076v2[hep-th] + + + + + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + + + + """ + expected_authors = [] + expected_authors_count = 0 + authors = GrobidAuthors(input_xml) + assert len(authors) == expected_authors_count + assert authors.parse_all() == expected_authors + + +def test_grobid_empty_author(): + input_xml = """ + + + + + + Remarks on noncommutativity and scale anomaly in planar quantum mechanics + + + + + + + January 21, 2021 + + + + + + + FIRST + + + email@cern.io + + + + XYZ + ABC + + + + + + + YZC + + some@email.cern + + + + + January 21, 2021 + + + arXiv:2101.07076v2[hep-th] + + + + + + + GROBID - A machine learning software for extracting information from scholarly documents + + + + + + + + + + + """ + expected_authors = [{'parsed_affiliations': None, 'author': {'full_name': u'Abc, Xyz'}}, {'parsed_affiliations': None, 'author': {'emails': [u'some@email.cern'], 'full_name': u'Yzc'}}] + expected_authors_count = 2 + authors = GrobidAuthors(input_xml) + assert len(authors) == expected_authors_count + assert authors.parse_all() == expected_authors