Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add grobidAuthors #79

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions inspire_utils/grobid_authors_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2020 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

from __future__ import absolute_import, division, print_function
from inspire_schemas.builders import LiteratureBuilder
from parsel import Selector


class GrobidAuthors(object):
def __init__(self, xml_text):
if isinstance(xml_text, bytes):
xml_text = xml_text.decode('utf-8')

self._xml = Selector(text=xml_text, type="xml")
self._xml.remove_namespaces()
self._parsed_authors = self._xml.xpath("//author[persName/surname[string-length(normalize-space()) > 0]]")
self._builder = None

def __getitem__(self, item):
return GrobidAuthor(self._parsed_authors[item])

def __len__(self):
return len(self._parsed_authors)

def parse_one(self):
"""yield parsed authors one by one"""
self._builder = LiteratureBuilder()
for author in self:
yield {
'author': self._builder.make_author(
full_name=author.fullname,
raw_affiliations=author.raw_affiliations,
emails=author.emails,
),
'parsed_affiliations': author.processed_affiliations
}

def parse_all(self):
"""Returns all authors at once as a list"""
return list(self.parse_one())


class GrobidAuthor(object):
def __init__(self, author_selector):
self._author = author_selector

@staticmethod
def _extract(source, path, type=None, text=False):
path += "[string-length(normalize-space()) > 0]"
if type:
path += u"[@type='{}']".format(type)
if text:
path += "/text()"
return source.xpath(path)
return source.xpath(path)

@classmethod
def _extract_string(cls, source, path, type=None, join_char=u' '):
data = cls._extract(source, path, type, text=True).getall()
data = [text.strip() for text in data]
return join_char.join(data)

@classmethod
def _extract_strings_list(cls, source, path, type=None):
data = cls._extract(source, path, type, text=True).getall()
return [text.strip() for text in data]

@staticmethod
def _build_address(street, city, post_code, country):
address_list = [element for element in [street, city, post_code, country] if element]
address = {"postal_address": ', '.join(address_list)} if address_list else {}
if city:
address['cities'] = [city]
if post_code:
address['postal_code'] = post_code
if country:
address['country'] = country
return address

@property
def names(self):
return self._extract_string(self._author, "persName/forename")

@property
def lastname(self):
return self._extract_string(self._author, "persName/surname")

@property
def fullname(self):
return u",".join([self.lastname, self.names])

@property
def raw_affiliations(self):
return self._extract_strings_list(self._author, "affiliation/note", type="raw_affiliation")

@property
def emails(self):
return self._extract_strings_list(self._author, "email")

@property
def processed_affiliations(self):
affiliations = []
for affiliation in self._extract(self._author, "affiliation"):
affiliation_obj = {}
name = self._extract_string(affiliation, "orgName", type="institution", join_char=', ')
department = self._extract_strings_list(affiliation, "orgName", type="department")

street = self._extract_string(affiliation, 'address/addrLine')
settlement = self._extract_string(affiliation, 'address/settlement')
post_code = self._extract_string(affiliation, 'address/post_code')
country = self._extract_string(affiliation, 'address/country')

address = self._build_address(street, settlement, post_code, country)

if name:
affiliation_obj['name'] = name
if department:
affiliation_obj['department'] = department
if address:
affiliation_obj['address'] = address
affiliations.append(affiliation_obj)
return affiliations or None
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@
'python-dateutil~=2.0,>=2.6.1',
'six~=1.0,>=1.10.0',
'elasticsearch==7.1.0',
'elasticsearch-dsl~=7.1'
'elasticsearch-dsl~=7.1',
'inspire-schemas==61.4.12',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that might be problematic cause schemas require utils too :/

'parsel>=1.5'
]

docs_require = []
Expand Down
69 changes: 69 additions & 0 deletions tests/fixtures/grobid_empty_author_doc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve"
xmlns="http://www.tei-c.org/ns/1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/xsd/Grobid.xsd"
xmlns:xlink="http://www.w3.org/1999/xlink">
<teiHeader xml:lang="en">
<fileDesc>
<titleStmt>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</titleStmt>
<publicationStmt>
<publisher/>
<availability status="unknown">
<licence/>
</availability>
<date type="published" when="2021-01-21">January 21, 2021</date>
</publicationStmt>
<sourceDesc>
<biblStruct>
<analytic>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first"> FIRST </forename>
<surname></surname>
</persName>
<email> [email protected] </email>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">XYZ</forename>
<surname>ABC</surname>
</persName>
<email> </email>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first"> </forename>
<surname>YZC</surname>
</persName>
<email> [email protected] </email>
</author>
</analytic>
<monogr>
<imprint>
<date type="published" when="2021-01-21">January 21, 2021</date>
</imprint>
</monogr>
<idno type="arXiv">arXiv:2101.07076v2[hep-th]</idno>
</biblStruct>
</sourceDesc>
</fileDesc>
<encodingDesc>
<appInfo>
<application version="0.6.1" ident="GROBID" when="2021-02-09T09:29+0000">
<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
<ref target="https://github.com/kermitt2/grobid"/>
</application>
</appInfo>
</encodingDesc>
<profileDesc>
<abstract/>
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
109 changes: 109 additions & 0 deletions tests/fixtures/grobid_full_doc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve"
xmlns="http://www.tei-c.org/ns/1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/xsd/Grobid.xsd"
xmlns:xlink="http://www.w3.org/1999/xlink">
<teiHeader xml:lang="en">
<fileDesc>
<titleStmt>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</titleStmt>
<publicationStmt>
<publisher/>
<availability status="unknown">
<licence/>
</availability>
<date type="published" when="2021-01-21">January 21, 2021</date>
</publicationStmt>
<sourceDesc>
<biblStruct>
<analytic>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">Partha</forename>
<surname>Nandi</surname>
</persName>
<email>[email protected]</email>
<affiliation key="aff0">
<note type="raw_affiliation">
<label>1</label> S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India.
</note>
<orgName type="department">S. N</orgName>
<orgName type="institution" key="instit1">Bose National Centre for Basic Sciences</orgName>
<orgName type="institution" key="instit2">JD Block</orgName>
<address>
<addrLine>Sector III, Salt Lake</addrLine>
<settlement>Kolkata-700106</settlement>
<country key="IN">India</country>
</address>
</affiliation>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">Sankarshan</forename>
<surname>Sahu</surname>
</persName>
<email>[email protected]</email>
<affiliation key="aff1">
<note type="raw_affiliation">
<label>2</label> Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India.
</note>
<orgName type="department">Indian Institute of Engineering Science and Technology</orgName>
<address>
<postCode>Bengal-711103</postCode>
<settlement>Shibpur, Howrah</settlement>
<region>West</region>
<country key="IN">India</country>
</address>
</affiliation>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">Sayan</forename>
<forename type="middle">Kumar</forename>
<surname>Pal</surname>
</persName>
<email>[email protected]</email>
<affiliation key="aff0">
<note type="raw_affiliation">
<label>1</label> S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India.
</note>
<orgName type="department">S. N</orgName>
<orgName type="institution" key="instit1">Bose National Centre for Basic Sciences</orgName>
<orgName type="institution" key="instit2">JD Block</orgName>
<address>
<addrLine>Sector III, Salt Lake</addrLine>
<settlement>Kolkata-700106</settlement>
<country key="IN">India</country>
</address>
</affiliation>
</author>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</analytic>
<monogr>
<imprint>
<date type="published" when="2021-01-21">January 21, 2021</date>
</imprint>
</monogr>
<idno type="arXiv">arXiv:2101.07076v2[hep-th]</idno>
</biblStruct>
</sourceDesc>
</fileDesc>
<encodingDesc>
<appInfo>
<application version="0.6.1" ident="GROBID" when="2021-02-09T09:29+0000">
<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
<ref target="https://github.com/kermitt2/grobid"/>
</application>
</appInfo>
</encodingDesc>
<profileDesc>
<abstract/>
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
Loading