Skip to content

Commit

Permalink
add grobid authors to utils
Browse files Browse the repository at this point in the history
  • Loading branch information
nooraangelva committed Jul 25, 2022
1 parent 857a80e commit a5d3e2a
Show file tree
Hide file tree
Showing 7 changed files with 344 additions and 126 deletions.
6 changes: 4 additions & 2 deletions inspire_utils/grobid_authors_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@

class GrobidAuthors(object):
def __init__(self, xml_text):

if isinstance(xml_text, str):
xml_text = xml_text.decode('utf-8')
xml_text.t
self._xml = Selector(text=xml_text, type="xml")

self._xml.remove_namespaces()
Expand Down Expand Up @@ -69,7 +71,7 @@ def __init__(self, author_selector):
def _extract(source, path, type=None, text=False):
path += "[string-length(normalize-space()) > 0]"
if type:
path += "[@type='{type}']".format(type=type)
path += u"[@type='{}']".format(type)
if text:
path += "/text()"
return source.xpath(path)
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@
'python-dateutil~=2.0,>=2.6.1',
'six~=1.0,>=1.10.0',
'elasticsearch==7.1.0',
'elasticsearch-dsl~=7.1'
'inspire-schemas==61.4.12'
'elasticsearch-dsl~=7.1',
'inspire-schemas==61.4.12',
'parsel>=1.5'
]

docs_require = []
Expand Down
69 changes: 69 additions & 0 deletions tests/fixtures/grobid_empty_author_doc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve"
xmlns="http://www.tei-c.org/ns/1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/xsd/Grobid.xsd"
xmlns:xlink="http://www.w3.org/1999/xlink">
<teiHeader xml:lang="en">
<fileDesc>
<titleStmt>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</titleStmt>
<publicationStmt>
<publisher/>
<availability status="unknown">
<licence/>
</availability>
<date type="published" when="2021-01-21">January 21, 2021</date>
</publicationStmt>
<sourceDesc>
<biblStruct>
<analytic>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first"> FIRST </forename>
<surname></surname>
</persName>
<email> [email protected] </email>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">XYZ</forename>
<surname>ABC</surname>
</persName>
<email> </email>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first"> </forename>
<surname>YZC</surname>
</persName>
<email> [email protected] </email>
</author>
</analytic>
<monogr>
<imprint>
<date type="published" when="2021-01-21">January 21, 2021</date>
</imprint>
</monogr>
<idno type="arXiv">arXiv:2101.07076v2[hep-th]</idno>
</biblStruct>
</sourceDesc>
</fileDesc>
<encodingDesc>
<appInfo>
<application version="0.6.1" ident="GROBID" when="2021-02-09T09:29+0000">
<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
<ref target="https://github.com/kermitt2/grobid"/>
</application>
</appInfo>
</encodingDesc>
<profileDesc>
<abstract/>
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
109 changes: 109 additions & 0 deletions tests/fixtures/grobid_full_doc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve"
xmlns="http://www.tei-c.org/ns/1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/xsd/Grobid.xsd"
xmlns:xlink="http://www.w3.org/1999/xlink">
<teiHeader xml:lang="en">
<fileDesc>
<titleStmt>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</titleStmt>
<publicationStmt>
<publisher/>
<availability status="unknown">
<licence/>
</availability>
<date type="published" when="2021-01-21">January 21, 2021</date>
</publicationStmt>
<sourceDesc>
<biblStruct>
<analytic>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">Partha</forename>
<surname>Nandi</surname>
</persName>
<email>[email protected]</email>
<affiliation key="aff0">
<note type="raw_affiliation">
<label>1</label> S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India.
</note>
<orgName type="department">S. N</orgName>
<orgName type="institution" key="instit1">Bose National Centre for Basic Sciences</orgName>
<orgName type="institution" key="instit2">JD Block</orgName>
<address>
<addrLine>Sector III, Salt Lake</addrLine>
<settlement>Kolkata-700106</settlement>
<country key="IN">India</country>
</address>
</affiliation>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">Sankarshan</forename>
<surname>Sahu</surname>
</persName>
<email>[email protected]</email>
<affiliation key="aff1">
<note type="raw_affiliation">
<label>2</label> Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India.
</note>
<orgName type="department">Indian Institute of Engineering Science and Technology</orgName>
<address>
<postCode>Bengal-711103</postCode>
<settlement>Shibpur, Howrah</settlement>
<region>West</region>
<country key="IN">India</country>
</address>
</affiliation>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">Sayan</forename>
<forename type="middle">Kumar</forename>
<surname>Pal</surname>
</persName>
<email>[email protected]</email>
<affiliation key="aff0">
<note type="raw_affiliation">
<label>1</label> S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India.
</note>
<orgName type="department">S. N</orgName>
<orgName type="institution" key="instit1">Bose National Centre for Basic Sciences</orgName>
<orgName type="institution" key="instit2">JD Block</orgName>
<address>
<addrLine>Sector III, Salt Lake</addrLine>
<settlement>Kolkata-700106</settlement>
<country key="IN">India</country>
</address>
</affiliation>
</author>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</analytic>
<monogr>
<imprint>
<date type="published" when="2021-01-21">January 21, 2021</date>
</imprint>
</monogr>
<idno type="arXiv">arXiv:2101.07076v2[hep-th]</idno>
</biblStruct>
</sourceDesc>
</fileDesc>
<encodingDesc>
<appInfo>
<application version="0.6.1" ident="GROBID" when="2021-02-09T09:29+0000">
<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
<ref target="https://github.com/kermitt2/grobid"/>
</application>
</appInfo>
</encodingDesc>
<profileDesc>
<abstract/>
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
85 changes: 85 additions & 0 deletions tests/fixtures/grobid_incomplete_doc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve"
xmlns="http://www.tei-c.org/ns/1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/xsd/Grobid.xsd"
xmlns:xlink="http://www.w3.org/1999/xlink">
<teiHeader xml:lang="en">
<fileDesc>
<titleStmt>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</titleStmt>
<publicationStmt>
<publisher/>
<availability status="unknown">
<licence/>
</availability>
<date type="published" when="2021-01-21">January 21, 2021</date>
</publicationStmt>
<sourceDesc>
<biblStruct>
<analytic>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<surname>Nandi</surname>
</persName>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">Sankarshan</forename>
<surname>Sahu</surname>
</persName>
<affiliation key="aff1">
<note type="raw_affiliation">
<label>2</label> Indian Institute of Engineering Science and Technology, Shibpur, Howrah, West Bengal-711103, India.
</note>
<address>
<postCode>Bengal-711103</postCode>
<settlement>Shibpur, Howrah</settlement>
</address>
</affiliation>
</author>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<forename type="first">Sayan</forename>
<forename type="middle">Kumar</forename>
<surname>Pal</surname>
</persName>
<email>[email protected]</email>
<affiliation key="aff0">
<note type="raw_affiliation">
<label>1</label> S. N. Bose National Centre for Basic Sciences, JD Block, Sector III, Salt Lake, Kolkata-700106, India.
</note>
<orgName type="department">S. N</orgName>
<orgName type="institution" key="instit1">Bose National Centre for Basic Sciences</orgName>
<orgName type="institution" key="instit2">JD Block</orgName>
</affiliation>
</author>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</analytic>
<monogr>
<imprint>
<date type="published" when="2021-01-21">January 21, 2021</date>
</imprint>
</monogr>
<idno type="arXiv">arXiv:2101.07076v2[hep-th]</idno>
</biblStruct>
</sourceDesc>
</fileDesc>
<encodingDesc>
<appInfo>
<application version="0.6.1" ident="GROBID" when="2021-02-09T09:29+0000">
<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
<ref target="https://github.com/kermitt2/grobid"/>
</application>
</appInfo>
</encodingDesc>
<profileDesc>
<abstract/>
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
52 changes: 52 additions & 0 deletions tests/fixtures/grobid_no_authors_doc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve"
xmlns="http://www.tei-c.org/ns/1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /opt/grobid/grobid-home/schemas/xsd/Grobid.xsd"
xmlns:xlink="http://www.w3.org/1999/xlink">
<teiHeader xml:lang="en">
<fileDesc>
<titleStmt>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</titleStmt>
<publicationStmt>
<publisher/>
<availability status="unknown">
<licence/>
</availability>
<date type="published" when="2021-01-21">January 21, 2021</date>
</publicationStmt>
<sourceDesc>
<biblStruct>
<analytic>
<author>
<persName
xmlns="http://www.tei-c.org/ns/1.0">
<surname> </surname>
</persName>
</author>
<title level="a" type="main">Remarks on noncommutativity and scale anomaly in planar quantum mechanics</title>
</analytic>
<monogr>
<imprint>
<date type="published" when="2021-01-21">January 21, 2021</date>
</imprint>
</monogr>
<idno type="arXiv">arXiv:2101.07076v2[hep-th]</idno>
</biblStruct>
</sourceDesc>
</fileDesc>
<encodingDesc>
<appInfo>
<application version="0.6.1" ident="GROBID" when="2021-02-09T09:29+0000">
<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
<ref target="https://github.com/kermitt2/grobid"/>
</application>
</appInfo>
</encodingDesc>
<profileDesc>
<abstract/>
</profileDesc>
</teiHeader>
<text xml:lang="en"></text>
</TEI>
Loading

0 comments on commit a5d3e2a

Please sign in to comment.