diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..df97a7d
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.6.0
+ hooks:
+ - id: check-yaml
+ - id: end-of-file-fixer
+ - id: trailing-whitespace
+ - id: fix-byte-order-marker
+ - id: mixed-line-ending
+ - id: name-tests-test
+ args: [ --pytest-test-first ]
+ exclude: '^(?!factories/)'
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.5.6
+ hooks:
+ - id: ruff
+ args: [ --fix , --unsafe-fixes]
diff --git a/Dockerfile b/Dockerfile
index 574e1a9..9cb40f5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,6 +2,6 @@ FROM python:3.8
RUN apt update && apt install poppler-utils -y
COPY setup.py setup.cfg README.rst ./
COPY refextract refextract/
-RUN python setup.py install
+RUN python setup.py install
ENV PROMETHEUS_MULTIPROC_DIR='/tmp'
ENTRYPOINT exec gunicorn -b :5000 --access-logfile - --error-logfile - refextract.app:app --timeout 650
diff --git a/Untitled.ipynb b/Untitled.ipynb
new file mode 100644
index 0000000..dc7b752
--- /dev/null
+++ b/Untitled.ipynb
@@ -0,0 +1,106 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "163771b1-17d9-4648-875c-63f1a54c9201",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0\n",
+ "1\n",
+ "2\n",
+ "3\n",
+ "4\n",
+ "5\n",
+ "6\n",
+ "6\n"
+ ]
+ }
+ ],
+ "source": [
+ "real_index = 0\n",
+ "s = \"sdasdas\"\n",
+ "\n",
+ "for real_index, char in enumerate(s):\n",
+ " print(real_index)\n",
+ "\n",
+ "print(real_index)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "6132dad4-7fce-4719-beea-693eb32eed16",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'asdsad'"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\"asdsad\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "d72da078-f2c3-4879-a1a1-7557688ee727",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path = \"adsad\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "4497ac16-b4fd-407a-b567-2b5a67ec5d55",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "wow\n"
+ ]
+ }
+ ],
+ "source": [
+ "if path.startswith:\n",
+ " print(\"wow\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/refextract/__init__.py b/refextract/__init__.py
index 3669b7a..92df279 100644
--- a/refextract/__init__.py
+++ b/refextract/__init__.py
@@ -23,7 +23,7 @@
"""Refextract."""
-from .references.api import (
+from refextract.references.api import (
extract_journal_reference,
extract_references_from_file,
extract_references_from_string,
diff --git a/refextract/app.py b/refextract/app.py
index c5e1a1a..6b244e1 100644
--- a/refextract/app.py
+++ b/refextract/app.py
@@ -1,14 +1,15 @@
import logging
from flask import Flask, jsonify, make_response
-from prometheus_flask_exporter.multiprocess import \
- GunicornInternalPrometheusMetrics
+from prometheus_flask_exporter.multiprocess import GunicornInternalPrometheusMetrics
from webargs import fields
from webargs.flaskparser import FlaskParser
-from refextract.references.api import (extract_journal_reference,
- extract_references_from_string,
- extract_references_from_url)
+from refextract.references.api import (
+ extract_journal_reference,
+ extract_references_from_string,
+ extract_references_from_url,
+)
parser = FlaskParser()
diff --git a/refextract/authors/regexs.py b/refextract/authors/regexs.py
index b169d12..b60e5e8 100644
--- a/refextract/authors/regexs.py
+++ b/refextract/authors/regexs.py
@@ -24,7 +24,7 @@
import logging
import re
-from ..references.config import CFG_REFEXTRACT_KBS
+from refextract.references.config import CFG_REFEXTRACT_KBS
LOGGER = logging.getLogger(__name__)
@@ -42,10 +42,7 @@ def get_author_affiliation_numeration_str(punct=None):
re_number = r'(?:\d\d?)'
re_chained_numbers = r"(?:(?:[,;]\s*%s\.?\s*))*" % re_number
# Punctuation surrounding the number, either general or specific again
- if punct is None:
- re_punct = r"(?:[\{\(\[]?)"
- else:
- re_punct = re.escape(punct)
+ re_punct = '(?:[\\{\\(\\[]?)' if punct is None else re.escape(punct)
# Generic number finder (MUST NOT INCLUDE NAMED GROUPS!!!)
numeration_str = r"""
@@ -86,10 +83,7 @@ def get_initial_surname_author_pattern(incl_numeration=False):
@return (string): The 'Initials Surname' author pattern."""
# Possible inclusion of superscript numeration at the end of author names
# Will match the empty string
- if incl_numeration:
- append_num_re = get_author_affiliation_numeration_str() + '?'
- else:
- append_num_re = ""
+ append_num_re = get_author_affiliation_numeration_str() + '?' if incl_numeration else ''
return r"""
(?:
@@ -137,10 +131,7 @@ def get_surname_initial_author_pattern(incl_numeration=False):
@return (string): The 'Surname Initials' author pattern."""
# Possible inclusion of superscript numeration at the end of author names
# Will match the empty string
- if incl_numeration:
- append_num_re = get_author_affiliation_numeration_str() + '?'
- else:
- append_num_re = ""
+ append_num_re = get_author_affiliation_numeration_str() + '?' if incl_numeration else ''
return r"""
(?:
@@ -410,27 +401,27 @@ def add_to_auth_list(s):
fpath = CFG_REFEXTRACT_KBS['collaborations']
try:
- fh = open(fpath, "r")
+ with open(fpath, 'r') as fh:
+ for line_num, rawline in enumerate(fh):
+ try:
+ rawline = rawline.decode("utf-8")
+ except UnicodeError:
+ LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num)
+ raise UnicodeError(
+ "Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
+ if rawline.strip() and rawline[0].strip() != '#':
+ add_to_auth_list(rawline)
+ # Shorten collaboration to 'coll'
+ if rawline.lower().endswith('collaboration\n'):
+ coll_version = rawline[:rawline.lower().find(
+ u'collaboration\n')] + r"coll[\.\,]"
+ add_to_auth_list(
+ coll_version.strip().replace(' ', r'\s') + u"s?")
except IOError:
# problem opening KB for reading, or problem while reading from it:
LOGGER.debug(u"Error: Could not build knowledge base containing author patterns - failed to read from KB %s s.\n", fpath)
raise IOError("Error: Unable to open collaborations kb '%s'" % fpath)
- for line_num, rawline in enumerate(fh):
- try:
- rawline = rawline.decode("utf-8")
- except UnicodeError:
- LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num)
- raise UnicodeError(
- "Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
- if rawline.strip() and rawline[0].strip() != '#':
- add_to_auth_list(rawline)
- # Shorten collaboration to 'coll'
- if rawline.lower().endswith('collaboration\n'):
- coll_version = rawline[:rawline.lower().find(
- u'collaboration\n')] + r"coll[\.\,]"
- add_to_auth_list(
- coll_version.strip().replace(' ', r'\s') + u"s?")
author_match_re = ""
if len(auths) > 0:
diff --git a/refextract/documents/pdf.py b/refextract/documents/pdf.py
index 038ef32..e193911 100644
--- a/refextract/documents/pdf.py
+++ b/refextract/documents/pdf.py
@@ -39,7 +39,7 @@
import re
import subprocess
-from ..references.config import CFG_PATH_PDFTOTEXT
+from refextract.references.config import CFG_PATH_PDFTOTEXT
LOGGER = logging.getLogger(__name__)
@@ -56,10 +56,7 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
if not os.path.isfile(CFG_PATH_PDFTOTEXT):
raise IOError('Missing pdftotext executable')
- if keep_layout:
- layout_option = "-layout"
- else:
- layout_option = "-raw"
+ layout_option = '-layout' if keep_layout else '-raw'
doclines = []
# Pattern to check for lines with a leading page-break character.
# If this pattern is matched, we want to split the page-break into
diff --git a/refextract/documents/text.py b/refextract/documents/text.py
index c8022fe..df120b7 100644
--- a/refextract/documents/text.py
+++ b/refextract/documents/text.py
@@ -25,7 +25,6 @@
import re
-
re_space_comma = re.compile(r'\s,', re.UNICODE)
re_space_semicolon = re.compile(r'\s;', re.UNICODE)
re_space_period = re.compile(r'\s\.', re.UNICODE)
@@ -264,12 +263,8 @@ def get_number_header_lines(docbody, page_break_posns):
# pattern to search for a word in a line:
p_wordSearch = re.compile(r'([A-Za-z0-9-]+)', re.UNICODE)
if remaining_breaks > 2:
- if remaining_breaks > 3:
- # Only check odd page headers
- next_head = 2
- else:
- # Check headers on each page
- next_head = 1
+ # Only check odd page headers else check headers on each page
+ next_head = 2 if remaining_breaks > 3 else 1
keep_checking = 1
while keep_checking:
cur_break = 1
@@ -406,7 +401,7 @@ def strip_headers_footers_pagebreaks(docbody,
for i in range(0, len(page_break_posns)):
# Unless this is the last page break, chop headers
if not first:
- for dummy in range(1, num_head_lines + 1):
+ for _dummy in range(1, num_head_lines + 1):
docbody[page_break_posns[i] +
1:page_break_posns[i] + 2] = []
else:
@@ -415,7 +410,7 @@ def strip_headers_footers_pagebreaks(docbody,
docbody[page_break_posns[i]:page_break_posns[i] + 1] = []
# Chop footers (unless this is the first page break)
if i != len(page_break_posns) - 1:
- for dummy in range(1, num_foot_lines + 1):
+ for _dummy in range(1, num_foot_lines + 1):
docbody[page_break_posns[i] -
num_foot_lines:page_break_posns[i] -
num_foot_lines + 1] = []
@@ -429,7 +424,7 @@ def check_boundary_lines_similar(l_1, l_2):
@return: (int) 1/0.
"""
num_matches = 0
- if (type(l_1) != list) or (type(l_2) != list) or (len(l_1) != len(l_2)):
+ if not isinstance(l_1, list) or not isinstance(l_2, list) or len(l_1) != len(l_2):
# these 'boundaries' are not similar
return 0
diff --git a/refextract/references/api.py b/refextract/references/api.py
index 3f0fcce..ffaaec2 100644
--- a/refextract/references/api.py
+++ b/refextract/references/api.py
@@ -29,25 +29,29 @@
"""
import os
-import requests
-import magic
-
from tempfile import mkstemp
+import magic
+import requests
from inspire_utils.dedupers import dedupe_list
-from .engine import (
+from refextract.references.engine import (
get_kbs,
get_plaintext_document_body,
parse_reference_line,
parse_references,
)
-from .errors import FullTextNotAvailableError
-from .find import (find_numeration_in_body,
- get_reference_section_beginning)
-from .pdf import extract_texkeys_and_urls_from_pdf
-from .text import extract_references_from_fulltext, rebuild_reference_lines
-from .record import update_reference_with_urls
+from refextract.references.errors import FullTextNotAvailableError
+from refextract.references.find import (
+ find_numeration_in_body,
+ get_reference_section_beginning,
+)
+from refextract.references.pdf import extract_texkeys_and_urls_from_pdf
+from refextract.references.record import update_reference_with_urls
+from refextract.references.text import (
+ extract_references_from_fulltext,
+ rebuild_reference_lines,
+)
def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
@@ -146,7 +150,7 @@ def extract_references_from_file(path,
extracted_texkeys_urls = extract_texkeys_and_urls_from_pdf(path)
if len(extracted_texkeys_urls) == len(parsed_refs):
parsed_refs_updated = []
- for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls):
+ for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls, strict=False):
update_reference_with_urls(ref, ref_texkey_urls.get('urls', []))
if ref.get('url'):
ref['url'] = dedupe_list(ref['url'])
diff --git a/refextract/references/engine.py b/refextract/references/engine.py
index 9626b5b..ed1cdb3 100644
--- a/refextract/references/engine.py
+++ b/refextract/references/engine.py
@@ -26,49 +26,47 @@
import logging
import mmap
import re
-
from datetime import datetime
import magic
-from .config import (
- CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM,
+from refextract.documents.pdf import convert_PDF_to_plaintext
+from refextract.references.config import (
CFG_REFEXTRACT_MARKER_CLOSING_ARXIV,
+ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL,
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL,
CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND,
- CFG_REFEXTRACT_MARKER_CLOSING_VOLUME,
- CFG_REFEXTRACT_MARKER_CLOSING_YEAR,
CFG_REFEXTRACT_MARKER_CLOSING_PAGE,
- CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID,
- CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL,
- CFG_REFEXTRACT_MARKER_CLOSING_TITLE,
+ CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM,
CFG_REFEXTRACT_MARKER_CLOSING_SERIES,
+ CFG_REFEXTRACT_MARKER_CLOSING_TITLE,
+ CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID,
+ CFG_REFEXTRACT_MARKER_CLOSING_VOLUME,
+ CFG_REFEXTRACT_MARKER_CLOSING_YEAR,
)
-
-from .errors import UnknownDocumentTypeError
-
-from .tag import (
- tag_reference_line,
- sum_2_dictionaries,
- identify_and_tag_DOI,
- identify_and_tag_URLs,
- find_numeration,
- extract_series_from_volume
-)
-from .text import wash_and_repair_reference_line
-from .record import build_references
-from ..documents.pdf import convert_PDF_to_plaintext
-from .kbs import get_kbs
-from .regexs import (
+from refextract.references.errors import UnknownDocumentTypeError
+from refextract.references.kbs import get_kbs
+from refextract.references.record import build_references
+from refextract.references.regexs import (
get_reference_line_numeration_marker_patterns,
- regex_match_list,
- re_tagged_citation,
+ re_hdl,
re_numeration_no_ibid_txt,
- re_roman_numbers,
re_recognised_numeration_for_title_plus_series,
- remove_year,
+ re_roman_numbers,
+ re_tagged_citation,
re_year_in_misc_txt,
- re_hdl)
+ regex_match_list,
+ remove_year,
+)
+from refextract.references.tag import (
+ extract_series_from_volume,
+ find_numeration,
+ identify_and_tag_DOI,
+ identify_and_tag_URLs,
+ sum_2_dictionaries,
+ tag_reference_line,
+)
+from refextract.references.text import wash_and_repair_reference_line
LOGGER = logging.getLogger(__name__)
@@ -307,10 +305,7 @@ def postpone_last_auth(current_citation, num_auth):
if num_auth == 0:
return None
- if num_auth == 1:
- func = current_citation.__getitem__
- else:
- func = current_citation.pop
+ func = current_citation.__getitem__ if num_auth == 1 else current_citation.pop
for idx, el in enumerate(reversed(current_citation), 1):
if el["type"] == "AUTH":
@@ -375,10 +370,7 @@ def split_citations_iter(citation_elements):
def valid_citation(citation):
els_to_remove = ('MISC', )
- for el in citation:
- if el['type'] not in els_to_remove:
- return True
- return False
+ return any(el['type'] not in els_to_remove for el in citation)
def remove_invalid_references(splitted_citations):
@@ -429,11 +421,10 @@ def add_misc(el, txt):
previous_citation_valid = True
for citation in splitted_citations:
current_citation_valid = valid_citation(citation)
- if not current_citation_valid:
+ if not current_citation_valid and not previous_citation_valid and not current_citation_valid :
# Merge to previous one misc txt
- if not previous_citation_valid and not current_citation_valid:
- for el in citation:
- add_misc(previous_citation[-1], el['misc_txt'])
+ for el in citation:
+ add_misc(previous_citation[-1], el['misc_txt'])
previous_citation = citation
previous_citation_valid = current_citation_valid
@@ -475,10 +466,7 @@ def add_year_elements(splitted_citations):
def look_for_implied_ibids(splitted_citations):
def look_for_journal(els):
- for el in els:
- if el['type'] == 'JOURNAL':
- return True
- return False
+ return any(el['type'] == 'JOURNAL' for el in els)
current_journal = None
for citation in splitted_citations:
@@ -616,13 +604,15 @@ def print_citations(splitted_citations, line_marker):
LOGGER.debug('%s %s', el['type'], repr(el))
-def parse_reference_line(ref_line, kbs, bad_titles_count={}, linker_callback=None):
+def parse_reference_line(ref_line, kbs, bad_titles_count=None, linker_callback=None):
"""Parse one reference line
@input a string representing a single reference bullet
@output parsed references (a list of elements objects)
"""
# Strip the 'marker' (e.g. [1]) from this reference line:
+ if bad_titles_count is None:
+ bad_titles_count = {}
line_marker, ref_line = remove_reference_line_marker(ref_line)
# Find DOI sections in citation
ref_line, identified_dois = identify_and_tag_DOI(ref_line)
@@ -776,11 +766,12 @@ def find_substring_ignore_special_chars(s, substr):
i = 0
real_index = 0
re_alphanum = re.compile('[A-Z0-9]')
- for real_index, char in enumerate(s):
+ for char in s:
if re_alphanum.match(char):
i += 1
if i > startIndex:
break
+ real_index=+1
return real_index
else:
@@ -801,7 +792,7 @@ def cut_substring_with_special_chars(s, sub, startIndex):
if subPosition >= len(clean_sub):
# include everything till a space, open bracket or a normal
# character
- counter += len(re.split('[ [{(a-zA-Z0-9]', s[startIndex + counter:], 1)[0])
+ counter += len(re.split('[ [{(a-zA-Z0-9]', s[startIndex + counter:], maxsplit=1)[0])
return s[0:startIndex].strip() + ' ' + s[startIndex + counter:].strip()
@@ -810,10 +801,7 @@ def is_unknown_citation(citation):
"""Checks if the citation got recognized as one of the known types.
"""
knownTypes = ['BOOK', 'JOURNAL', 'DOI', 'ISBN', 'RECID']
- for citation_element in citation:
- if citation_element['type'] in knownTypes:
- return False
- return True
+ return all(citation_element['type'] not in knownTypes for citation_element in citation)
def parse_references_elements(ref_sect, kbs, linker_callback=None):
diff --git a/refextract/references/find.py b/refextract/references/find.py
index a88005e..f8c8a13 100644
--- a/refextract/references/find.py
+++ b/refextract/references/find.py
@@ -23,19 +23,21 @@
"""Finding the reference section from the fulltext"""
+import contextlib
import logging
import re
-from .regexs import \
- get_reference_section_title_patterns, \
- get_reference_line_numeration_marker_patterns, \
- regex_match_list, \
- get_post_reference_section_title_patterns, \
- get_post_reference_section_keyword_patterns, \
- re_reference_line_bracket_markers, \
- re_reference_line_dot_markers, \
- re_reference_line_number_markers, \
- re_num
+from refextract.references.regexs import (
+ get_post_reference_section_keyword_patterns,
+ get_post_reference_section_title_patterns,
+ get_reference_line_numeration_marker_patterns,
+ get_reference_section_title_patterns,
+ re_num,
+ re_reference_line_bracket_markers,
+ re_reference_line_dot_markers,
+ re_reference_line_number_markers,
+ regex_match_list,
+)
LOGGER = logging.getLogger(__name__)
@@ -393,11 +395,9 @@ def find_end_of_reference_section(docbody,
# save the reference count
num_match = regex_match_list(docbody[x].strip(), mk_patterns)
if num_match:
- try:
+ with contextlib.suppress(ValueError, IndexError):
current_reference_count = int(num_match.group('marknum'))
- except (ValueError, IndexError):
- # non numerical references marking
- pass
+
# look for a likely section title that would follow a reference
# section:
end_match = regex_match_list(docbody[x].strip(), t_patterns)
diff --git a/refextract/references/kbs.py b/refextract/references/kbs.py
index a1d8184..5a77dbf 100644
--- a/refextract/references/kbs.py
+++ b/refextract/references/kbs.py
@@ -21,21 +21,22 @@
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
-import re
-import six
-import csv
import codecs
import contextlib
+import csv
+import re
-from .config import CFG_REFEXTRACT_KBS
-from .regexs import (
- re_kb_line,
- re_regexp_character_class,
- re_extract_quoted_text,
+import six
+
+from refextract.documents.text import re_group_captured_multiple_space
+from refextract.references.config import CFG_REFEXTRACT_KBS
+from refextract.references.regexs import (
re_extract_char_class,
+ re_extract_quoted_text,
+ re_kb_line,
re_punctuation,
+ re_regexp_character_class,
)
-from ..documents.text import re_group_captured_multiple_space
@contextlib.contextmanager
@@ -91,12 +92,9 @@ def load_kb_by_type(kb_type, kb):
def load_kb(path, builder):
if isinstance(path, dict):
return load_kb_from_iterable(path.items(), builder)
- try:
- path.startswith
- except AttributeError:
- return load_kb_from_iterable(path, builder)
- else:
+ elif hasattr(path, 'startswith'):
return load_kb_from_file(path, builder)
+ return load_kb_from_iterable(path, builder)
def order_reportnum_patterns_bylen(numeration_patterns):
@@ -342,25 +340,18 @@ def _add_institute_preprint_patterns(preprint_classifications,
if m_preprint_classification:
# This KB line contains a preprint classification for
# the current institute
- try:
+ with contextlib.suppress(AttributeError, NameError):
current_institute_preprint_classifications.append((m_preprint_classification.group(1),
m_preprint_classification.group(2)))
- except (AttributeError, NameError):
- # didn't match this line correctly - skip it
- pass
- # move on to the next line
continue
m_numeration_pattern = re_numeration_pattern.search(rawline)
if m_numeration_pattern:
# This KB line contains a preprint item numeration pattern
# for the current institute
- try:
+ with contextlib.suppress(AttributeError, NameError):
current_institute_numerations.append(
m_numeration_pattern.group(1))
- except (AttributeError, NameError):
- # didn't match the numeration pattern correctly - skip it
- pass
continue
_add_institute_preprint_patterns(current_institute_preprint_classifications,
@@ -538,7 +529,7 @@ def build_journals_kb(knowledgebase):
# Now, for every 'replacement term' found in the KB, if it is
# not already in the KB as a "search term", add it:
- for repl_term in repl_terms.keys():
+ for repl_term in repl_terms:
raw_repl_phrase = repl_term.upper()
raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase)
raw_repl_phrase = \
diff --git a/refextract/references/pdf.py b/refextract/references/pdf.py
index a30992c..a3304f4 100644
--- a/refextract/references/pdf.py
+++ b/refextract/references/pdf.py
@@ -26,7 +26,7 @@
from PyPDF2 import PdfFileReader
from PyPDF2.generic import ByteStringObject
-from .regexs import re_reference_in_dest
+from refextract.references.regexs import re_reference_in_dest
LOGGER = logging.getLogger(__name__)
@@ -173,10 +173,7 @@ def _match_urls_with_reference(
if not two_column_layout or (two_column_layout and url_col == ref_column):
urls_for_reference.add(url[0])
continue
- elif is_last_reference_in_page or is_last_reference_in_page_two_col_layout:
- urls_for_reference.add(url[0])
- continue
- elif is_in_new_column:
+ elif is_last_reference_in_page or is_last_reference_in_page_two_col_layout or is_in_new_column:
urls_for_reference.add(url[0])
continue
elif is_url_unrelated_to_references:
diff --git a/refextract/references/regexs.py b/refextract/references/regexs.py
index d7e10de..330fbc6 100644
--- a/refextract/references/regexs.py
+++ b/refextract/references/regexs.py
@@ -22,7 +22,6 @@
# or submit itself to any jurisdiction.
import re
-
from datetime import datetime
# Sep
@@ -193,7 +192,6 @@ def compute_pos_patterns(patterns):
r"quan-ph": "quant-ph",
r"nlin-cd": "nlin.cd",
r"math-sp": "math.sp",
- r"atro-ph": "astro-ph",
r"ast-ph": "astro-ph",
r"asyro-ph": "astro-ph",
r"aastro-ph": "astro-ph",
@@ -927,10 +925,7 @@ def regex_match_list(line, patterns):
def remove_year(s, year=None):
- if year:
- year_pattern = re.escape(year)
- else:
- year_pattern = r"(?:19|20)\d{2}"
+ year_pattern = re.escape(year) if year else "(?:19|20)\\d{2}"
s = re.sub(r'\[\s*%s\s*\]' % year_pattern, '', s)
s = re.sub(r'\(\s*%s\s*\)' % year_pattern, '', s)
s = re.sub(r'\s*%s\s*' % year_pattern, '', s)
diff --git a/refextract/references/tag.py b/refextract/references/tag.py
index b0156d2..7fba15b 100644
--- a/refextract/references/tag.py
+++ b/refextract/references/tag.py
@@ -22,66 +22,67 @@
# or submit itself to any jurisdiction.
import re
-
from urllib.parse import unquote
from unidecode import unidecode
-from .config import \
- CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL, \
- CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL, \
- CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \
- CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID, \
- CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID, \
- CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION, \
- CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION
-
-from ..documents.text import remove_and_record_multiple_spaces_in_line
-
-from .regexs import \
- re_ibid, \
- re_doi, \
- re_raw_url, \
- re_series_from_numeration, \
- re_punctuation, \
- re_correct_numeration_2nd_try_ptn1, \
- re_correct_numeration_2nd_try_ptn2, \
- re_correct_numeration_2nd_try_ptn3, \
- re_correct_numeration_2nd_try_ptn4, \
- re_numeration_nucphys_vol_page_yr, \
- re_numeration_vol_subvol_nucphys_yr_page, \
- re_numeration_nucphys_vol_yr_page, \
- re_multiple_hyphens, \
- re_numeration_vol_page_yr, \
- re_numeration_vol_yr_page, \
- re_numeration_vol_nucphys_series_yr_page, \
- re_numeration_vol_series_nucphys_page_yr, \
- re_numeration_vol_nucphys_series_page_yr, \
- re_html_tagged_url, \
- re_numeration_yr_vol_page, \
- re_numeration_vol_nucphys_page_yr, \
- re_wash_volume_tag, \
- re_numeration_vol_nucphys_yr_subvol_page, \
- re_quoted, \
- re_isbn, \
- re_arxiv, \
- re_arxiv_5digits, \
- re_new_arxiv, \
- re_new_arxiv_5digits, \
- re_pos, \
- re_pos_year_num, \
- re_series_from_numeration_after_volume, \
- RE_OLD_ARXIV, \
- RE_ARXIV_CATCHUP, \
- RE_ATLAS_CONF_PRE_2010, \
- RE_ATLAS_CONF_POST_2010
-
-from ..authors.regexs import (
- get_author_regexps,
+from refextract.authors.regexs import (
etal_matches,
+ get_author_regexps,
re_ed_notation,
- re_etal)
-from ..documents.text import wash_line
+ re_etal,
+)
+from refextract.documents.text import (
+ remove_and_record_multiple_spaces_in_line,
+ wash_line,
+)
+from refextract.references.config import (
+ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL,
+ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL,
+ CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND,
+ CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION,
+ CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID,
+ CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION,
+ CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID,
+)
+from refextract.references.regexs import (
+ RE_ARXIV_CATCHUP,
+ RE_ATLAS_CONF_POST_2010,
+ RE_ATLAS_CONF_PRE_2010,
+ RE_OLD_ARXIV,
+ re_arxiv,
+ re_arxiv_5digits,
+ re_correct_numeration_2nd_try_ptn1,
+ re_correct_numeration_2nd_try_ptn2,
+ re_correct_numeration_2nd_try_ptn3,
+ re_correct_numeration_2nd_try_ptn4,
+ re_doi,
+ re_html_tagged_url,
+ re_ibid,
+ re_isbn,
+ re_multiple_hyphens,
+ re_new_arxiv,
+ re_new_arxiv_5digits,
+ re_numeration_nucphys_vol_page_yr,
+ re_numeration_nucphys_vol_yr_page,
+ re_numeration_vol_nucphys_page_yr,
+ re_numeration_vol_nucphys_series_page_yr,
+ re_numeration_vol_nucphys_series_yr_page,
+ re_numeration_vol_nucphys_yr_subvol_page,
+ re_numeration_vol_page_yr,
+ re_numeration_vol_series_nucphys_page_yr,
+ re_numeration_vol_subvol_nucphys_yr_page,
+ re_numeration_vol_yr_page,
+ re_numeration_yr_vol_page,
+ re_pos,
+ re_pos_year_num,
+ re_punctuation,
+ re_quoted,
+ re_raw_url,
+ re_series_from_numeration,
+ re_series_from_numeration_after_volume,
+ re_wash_volume_tag,
+)
def tag_reference_line(line, kbs, record_titles_count):
@@ -439,7 +440,7 @@ def tag_atlas_conf(line):
def identifiy_journals_re(line, kb_journals):
matches = {}
- for pattern, dummy_journal in kb_journals:
+ for pattern, _dummy_journal in kb_journals:
match = re.search(pattern, line)
if match:
matches[match.start()] = match.group(0)
@@ -558,10 +559,7 @@ def extract_series_from_volume(volume):
def create_numeration_tag(info):
- if info['series']:
- series_and_volume = info['series'] + info['volume']
- else:
- series_and_volume = info['volume']
+ series_and_volume = info['series'] + info['volume'] if info['series'] else info['volume']
numeration_tags = u' %s' % series_and_volume
if info.get('year', False):
numeration_tags += u' (%(year)s)' % info
@@ -861,7 +859,7 @@ def identify_and_tag_collaborations(line, collaborations_kb):
which won't influence the reference splitting heuristics
(used when looking at mulitple tags in a line).
"""
- for dummy_collab, re_collab in collaborations_kb.items():
+ for _dummy_collab, re_collab in collaborations_kb.items():
matches = re_collab.finditer(strip_tags(line))
for match in reversed(list(matches)):
@@ -967,16 +965,15 @@ def identify_and_tag_authors(line, authors_kb):
add_to_misc = ""
# If a semi-colon was found at the end of this author group, keep it in misc
# so that it can be looked at for splitting heurisitics
- if len(output_line) > m['end']:
- if output_line[m['end']].strip(" ,.") == ';':
- add_to_misc = ';'
+ if len(output_line) > m['end'] and output_line[m['end']].strip(" ,.") == ';':
+ add_to_misc = ';'
# Standardize eds. notation
tmp_output_line = re.sub(re_ed_notation, '(ed.)',
- output_line[start:end], re.IGNORECASE)
+ output_line[start:end], flags=re.IGNORECASE)
# Standardize et al. notation
tmp_output_line = re.sub(re_etal, 'et al.',
- tmp_output_line, re.IGNORECASE)
+ tmp_output_line, flags=re.IGNORECASE)
# Strip
tmp_output_line = tmp_output_line.lstrip('.').strip(",:;- [](")
if not tmp_output_line.endswith('(ed.)'):
@@ -1006,7 +1003,7 @@ def identify_and_tag_authors(line, authors_kb):
ed_notation = " (eds.)"
# Standardize et al. notation
tmp_output_line = re.sub(re_etal, 'et al.',
- m['author_names'], re.IGNORECASE)
+ m['author_names'], flags=re.IGNORECASE)
# remove any characters which denote this author group
# to be editors, just take the
# author names, and append '(ed.)'
@@ -1032,7 +1029,7 @@ def sum_2_dictionaries(dicta, dictb):
@return: (dictionary) - the sum of the 2 dictionaries
"""
dict_out = dicta.copy()
- for key in dictb.keys():
+ for key in dictb:
if 'key' in dict_out:
# Add the sum for key in dictb to that of dict_out:
dict_out[key] += dictb[key]
diff --git a/refextract/references/text.py b/refextract/references/text.py
index 18c6f56..4cfc643 100644
--- a/refextract/references/text.py
+++ b/refextract/references/text.py
@@ -25,15 +25,18 @@
import re
from inspire_utils.record import replace_undesirable_characters
-from ..documents.text import (
+
+from refextract.documents.text import (
join_lines,
- repair_broken_urls,
re_multiple_space,
- remove_page_boundary_lines
+ remove_page_boundary_lines,
+ repair_broken_urls,
+)
+from refextract.references.config import CFG_REFEXTRACT_MAX_LINES
+from refextract.references.find import (
+ find_end_of_reference_section,
+ get_reference_section_beginning,
)
-
-from .config import CFG_REFEXTRACT_MAX_LINES
-from .find import find_end_of_reference_section, get_reference_section_beginning
LOGGER = logging.getLogger(__name__)
diff --git a/ruff.toml b/ruff.toml
new file mode 100644
index 0000000..d347e32
--- /dev/null
+++ b/ruff.toml
@@ -0,0 +1,28 @@
+target-version = "py311"
+[lint.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[lint]
+select = [
+ # pycodestyle
+ "E",
+ # Pyflakes
+ "F",
+ # flake8-bugbear
+ "B",
+ # flake8-simplify
+ "SIM",
+ # isort
+ "I",
+ # flake8-tidy-imports
+ "TID",
+ # flake8-pytest-style
+ "PT",
+]
+ignore = ["B904","E501"]
+
+[lint.pycodestyle]
+ignore-overlong-task-comments = true
+
+[lint.pydocstyle]
+convention = "google"
diff --git a/setup.py b/setup.py
index a57faca..a75a1e0 100644
--- a/setup.py
+++ b/setup.py
@@ -25,10 +25,10 @@
from setuptools import find_packages, setup
-
url = 'https://github.com/inspirehep/refextract'
-readme = open('README.rst').read()
+with open('README.rst') as file:
+ readme = file.read()
install_requires = [
'PyPDF2~=1.0,>=1.26.0',
@@ -57,7 +57,7 @@
}
extras_require['all'] = []
-for name, reqs in extras_require.items():
+for _name, reqs in extras_require.items():
extras_require['all'].extend(reqs)
packages = find_packages()
diff --git a/tests/conftest.py b/tests/conftest.py
index 0860cab..cd19010 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -26,7 +26,7 @@
import pytest
-@pytest.fixture
+@pytest.fixture()
def pdf_files():
path_to_pdfs = os.path.join(os.path.dirname(__file__), 'data')
pdfs = os.listdir(path_to_pdfs)
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 882022f..fc6bde9 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -6,7 +6,7 @@
@pytest.fixture(autouse=True, scope="session")
def app():
app = create_app()
- yield app
+ return app
@pytest.fixture()
diff --git a/tests/integration/test_views.py b/tests/integration/test_views.py
index 562cd60..e382ffe 100644
--- a/tests/integration/test_views.py
+++ b/tests/integration/test_views.py
@@ -75,7 +75,7 @@ def test_extract_journal_info_when_timeout_from_refextract(
data=json.dumps(payload),
)
assert response.status_code == 500
- assert {'message': "Can not extract publication info data. Reason: 'test message'"} == response.json
+ assert response.json == {'message': "Can not extract publication info data. Reason: 'test message'"}
def test_extract_journal_info_for_multiple_pubinfos(app_client):
@@ -141,7 +141,8 @@ def test_extract_extract_references_from_text(app_client):
assert "year" in response.json["extracted_references"][0]
-@mock.patch("refextract.app.extract_references_from_string", side_effect=KeyError("test message"))
+@mock.patch("refextract.app.extract_references_from_string",
+ side_effect=KeyError("test message"))
def test_extract_references_from_text_when_timeout_from_refextract(
mock_extract_refs, app_client
):
@@ -159,7 +160,7 @@ def test_extract_references_from_text_when_timeout_from_refextract(
"/extract_references_from_text", headers=headers, data=json.dumps(payload)
)
assert response.status_code == 500
- assert {'message': "Can not extract references. Reason: 'test message'"} == response.json
+ assert response.json == {'message': "Can not extract references. Reason: 'test message'"}
@pytest.mark.vcr()
diff --git a/tests/test_api.py b/tests/test_api.py
index 5f5c17e..4948ac2 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -26,22 +26,22 @@
from refextract.references.api import (
extract_journal_reference,
+ extract_references_from_file,
extract_references_from_string,
extract_references_from_url,
- extract_references_from_file,
)
-
from refextract.references.errors import FullTextNotAvailableError
-@pytest.fixture
+@pytest.fixture()
def kbs_override():
return {
"books": [
('Griffiths, David', 'Introduction to elementary particles', '2008')
],
"journals": [
- ("PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS", "Phys.Rev.ST Accel.Beams"),
+ ("PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS",
+ "Phys.Rev.ST Accel.Beams"),
("PHYS REV D", "Phys.Rev.;D"),
("PHYS REV", "Phys.Rev."),
("PHYS REV LETT", "Phys.Rev.Lett."),
@@ -148,15 +148,15 @@ def test_extract_references_from_url(pdf_files):
assert len(r) == 36
assert 'url' in r[0]
+ url = "http://www.example.com"
+ responses.add(
+ responses.GET,
+ url,
+ body="File not found!",
+ status=404,
+ content_type='text/plain',
+ )
with pytest.raises(FullTextNotAvailableError):
- url = "http://www.example.com"
- responses.add(
- responses.GET,
- url,
- body="File not found!",
- status=404,
- content_type='text/plain',
- )
extract_references_from_url(url)
diff --git a/tests/test_engine.py b/tests/test_engine.py
index 0eeb5d4..4486462 100644
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@@ -27,7 +27,6 @@
get_plaintext_document_body,
parse_references,
)
-
from refextract.references.errors import UnknownDocumentTypeError
@@ -329,10 +328,10 @@ def test_get_plaintext_document_body(tmpdir):
f.write("".join(input))
assert input == get_plaintext_document_body(str(f))
+ html = "Some page"
+ f = tmpdir.join("page.html")
+ f.write(html)
with pytest.raises(UnknownDocumentTypeError) as excinfo:
- html = "Some page"
- f = tmpdir.join("page.html")
- f.write(html)
get_plaintext_document_body(str(f))
assert 'text/html' in excinfo.value.args
diff --git a/tests/test_kbs.py b/tests/test_kbs.py
index 7d9c573..c3968a5 100644
--- a/tests/test_kbs.py
+++ b/tests/test_kbs.py
@@ -35,14 +35,14 @@ def test_get_kbs_caches_journal_dict():
first_cache = get_kbs(custom_kbs={"journals": journals}).copy()
assert len(first_cache["journals"]) == 3
- assert ["JOURNAL OF TESTING", "J TESTING"] == first_cache["journals"][-1]
+ assert first_cache["journals"][-1] == ["JOURNAL OF TESTING", "J TESTING"]
journals = journals.copy()
second_cache = get_kbs(custom_kbs={"journals": journals})
# the cache is reused, so identity of the cache elements doesn't change
assert all(
cached_first is cached_second for (cached_first, cached_second)
- in zip(first_cache["journals"], second_cache["journals"])
+ in zip(first_cache["journals"], second_cache["journals"], strict=False)
)
@@ -55,7 +55,7 @@ def test_get_kbs_invalidates_cache_if_input_changes():
# the cache is invalidated, so identity of the cache elements changes
assert all(
cached_first is not cached_second for (cached_first, cached_second)
- in zip(first_cache["journals"], second_cache["journals"])
+ in zip(first_cache["journals"], second_cache["journals"], strict=False)
)
assert len(second_cache["journals"]) == 3
- assert ["JOURNAL OF TESTING", "J TEST"] == second_cache["journals"][-1]
+ assert second_cache["journals"][-1] == ["JOURNAL OF TESTING", "J TEST"]
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
index 61f6b01..3ce6923 100644
--- a/tests/test_pdf.py
+++ b/tests/test_pdf.py
@@ -84,7 +84,6 @@ def test_extract_texkeys_and_urls_from_pdf(pdf_files):
"urls": {
"http://dx.doi.org/10.1016/j.physletb.2013.08.026",
"http://www.arXiv.org/abs/1307.1432",
- "http://www.arXiv.org/abs/1307.1432",
},
},
{"texkey": "CMS_combination", "urls": {"http://www.arXiv.org/abs/1412.8662"}},
@@ -111,7 +110,6 @@ def test_extract_texkeys_and_urls_from_pdf(pdf_files):
"urls": {
"http://dx.doi.org/10.1103/PhysRevD.89.092007",
"http://www.arXiv.org/abs/1312.5353",
- "http://www.arXiv.org/abs/1312.5353",
},
},
{
diff --git a/tests/test_tag.py b/tests/test_tag.py
index e52ca38..ab14c8e 100644
--- a/tests/test_tag.py
+++ b/tests/test_tag.py
@@ -22,10 +22,10 @@
# or submit itself to any jurisdiction.
from refextract.references.tag import (
- tag_arxiv,
- identify_ibids,
find_numeration,
find_numeration_more,
+ identify_ibids,
+ tag_arxiv,
)