From ce53808a9f12333a43a07998682236ed95f5c828 Mon Sep 17 00:00:00 2001
From: DonHaul <ramiro.animus@gmail.com>
Date: Wed, 2 Oct 2024 17:29:40 +0200
Subject: [PATCH] fix test index name

---
 .pre-commit-config.yaml         |  17 +++++
 Dockerfile                      |   2 +-
 Untitled.ipynb                  | 106 ++++++++++++++++++++++++++
 refextract/__init__.py          |   2 +-
 refextract/app.py               |  11 +--
 refextract/authors/regexs.py    |  49 +++++-------
 refextract/documents/pdf.py     |   7 +-
 refextract/documents/text.py    |  15 ++--
 refextract/references/api.py    |  26 ++++---
 refextract/references/engine.py |  92 ++++++++++------------
 refextract/references/find.py   |  28 +++----
 refextract/references/kbs.py    |  39 ++++------
 refextract/references/pdf.py    |   7 +-
 refextract/references/regexs.py |   7 +-
 refextract/references/tag.py    | 131 ++++++++++++++++----------------
 refextract/references/text.py   |  15 ++--
 ruff.toml                       |  28 +++++++
 setup.py                        |   6 +-
 tests/conftest.py               |   2 +-
 tests/integration/conftest.py   |   2 +-
 tests/integration/test_views.py |   7 +-
 tests/test_api.py               |  24 +++---
 tests/test_engine.py            |   7 +-
 tests/test_kbs.py               |   8 +-
 tests/test_pdf.py               |   2 -
 tests/test_tag.py               |   4 +-
 26 files changed, 376 insertions(+), 268 deletions(-)
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 Untitled.ipynb
 create mode 100644 ruff.toml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..df97a7d
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: fix-byte-order-marker
+      - id: mixed-line-ending
+      - id: name-tests-test
+        args: [ --pytest-test-first ]
+        exclude: '^(?!factories/)'
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.5.6
+    hooks:
+      - id: ruff
+        args: [ --fix , --unsafe-fixes]
diff --git a/Dockerfile b/Dockerfile
index 574e1a9..9cb40f5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,6 +2,6 @@ FROM python:3.8
 RUN apt update && apt install poppler-utils -y
 COPY setup.py setup.cfg README.rst ./
 COPY refextract refextract/
-RUN python setup.py install 
+RUN python setup.py install
 ENV PROMETHEUS_MULTIPROC_DIR='/tmp'
 ENTRYPOINT exec gunicorn -b :5000 --access-logfile - --error-logfile - refextract.app:app --timeout 650
diff --git a/Untitled.ipynb b/Untitled.ipynb
new file mode 100644
index 0000000..dc7b752
--- /dev/null
+++ b/Untitled.ipynb
@@ -0,0 +1,106 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "163771b1-17d9-4648-875c-63f1a54c9201",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "1\n",
+      "2\n",
+      "3\n",
+      "4\n",
+      "5\n",
+      "6\n",
+      "6\n"
+     ]
+    }
+   ],
+   "source": [
+    "real_index = 0\n",
+    "s = \"sdasdas\"\n",
+    "\n",
+    "for real_index, char in enumerate(s):\n",
+    "    print(real_index)\n",
+    "\n",
+    "print(real_index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6132dad4-7fce-4719-beea-693eb32eed16",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'asdsad'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"asdsad\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d72da078-f2c3-4879-a1a1-7557688ee727",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = \"adsad\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4497ac16-b4fd-407a-b567-2b5a67ec5d55",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "wow\n"
+     ]
+    }
+   ],
+   "source": [
+    "if path.startswith:\n",
+    "    print(\"wow\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/refextract/__init__.py b/refextract/__init__.py
index 3669b7a..92df279 100644
--- a/refextract/__init__.py
+++ b/refextract/__init__.py
@@ -23,7 +23,7 @@
 
 """Refextract."""
 
-from .references.api import (
+from refextract.references.api import (
     extract_journal_reference,
     extract_references_from_file,
     extract_references_from_string,
diff --git a/refextract/app.py b/refextract/app.py
index c5e1a1a..6b244e1 100644
--- a/refextract/app.py
+++ b/refextract/app.py
@@ -1,14 +1,15 @@
 import logging
 
 from flask import Flask, jsonify, make_response
-from prometheus_flask_exporter.multiprocess import \
-    GunicornInternalPrometheusMetrics
+from prometheus_flask_exporter.multiprocess import GunicornInternalPrometheusMetrics
 from webargs import fields
 from webargs.flaskparser import FlaskParser
 
-from refextract.references.api import (extract_journal_reference,
-                                       extract_references_from_string,
-                                       extract_references_from_url)
+from refextract.references.api import (
+    extract_journal_reference,
+    extract_references_from_string,
+    extract_references_from_url,
+)
 
 parser = FlaskParser()
 
diff --git a/refextract/authors/regexs.py b/refextract/authors/regexs.py
index b169d12..b60e5e8 100644
--- a/refextract/authors/regexs.py
+++ b/refextract/authors/regexs.py
@@ -24,7 +24,7 @@
 import logging
 import re
 
-from ..references.config import CFG_REFEXTRACT_KBS
+from refextract.references.config import CFG_REFEXTRACT_KBS
 
 LOGGER = logging.getLogger(__name__)
 
@@ -42,10 +42,7 @@ def get_author_affiliation_numeration_str(punct=None):
     re_number = r'(?:\d\d?)'
     re_chained_numbers = r"(?:(?:[,;]\s*%s\.?\s*))*" % re_number
     # Punctuation surrounding the number, either general or specific again
-    if punct is None:
-        re_punct = r"(?:[\{\(\[]?)"
-    else:
-        re_punct = re.escape(punct)
+    re_punct = '(?:[\\{\\(\\[]?)' if punct is None else re.escape(punct)
 
     # Generic number finder (MUST NOT INCLUDE NAMED GROUPS!!!)
     numeration_str = r"""
@@ -86,10 +83,7 @@ def get_initial_surname_author_pattern(incl_numeration=False):
     @return (string): The 'Initials Surname' author pattern."""
     # Possible inclusion of superscript numeration at the end of author names
     # Will match the empty string
-    if incl_numeration:
-        append_num_re = get_author_affiliation_numeration_str() + '?'
-    else:
-        append_num_re = ""
+    append_num_re = get_author_affiliation_numeration_str() + '?' if incl_numeration else ''
 
     return r"""
     (?:
@@ -137,10 +131,7 @@ def get_surname_initial_author_pattern(incl_numeration=False):
     @return (string): The 'Surname Initials' author pattern."""
     # Possible inclusion of superscript numeration at the end of author names
     # Will match the empty string
-    if incl_numeration:
-        append_num_re = get_author_affiliation_numeration_str() + '?'
-    else:
-        append_num_re = ""
+    append_num_re = get_author_affiliation_numeration_str() + '?' if incl_numeration else ''
 
     return r"""
     (?:
@@ -410,27 +401,27 @@ def add_to_auth_list(s):
     fpath = CFG_REFEXTRACT_KBS['collaborations']
 
     try:
-        fh = open(fpath, "r")
+        with open(fpath, 'r') as fh:
+            for line_num, rawline in enumerate(fh):
+                try:
+                    rawline = rawline.decode("utf-8")
+                except UnicodeError:
+                    LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num)
+                    raise UnicodeError(
+                        "Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
+                if rawline.strip() and rawline[0].strip() != '#':
+                    add_to_auth_list(rawline)
+                    # Shorten collaboration to 'coll'
+                    if rawline.lower().endswith('collaboration\n'):
+                        coll_version = rawline[:rawline.lower().find(
+                            u'collaboration\n')] + r"coll[\.\,]"
+                        add_to_auth_list(
+                            coll_version.strip().replace(' ', r'\s') + u"s?")
     except IOError:
         # problem opening KB for reading, or problem while reading from it:
         LOGGER.debug(u"Error: Could not build knowledge base containing author patterns - failed to read from KB %s s.\n", fpath)
         raise IOError("Error: Unable to open collaborations kb '%s'" % fpath)
 
-    for line_num, rawline in enumerate(fh):
-        try:
-            rawline = rawline.decode("utf-8")
-        except UnicodeError:
-            LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num)
-            raise UnicodeError(
-                "Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
-        if rawline.strip() and rawline[0].strip() != '#':
-            add_to_auth_list(rawline)
-            # Shorten collaboration to 'coll'
-            if rawline.lower().endswith('collaboration\n'):
-                coll_version = rawline[:rawline.lower().find(
-                    u'collaboration\n')] + r"coll[\.\,]"
-                add_to_auth_list(
-                    coll_version.strip().replace(' ', r'\s') + u"s?")
 
     author_match_re = ""
     if len(auths) > 0:
diff --git a/refextract/documents/pdf.py b/refextract/documents/pdf.py
index 038ef32..e193911 100644
--- a/refextract/documents/pdf.py
+++ b/refextract/documents/pdf.py
@@ -39,7 +39,7 @@
 import re
 import subprocess
 
-from ..references.config import CFG_PATH_PDFTOTEXT
+from refextract.references.config import CFG_PATH_PDFTOTEXT
 
 LOGGER = logging.getLogger(__name__)
 
@@ -56,10 +56,7 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
     if not os.path.isfile(CFG_PATH_PDFTOTEXT):
         raise IOError('Missing pdftotext executable')
 
-    if keep_layout:
-        layout_option = "-layout"
-    else:
-        layout_option = "-raw"
+    layout_option = '-layout' if keep_layout else '-raw'
     doclines = []
     # Pattern to check for lines with a leading page-break character.
     # If this pattern is matched, we want to split the page-break into
diff --git a/refextract/documents/text.py b/refextract/documents/text.py
index c8022fe..df120b7 100644
--- a/refextract/documents/text.py
+++ b/refextract/documents/text.py
@@ -25,7 +25,6 @@
 
 import re
 
-
 re_space_comma = re.compile(r'\s,', re.UNICODE)
 re_space_semicolon = re.compile(r'\s;', re.UNICODE)
 re_space_period = re.compile(r'\s\.', re.UNICODE)
@@ -264,12 +263,8 @@ def get_number_header_lines(docbody, page_break_posns):
     # pattern to search for a word in a line:
     p_wordSearch = re.compile(r'([A-Za-z0-9-]+)', re.UNICODE)
     if remaining_breaks > 2:
-        if remaining_breaks > 3:
-            # Only check odd page headers
-            next_head = 2
-        else:
-            # Check headers on each page
-            next_head = 1
+        # Only check odd page headers else check headers on each page
+        next_head = 2 if remaining_breaks > 3 else 1
         keep_checking = 1
         while keep_checking:
             cur_break = 1
@@ -406,7 +401,7 @@ def strip_headers_footers_pagebreaks(docbody,
         for i in range(0, len(page_break_posns)):
             # Unless this is the last page break, chop headers
             if not first:
-                for dummy in range(1, num_head_lines + 1):
+                for _dummy in range(1, num_head_lines + 1):
                     docbody[page_break_posns[i] +
                             1:page_break_posns[i] + 2] = []
             else:
@@ -415,7 +410,7 @@ def strip_headers_footers_pagebreaks(docbody,
             docbody[page_break_posns[i]:page_break_posns[i] + 1] = []
             # Chop footers (unless this is the first page break)
             if i != len(page_break_posns) - 1:
-                for dummy in range(1, num_foot_lines + 1):
+                for _dummy in range(1, num_foot_lines + 1):
                     docbody[page_break_posns[i] -
                             num_foot_lines:page_break_posns[i] -
                             num_foot_lines + 1] = []
@@ -429,7 +424,7 @@ def check_boundary_lines_similar(l_1, l_2):
     @return: (int) 1/0.
     """
     num_matches = 0
-    if (type(l_1) != list) or (type(l_2) != list) or (len(l_1) != len(l_2)):
+    if not isinstance(l_1, list) or not isinstance(l_2, list) or len(l_1) != len(l_2):
         # these 'boundaries' are not similar
         return 0
 
diff --git a/refextract/references/api.py b/refextract/references/api.py
index 3f0fcce..ffaaec2 100644
--- a/refextract/references/api.py
+++ b/refextract/references/api.py
@@ -29,25 +29,29 @@
 """
 
 import os
-import requests
-import magic
-
 from tempfile import mkstemp
 
+import magic
+import requests
 from inspire_utils.dedupers import dedupe_list
 
-from .engine import (
+from refextract.references.engine import (
     get_kbs,
     get_plaintext_document_body,
     parse_reference_line,
     parse_references,
 )
-from .errors import FullTextNotAvailableError
-from .find import (find_numeration_in_body,
-                   get_reference_section_beginning)
-from .pdf import extract_texkeys_and_urls_from_pdf
-from .text import extract_references_from_fulltext, rebuild_reference_lines
-from .record import update_reference_with_urls
+from refextract.references.errors import FullTextNotAvailableError
+from refextract.references.find import (
+    find_numeration_in_body,
+    get_reference_section_beginning,
+)
+from refextract.references.pdf import extract_texkeys_and_urls_from_pdf
+from refextract.references.record import update_reference_with_urls
+from refextract.references.text import (
+    extract_references_from_fulltext,
+    rebuild_reference_lines,
+)
 
 
 def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
@@ -146,7 +150,7 @@ def extract_references_from_file(path,
         extracted_texkeys_urls = extract_texkeys_and_urls_from_pdf(path)
         if len(extracted_texkeys_urls) == len(parsed_refs):
             parsed_refs_updated = []
-            for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls):
+            for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls, strict=False):
                 update_reference_with_urls(ref, ref_texkey_urls.get('urls', []))
                 if ref.get('url'):
                     ref['url'] = dedupe_list(ref['url'])
diff --git a/refextract/references/engine.py b/refextract/references/engine.py
index 9626b5b..ed1cdb3 100644
--- a/refextract/references/engine.py
+++ b/refextract/references/engine.py
@@ -26,49 +26,47 @@
 import logging
 import mmap
 import re
-
 from datetime import datetime
 
 import magic
 
-from .config import (
-    CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM,
+from refextract.documents.pdf import convert_PDF_to_plaintext
+from refextract.references.config import (
     CFG_REFEXTRACT_MARKER_CLOSING_ARXIV,
+    CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL,
     CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL,
     CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND,
-    CFG_REFEXTRACT_MARKER_CLOSING_VOLUME,
-    CFG_REFEXTRACT_MARKER_CLOSING_YEAR,
     CFG_REFEXTRACT_MARKER_CLOSING_PAGE,
-    CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID,
-    CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL,
-    CFG_REFEXTRACT_MARKER_CLOSING_TITLE,
+    CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM,
     CFG_REFEXTRACT_MARKER_CLOSING_SERIES,
+    CFG_REFEXTRACT_MARKER_CLOSING_TITLE,
+    CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID,
+    CFG_REFEXTRACT_MARKER_CLOSING_VOLUME,
+    CFG_REFEXTRACT_MARKER_CLOSING_YEAR,
 )
-
-from .errors import UnknownDocumentTypeError
-
-from .tag import (
-    tag_reference_line,
-    sum_2_dictionaries,
-    identify_and_tag_DOI,
-    identify_and_tag_URLs,
-    find_numeration,
-    extract_series_from_volume
-)
-from .text import wash_and_repair_reference_line
-from .record import build_references
-from ..documents.pdf import convert_PDF_to_plaintext
-from .kbs import get_kbs
-from .regexs import (
+from refextract.references.errors import UnknownDocumentTypeError
+from refextract.references.kbs import get_kbs
+from refextract.references.record import build_references
+from refextract.references.regexs import (
     get_reference_line_numeration_marker_patterns,
-    regex_match_list,
-    re_tagged_citation,
+    re_hdl,
     re_numeration_no_ibid_txt,
-    re_roman_numbers,
     re_recognised_numeration_for_title_plus_series,
-    remove_year,
+    re_roman_numbers,
+    re_tagged_citation,
     re_year_in_misc_txt,
-    re_hdl)
+    regex_match_list,
+    remove_year,
+)
+from refextract.references.tag import (
+    extract_series_from_volume,
+    find_numeration,
+    identify_and_tag_DOI,
+    identify_and_tag_URLs,
+    sum_2_dictionaries,
+    tag_reference_line,
+)
+from refextract.references.text import wash_and_repair_reference_line
 
 LOGGER = logging.getLogger(__name__)
 
@@ -307,10 +305,7 @@ def postpone_last_auth(current_citation, num_auth):
     if num_auth == 0:
         return None
 
-    if num_auth == 1:
-        func = current_citation.__getitem__
-    else:
-        func = current_citation.pop
+    func = current_citation.__getitem__ if num_auth == 1 else current_citation.pop
 
     for idx, el in enumerate(reversed(current_citation), 1):
         if el["type"] == "AUTH":
@@ -375,10 +370,7 @@ def split_citations_iter(citation_elements):
 
 def valid_citation(citation):
     els_to_remove = ('MISC', )
-    for el in citation:
-        if el['type'] not in els_to_remove:
-            return True
-    return False
+    return any(el['type'] not in els_to_remove for el in citation)
 
 
 def remove_invalid_references(splitted_citations):
@@ -429,11 +421,10 @@ def add_misc(el, txt):
         previous_citation_valid = True
         for citation in splitted_citations:
             current_citation_valid = valid_citation(citation)
-            if not current_citation_valid:
+            if not current_citation_valid and not previous_citation_valid and not current_citation_valid :
                 # Merge to previous one misc txt
-                if not previous_citation_valid and not current_citation_valid:
-                    for el in citation:
-                        add_misc(previous_citation[-1], el['misc_txt'])
+                for el in citation:
+                    add_misc(previous_citation[-1], el['misc_txt'])
 
             previous_citation = citation
             previous_citation_valid = current_citation_valid
@@ -475,10 +466,7 @@ def add_year_elements(splitted_citations):
 
 def look_for_implied_ibids(splitted_citations):
     def look_for_journal(els):
-        for el in els:
-            if el['type'] == 'JOURNAL':
-                return True
-        return False
+        return any(el['type'] == 'JOURNAL' for el in els)
 
     current_journal = None
     for citation in splitted_citations:
@@ -616,13 +604,15 @@ def print_citations(splitted_citations, line_marker):
             LOGGER.debug('%s %s', el['type'], repr(el))
 
 
-def parse_reference_line(ref_line, kbs, bad_titles_count={}, linker_callback=None):
+def parse_reference_line(ref_line, kbs, bad_titles_count=None, linker_callback=None):
     """Parse one reference line
 
     @input a string representing a single reference bullet
     @output parsed references (a list of elements objects)
     """
     # Strip the 'marker' (e.g. [1]) from this reference line:
+    if bad_titles_count is None:
+        bad_titles_count = {}
     line_marker, ref_line = remove_reference_line_marker(ref_line)
     # Find DOI sections in citation
     ref_line, identified_dois = identify_and_tag_DOI(ref_line)
@@ -776,11 +766,12 @@ def find_substring_ignore_special_chars(s, substr):
         i = 0
         real_index = 0
         re_alphanum = re.compile('[A-Z0-9]')
-        for real_index, char in enumerate(s):
+        for char in s:
             if re_alphanum.match(char):
                 i += 1
             if i > startIndex:
                 break
+            real_index=+1
 
         return real_index
     else:
@@ -801,7 +792,7 @@ def cut_substring_with_special_chars(s, sub, startIndex):
         if subPosition >= len(clean_sub):
             # include everything till a space, open bracket or a normal
             # character
-            counter += len(re.split('[ [{(a-zA-Z0-9]', s[startIndex + counter:], 1)[0])
+            counter += len(re.split('[ [{(a-zA-Z0-9]', s[startIndex + counter:], maxsplit=1)[0])
 
             return s[0:startIndex].strip() + ' ' + s[startIndex + counter:].strip()
 
@@ -810,10 +801,7 @@ def is_unknown_citation(citation):
     """Checks if the citation got recognized as one of the known types.
     """
     knownTypes = ['BOOK', 'JOURNAL', 'DOI', 'ISBN', 'RECID']
-    for citation_element in citation:
-        if citation_element['type'] in knownTypes:
-            return False
-    return True
+    return all(citation_element['type'] not in knownTypes for citation_element in citation)
 
 
 def parse_references_elements(ref_sect, kbs, linker_callback=None):
diff --git a/refextract/references/find.py b/refextract/references/find.py
index a88005e..f8c8a13 100644
--- a/refextract/references/find.py
+++ b/refextract/references/find.py
@@ -23,19 +23,21 @@
 
 """Finding the reference section from the fulltext"""
 
+import contextlib
 import logging
 import re
 
-from .regexs import \
-    get_reference_section_title_patterns, \
-    get_reference_line_numeration_marker_patterns, \
-    regex_match_list, \
-    get_post_reference_section_title_patterns, \
-    get_post_reference_section_keyword_patterns, \
-    re_reference_line_bracket_markers, \
-    re_reference_line_dot_markers, \
-    re_reference_line_number_markers, \
-    re_num
+from refextract.references.regexs import (
+    get_post_reference_section_keyword_patterns,
+    get_post_reference_section_title_patterns,
+    get_reference_line_numeration_marker_patterns,
+    get_reference_section_title_patterns,
+    re_num,
+    re_reference_line_bracket_markers,
+    re_reference_line_dot_markers,
+    re_reference_line_number_markers,
+    regex_match_list,
+)
 
 LOGGER = logging.getLogger(__name__)
 
@@ -393,11 +395,9 @@ def find_end_of_reference_section(docbody,
         # save the reference count
         num_match = regex_match_list(docbody[x].strip(), mk_patterns)
         if num_match:
-            try:
+            with contextlib.suppress(ValueError, IndexError):
                 current_reference_count = int(num_match.group('marknum'))
-            except (ValueError, IndexError):
-                # non numerical references marking
-                pass
+
         # look for a likely section title that would follow a reference
         # section:
         end_match = regex_match_list(docbody[x].strip(), t_patterns)
diff --git a/refextract/references/kbs.py b/refextract/references/kbs.py
index a1d8184..5a77dbf 100644
--- a/refextract/references/kbs.py
+++ b/refextract/references/kbs.py
@@ -21,21 +21,22 @@
 # granted to it by virtue of its status as an Intergovernmental Organization
 # or submit itself to any jurisdiction.
 
-import re
-import six
-import csv
 import codecs
 import contextlib
+import csv
+import re
 
-from .config import CFG_REFEXTRACT_KBS
-from .regexs import (
-    re_kb_line,
-    re_regexp_character_class,
-    re_extract_quoted_text,
+import six
+
+from refextract.documents.text import re_group_captured_multiple_space
+from refextract.references.config import CFG_REFEXTRACT_KBS
+from refextract.references.regexs import (
     re_extract_char_class,
+    re_extract_quoted_text,
+    re_kb_line,
     re_punctuation,
+    re_regexp_character_class,
 )
-from ..documents.text import re_group_captured_multiple_space
 
 
 @contextlib.contextmanager
@@ -91,12 +92,9 @@ def load_kb_by_type(kb_type, kb):
 def load_kb(path, builder):
     if isinstance(path, dict):
         return load_kb_from_iterable(path.items(), builder)
-    try:
-        path.startswith
-    except AttributeError:
-        return load_kb_from_iterable(path, builder)
-    else:
+    elif hasattr(path, 'startswith'):
         return load_kb_from_file(path, builder)
+    return load_kb_from_iterable(path, builder)
 
 
 def order_reportnum_patterns_bylen(numeration_patterns):
@@ -342,25 +340,18 @@ def _add_institute_preprint_patterns(preprint_classifications,
             if m_preprint_classification:
                 # This KB line contains a preprint classification for
                 # the current institute
-                try:
+                with contextlib.suppress(AttributeError, NameError):
                     current_institute_preprint_classifications.append((m_preprint_classification.group(1),
                                                                        m_preprint_classification.group(2)))
-                except (AttributeError, NameError):
-                    # didn't match this line correctly - skip it
-                    pass
-                # move on to the next line
                 continue
 
             m_numeration_pattern = re_numeration_pattern.search(rawline)
             if m_numeration_pattern:
                 # This KB line contains a preprint item numeration pattern
                 # for the current institute
-                try:
+                with contextlib.suppress(AttributeError, NameError):
                     current_institute_numerations.append(
                         m_numeration_pattern.group(1))
-                except (AttributeError, NameError):
-                    # didn't match the numeration pattern correctly - skip it
-                    pass
                 continue
 
         _add_institute_preprint_patterns(current_institute_preprint_classifications,
@@ -538,7 +529,7 @@ def build_journals_kb(knowledgebase):
 
     # Now, for every 'replacement term' found in the KB, if it is
     # not already in the KB as a "search term", add it:
-    for repl_term in repl_terms.keys():
+    for repl_term in repl_terms:
         raw_repl_phrase = repl_term.upper()
         raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase)
         raw_repl_phrase = \
diff --git a/refextract/references/pdf.py b/refextract/references/pdf.py
index a30992c..a3304f4 100644
--- a/refextract/references/pdf.py
+++ b/refextract/references/pdf.py
@@ -26,7 +26,7 @@
 from PyPDF2 import PdfFileReader
 from PyPDF2.generic import ByteStringObject
 
-from .regexs import re_reference_in_dest
+from refextract.references.regexs import re_reference_in_dest
 
 LOGGER = logging.getLogger(__name__)
 
@@ -173,10 +173,7 @@ def _match_urls_with_reference(
             if not two_column_layout or (two_column_layout and url_col == ref_column):
                 urls_for_reference.add(url[0])
                 continue
-        elif is_last_reference_in_page or is_last_reference_in_page_two_col_layout:
-            urls_for_reference.add(url[0])
-            continue
-        elif is_in_new_column:
+        elif is_last_reference_in_page or is_last_reference_in_page_two_col_layout or is_in_new_column:
             urls_for_reference.add(url[0])
             continue
         elif is_url_unrelated_to_references:
diff --git a/refextract/references/regexs.py b/refextract/references/regexs.py
index d7e10de..330fbc6 100644
--- a/refextract/references/regexs.py
+++ b/refextract/references/regexs.py
@@ -22,7 +22,6 @@
 # or submit itself to any jurisdiction.
 
 import re
-
 from datetime import datetime
 
 # Sep
@@ -193,7 +192,6 @@ def compute_pos_patterns(patterns):
     r"quan-ph": "quant-ph",
     r"nlin-cd": "nlin.cd",
     r"math-sp": "math.sp",
-    r"atro-ph": "astro-ph",
     r"ast-ph": "astro-ph",
     r"asyro-ph": "astro-ph",
     r"aastro-ph": "astro-ph",
@@ -927,10 +925,7 @@ def regex_match_list(line, patterns):
 
 
 def remove_year(s, year=None):
-    if year:
-        year_pattern = re.escape(year)
-    else:
-        year_pattern = r"(?:19|20)\d{2}"
+    year_pattern = re.escape(year) if year else "(?:19|20)\\d{2}"
     s = re.sub(r'\[\s*%s\s*\]' % year_pattern, '', s)
     s = re.sub(r'\(\s*%s\s*\)' % year_pattern, '', s)
     s = re.sub(r'\s*%s\s*' % year_pattern, '', s)
diff --git a/refextract/references/tag.py b/refextract/references/tag.py
index b0156d2..7fba15b 100644
--- a/refextract/references/tag.py
+++ b/refextract/references/tag.py
@@ -22,66 +22,67 @@
 # or submit itself to any jurisdiction.
 
 import re
-
 from urllib.parse import unquote
 
 from unidecode import unidecode
 
-from .config import \
-    CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL, \
-    CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL, \
-    CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \
-    CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID, \
-    CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID, \
-    CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION, \
-    CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION
-
-from ..documents.text import remove_and_record_multiple_spaces_in_line
-
-from .regexs import \
-    re_ibid, \
-    re_doi, \
-    re_raw_url, \
-    re_series_from_numeration, \
-    re_punctuation, \
-    re_correct_numeration_2nd_try_ptn1, \
-    re_correct_numeration_2nd_try_ptn2, \
-    re_correct_numeration_2nd_try_ptn3, \
-    re_correct_numeration_2nd_try_ptn4, \
-    re_numeration_nucphys_vol_page_yr, \
-    re_numeration_vol_subvol_nucphys_yr_page, \
-    re_numeration_nucphys_vol_yr_page, \
-    re_multiple_hyphens, \
-    re_numeration_vol_page_yr, \
-    re_numeration_vol_yr_page, \
-    re_numeration_vol_nucphys_series_yr_page, \
-    re_numeration_vol_series_nucphys_page_yr, \
-    re_numeration_vol_nucphys_series_page_yr, \
-    re_html_tagged_url, \
-    re_numeration_yr_vol_page, \
-    re_numeration_vol_nucphys_page_yr, \
-    re_wash_volume_tag, \
-    re_numeration_vol_nucphys_yr_subvol_page, \
-    re_quoted, \
-    re_isbn, \
-    re_arxiv, \
-    re_arxiv_5digits, \
-    re_new_arxiv, \
-    re_new_arxiv_5digits, \
-    re_pos, \
-    re_pos_year_num, \
-    re_series_from_numeration_after_volume, \
-    RE_OLD_ARXIV, \
-    RE_ARXIV_CATCHUP, \
-    RE_ATLAS_CONF_PRE_2010, \
-    RE_ATLAS_CONF_POST_2010
-
-from ..authors.regexs import (
-    get_author_regexps,
+from refextract.authors.regexs import (
     etal_matches,
+    get_author_regexps,
     re_ed_notation,
-    re_etal)
-from ..documents.text import wash_line
+    re_etal,
+)
+from refextract.documents.text import (
+    remove_and_record_multiple_spaces_in_line,
+    wash_line,
+)
+from refextract.references.config import (
+    CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL,
+    CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL,
+    CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND,
+    CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION,
+    CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID,
+    CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION,
+    CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID,
+)
+from refextract.references.regexs import (
+    RE_ARXIV_CATCHUP,
+    RE_ATLAS_CONF_POST_2010,
+    RE_ATLAS_CONF_PRE_2010,
+    RE_OLD_ARXIV,
+    re_arxiv,
+    re_arxiv_5digits,
+    re_correct_numeration_2nd_try_ptn1,
+    re_correct_numeration_2nd_try_ptn2,
+    re_correct_numeration_2nd_try_ptn3,
+    re_correct_numeration_2nd_try_ptn4,
+    re_doi,
+    re_html_tagged_url,
+    re_ibid,
+    re_isbn,
+    re_multiple_hyphens,
+    re_new_arxiv,
+    re_new_arxiv_5digits,
+    re_numeration_nucphys_vol_page_yr,
+    re_numeration_nucphys_vol_yr_page,
+    re_numeration_vol_nucphys_page_yr,
+    re_numeration_vol_nucphys_series_page_yr,
+    re_numeration_vol_nucphys_series_yr_page,
+    re_numeration_vol_nucphys_yr_subvol_page,
+    re_numeration_vol_page_yr,
+    re_numeration_vol_series_nucphys_page_yr,
+    re_numeration_vol_subvol_nucphys_yr_page,
+    re_numeration_vol_yr_page,
+    re_numeration_yr_vol_page,
+    re_pos,
+    re_pos_year_num,
+    re_punctuation,
+    re_quoted,
+    re_raw_url,
+    re_series_from_numeration,
+    re_series_from_numeration_after_volume,
+    re_wash_volume_tag,
+)
 
 
 def tag_reference_line(line, kbs, record_titles_count):
@@ -439,7 +440,7 @@ def tag_atlas_conf(line):
 
 def identifiy_journals_re(line, kb_journals):
     matches = {}
-    for pattern, dummy_journal in kb_journals:
+    for pattern, _dummy_journal in kb_journals:
         match = re.search(pattern, line)
         if match:
             matches[match.start()] = match.group(0)
@@ -558,10 +559,7 @@ def extract_series_from_volume(volume):
 
 
 def create_numeration_tag(info):
-    if info['series']:
-        series_and_volume = info['series'] + info['volume']
-    else:
-        series_and_volume = info['volume']
+    series_and_volume = info['series'] + info['volume'] if info['series'] else info['volume']
     numeration_tags = u' <cds.VOL>%s</cds.VOL>' % series_and_volume
     if info.get('year', False):
         numeration_tags += u' <cds.YR>(%(year)s)</cds.YR>' % info
@@ -861,7 +859,7 @@ def identify_and_tag_collaborations(line, collaborations_kb):
        which won't influence the reference splitting heuristics
        (used when looking at mulitple <AUTH> tags in a line).
     """
-    for dummy_collab, re_collab in collaborations_kb.items():
+    for _dummy_collab, re_collab in collaborations_kb.items():
         matches = re_collab.finditer(strip_tags(line))
 
         for match in reversed(list(matches)):
@@ -967,16 +965,15 @@ def identify_and_tag_authors(line, authors_kb):
             add_to_misc = ""
             # If a semi-colon was found at the end of this author group, keep it in misc
             # so that it can be looked at for splitting heurisitics
-            if len(output_line) > m['end']:
-                if output_line[m['end']].strip(" ,.") == ';':
-                    add_to_misc = ';'
+            if len(output_line) > m['end'] and output_line[m['end']].strip(" ,.") == ';':
+                add_to_misc = ';'
 
             # Standardize eds. notation
             tmp_output_line = re.sub(re_ed_notation, '(ed.)',
-                                     output_line[start:end], re.IGNORECASE)
+                                     output_line[start:end], flags=re.IGNORECASE)
             # Standardize et al. notation
             tmp_output_line = re.sub(re_etal, 'et al.',
-                                     tmp_output_line, re.IGNORECASE)
+                                     tmp_output_line, flags=re.IGNORECASE)
             # Strip
             tmp_output_line = tmp_output_line.lstrip('.').strip(",:;- [](")
             if not tmp_output_line.endswith('(ed.)'):
@@ -1006,7 +1003,7 @@ def identify_and_tag_authors(line, authors_kb):
                 ed_notation = " (eds.)"
                 # Standardize et al. notation
                 tmp_output_line = re.sub(re_etal, 'et al.',
-                                         m['author_names'], re.IGNORECASE)
+                                         m['author_names'], flags=re.IGNORECASE)
                 # remove any characters which denote this author group
                 # to be editors, just take the
                 # author names, and append '(ed.)'
@@ -1032,7 +1029,7 @@ def sum_2_dictionaries(dicta, dictb):
        @return: (dictionary) - the sum of the 2 dictionaries
     """
     dict_out = dicta.copy()
-    for key in dictb.keys():
+    for key in dictb:
         if 'key' in dict_out:
             # Add the sum for key in dictb to that of dict_out:
             dict_out[key] += dictb[key]
diff --git a/refextract/references/text.py b/refextract/references/text.py
index 18c6f56..4cfc643 100644
--- a/refextract/references/text.py
+++ b/refextract/references/text.py
@@ -25,15 +25,18 @@
 import re
 
 from inspire_utils.record import replace_undesirable_characters
-from ..documents.text import (
+
+from refextract.documents.text import (
     join_lines,
-    repair_broken_urls,
     re_multiple_space,
-    remove_page_boundary_lines
+    remove_page_boundary_lines,
+    repair_broken_urls,
+)
+from refextract.references.config import CFG_REFEXTRACT_MAX_LINES
+from refextract.references.find import (
+    find_end_of_reference_section,
+    get_reference_section_beginning,
 )
-
-from .config import CFG_REFEXTRACT_MAX_LINES
-from .find import find_end_of_reference_section, get_reference_section_beginning
 
 LOGGER = logging.getLogger(__name__)
 
diff --git a/ruff.toml b/ruff.toml
new file mode 100644
index 0000000..d347e32
--- /dev/null
+++ b/ruff.toml
@@ -0,0 +1,28 @@
+target-version = "py311"
+[lint.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-tidy-imports
+    "TID",
+    # flake8-pytest-style
+    "PT",
+]
+ignore = ["B904","E501"]
+
+[lint.pycodestyle]
+ignore-overlong-task-comments = true
+
+[lint.pydocstyle]
+convention = "google"
diff --git a/setup.py b/setup.py
index a57faca..a75a1e0 100644
--- a/setup.py
+++ b/setup.py
@@ -25,10 +25,10 @@
 
 from setuptools import find_packages, setup
 
-
 url = 'https://github.com/inspirehep/refextract'
 
-readme = open('README.rst').read()
+with open('README.rst') as file:
+    readme = file.read()
 
 install_requires = [
     'PyPDF2~=1.0,>=1.26.0',
@@ -57,7 +57,7 @@
 }
 
 extras_require['all'] = []
-for name, reqs in extras_require.items():
+for _name, reqs in extras_require.items():
     extras_require['all'].extend(reqs)
 
 packages = find_packages()
diff --git a/tests/conftest.py b/tests/conftest.py
index 0860cab..cd19010 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -26,7 +26,7 @@
 import pytest
 
 
-@pytest.fixture
+@pytest.fixture()
 def pdf_files():
     path_to_pdfs = os.path.join(os.path.dirname(__file__), 'data')
     pdfs = os.listdir(path_to_pdfs)
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 882022f..fc6bde9 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -6,7 +6,7 @@
 @pytest.fixture(autouse=True, scope="session")
 def app():
     app = create_app()
-    yield app
+    return app
 
 
 @pytest.fixture()
diff --git a/tests/integration/test_views.py b/tests/integration/test_views.py
index 562cd60..e382ffe 100644
--- a/tests/integration/test_views.py
+++ b/tests/integration/test_views.py
@@ -75,7 +75,7 @@ def test_extract_journal_info_when_timeout_from_refextract(
         data=json.dumps(payload),
     )
     assert response.status_code == 500
-    assert {'message': "Can not extract publication info data. Reason: 'test message'"} == response.json
+    assert response.json == {'message': "Can not extract publication info data. Reason: 'test message'"}
 
 
 def test_extract_journal_info_for_multiple_pubinfos(app_client):
@@ -141,7 +141,8 @@ def test_extract_extract_references_from_text(app_client):
     assert "year" in response.json["extracted_references"][0]
 
 
-@mock.patch("refextract.app.extract_references_from_string", side_effect=KeyError("test message"))
+@mock.patch("refextract.app.extract_references_from_string",
+            side_effect=KeyError("test message"))
 def test_extract_references_from_text_when_timeout_from_refextract(
     mock_extract_refs, app_client
 ):
@@ -159,7 +160,7 @@ def test_extract_references_from_text_when_timeout_from_refextract(
         "/extract_references_from_text", headers=headers, data=json.dumps(payload)
     )
     assert response.status_code == 500
-    assert {'message': "Can not extract references. Reason: 'test message'"} == response.json
+    assert response.json == {'message': "Can not extract references. Reason: 'test message'"}
 
 
 @pytest.mark.vcr()
diff --git a/tests/test_api.py b/tests/test_api.py
index 5f5c17e..4948ac2 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -26,22 +26,22 @@
 
 from refextract.references.api import (
     extract_journal_reference,
+    extract_references_from_file,
     extract_references_from_string,
     extract_references_from_url,
-    extract_references_from_file,
 )
-
 from refextract.references.errors import FullTextNotAvailableError
 
 
-@pytest.fixture
+@pytest.fixture()
 def kbs_override():
     return {
         "books": [
             ('Griffiths, David', 'Introduction to elementary particles', '2008')
         ],
         "journals": [
-            ("PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS", "Phys.Rev.ST Accel.Beams"),
+            ("PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS",
+             "Phys.Rev.ST Accel.Beams"),
             ("PHYS REV D", "Phys.Rev.;D"),
             ("PHYS REV", "Phys.Rev."),
             ("PHYS REV LETT", "Phys.Rev.Lett."),
@@ -148,15 +148,15 @@ def test_extract_references_from_url(pdf_files):
     assert len(r) == 36
     assert 'url' in r[0]
 
+    url = "http://www.example.com"
+    responses.add(
+        responses.GET,
+        url,
+        body="File not found!",
+        status=404,
+        content_type='text/plain',
+    )
     with pytest.raises(FullTextNotAvailableError):
-        url = "http://www.example.com"
-        responses.add(
-            responses.GET,
-            url,
-            body="File not found!",
-            status=404,
-            content_type='text/plain',
-        )
         extract_references_from_url(url)
 
 
diff --git a/tests/test_engine.py b/tests/test_engine.py
index 0eeb5d4..4486462 100644
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@@ -27,7 +27,6 @@
     get_plaintext_document_body,
     parse_references,
 )
-
 from refextract.references.errors import UnknownDocumentTypeError
 
 
@@ -329,10 +328,10 @@ def test_get_plaintext_document_body(tmpdir):
     f.write("".join(input))
     assert input == get_plaintext_document_body(str(f))
 
+    html = "<html><body>Some page</body></html>"
+    f = tmpdir.join("page.html")
+    f.write(html)
     with pytest.raises(UnknownDocumentTypeError) as excinfo:
-        html = "<html><body>Some page</body></html>"
-        f = tmpdir.join("page.html")
-        f.write(html)
         get_plaintext_document_body(str(f))
     assert 'text/html' in excinfo.value.args
 
diff --git a/tests/test_kbs.py b/tests/test_kbs.py
index 7d9c573..c3968a5 100644
--- a/tests/test_kbs.py
+++ b/tests/test_kbs.py
@@ -35,14 +35,14 @@ def test_get_kbs_caches_journal_dict():
 
     first_cache = get_kbs(custom_kbs={"journals": journals}).copy()
     assert len(first_cache["journals"]) == 3
-    assert ["JOURNAL OF TESTING", "J TESTING"] == first_cache["journals"][-1]
+    assert first_cache["journals"][-1] == ["JOURNAL OF TESTING", "J TESTING"]
 
     journals = journals.copy()
     second_cache = get_kbs(custom_kbs={"journals": journals})
     # the cache is reused, so identity of the cache elements doesn't change
     assert all(
         cached_first is cached_second for (cached_first, cached_second)
-        in zip(first_cache["journals"], second_cache["journals"])
+        in zip(first_cache["journals"], second_cache["journals"], strict=False)
     )
 
 
@@ -55,7 +55,7 @@ def test_get_kbs_invalidates_cache_if_input_changes():
     # the cache is invalidated, so identity of the cache elements changes
     assert all(
         cached_first is not cached_second for (cached_first, cached_second)
-        in zip(first_cache["journals"], second_cache["journals"])
+        in zip(first_cache["journals"], second_cache["journals"], strict=False)
     )
     assert len(second_cache["journals"]) == 3
-    assert ["JOURNAL OF TESTING", "J TEST"] == second_cache["journals"][-1]
+    assert second_cache["journals"][-1] == ["JOURNAL OF TESTING", "J TEST"]
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
index 61f6b01..3ce6923 100644
--- a/tests/test_pdf.py
+++ b/tests/test_pdf.py
@@ -84,7 +84,6 @@ def test_extract_texkeys_and_urls_from_pdf(pdf_files):
             "urls": {
                 "http://dx.doi.org/10.1016/j.physletb.2013.08.026",
                 "http://www.arXiv.org/abs/1307.1432",
-                "http://www.arXiv.org/abs/1307.1432",
             },
         },
         {"texkey": "CMS_combination", "urls": {"http://www.arXiv.org/abs/1412.8662"}},
@@ -111,7 +110,6 @@ def test_extract_texkeys_and_urls_from_pdf(pdf_files):
             "urls": {
                 "http://dx.doi.org/10.1103/PhysRevD.89.092007",
                 "http://www.arXiv.org/abs/1312.5353",
-                "http://www.arXiv.org/abs/1312.5353",
             },
         },
         {
diff --git a/tests/test_tag.py b/tests/test_tag.py
index e52ca38..ab14c8e 100644
--- a/tests/test_tag.py
+++ b/tests/test_tag.py
@@ -22,10 +22,10 @@
 # or submit itself to any jurisdiction.
 
 from refextract.references.tag import (
-    tag_arxiv,
-    identify_ibids,
     find_numeration,
     find_numeration_more,
+    identify_ibids,
+    tag_arxiv,
 )