precommit: add precommit and ruff

*ref: cern-sis/issues-inspire/issues/498
inspirehep · Oct 8, 2024 · 0ff1464 · 0ff1464
1 parent ef676a2
commit 0ff1464
Show file tree

Hide file tree

Showing 26 changed files with 718 additions and 510 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: fix-byte-order-marker
+      - id: mixed-line-ending
+      - id: name-tests-test
+        args: [ --pytest-test-first ]
+        exclude: '^(?!factories/)'
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.9
+    hooks:
+      - id: ruff
+        args: [ --fix]
diff --git a/Dockerfile b/Dockerfile
@@ -2,6 +2,6 @@ FROM python:3.8
 RUN apt update && apt install poppler-utils -y
 COPY setup.py setup.cfg README.rst ./
 COPY refextract refextract/
-RUN python setup.py install 
+RUN python setup.py install
 ENV PROMETHEUS_MULTIPROC_DIR='/tmp'
 ENTRYPOINT exec gunicorn -b :5000 --access-logfile - --error-logfile - refextract.app:app --timeout 650
diff --git a/refextract/__init__.py b/refextract/__init__.py
@@ -23,7 +23,7 @@
 
 """Refextract."""
 
-from .references.api import (
+from refextract.references.api import (
     extract_journal_reference,
     extract_references_from_file,
     extract_references_from_string,

diff --git a/refextract/app.py b/refextract/app.py
@@ -1,14 +1,15 @@
 import logging
 
 from flask import Flask, jsonify, make_response
-from prometheus_flask_exporter.multiprocess import \
-    GunicornInternalPrometheusMetrics
+from prometheus_flask_exporter.multiprocess import GunicornInternalPrometheusMetrics
 from webargs import fields
 from webargs.flaskparser import FlaskParser
 
-from refextract.references.api import (extract_journal_reference,
-                                       extract_references_from_string,
-                                       extract_references_from_url)
+from refextract.references.api import (
+    extract_journal_reference,
+    extract_references_from_string,
+    extract_references_from_url,
+)
 
 parser = FlaskParser()
 
@@ -46,7 +47,8 @@ def extract_journal_info(args):
             return make_response(
                 jsonify(
                     {
-                        "message": f"Can not extract publication info data. Reason: {str(e)}"
+                        "message":
+                            f"Can not extract publication info data. Reason: {str(e)}"
                     }
                 ),
                 500,

diff --git a/refextract/authors/regexs.py b/refextract/authors/regexs.py
diff --git a/refextract/documents/pdf.py b/refextract/documents/pdf.py
@@ -39,7 +39,7 @@
 import re
 import subprocess
 
-from ..references.config import CFG_PATH_PDFTOTEXT
+from refextract.references.config import CFG_PATH_PDFTOTEXT
 
 LOGGER = logging.getLogger(__name__)
 
@@ -56,10 +56,7 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False):
     if not os.path.isfile(CFG_PATH_PDFTOTEXT):
         raise IOError('Missing pdftotext executable')
 
-    if keep_layout:
-        layout_option = "-layout"
-    else:
-        layout_option = "-raw"
+    layout_option = '-layout' if keep_layout else '-raw'
     doclines = []
     # Pattern to check for lines with a leading page-break character.
     # If this pattern is matched, we want to split the page-break into

diff --git a/refextract/documents/text.py b/refextract/documents/text.py
@@ -25,7 +25,6 @@
 
 import re
 
-
 re_space_comma = re.compile(r'\s,', re.UNICODE)
 re_space_semicolon = re.compile(r'\s;', re.UNICODE)
 re_space_period = re.compile(r'\s\.', re.UNICODE)
@@ -34,7 +33,8 @@
 re_space_closing_square_bracket = re.compile(r'\s\]', re.UNICODE)
 re_opening_square_bracket_space = re.compile(r'\[\s', re.UNICODE)
 re_hyphens = re.compile(
-    br'(\\255|\u02D7|\u0335|\u0336|\u2212|\u002D|\uFE63|\uFF0D)'.decode('raw_unicode_escape'), re.UNICODE)
+    br'(\\255|\u02D7|\u0335|\u0336|\u2212|\u002D|\uFE63|\uFF0D)'
+    .decode('raw_unicode_escape'), re.UNICODE)
 re_multiple_space = re.compile(r'\s{2,}', re.UNICODE)
 
 re_group_captured_multiple_space = re.compile(r'(\s{2,})', re.UNICODE)
@@ -264,12 +264,7 @@ def get_number_header_lines(docbody, page_break_posns):
     # pattern to search for a word in a line:
     p_wordSearch = re.compile(r'([A-Za-z0-9-]+)', re.UNICODE)
     if remaining_breaks > 2:
-        if remaining_breaks > 3:
-            # Only check odd page headers
-            next_head = 2
-        else:
-            # Check headers on each page
-            next_head = 1
+        next_head = 2 if remaining_breaks > 3 else 1
         keep_checking = 1
         while keep_checking:
             cur_break = 1
@@ -406,7 +401,7 @@ def strip_headers_footers_pagebreaks(docbody,
         for i in range(0, len(page_break_posns)):
             # Unless this is the last page break, chop headers
             if not first:
-                for dummy in range(1, num_head_lines + 1):
+                for _dummy in range(1, num_head_lines + 1):
                     docbody[page_break_posns[i] +
                             1:page_break_posns[i] + 2] = []
             else:
@@ -415,7 +410,7 @@ def strip_headers_footers_pagebreaks(docbody,
             docbody[page_break_posns[i]:page_break_posns[i] + 1] = []
             # Chop footers (unless this is the first page break)
             if i != len(page_break_posns) - 1:
-                for dummy in range(1, num_foot_lines + 1):
+                for _dummy in range(1, num_foot_lines + 1):
                     docbody[page_break_posns[i] -
                             num_foot_lines:page_break_posns[i] -
                             num_foot_lines + 1] = []
@@ -429,7 +424,7 @@ def check_boundary_lines_similar(l_1, l_2):
     @return: (int) 1/0.
     """
     num_matches = 0
-    if (type(l_1) != list) or (type(l_2) != list) or (len(l_1) != len(l_2)):
+    if not isinstance(l_1, list) or not isinstance(l_2, list) or (len(l_1) != len(l_2)):
         # these 'boundaries' are not similar
         return 0
 

diff --git a/refextract/references/api.py b/refextract/references/api.py
@@ -29,25 +29,29 @@
 """
 
 import os
-import requests
-import magic
-
 from tempfile import mkstemp
 
+import magic
+import requests
 from inspire_utils.dedupers import dedupe_list
 
-from .engine import (
+from refextract.references.engine import (
     get_kbs,
     get_plaintext_document_body,
     parse_reference_line,
     parse_references,
 )
-from .errors import FullTextNotAvailableError
-from .find import (find_numeration_in_body,
-                   get_reference_section_beginning)
-from .pdf import extract_texkeys_and_urls_from_pdf
-from .text import extract_references_from_fulltext, rebuild_reference_lines
-from .record import update_reference_with_urls
+from refextract.references.errors import FullTextNotAvailableError
+from refextract.references.find import (
+    find_numeration_in_body,
+    get_reference_section_beginning,
+)
+from refextract.references.pdf import extract_texkeys_and_urls_from_pdf
+from refextract.references.record import update_reference_with_urls
+from refextract.references.text import (
+    extract_references_from_fulltext,
+    rebuild_reference_lines,
+)
 
 
 def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
@@ -71,7 +75,8 @@ def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
 
     To override KBs for journal names etc., use ``override_kbs_files``:
 
-    >>> extract_references_from_url(path, override_kbs_files={'journals': 'my/path/to.kb'})
+    >>> extract_references_from_url(path,
+                                    override_kbs_files={'journals': 'my/path/to.kb'})
 
     """
     # Get temporary filepath to download to
@@ -122,7 +127,8 @@ def extract_references_from_file(path,
 
     To override KBs for journal names etc., use ``override_kbs_files``:
 
-    >>> extract_references_from_file(path, override_kbs_files={'journals': 'my/path/to.kb'})
+    >>> extract_references_from_file(path,
+                                     override_kbs_files={'journals': 'my/path/to.kb'})
 
     """
     if not os.path.isfile(path):
@@ -150,7 +156,8 @@ def extract_references_from_file(path,
                 update_reference_with_urls(ref, ref_texkey_urls.get('urls', []))
                 if ref.get('url'):
                     ref['url'] = dedupe_list(ref['url'])
-                parsed_refs_updated.append(dict(ref, texkey=[ref_texkey_urls['texkey']]))
+                parsed_refs_updated.append(dict(ref,
+                                                texkey=[ref_texkey_urls['texkey']]))
 
             return parsed_refs_updated
     return parsed_refs
@@ -182,7 +189,8 @@ def extract_references_from_string(source,
 
     To override KBs for journal names etc., use ``override_kbs_files``:
 
-    >>> extract_references_from_string(path, override_kbs_files={'journals': 'my/path/to.kb'})
+    >>> extract_references_from_string(path,
+        override_kbs_files={'journals': 'my/path/to.kb'})
     """
     docbody = source.split('\n')
     if not is_only_references: