From ce53808a9f12333a43a07998682236ed95f5c828 Mon Sep 17 00:00:00 2001 From: DonHaul Date: Wed, 2 Oct 2024 17:29:40 +0200 Subject: [PATCH] fix test index name --- .pre-commit-config.yaml | 17 +++++ Dockerfile | 2 +- Untitled.ipynb | 106 ++++++++++++++++++++++++++ refextract/__init__.py | 2 +- refextract/app.py | 11 +-- refextract/authors/regexs.py | 49 +++++------- refextract/documents/pdf.py | 7 +- refextract/documents/text.py | 15 ++-- refextract/references/api.py | 26 ++++--- refextract/references/engine.py | 92 ++++++++++------------ refextract/references/find.py | 28 +++---- refextract/references/kbs.py | 39 ++++------ refextract/references/pdf.py | 7 +- refextract/references/regexs.py | 7 +- refextract/references/tag.py | 131 ++++++++++++++++---------------- refextract/references/text.py | 15 ++-- ruff.toml | 28 +++++++ setup.py | 6 +- tests/conftest.py | 2 +- tests/integration/conftest.py | 2 +- tests/integration/test_views.py | 7 +- tests/test_api.py | 24 +++--- tests/test_engine.py | 7 +- tests/test_kbs.py | 8 +- tests/test_pdf.py | 2 - tests/test_tag.py | 4 +- 26 files changed, 376 insertions(+), 268 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 Untitled.ipynb create mode 100644 ruff.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..df97a7d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: fix-byte-order-marker + - id: mixed-line-ending + - id: name-tests-test + args: [ --pytest-test-first ] + exclude: '^(?!factories/)' + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.5.6 + hooks: + - id: ruff + args: [ --fix , --unsafe-fixes] diff --git a/Dockerfile b/Dockerfile index 574e1a9..9cb40f5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,6 @@ FROM python:3.8 RUN apt update && apt install poppler-utils -y COPY setup.py setup.cfg README.rst ./ COPY refextract refextract/ -RUN python setup.py install +RUN python setup.py install ENV PROMETHEUS_MULTIPROC_DIR='/tmp' ENTRYPOINT exec gunicorn -b :5000 --access-logfile - --error-logfile - refextract.app:app --timeout 650 diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..dc7b752 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,106 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "163771b1-17d9-4648-875c-63f1a54c9201", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "1\n", + "2\n", + "3\n", + "4\n", + "5\n", + "6\n", + "6\n" + ] + } + ], + "source": [ + "real_index = 0\n", + "s = \"sdasdas\"\n", + "\n", + "for real_index, char in enumerate(s):\n", + " print(real_index)\n", + "\n", + "print(real_index)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6132dad4-7fce-4719-beea-693eb32eed16", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'asdsad'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"asdsad\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d72da078-f2c3-4879-a1a1-7557688ee727", + "metadata": {}, + "outputs": [], + "source": [ + "path = \"adsad\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4497ac16-b4fd-407a-b567-2b5a67ec5d55", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "wow\n" + ] + } + ], + "source": [ + "if path.startswith:\n", + " print(\"wow\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/refextract/__init__.py b/refextract/__init__.py index 3669b7a..92df279 100644 --- a/refextract/__init__.py +++ b/refextract/__init__.py @@ -23,7 +23,7 @@ """Refextract.""" -from .references.api import ( +from refextract.references.api import ( extract_journal_reference, extract_references_from_file, extract_references_from_string, diff --git a/refextract/app.py b/refextract/app.py index c5e1a1a..6b244e1 100644 --- a/refextract/app.py +++ b/refextract/app.py @@ -1,14 +1,15 @@ import logging from flask import Flask, jsonify, make_response -from prometheus_flask_exporter.multiprocess import \ - GunicornInternalPrometheusMetrics +from prometheus_flask_exporter.multiprocess import GunicornInternalPrometheusMetrics from webargs import fields from webargs.flaskparser import FlaskParser -from refextract.references.api import (extract_journal_reference, - extract_references_from_string, - extract_references_from_url) +from refextract.references.api import ( + extract_journal_reference, + extract_references_from_string, + extract_references_from_url, +) parser = FlaskParser() diff --git a/refextract/authors/regexs.py b/refextract/authors/regexs.py index b169d12..b60e5e8 100644 --- a/refextract/authors/regexs.py +++ b/refextract/authors/regexs.py @@ -24,7 +24,7 @@ import logging import re -from ..references.config import CFG_REFEXTRACT_KBS +from refextract.references.config import CFG_REFEXTRACT_KBS LOGGER = logging.getLogger(__name__) @@ -42,10 +42,7 @@ def get_author_affiliation_numeration_str(punct=None): re_number = r'(?:\d\d?)' re_chained_numbers = r"(?:(?:[,;]\s*%s\.?\s*))*" % re_number # Punctuation surrounding the number, either general or specific again - if punct is None: - re_punct = r"(?:[\{\(\[]?)" - else: - re_punct = re.escape(punct) + re_punct = '(?:[\\{\\(\\[]?)' if punct is None else re.escape(punct) # Generic number finder (MUST NOT INCLUDE NAMED GROUPS!!!) numeration_str = r""" @@ -86,10 +83,7 @@ def get_initial_surname_author_pattern(incl_numeration=False): @return (string): The 'Initials Surname' author pattern.""" # Possible inclusion of superscript numeration at the end of author names # Will match the empty string - if incl_numeration: - append_num_re = get_author_affiliation_numeration_str() + '?' - else: - append_num_re = "" + append_num_re = get_author_affiliation_numeration_str() + '?' if incl_numeration else '' return r""" (?: @@ -137,10 +131,7 @@ def get_surname_initial_author_pattern(incl_numeration=False): @return (string): The 'Surname Initials' author pattern.""" # Possible inclusion of superscript numeration at the end of author names # Will match the empty string - if incl_numeration: - append_num_re = get_author_affiliation_numeration_str() + '?' - else: - append_num_re = "" + append_num_re = get_author_affiliation_numeration_str() + '?' if incl_numeration else '' return r""" (?: @@ -410,27 +401,27 @@ def add_to_auth_list(s): fpath = CFG_REFEXTRACT_KBS['collaborations'] try: - fh = open(fpath, "r") + with open(fpath, 'r') as fh: + for line_num, rawline in enumerate(fh): + try: + rawline = rawline.decode("utf-8") + except UnicodeError: + LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num) + raise UnicodeError( + "Error: Unable to parse collaboration kb (line: %s)" % str(line_num)) + if rawline.strip() and rawline[0].strip() != '#': + add_to_auth_list(rawline) + # Shorten collaboration to 'coll' + if rawline.lower().endswith('collaboration\n'): + coll_version = rawline[:rawline.lower().find( + u'collaboration\n')] + r"coll[\.\,]" + add_to_auth_list( + coll_version.strip().replace(' ', r'\s') + u"s?") except IOError: # problem opening KB for reading, or problem while reading from it: LOGGER.debug(u"Error: Could not build knowledge base containing author patterns - failed to read from KB %s s.\n", fpath) raise IOError("Error: Unable to open collaborations kb '%s'" % fpath) - for line_num, rawline in enumerate(fh): - try: - rawline = rawline.decode("utf-8") - except UnicodeError: - LOGGER.debug(u"Unicode problems in %s for line %d", fpath, line_num) - raise UnicodeError( - "Error: Unable to parse collaboration kb (line: %s)" % str(line_num)) - if rawline.strip() and rawline[0].strip() != '#': - add_to_auth_list(rawline) - # Shorten collaboration to 'coll' - if rawline.lower().endswith('collaboration\n'): - coll_version = rawline[:rawline.lower().find( - u'collaboration\n')] + r"coll[\.\,]" - add_to_auth_list( - coll_version.strip().replace(' ', r'\s') + u"s?") author_match_re = "" if len(auths) > 0: diff --git a/refextract/documents/pdf.py b/refextract/documents/pdf.py index 038ef32..e193911 100644 --- a/refextract/documents/pdf.py +++ b/refextract/documents/pdf.py @@ -39,7 +39,7 @@ import re import subprocess -from ..references.config import CFG_PATH_PDFTOTEXT +from refextract.references.config import CFG_PATH_PDFTOTEXT LOGGER = logging.getLogger(__name__) @@ -56,10 +56,7 @@ def convert_PDF_to_plaintext(fpath, keep_layout=False): if not os.path.isfile(CFG_PATH_PDFTOTEXT): raise IOError('Missing pdftotext executable') - if keep_layout: - layout_option = "-layout" - else: - layout_option = "-raw" + layout_option = '-layout' if keep_layout else '-raw' doclines = [] # Pattern to check for lines with a leading page-break character. # If this pattern is matched, we want to split the page-break into diff --git a/refextract/documents/text.py b/refextract/documents/text.py index c8022fe..df120b7 100644 --- a/refextract/documents/text.py +++ b/refextract/documents/text.py @@ -25,7 +25,6 @@ import re - re_space_comma = re.compile(r'\s,', re.UNICODE) re_space_semicolon = re.compile(r'\s;', re.UNICODE) re_space_period = re.compile(r'\s\.', re.UNICODE) @@ -264,12 +263,8 @@ def get_number_header_lines(docbody, page_break_posns): # pattern to search for a word in a line: p_wordSearch = re.compile(r'([A-Za-z0-9-]+)', re.UNICODE) if remaining_breaks > 2: - if remaining_breaks > 3: - # Only check odd page headers - next_head = 2 - else: - # Check headers on each page - next_head = 1 + # Only check odd page headers else check headers on each page + next_head = 2 if remaining_breaks > 3 else 1 keep_checking = 1 while keep_checking: cur_break = 1 @@ -406,7 +401,7 @@ def strip_headers_footers_pagebreaks(docbody, for i in range(0, len(page_break_posns)): # Unless this is the last page break, chop headers if not first: - for dummy in range(1, num_head_lines + 1): + for _dummy in range(1, num_head_lines + 1): docbody[page_break_posns[i] + 1:page_break_posns[i] + 2] = [] else: @@ -415,7 +410,7 @@ def strip_headers_footers_pagebreaks(docbody, docbody[page_break_posns[i]:page_break_posns[i] + 1] = [] # Chop footers (unless this is the first page break) if i != len(page_break_posns) - 1: - for dummy in range(1, num_foot_lines + 1): + for _dummy in range(1, num_foot_lines + 1): docbody[page_break_posns[i] - num_foot_lines:page_break_posns[i] - num_foot_lines + 1] = [] @@ -429,7 +424,7 @@ def check_boundary_lines_similar(l_1, l_2): @return: (int) 1/0. """ num_matches = 0 - if (type(l_1) != list) or (type(l_2) != list) or (len(l_1) != len(l_2)): + if not isinstance(l_1, list) or not isinstance(l_2, list) or len(l_1) != len(l_2): # these 'boundaries' are not similar return 0 diff --git a/refextract/references/api.py b/refextract/references/api.py index 3f0fcce..ffaaec2 100644 --- a/refextract/references/api.py +++ b/refextract/references/api.py @@ -29,25 +29,29 @@ """ import os -import requests -import magic - from tempfile import mkstemp +import magic +import requests from inspire_utils.dedupers import dedupe_list -from .engine import ( +from refextract.references.engine import ( get_kbs, get_plaintext_document_body, parse_reference_line, parse_references, ) -from .errors import FullTextNotAvailableError -from .find import (find_numeration_in_body, - get_reference_section_beginning) -from .pdf import extract_texkeys_and_urls_from_pdf -from .text import extract_references_from_fulltext, rebuild_reference_lines -from .record import update_reference_with_urls +from refextract.references.errors import FullTextNotAvailableError +from refextract.references.find import ( + find_numeration_in_body, + get_reference_section_beginning, +) +from refextract.references.pdf import extract_texkeys_and_urls_from_pdf +from refextract.references.record import update_reference_with_urls +from refextract.references.text import ( + extract_references_from_fulltext, + rebuild_reference_lines, +) def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs): @@ -146,7 +150,7 @@ def extract_references_from_file(path, extracted_texkeys_urls = extract_texkeys_and_urls_from_pdf(path) if len(extracted_texkeys_urls) == len(parsed_refs): parsed_refs_updated = [] - for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls): + for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls, strict=False): update_reference_with_urls(ref, ref_texkey_urls.get('urls', [])) if ref.get('url'): ref['url'] = dedupe_list(ref['url']) diff --git a/refextract/references/engine.py b/refextract/references/engine.py index 9626b5b..ed1cdb3 100644 --- a/refextract/references/engine.py +++ b/refextract/references/engine.py @@ -26,49 +26,47 @@ import logging import mmap import re - from datetime import datetime import magic -from .config import ( - CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM, +from refextract.documents.pdf import convert_PDF_to_plaintext +from refextract.references.config import ( CFG_REFEXTRACT_MARKER_CLOSING_ARXIV, + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL, CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL, CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, - CFG_REFEXTRACT_MARKER_CLOSING_VOLUME, - CFG_REFEXTRACT_MARKER_CLOSING_YEAR, CFG_REFEXTRACT_MARKER_CLOSING_PAGE, - CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID, - CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL, - CFG_REFEXTRACT_MARKER_CLOSING_TITLE, + CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM, CFG_REFEXTRACT_MARKER_CLOSING_SERIES, + CFG_REFEXTRACT_MARKER_CLOSING_TITLE, + CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID, + CFG_REFEXTRACT_MARKER_CLOSING_VOLUME, + CFG_REFEXTRACT_MARKER_CLOSING_YEAR, ) - -from .errors import UnknownDocumentTypeError - -from .tag import ( - tag_reference_line, - sum_2_dictionaries, - identify_and_tag_DOI, - identify_and_tag_URLs, - find_numeration, - extract_series_from_volume -) -from .text import wash_and_repair_reference_line -from .record import build_references -from ..documents.pdf import convert_PDF_to_plaintext -from .kbs import get_kbs -from .regexs import ( +from refextract.references.errors import UnknownDocumentTypeError +from refextract.references.kbs import get_kbs +from refextract.references.record import build_references +from refextract.references.regexs import ( get_reference_line_numeration_marker_patterns, - regex_match_list, - re_tagged_citation, + re_hdl, re_numeration_no_ibid_txt, - re_roman_numbers, re_recognised_numeration_for_title_plus_series, - remove_year, + re_roman_numbers, + re_tagged_citation, re_year_in_misc_txt, - re_hdl) + regex_match_list, + remove_year, +) +from refextract.references.tag import ( + extract_series_from_volume, + find_numeration, + identify_and_tag_DOI, + identify_and_tag_URLs, + sum_2_dictionaries, + tag_reference_line, +) +from refextract.references.text import wash_and_repair_reference_line LOGGER = logging.getLogger(__name__) @@ -307,10 +305,7 @@ def postpone_last_auth(current_citation, num_auth): if num_auth == 0: return None - if num_auth == 1: - func = current_citation.__getitem__ - else: - func = current_citation.pop + func = current_citation.__getitem__ if num_auth == 1 else current_citation.pop for idx, el in enumerate(reversed(current_citation), 1): if el["type"] == "AUTH": @@ -375,10 +370,7 @@ def split_citations_iter(citation_elements): def valid_citation(citation): els_to_remove = ('MISC', ) - for el in citation: - if el['type'] not in els_to_remove: - return True - return False + return any(el['type'] not in els_to_remove for el in citation) def remove_invalid_references(splitted_citations): @@ -429,11 +421,10 @@ def add_misc(el, txt): previous_citation_valid = True for citation in splitted_citations: current_citation_valid = valid_citation(citation) - if not current_citation_valid: + if not current_citation_valid and not previous_citation_valid and not current_citation_valid : # Merge to previous one misc txt - if not previous_citation_valid and not current_citation_valid: - for el in citation: - add_misc(previous_citation[-1], el['misc_txt']) + for el in citation: + add_misc(previous_citation[-1], el['misc_txt']) previous_citation = citation previous_citation_valid = current_citation_valid @@ -475,10 +466,7 @@ def add_year_elements(splitted_citations): def look_for_implied_ibids(splitted_citations): def look_for_journal(els): - for el in els: - if el['type'] == 'JOURNAL': - return True - return False + return any(el['type'] == 'JOURNAL' for el in els) current_journal = None for citation in splitted_citations: @@ -616,13 +604,15 @@ def print_citations(splitted_citations, line_marker): LOGGER.debug('%s %s', el['type'], repr(el)) -def parse_reference_line(ref_line, kbs, bad_titles_count={}, linker_callback=None): +def parse_reference_line(ref_line, kbs, bad_titles_count=None, linker_callback=None): """Parse one reference line @input a string representing a single reference bullet @output parsed references (a list of elements objects) """ # Strip the 'marker' (e.g. [1]) from this reference line: + if bad_titles_count is None: + bad_titles_count = {} line_marker, ref_line = remove_reference_line_marker(ref_line) # Find DOI sections in citation ref_line, identified_dois = identify_and_tag_DOI(ref_line) @@ -776,11 +766,12 @@ def find_substring_ignore_special_chars(s, substr): i = 0 real_index = 0 re_alphanum = re.compile('[A-Z0-9]') - for real_index, char in enumerate(s): + for char in s: if re_alphanum.match(char): i += 1 if i > startIndex: break + real_index=+1 return real_index else: @@ -801,7 +792,7 @@ def cut_substring_with_special_chars(s, sub, startIndex): if subPosition >= len(clean_sub): # include everything till a space, open bracket or a normal # character - counter += len(re.split('[ [{(a-zA-Z0-9]', s[startIndex + counter:], 1)[0]) + counter += len(re.split('[ [{(a-zA-Z0-9]', s[startIndex + counter:], maxsplit=1)[0]) return s[0:startIndex].strip() + ' ' + s[startIndex + counter:].strip() @@ -810,10 +801,7 @@ def is_unknown_citation(citation): """Checks if the citation got recognized as one of the known types. """ knownTypes = ['BOOK', 'JOURNAL', 'DOI', 'ISBN', 'RECID'] - for citation_element in citation: - if citation_element['type'] in knownTypes: - return False - return True + return all(citation_element['type'] not in knownTypes for citation_element in citation) def parse_references_elements(ref_sect, kbs, linker_callback=None): diff --git a/refextract/references/find.py b/refextract/references/find.py index a88005e..f8c8a13 100644 --- a/refextract/references/find.py +++ b/refextract/references/find.py @@ -23,19 +23,21 @@ """Finding the reference section from the fulltext""" +import contextlib import logging import re -from .regexs import \ - get_reference_section_title_patterns, \ - get_reference_line_numeration_marker_patterns, \ - regex_match_list, \ - get_post_reference_section_title_patterns, \ - get_post_reference_section_keyword_patterns, \ - re_reference_line_bracket_markers, \ - re_reference_line_dot_markers, \ - re_reference_line_number_markers, \ - re_num +from refextract.references.regexs import ( + get_post_reference_section_keyword_patterns, + get_post_reference_section_title_patterns, + get_reference_line_numeration_marker_patterns, + get_reference_section_title_patterns, + re_num, + re_reference_line_bracket_markers, + re_reference_line_dot_markers, + re_reference_line_number_markers, + regex_match_list, +) LOGGER = logging.getLogger(__name__) @@ -393,11 +395,9 @@ def find_end_of_reference_section(docbody, # save the reference count num_match = regex_match_list(docbody[x].strip(), mk_patterns) if num_match: - try: + with contextlib.suppress(ValueError, IndexError): current_reference_count = int(num_match.group('marknum')) - except (ValueError, IndexError): - # non numerical references marking - pass + # look for a likely section title that would follow a reference # section: end_match = regex_match_list(docbody[x].strip(), t_patterns) diff --git a/refextract/references/kbs.py b/refextract/references/kbs.py index a1d8184..5a77dbf 100644 --- a/refextract/references/kbs.py +++ b/refextract/references/kbs.py @@ -21,21 +21,22 @@ # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. -import re -import six -import csv import codecs import contextlib +import csv +import re -from .config import CFG_REFEXTRACT_KBS -from .regexs import ( - re_kb_line, - re_regexp_character_class, - re_extract_quoted_text, +import six + +from refextract.documents.text import re_group_captured_multiple_space +from refextract.references.config import CFG_REFEXTRACT_KBS +from refextract.references.regexs import ( re_extract_char_class, + re_extract_quoted_text, + re_kb_line, re_punctuation, + re_regexp_character_class, ) -from ..documents.text import re_group_captured_multiple_space @contextlib.contextmanager @@ -91,12 +92,9 @@ def load_kb_by_type(kb_type, kb): def load_kb(path, builder): if isinstance(path, dict): return load_kb_from_iterable(path.items(), builder) - try: - path.startswith - except AttributeError: - return load_kb_from_iterable(path, builder) - else: + elif hasattr(path, 'startswith'): return load_kb_from_file(path, builder) + return load_kb_from_iterable(path, builder) def order_reportnum_patterns_bylen(numeration_patterns): @@ -342,25 +340,18 @@ def _add_institute_preprint_patterns(preprint_classifications, if m_preprint_classification: # This KB line contains a preprint classification for # the current institute - try: + with contextlib.suppress(AttributeError, NameError): current_institute_preprint_classifications.append((m_preprint_classification.group(1), m_preprint_classification.group(2))) - except (AttributeError, NameError): - # didn't match this line correctly - skip it - pass - # move on to the next line continue m_numeration_pattern = re_numeration_pattern.search(rawline) if m_numeration_pattern: # This KB line contains a preprint item numeration pattern # for the current institute - try: + with contextlib.suppress(AttributeError, NameError): current_institute_numerations.append( m_numeration_pattern.group(1)) - except (AttributeError, NameError): - # didn't match the numeration pattern correctly - skip it - pass continue _add_institute_preprint_patterns(current_institute_preprint_classifications, @@ -538,7 +529,7 @@ def build_journals_kb(knowledgebase): # Now, for every 'replacement term' found in the KB, if it is # not already in the KB as a "search term", add it: - for repl_term in repl_terms.keys(): + for repl_term in repl_terms: raw_repl_phrase = repl_term.upper() raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase) raw_repl_phrase = \ diff --git a/refextract/references/pdf.py b/refextract/references/pdf.py index a30992c..a3304f4 100644 --- a/refextract/references/pdf.py +++ b/refextract/references/pdf.py @@ -26,7 +26,7 @@ from PyPDF2 import PdfFileReader from PyPDF2.generic import ByteStringObject -from .regexs import re_reference_in_dest +from refextract.references.regexs import re_reference_in_dest LOGGER = logging.getLogger(__name__) @@ -173,10 +173,7 @@ def _match_urls_with_reference( if not two_column_layout or (two_column_layout and url_col == ref_column): urls_for_reference.add(url[0]) continue - elif is_last_reference_in_page or is_last_reference_in_page_two_col_layout: - urls_for_reference.add(url[0]) - continue - elif is_in_new_column: + elif is_last_reference_in_page or is_last_reference_in_page_two_col_layout or is_in_new_column: urls_for_reference.add(url[0]) continue elif is_url_unrelated_to_references: diff --git a/refextract/references/regexs.py b/refextract/references/regexs.py index d7e10de..330fbc6 100644 --- a/refextract/references/regexs.py +++ b/refextract/references/regexs.py @@ -22,7 +22,6 @@ # or submit itself to any jurisdiction. import re - from datetime import datetime # Sep @@ -193,7 +192,6 @@ def compute_pos_patterns(patterns): r"quan-ph": "quant-ph", r"nlin-cd": "nlin.cd", r"math-sp": "math.sp", - r"atro-ph": "astro-ph", r"ast-ph": "astro-ph", r"asyro-ph": "astro-ph", r"aastro-ph": "astro-ph", @@ -927,10 +925,7 @@ def regex_match_list(line, patterns): def remove_year(s, year=None): - if year: - year_pattern = re.escape(year) - else: - year_pattern = r"(?:19|20)\d{2}" + year_pattern = re.escape(year) if year else "(?:19|20)\\d{2}" s = re.sub(r'\[\s*%s\s*\]' % year_pattern, '', s) s = re.sub(r'\(\s*%s\s*\)' % year_pattern, '', s) s = re.sub(r'\s*%s\s*' % year_pattern, '', s) diff --git a/refextract/references/tag.py b/refextract/references/tag.py index b0156d2..7fba15b 100644 --- a/refextract/references/tag.py +++ b/refextract/references/tag.py @@ -22,66 +22,67 @@ # or submit itself to any jurisdiction. import re - from urllib.parse import unquote from unidecode import unidecode -from .config import \ - CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL, \ - CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL, \ - CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, \ - CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID, \ - CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID, \ - CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION, \ - CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION - -from ..documents.text import remove_and_record_multiple_spaces_in_line - -from .regexs import \ - re_ibid, \ - re_doi, \ - re_raw_url, \ - re_series_from_numeration, \ - re_punctuation, \ - re_correct_numeration_2nd_try_ptn1, \ - re_correct_numeration_2nd_try_ptn2, \ - re_correct_numeration_2nd_try_ptn3, \ - re_correct_numeration_2nd_try_ptn4, \ - re_numeration_nucphys_vol_page_yr, \ - re_numeration_vol_subvol_nucphys_yr_page, \ - re_numeration_nucphys_vol_yr_page, \ - re_multiple_hyphens, \ - re_numeration_vol_page_yr, \ - re_numeration_vol_yr_page, \ - re_numeration_vol_nucphys_series_yr_page, \ - re_numeration_vol_series_nucphys_page_yr, \ - re_numeration_vol_nucphys_series_page_yr, \ - re_html_tagged_url, \ - re_numeration_yr_vol_page, \ - re_numeration_vol_nucphys_page_yr, \ - re_wash_volume_tag, \ - re_numeration_vol_nucphys_yr_subvol_page, \ - re_quoted, \ - re_isbn, \ - re_arxiv, \ - re_arxiv_5digits, \ - re_new_arxiv, \ - re_new_arxiv_5digits, \ - re_pos, \ - re_pos_year_num, \ - re_series_from_numeration_after_volume, \ - RE_OLD_ARXIV, \ - RE_ARXIV_CATCHUP, \ - RE_ATLAS_CONF_PRE_2010, \ - RE_ATLAS_CONF_POST_2010 - -from ..authors.regexs import ( - get_author_regexps, +from refextract.authors.regexs import ( etal_matches, + get_author_regexps, re_ed_notation, - re_etal) -from ..documents.text import wash_line + re_etal, +) +from refextract.documents.text import ( + remove_and_record_multiple_spaces_in_line, + wash_line, +) +from refextract.references.config import ( + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL, + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL, + CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND, + CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION, + CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID, + CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION, + CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID, +) +from refextract.references.regexs import ( + RE_ARXIV_CATCHUP, + RE_ATLAS_CONF_POST_2010, + RE_ATLAS_CONF_PRE_2010, + RE_OLD_ARXIV, + re_arxiv, + re_arxiv_5digits, + re_correct_numeration_2nd_try_ptn1, + re_correct_numeration_2nd_try_ptn2, + re_correct_numeration_2nd_try_ptn3, + re_correct_numeration_2nd_try_ptn4, + re_doi, + re_html_tagged_url, + re_ibid, + re_isbn, + re_multiple_hyphens, + re_new_arxiv, + re_new_arxiv_5digits, + re_numeration_nucphys_vol_page_yr, + re_numeration_nucphys_vol_yr_page, + re_numeration_vol_nucphys_page_yr, + re_numeration_vol_nucphys_series_page_yr, + re_numeration_vol_nucphys_series_yr_page, + re_numeration_vol_nucphys_yr_subvol_page, + re_numeration_vol_page_yr, + re_numeration_vol_series_nucphys_page_yr, + re_numeration_vol_subvol_nucphys_yr_page, + re_numeration_vol_yr_page, + re_numeration_yr_vol_page, + re_pos, + re_pos_year_num, + re_punctuation, + re_quoted, + re_raw_url, + re_series_from_numeration, + re_series_from_numeration_after_volume, + re_wash_volume_tag, +) def tag_reference_line(line, kbs, record_titles_count): @@ -439,7 +440,7 @@ def tag_atlas_conf(line): def identifiy_journals_re(line, kb_journals): matches = {} - for pattern, dummy_journal in kb_journals: + for pattern, _dummy_journal in kb_journals: match = re.search(pattern, line) if match: matches[match.start()] = match.group(0) @@ -558,10 +559,7 @@ def extract_series_from_volume(volume): def create_numeration_tag(info): - if info['series']: - series_and_volume = info['series'] + info['volume'] - else: - series_and_volume = info['volume'] + series_and_volume = info['series'] + info['volume'] if info['series'] else info['volume'] numeration_tags = u' %s' % series_and_volume if info.get('year', False): numeration_tags += u' (%(year)s)' % info @@ -861,7 +859,7 @@ def identify_and_tag_collaborations(line, collaborations_kb): which won't influence the reference splitting heuristics (used when looking at mulitple tags in a line). """ - for dummy_collab, re_collab in collaborations_kb.items(): + for _dummy_collab, re_collab in collaborations_kb.items(): matches = re_collab.finditer(strip_tags(line)) for match in reversed(list(matches)): @@ -967,16 +965,15 @@ def identify_and_tag_authors(line, authors_kb): add_to_misc = "" # If a semi-colon was found at the end of this author group, keep it in misc # so that it can be looked at for splitting heurisitics - if len(output_line) > m['end']: - if output_line[m['end']].strip(" ,.") == ';': - add_to_misc = ';' + if len(output_line) > m['end'] and output_line[m['end']].strip(" ,.") == ';': + add_to_misc = ';' # Standardize eds. notation tmp_output_line = re.sub(re_ed_notation, '(ed.)', - output_line[start:end], re.IGNORECASE) + output_line[start:end], flags=re.IGNORECASE) # Standardize et al. notation tmp_output_line = re.sub(re_etal, 'et al.', - tmp_output_line, re.IGNORECASE) + tmp_output_line, flags=re.IGNORECASE) # Strip tmp_output_line = tmp_output_line.lstrip('.').strip(",:;- [](") if not tmp_output_line.endswith('(ed.)'): @@ -1006,7 +1003,7 @@ def identify_and_tag_authors(line, authors_kb): ed_notation = " (eds.)" # Standardize et al. notation tmp_output_line = re.sub(re_etal, 'et al.', - m['author_names'], re.IGNORECASE) + m['author_names'], flags=re.IGNORECASE) # remove any characters which denote this author group # to be editors, just take the # author names, and append '(ed.)' @@ -1032,7 +1029,7 @@ def sum_2_dictionaries(dicta, dictb): @return: (dictionary) - the sum of the 2 dictionaries """ dict_out = dicta.copy() - for key in dictb.keys(): + for key in dictb: if 'key' in dict_out: # Add the sum for key in dictb to that of dict_out: dict_out[key] += dictb[key] diff --git a/refextract/references/text.py b/refextract/references/text.py index 18c6f56..4cfc643 100644 --- a/refextract/references/text.py +++ b/refextract/references/text.py @@ -25,15 +25,18 @@ import re from inspire_utils.record import replace_undesirable_characters -from ..documents.text import ( + +from refextract.documents.text import ( join_lines, - repair_broken_urls, re_multiple_space, - remove_page_boundary_lines + remove_page_boundary_lines, + repair_broken_urls, +) +from refextract.references.config import CFG_REFEXTRACT_MAX_LINES +from refextract.references.find import ( + find_end_of_reference_section, + get_reference_section_beginning, ) - -from .config import CFG_REFEXTRACT_MAX_LINES -from .find import find_end_of_reference_section, get_reference_section_beginning LOGGER = logging.getLogger(__name__) diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..d347e32 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,28 @@ +target-version = "py311" +[lint.flake8-tidy-imports] +ban-relative-imports = "all" + +[lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", + # flake8-tidy-imports + "TID", + # flake8-pytest-style + "PT", +] +ignore = ["B904","E501"] + +[lint.pycodestyle] +ignore-overlong-task-comments = true + +[lint.pydocstyle] +convention = "google" diff --git a/setup.py b/setup.py index a57faca..a75a1e0 100644 --- a/setup.py +++ b/setup.py @@ -25,10 +25,10 @@ from setuptools import find_packages, setup - url = 'https://github.com/inspirehep/refextract' -readme = open('README.rst').read() +with open('README.rst') as file: + readme = file.read() install_requires = [ 'PyPDF2~=1.0,>=1.26.0', @@ -57,7 +57,7 @@ } extras_require['all'] = [] -for name, reqs in extras_require.items(): +for _name, reqs in extras_require.items(): extras_require['all'].extend(reqs) packages = find_packages() diff --git a/tests/conftest.py b/tests/conftest.py index 0860cab..cd19010 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,7 +26,7 @@ import pytest -@pytest.fixture +@pytest.fixture() def pdf_files(): path_to_pdfs = os.path.join(os.path.dirname(__file__), 'data') pdfs = os.listdir(path_to_pdfs) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 882022f..fc6bde9 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -6,7 +6,7 @@ @pytest.fixture(autouse=True, scope="session") def app(): app = create_app() - yield app + return app @pytest.fixture() diff --git a/tests/integration/test_views.py b/tests/integration/test_views.py index 562cd60..e382ffe 100644 --- a/tests/integration/test_views.py +++ b/tests/integration/test_views.py @@ -75,7 +75,7 @@ def test_extract_journal_info_when_timeout_from_refextract( data=json.dumps(payload), ) assert response.status_code == 500 - assert {'message': "Can not extract publication info data. Reason: 'test message'"} == response.json + assert response.json == {'message': "Can not extract publication info data. Reason: 'test message'"} def test_extract_journal_info_for_multiple_pubinfos(app_client): @@ -141,7 +141,8 @@ def test_extract_extract_references_from_text(app_client): assert "year" in response.json["extracted_references"][0] -@mock.patch("refextract.app.extract_references_from_string", side_effect=KeyError("test message")) +@mock.patch("refextract.app.extract_references_from_string", + side_effect=KeyError("test message")) def test_extract_references_from_text_when_timeout_from_refextract( mock_extract_refs, app_client ): @@ -159,7 +160,7 @@ def test_extract_references_from_text_when_timeout_from_refextract( "/extract_references_from_text", headers=headers, data=json.dumps(payload) ) assert response.status_code == 500 - assert {'message': "Can not extract references. Reason: 'test message'"} == response.json + assert response.json == {'message': "Can not extract references. Reason: 'test message'"} @pytest.mark.vcr() diff --git a/tests/test_api.py b/tests/test_api.py index 5f5c17e..4948ac2 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -26,22 +26,22 @@ from refextract.references.api import ( extract_journal_reference, + extract_references_from_file, extract_references_from_string, extract_references_from_url, - extract_references_from_file, ) - from refextract.references.errors import FullTextNotAvailableError -@pytest.fixture +@pytest.fixture() def kbs_override(): return { "books": [ ('Griffiths, David', 'Introduction to elementary particles', '2008') ], "journals": [ - ("PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS", "Phys.Rev.ST Accel.Beams"), + ("PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS", + "Phys.Rev.ST Accel.Beams"), ("PHYS REV D", "Phys.Rev.;D"), ("PHYS REV", "Phys.Rev."), ("PHYS REV LETT", "Phys.Rev.Lett."), @@ -148,15 +148,15 @@ def test_extract_references_from_url(pdf_files): assert len(r) == 36 assert 'url' in r[0] + url = "http://www.example.com" + responses.add( + responses.GET, + url, + body="File not found!", + status=404, + content_type='text/plain', + ) with pytest.raises(FullTextNotAvailableError): - url = "http://www.example.com" - responses.add( - responses.GET, - url, - body="File not found!", - status=404, - content_type='text/plain', - ) extract_references_from_url(url) diff --git a/tests/test_engine.py b/tests/test_engine.py index 0eeb5d4..4486462 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -27,7 +27,6 @@ get_plaintext_document_body, parse_references, ) - from refextract.references.errors import UnknownDocumentTypeError @@ -329,10 +328,10 @@ def test_get_plaintext_document_body(tmpdir): f.write("".join(input)) assert input == get_plaintext_document_body(str(f)) + html = "Some page" + f = tmpdir.join("page.html") + f.write(html) with pytest.raises(UnknownDocumentTypeError) as excinfo: - html = "Some page" - f = tmpdir.join("page.html") - f.write(html) get_plaintext_document_body(str(f)) assert 'text/html' in excinfo.value.args diff --git a/tests/test_kbs.py b/tests/test_kbs.py index 7d9c573..c3968a5 100644 --- a/tests/test_kbs.py +++ b/tests/test_kbs.py @@ -35,14 +35,14 @@ def test_get_kbs_caches_journal_dict(): first_cache = get_kbs(custom_kbs={"journals": journals}).copy() assert len(first_cache["journals"]) == 3 - assert ["JOURNAL OF TESTING", "J TESTING"] == first_cache["journals"][-1] + assert first_cache["journals"][-1] == ["JOURNAL OF TESTING", "J TESTING"] journals = journals.copy() second_cache = get_kbs(custom_kbs={"journals": journals}) # the cache is reused, so identity of the cache elements doesn't change assert all( cached_first is cached_second for (cached_first, cached_second) - in zip(first_cache["journals"], second_cache["journals"]) + in zip(first_cache["journals"], second_cache["journals"], strict=False) ) @@ -55,7 +55,7 @@ def test_get_kbs_invalidates_cache_if_input_changes(): # the cache is invalidated, so identity of the cache elements changes assert all( cached_first is not cached_second for (cached_first, cached_second) - in zip(first_cache["journals"], second_cache["journals"]) + in zip(first_cache["journals"], second_cache["journals"], strict=False) ) assert len(second_cache["journals"]) == 3 - assert ["JOURNAL OF TESTING", "J TEST"] == second_cache["journals"][-1] + assert second_cache["journals"][-1] == ["JOURNAL OF TESTING", "J TEST"] diff --git a/tests/test_pdf.py b/tests/test_pdf.py index 61f6b01..3ce6923 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -84,7 +84,6 @@ def test_extract_texkeys_and_urls_from_pdf(pdf_files): "urls": { "http://dx.doi.org/10.1016/j.physletb.2013.08.026", "http://www.arXiv.org/abs/1307.1432", - "http://www.arXiv.org/abs/1307.1432", }, }, {"texkey": "CMS_combination", "urls": {"http://www.arXiv.org/abs/1412.8662"}}, @@ -111,7 +110,6 @@ def test_extract_texkeys_and_urls_from_pdf(pdf_files): "urls": { "http://dx.doi.org/10.1103/PhysRevD.89.092007", "http://www.arXiv.org/abs/1312.5353", - "http://www.arXiv.org/abs/1312.5353", }, }, { diff --git a/tests/test_tag.py b/tests/test_tag.py index e52ca38..ab14c8e 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -22,10 +22,10 @@ # or submit itself to any jurisdiction. from refextract.references.tag import ( - tag_arxiv, - identify_ibids, find_numeration, find_numeration_more, + identify_ibids, + tag_arxiv, )