STY: Fix warnings from static analysis

medkit-lib · Jul 1, 2024 · b1103dd · b1103dd
1 parent 2d4becb
commit b1103dd
Showing 1 changed file with 8 additions and 9 deletions.
diff --git a/medkit/text/ner/_base_simstring_matcher.py b/medkit/text/ner/_base_simstring_matcher.py
@@ -59,8 +59,7 @@ class BaseSimstringMatcherRule:
         Whether to use ASCII-only versions of the rule term and input texts when
         looking for matches (non-ASCII chars replaced by closest ASCII chars).
     normalizations : list of BaseSimstringMatcherNormalization, optional
-        Optional list of normalization attributes that should be attached to the
-        entities created
+        List of normalization attributes that should be attached to the entities created.
     """
 
     term: str
@@ -157,7 +156,7 @@ class BaseSimstringMatcher(NEROperation):
     similarity : str, default="jaccard"
         Similarity metric to use.
     spacy_tokenization_language : str, optional
-        2-letter code (ex: "fr", "en", etc) designating the language of the
+        2-letter code (ex: "fr", "en", etc.) designating the language of the
         spacy model to use for tokenization. If provided, spacy will be used
         to tokenize input segments and filter out some tokens based on their
         part-of-speech tags, such as determinants, conjunctions and
@@ -174,7 +173,7 @@ class BaseSimstringMatcher(NEROperation):
     attrs_to_copy : list of str, optional
         Labels of the attributes that should be copied from the source
         segment to the created entity. Useful for propagating context
-        attributes (negation, antecedent, etc).
+        attributes (negation, antecedent, etc.).
     name : str, optional
         Name describing the matcher (defaults to the class name).
     uid : str, optional
@@ -387,7 +386,7 @@ def build_simstring_matcher_databases(
         Rules to add to databases
     """
     # the params passed to simstring.writer are copy/pasted from QuickUMLS
-    # cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/a3ba0b3559da2574a907f4d41aa0f2c1c0d5ce0a/quickumls/toolbox.py#L173
+    # cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/1.4.0/quickumls/toolbox.py#L173
     simstring_db_writer = simstring.writer(
         str(simstring_db_file),
         3,  # unit of character n-grams
@@ -417,7 +416,7 @@ def build_simstring_matcher_databases(
     rules_db.close()
 
 
-_TOKENIZATION_PATTERN = re.compile(r"[\w]+|[^\w ]")
+_TOKENIZATION_PATTERN = re.compile(r"\w+|[^\w ]")
 
 
 def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length: int) -> Iterator[tuple[int, int]]:
@@ -476,7 +475,7 @@ def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length:
             # candidate is too long, stop appending tokens
             if length > max_length:
                 break
-            yield (start, end)
+            yield start, end
 
 
 def _build_candidate_ranges_with_spacy(
@@ -517,7 +516,7 @@ def _build_candidate_ranges_with_spacy(
     ['I have', 'have', 'have type', 'type', 'type 2', '2 diabetes', 'diabetes']
     """
 
-    # don't allow candidates to start or end with pre/post positions,
+    # don't allow candidates to start or end with adpositions,
     # determinants or conjunctions
     def is_invalid_boundary_token(token):
         return token.is_punct or token.is_space or token.pos_ in ("ADP", "DET", "SCONJ", "CCONJ", "CONJ")
@@ -544,7 +543,7 @@ def is_invalid_boundary_token(token):
             # candidate is too long, stop appending tokens
             if length > max_length:
                 break
-            yield (span.start_char, span.end_char)
+            yield span.start_char, span.end_char
 
 
 def _get_similarity_score(text_1, text_2, similarity_name, ngram_size=3):