From 9e547d649ab913da02e2f8f09694c957f1ab458a Mon Sep 17 00:00:00 2001
From: Ghislain Vaillant <ghislain.vaillant@inria.fr>
Date: Mon, 1 Jul 2024 14:56:07 +0200
Subject: [PATCH] STY: Fix warnings from static analysis

---
 medkit/audio/preprocessing/resampler.py         |  2 +-
 medkit/audio/transcription/hf_transcriber.py    |  2 +-
 medkit/audio/transcription/sb_transcriber.py    |  2 +-
 .../transcription/transcribed_text_document.py  |  8 +++-----
 medkit/text/ner/_base_simstring_matcher.py      | 17 ++++++++---------
 5 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/medkit/audio/preprocessing/resampler.py b/medkit/audio/preprocessing/resampler.py
index a8d6e9e4..44264032 100644
--- a/medkit/audio/preprocessing/resampler.py
+++ b/medkit/audio/preprocessing/resampler.py
@@ -19,7 +19,7 @@ class Resampler(PreprocessingOperation):
     sample_rate : int
         Target sample rate to resample to, in samples per second.
     fast : bool, default=False
-        If `True`, prefer speed over quality and use resampy's "kaiser_fast" filter
+        If `True`, prefer speed to quality and use resampy's "kaiser_fast" filter
         instead of "kaiser_best".
     uid : str, optional
         Identifier of the resampler.
diff --git a/medkit/audio/transcription/hf_transcriber.py b/medkit/audio/transcription/hf_transcriber.py
index 97b57059..0406c3f6 100644
--- a/medkit/audio/transcription/hf_transcriber.py
+++ b/medkit/audio/transcription/hf_transcriber.py
@@ -21,7 +21,7 @@ class HFTranscriber(Operation):
 
     For each segment given as input, a transcription attribute will be created
     with the transcribed text as value. If needed, a text document can later be
-    created from all the transcriptions of a audio document using
+    created from all the transcriptions of an audio document using
     :func:`~medkit.audio.transcription.TranscribedTextDocument.from_audio_doc
     <TranscribedTextDocument.from_audio_doc>`
 
diff --git a/medkit/audio/transcription/sb_transcriber.py b/medkit/audio/transcription/sb_transcriber.py
index ea666e4c..2ad5a73e 100644
--- a/medkit/audio/transcription/sb_transcriber.py
+++ b/medkit/audio/transcription/sb_transcriber.py
@@ -20,7 +20,7 @@ class SBTranscriber(Operation):
 
     For each segment given as input, a transcription attribute will be created
     with the transcribed text as value. If needed, a text document can later be
-    created from all the transcriptions of a audio document using
+    created from all the transcriptions of an audio document using
     :func:`~medkit.audio.transcription.TranscribedTextDocument.from_audio_doc
     <TranscribedTextDocument.from_audio_doc>`
 
diff --git a/medkit/audio/transcription/transcribed_text_document.py b/medkit/audio/transcription/transcribed_text_document.py
index 7defbd58..3635163d 100644
--- a/medkit/audio/transcription/transcribed_text_document.py
+++ b/medkit/audio/transcription/transcribed_text_document.py
@@ -28,9 +28,8 @@ class TranscribedTextDocument(TextDocument):
         Mapping between text characters spans in this document and
         corresponding audio spans in the original audio.
     audio_doc_id: str, optional
-        Id of the original
-        :class:`~medkit.core.audio.document.AudioDocument` that was
-        transcribed, if known.
+        Identifier for the original :class:`~medkit.core.audio.document.AudioDocument`
+        that was transcribed, if known.
     anns: sequence of TextAnnotation, optional
         Annotations of the document.
     attrs: sequence of Attribute, optional
@@ -75,8 +74,7 @@ def get_containing_audio_spans(self, text_ann_spans: list[AnyTextSpan]) -> list[
         the span 15 to 25, then the containing audio span will be the one ranging
         from 1.0 to 20.0 seconds.
 
-        Note that some text annotations maybe be contained in more that one
-        audio spans.
+        Note that some text annotations maybe be contained in more than one audio spans.
 
         Parameters
         ----------
diff --git a/medkit/text/ner/_base_simstring_matcher.py b/medkit/text/ner/_base_simstring_matcher.py
index 87a54aac..aeff4e49 100644
--- a/medkit/text/ner/_base_simstring_matcher.py
+++ b/medkit/text/ner/_base_simstring_matcher.py
@@ -59,8 +59,7 @@ class BaseSimstringMatcherRule:
         Whether to use ASCII-only versions of the rule term and input texts when
         looking for matches (non-ASCII chars replaced by closest ASCII chars).
     normalizations : list of BaseSimstringMatcherNormalization, optional
-        Optional list of normalization attributes that should be attached to the
-        entities created
+        List of normalization attributes that should be attached to the entities created.
     """
 
     term: str
@@ -157,7 +156,7 @@ class BaseSimstringMatcher(NEROperation):
     similarity : str, default="jaccard"
         Similarity metric to use.
     spacy_tokenization_language : str, optional
-        2-letter code (ex: "fr", "en", etc) designating the language of the
+        2-letter code (ex: "fr", "en", etc.) designating the language of the
         spacy model to use for tokenization. If provided, spacy will be used
         to tokenize input segments and filter out some tokens based on their
         part-of-speech tags, such as determinants, conjunctions and
@@ -174,7 +173,7 @@ class BaseSimstringMatcher(NEROperation):
     attrs_to_copy : list of str, optional
         Labels of the attributes that should be copied from the source
         segment to the created entity. Useful for propagating context
-        attributes (negation, antecedent, etc).
+        attributes (negation, antecedent, etc.).
     name : str, optional
         Name describing the matcher (defaults to the class name).
     uid : str, optional
@@ -387,7 +386,7 @@ def build_simstring_matcher_databases(
         Rules to add to databases
     """
     # the params passed to simstring.writer are copy/pasted from QuickUMLS
-    # cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/a3ba0b3559da2574a907f4d41aa0f2c1c0d5ce0a/quickumls/toolbox.py#L173
+    # cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/1.4.0/quickumls/toolbox.py#L173
     simstring_db_writer = simstring.writer(
         str(simstring_db_file),
         3,  # unit of character n-grams
@@ -417,7 +416,7 @@ def build_simstring_matcher_databases(
     rules_db.close()
 
 
-_TOKENIZATION_PATTERN = re.compile(r"[\w]+|[^\w ]")
+_TOKENIZATION_PATTERN = re.compile(r"\w+|[^\w ]")
 
 
 def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length: int) -> Iterator[tuple[int, int]]:
@@ -476,7 +475,7 @@ def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length:
             # candidate is too long, stop appending tokens
             if length > max_length:
                 break
-            yield (start, end)
+            yield start, end
 
 
 def _build_candidate_ranges_with_spacy(
@@ -517,7 +516,7 @@ def _build_candidate_ranges_with_spacy(
     ['I have', 'have', 'have type', 'type', 'type 2', '2 diabetes', 'diabetes']
     """
 
-    # don't allow candidates to start or end with pre/post positions,
+    # don't allow candidates to start or end with adpositions,
     # determinants or conjunctions
     def is_invalid_boundary_token(token):
         return token.is_punct or token.is_space or token.pos_ in ("ADP", "DET", "SCONJ", "CCONJ", "CONJ")
@@ -544,7 +543,7 @@ def is_invalid_boundary_token(token):
             # candidate is too long, stop appending tokens
             if length > max_length:
                 break
-            yield (span.start_char, span.end_char)
+            yield span.start_char, span.end_char
 
 
 def _get_similarity_score(text_1, text_2, similarity_name, ngram_size=3):