Skip to content

Commit

Permalink
STY: Fix warnings from static analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
ghisvail committed Jul 1, 2024
1 parent 2d4becb commit 9e547d6
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 17 deletions.
2 changes: 1 addition & 1 deletion medkit/audio/preprocessing/resampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class Resampler(PreprocessingOperation):
sample_rate : int
Target sample rate to resample to, in samples per second.
fast : bool, default=False
If `True`, prefer speed over quality and use resampy's "kaiser_fast" filter
If `True`, prefer speed to quality and use resampy's "kaiser_fast" filter
instead of "kaiser_best".
uid : str, optional
Identifier of the resampler.
Expand Down
2 changes: 1 addition & 1 deletion medkit/audio/transcription/hf_transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class HFTranscriber(Operation):
For each segment given as input, a transcription attribute will be created
with the transcribed text as value. If needed, a text document can later be
created from all the transcriptions of a audio document using
created from all the transcriptions of an audio document using
:func:`~medkit.audio.transcription.TranscribedTextDocument.from_audio_doc
<TranscribedTextDocument.from_audio_doc>`
Expand Down
2 changes: 1 addition & 1 deletion medkit/audio/transcription/sb_transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class SBTranscriber(Operation):
For each segment given as input, a transcription attribute will be created
with the transcribed text as value. If needed, a text document can later be
created from all the transcriptions of a audio document using
created from all the transcriptions of an audio document using
:func:`~medkit.audio.transcription.TranscribedTextDocument.from_audio_doc
<TranscribedTextDocument.from_audio_doc>`
Expand Down
8 changes: 3 additions & 5 deletions medkit/audio/transcription/transcribed_text_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@ class TranscribedTextDocument(TextDocument):
Mapping between text characters spans in this document and
corresponding audio spans in the original audio.
audio_doc_id: str, optional
Id of the original
:class:`~medkit.core.audio.document.AudioDocument` that was
transcribed, if known.
Identifier for the original :class:`~medkit.core.audio.document.AudioDocument`
that was transcribed, if known.
anns: sequence of TextAnnotation, optional
Annotations of the document.
attrs: sequence of Attribute, optional
Expand Down Expand Up @@ -75,8 +74,7 @@ def get_containing_audio_spans(self, text_ann_spans: list[AnyTextSpan]) -> list[
the span 15 to 25, then the containing audio span will be the one ranging
from 1.0 to 20.0 seconds.
Note that some text annotations maybe be contained in more that one
audio spans.
Note that some text annotations maybe be contained in more than one audio spans.
Parameters
----------
Expand Down
17 changes: 8 additions & 9 deletions medkit/text/ner/_base_simstring_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@ class BaseSimstringMatcherRule:
Whether to use ASCII-only versions of the rule term and input texts when
looking for matches (non-ASCII chars replaced by closest ASCII chars).
normalizations : list of BaseSimstringMatcherNormalization, optional
Optional list of normalization attributes that should be attached to the
entities created
List of normalization attributes that should be attached to the entities created.
"""

term: str
Expand Down Expand Up @@ -157,7 +156,7 @@ class BaseSimstringMatcher(NEROperation):
similarity : str, default="jaccard"
Similarity metric to use.
spacy_tokenization_language : str, optional
2-letter code (ex: "fr", "en", etc) designating the language of the
2-letter code (ex: "fr", "en", etc.) designating the language of the
spacy model to use for tokenization. If provided, spacy will be used
to tokenize input segments and filter out some tokens based on their
part-of-speech tags, such as determinants, conjunctions and
Expand All @@ -174,7 +173,7 @@ class BaseSimstringMatcher(NEROperation):
attrs_to_copy : list of str, optional
Labels of the attributes that should be copied from the source
segment to the created entity. Useful for propagating context
attributes (negation, antecedent, etc).
attributes (negation, antecedent, etc.).
name : str, optional
Name describing the matcher (defaults to the class name).
uid : str, optional
Expand Down Expand Up @@ -387,7 +386,7 @@ def build_simstring_matcher_databases(
Rules to add to databases
"""
# the params passed to simstring.writer are copy/pasted from QuickUMLS
# cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/a3ba0b3559da2574a907f4d41aa0f2c1c0d5ce0a/quickumls/toolbox.py#L173
# cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/1.4.0/quickumls/toolbox.py#L173
simstring_db_writer = simstring.writer(
str(simstring_db_file),
3, # unit of character n-grams
Expand Down Expand Up @@ -417,7 +416,7 @@ def build_simstring_matcher_databases(
rules_db.close()


_TOKENIZATION_PATTERN = re.compile(r"[\w]+|[^\w ]")
_TOKENIZATION_PATTERN = re.compile(r"\w+|[^\w ]")


def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length: int) -> Iterator[tuple[int, int]]:
Expand Down Expand Up @@ -476,7 +475,7 @@ def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length:
# candidate is too long, stop appending tokens
if length > max_length:
break
yield (start, end)
yield start, end


def _build_candidate_ranges_with_spacy(
Expand Down Expand Up @@ -517,7 +516,7 @@ def _build_candidate_ranges_with_spacy(
['I have', 'have', 'have type', 'type', 'type 2', '2 diabetes', 'diabetes']
"""

# don't allow candidates to start or end with pre/post positions,
# don't allow candidates to start or end with adpositions,
# determinants or conjunctions
def is_invalid_boundary_token(token):
return token.is_punct or token.is_space or token.pos_ in ("ADP", "DET", "SCONJ", "CCONJ", "CONJ")
Expand All @@ -544,7 +543,7 @@ def is_invalid_boundary_token(token):
# candidate is too long, stop appending tokens
if length > max_length:
break
yield (span.start_char, span.end_char)
yield span.start_char, span.end_char


def _get_similarity_score(text_1, text_2, similarity_name, ngram_size=3):
Expand Down

0 comments on commit 9e547d6

Please sign in to comment.