Skip to content

Commit

Permalink
STY: Fix warnings from static analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
ghisvail committed Jul 1, 2024
1 parent 2d4becb commit b1103dd
Showing 1 changed file with 8 additions and 9 deletions.
17 changes: 8 additions & 9 deletions medkit/text/ner/_base_simstring_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@ class BaseSimstringMatcherRule:
Whether to use ASCII-only versions of the rule term and input texts when
looking for matches (non-ASCII chars replaced by closest ASCII chars).
normalizations : list of BaseSimstringMatcherNormalization, optional
Optional list of normalization attributes that should be attached to the
entities created
List of normalization attributes that should be attached to the entities created.
"""

term: str
Expand Down Expand Up @@ -157,7 +156,7 @@ class BaseSimstringMatcher(NEROperation):
similarity : str, default="jaccard"
Similarity metric to use.
spacy_tokenization_language : str, optional
2-letter code (ex: "fr", "en", etc) designating the language of the
2-letter code (ex: "fr", "en", etc.) designating the language of the
spacy model to use for tokenization. If provided, spacy will be used
to tokenize input segments and filter out some tokens based on their
part-of-speech tags, such as determinants, conjunctions and
Expand All @@ -174,7 +173,7 @@ class BaseSimstringMatcher(NEROperation):
attrs_to_copy : list of str, optional
Labels of the attributes that should be copied from the source
segment to the created entity. Useful for propagating context
attributes (negation, antecedent, etc).
attributes (negation, antecedent, etc.).
name : str, optional
Name describing the matcher (defaults to the class name).
uid : str, optional
Expand Down Expand Up @@ -387,7 +386,7 @@ def build_simstring_matcher_databases(
Rules to add to databases
"""
# the params passed to simstring.writer are copy/pasted from QuickUMLS
# cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/a3ba0b3559da2574a907f4d41aa0f2c1c0d5ce0a/quickumls/toolbox.py#L173
# cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/1.4.0/quickumls/toolbox.py#L173
simstring_db_writer = simstring.writer(
str(simstring_db_file),
3, # unit of character n-grams
Expand Down Expand Up @@ -417,7 +416,7 @@ def build_simstring_matcher_databases(
rules_db.close()


_TOKENIZATION_PATTERN = re.compile(r"[\w]+|[^\w ]")
_TOKENIZATION_PATTERN = re.compile(r"\w+|[^\w ]")


def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length: int) -> Iterator[tuple[int, int]]:
Expand Down Expand Up @@ -476,7 +475,7 @@ def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length:
# candidate is too long, stop appending tokens
if length > max_length:
break
yield (start, end)
yield start, end


def _build_candidate_ranges_with_spacy(
Expand Down Expand Up @@ -517,7 +516,7 @@ def _build_candidate_ranges_with_spacy(
['I have', 'have', 'have type', 'type', 'type 2', '2 diabetes', 'diabetes']
"""

# don't allow candidates to start or end with pre/post positions,
# don't allow candidates to start or end with adpositions,
# determinants or conjunctions
def is_invalid_boundary_token(token):
return token.is_punct or token.is_space or token.pos_ in ("ADP", "DET", "SCONJ", "CCONJ", "CONJ")
Expand All @@ -544,7 +543,7 @@ def is_invalid_boundary_token(token):
# candidate is too long, stop appending tokens
if length > max_length:
break
yield (span.start_char, span.end_char)
yield span.start_char, span.end_char


def _get_similarity_score(text_1, text_2, similarity_name, ngram_size=3):
Expand Down

0 comments on commit b1103dd

Please sign in to comment.