diff --git a/medkit/text/ner/_base_simstring_matcher.py b/medkit/text/ner/_base_simstring_matcher.py index 87a54aac..aeff4e49 100644 --- a/medkit/text/ner/_base_simstring_matcher.py +++ b/medkit/text/ner/_base_simstring_matcher.py @@ -59,8 +59,7 @@ class BaseSimstringMatcherRule: Whether to use ASCII-only versions of the rule term and input texts when looking for matches (non-ASCII chars replaced by closest ASCII chars). normalizations : list of BaseSimstringMatcherNormalization, optional - Optional list of normalization attributes that should be attached to the - entities created + List of normalization attributes that should be attached to the entities created. """ term: str @@ -157,7 +156,7 @@ class BaseSimstringMatcher(NEROperation): similarity : str, default="jaccard" Similarity metric to use. spacy_tokenization_language : str, optional - 2-letter code (ex: "fr", "en", etc) designating the language of the + 2-letter code (ex: "fr", "en", etc.) designating the language of the spacy model to use for tokenization. If provided, spacy will be used to tokenize input segments and filter out some tokens based on their part-of-speech tags, such as determinants, conjunctions and @@ -174,7 +173,7 @@ class BaseSimstringMatcher(NEROperation): attrs_to_copy : list of str, optional Labels of the attributes that should be copied from the source segment to the created entity. Useful for propagating context - attributes (negation, antecedent, etc). + attributes (negation, antecedent, etc.). name : str, optional Name describing the matcher (defaults to the class name). uid : str, optional @@ -387,7 +386,7 @@ def build_simstring_matcher_databases( Rules to add to databases """ # the params passed to simstring.writer are copy/pasted from QuickUMLS - # cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/a3ba0b3559da2574a907f4d41aa0f2c1c0d5ce0a/quickumls/toolbox.py#L173 + # cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/1.4.0/quickumls/toolbox.py#L173 simstring_db_writer = simstring.writer( str(simstring_db_file), 3, # unit of character n-grams @@ -417,7 +416,7 @@ def build_simstring_matcher_databases( rules_db.close() -_TOKENIZATION_PATTERN = re.compile(r"[\w]+|[^\w ]") +_TOKENIZATION_PATTERN = re.compile(r"\w+|[^\w ]") def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length: int) -> Iterator[tuple[int, int]]: @@ -476,7 +475,7 @@ def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length: # candidate is too long, stop appending tokens if length > max_length: break - yield (start, end) + yield start, end def _build_candidate_ranges_with_spacy( @@ -517,7 +516,7 @@ def _build_candidate_ranges_with_spacy( ['I have', 'have', 'have type', 'type', 'type 2', '2 diabetes', 'diabetes'] """ - # don't allow candidates to start or end with pre/post positions, + # don't allow candidates to start or end with adpositions, # determinants or conjunctions def is_invalid_boundary_token(token): return token.is_punct or token.is_space or token.pos_ in ("ADP", "DET", "SCONJ", "CCONJ", "CONJ") @@ -544,7 +543,7 @@ def is_invalid_boundary_token(token): # candidate is too long, stop appending tokens if length > max_length: break - yield (span.start_char, span.end_char) + yield span.start_char, span.end_char def _get_similarity_score(text_1, text_2, similarity_name, ngram_size=3):