diff --git a/medkit/audio/preprocessing/resampler.py b/medkit/audio/preprocessing/resampler.py index a8d6e9e4..44264032 100644 --- a/medkit/audio/preprocessing/resampler.py +++ b/medkit/audio/preprocessing/resampler.py @@ -19,7 +19,7 @@ class Resampler(PreprocessingOperation): sample_rate : int Target sample rate to resample to, in samples per second. fast : bool, default=False - If `True`, prefer speed over quality and use resampy's "kaiser_fast" filter + If `True`, prefer speed to quality and use resampy's "kaiser_fast" filter instead of "kaiser_best". uid : str, optional Identifier of the resampler. diff --git a/medkit/audio/transcription/hf_transcriber.py b/medkit/audio/transcription/hf_transcriber.py index 97b57059..0406c3f6 100644 --- a/medkit/audio/transcription/hf_transcriber.py +++ b/medkit/audio/transcription/hf_transcriber.py @@ -21,7 +21,7 @@ class HFTranscriber(Operation): For each segment given as input, a transcription attribute will be created with the transcribed text as value. If needed, a text document can later be - created from all the transcriptions of a audio document using + created from all the transcriptions of an audio document using :func:`~medkit.audio.transcription.TranscribedTextDocument.from_audio_doc ` diff --git a/medkit/audio/transcription/sb_transcriber.py b/medkit/audio/transcription/sb_transcriber.py index ea666e4c..2ad5a73e 100644 --- a/medkit/audio/transcription/sb_transcriber.py +++ b/medkit/audio/transcription/sb_transcriber.py @@ -20,7 +20,7 @@ class SBTranscriber(Operation): For each segment given as input, a transcription attribute will be created with the transcribed text as value. If needed, a text document can later be - created from all the transcriptions of a audio document using + created from all the transcriptions of an audio document using :func:`~medkit.audio.transcription.TranscribedTextDocument.from_audio_doc ` diff --git a/medkit/audio/transcription/transcribed_text_document.py b/medkit/audio/transcription/transcribed_text_document.py index 7defbd58..3635163d 100644 --- a/medkit/audio/transcription/transcribed_text_document.py +++ b/medkit/audio/transcription/transcribed_text_document.py @@ -28,9 +28,8 @@ class TranscribedTextDocument(TextDocument): Mapping between text characters spans in this document and corresponding audio spans in the original audio. audio_doc_id: str, optional - Id of the original - :class:`~medkit.core.audio.document.AudioDocument` that was - transcribed, if known. + Identifier for the original :class:`~medkit.core.audio.document.AudioDocument` + that was transcribed, if known. anns: sequence of TextAnnotation, optional Annotations of the document. attrs: sequence of Attribute, optional @@ -75,8 +74,7 @@ def get_containing_audio_spans(self, text_ann_spans: list[AnyTextSpan]) -> list[ the span 15 to 25, then the containing audio span will be the one ranging from 1.0 to 20.0 seconds. - Note that some text annotations maybe be contained in more that one - audio spans. + Note that some text annotations maybe be contained in more than one audio spans. Parameters ---------- diff --git a/medkit/text/ner/_base_simstring_matcher.py b/medkit/text/ner/_base_simstring_matcher.py index 87a54aac..aeff4e49 100644 --- a/medkit/text/ner/_base_simstring_matcher.py +++ b/medkit/text/ner/_base_simstring_matcher.py @@ -59,8 +59,7 @@ class BaseSimstringMatcherRule: Whether to use ASCII-only versions of the rule term and input texts when looking for matches (non-ASCII chars replaced by closest ASCII chars). normalizations : list of BaseSimstringMatcherNormalization, optional - Optional list of normalization attributes that should be attached to the - entities created + List of normalization attributes that should be attached to the entities created. """ term: str @@ -157,7 +156,7 @@ class BaseSimstringMatcher(NEROperation): similarity : str, default="jaccard" Similarity metric to use. spacy_tokenization_language : str, optional - 2-letter code (ex: "fr", "en", etc) designating the language of the + 2-letter code (ex: "fr", "en", etc.) designating the language of the spacy model to use for tokenization. If provided, spacy will be used to tokenize input segments and filter out some tokens based on their part-of-speech tags, such as determinants, conjunctions and @@ -174,7 +173,7 @@ class BaseSimstringMatcher(NEROperation): attrs_to_copy : list of str, optional Labels of the attributes that should be copied from the source segment to the created entity. Useful for propagating context - attributes (negation, antecedent, etc). + attributes (negation, antecedent, etc.). name : str, optional Name describing the matcher (defaults to the class name). uid : str, optional @@ -387,7 +386,7 @@ def build_simstring_matcher_databases( Rules to add to databases """ # the params passed to simstring.writer are copy/pasted from QuickUMLS - # cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/a3ba0b3559da2574a907f4d41aa0f2c1c0d5ce0a/quickumls/toolbox.py#L173 + # cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/1.4.0/quickumls/toolbox.py#L173 simstring_db_writer = simstring.writer( str(simstring_db_file), 3, # unit of character n-grams @@ -417,7 +416,7 @@ def build_simstring_matcher_databases( rules_db.close() -_TOKENIZATION_PATTERN = re.compile(r"[\w]+|[^\w ]") +_TOKENIZATION_PATTERN = re.compile(r"\w+|[^\w ]") def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length: int) -> Iterator[tuple[int, int]]: @@ -476,7 +475,7 @@ def _build_candidate_ranges_with_regexp(text: str, min_length: int, max_length: # candidate is too long, stop appending tokens if length > max_length: break - yield (start, end) + yield start, end def _build_candidate_ranges_with_spacy( @@ -517,7 +516,7 @@ def _build_candidate_ranges_with_spacy( ['I have', 'have', 'have type', 'type', 'type 2', '2 diabetes', 'diabetes'] """ - # don't allow candidates to start or end with pre/post positions, + # don't allow candidates to start or end with adpositions, # determinants or conjunctions def is_invalid_boundary_token(token): return token.is_punct or token.is_space or token.pos_ in ("ADP", "DET", "SCONJ", "CCONJ", "CONJ") @@ -544,7 +543,7 @@ def is_invalid_boundary_token(token): # candidate is too long, stop appending tokens if length > max_length: break - yield (span.start_char, span.end_char) + yield span.start_char, span.end_char def _get_similarity_score(text_1, text_2, similarity_name, ngram_size=3):