From c264710c3efc06bf71832a7824c4e15eb7f2134f Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Fri, 22 Dec 2023 20:33:40 +0000 Subject: [PATCH] deploy: 7948aaea4c2105a355bbae6b2204d987e58b0a70 --- _modules/mltb2/text.html | 112 ++++++++++++++++++++++++++++++++++++++- api-reference/text.html | 100 ++++++++++++++++++++++++++++++++++ genindex.html | 16 +++++- objects.inv | Bin 1332 -> 1388 bytes searchindex.js | 2 +- 5 files changed, 226 insertions(+), 4 deletions(-) diff --git a/_modules/mltb2/text.html b/_modules/mltb2/text.html index 2c88228..cbf1661 100644 --- a/_modules/mltb2/text.html +++ b/_modules/mltb2/text.html @@ -92,7 +92,12 @@

Source code for mltb2.text

 """
 
 import re
-from typing import Dict, Final, Pattern, Tuple
+from collections import Counter, defaultdict
+from dataclasses import dataclass, field
+from typing import Dict, Final, Iterable, Optional, Pattern, Set, Tuple, Union
+
+from scipy.spatial.distance import cityblock
+from tqdm import tqdm
 
 INVISIBLE_CHARACTERS: Final[Tuple[str, ...]] = (
     "\u200b",  # Zero Width Space (ZWSP) https://www.compart.com/en/unicode/U+200b
@@ -218,6 +223,111 @@ 

Source code for mltb2.text

     text = replace_multiple_whitespaces(text)
     text = text.strip()
     return text
+ + +
[docs]def _normalize_counter_to_defaultdict(counter: Counter, max_dimensions: int) -> defaultdict: + """Normalize a counter to to ``max_dimensions``. + + The number of dimensions is limited to ``max_dimensions`` + of the most commen characters. + The counter values are normalized by deviding them by the total count. + + Args: + counter: The counter to normalize. + max_dimensions: The maximum number of dimensions to use for the normalization. + Must be greater than 0. + Returns: + The normalized counter with a maximum of ``max_dimensions`` dimensions. + """ + total_count = sum(counter.values()) + normalized_counter = defaultdict(float) + for char, count in counter.most_common(max_dimensions): + normalized_counter[char] = count / total_count + return normalized_counter
+ + +
[docs]@dataclass +class TextDistance: + """Calculate the distance between two texts. + + One text (or multiple texts) must first be fitted with :func:`~TextDistance.fit`. + After that the distance to other given texts can be calculated with :func:`~TextDistance.distance`. + After the distance was calculated the first time, the class can + not be fitted again. + + Args: + show_progress_bar: Show a progressbar during processing. + max_dimensions: The maximum number of dimensions to use for the distance calculation. + Must be greater than 0. + Raises: + ValueError: If ``max_dimensions`` is not greater than 0. + """ + + show_progress_bar: bool = False + max_dimensions: int = 100 + + # counter for the text we fit + _char_counter: Optional[Counter] = field(default_factory=Counter, init=False) + + # normalized counter for the text we fit - see _normalize_char_counter + _normalized_char_counts: Optional[defaultdict] = field(default=None, init=False) + + # set of all counted characters - see _normalize_char_counter + _counted_char_set: Optional[Set[str]] = field(default=None, init=False) + + def __post_init__(self) -> None: + """Do post init.""" + if not self.max_dimensions > 0: + raise ValueError("'max_dimensions' must be > 0!") + +
[docs] def fit(self, text: Union[str, Iterable[str]]) -> None: + """Fit the text. + + Args: + text: The text to fit. + Raises: + ValueError: If :func:`~TextDistance.fit` is called after + :func:`~TextDistance.distance`. + """ + if self._char_counter is None: + raise ValueError("Fit mut not be called after distance calculation!") + + if isinstance(text, str): + self._char_counter.update(text) + else: + for t in tqdm(text, disable=not self.show_progress_bar): + self._char_counter.update(t)
+ +
[docs] def _normalize_char_counter(self) -> None: + """Normalize the char counter to a defaultdict. + + This supports lazy postprocessing of the char counter. + """ + if self._char_counter is not None: + self._normalized_char_counts = _normalize_counter_to_defaultdict(self._char_counter, self.max_dimensions) + self._char_counter = None + self._counted_char_set = set(self._normalized_char_counts)
+ +
[docs] def distance(self, text) -> float: + """Calculate the distance between the fitted text and the given text. + + This implementation uses the Manhattan distance (:func:`scipy.spatial.distance.cityblock`). + The distance is only calculated for ``max_dimensions`` most commen characters. + + Args: + text: The text to calculate the Manhattan distance to. + """ + self._normalize_char_counter() + all_vector = [] + text_vector = [] + text_count = Counter(text) + text_count_defaultdict = _normalize_counter_to_defaultdict(text_count, self.max_dimensions) + for c in self._counted_char_set.union(text_count_defaultdict): # type: ignore + all_vector.append( + self._normalized_char_counts[c] # type: ignore + ) # if c is not in defaultdict, it will return 0 + text_vector.append(text_count_defaultdict[c]) # if c is not in defaultdict, it will return 0 + return cityblock(all_vector, text_vector)
diff --git a/api-reference/text.html b/api-reference/text.html index fe8ed89..e84d3bb 100644 --- a/api-reference/text.html +++ b/api-reference/text.html @@ -60,6 +60,13 @@
  • somajo
  • somajo_transformers
  • text +
    +
    +class mltb2.text.TextDistance(show_progress_bar: bool = False, max_dimensions: int = 100)[source]
    +

    Bases: object

    +

    Calculate the distance between two texts.

    +

    One text (or multiple texts) must first be fitted with fit(). +After that the distance to other given texts can be calculated with distance(). +After the distance was calculated the first time, the class can +not be fitted again.

    +
    +
    Parameters:
    +
      +
    • show_progress_bar (bool) – Show a progressbar during processing.

    • +
    • max_dimensions (int) – The maximum number of dimensions to use for the distance calculation. +Must be greater than 0.

    • +
    +
    +
    Raises:
    +

    ValueError – If max_dimensions is not greater than 0.

    +
    +
    +
    +
    +_normalize_char_counter() None[source]
    +

    Normalize the char counter to a defaultdict.

    +

    This supports lazy postprocessing of the char counter.

    +
    +
    Return type:
    +

    None

    +
    +
    +
    + +
    +
    +distance(text) float[source]
    +

    Calculate the distance between the fitted text and the given text.

    +

    This implementation uses the Manhattan distance (scipy.spatial.distance.cityblock()). +The distance is only calculated for max_dimensions most commen characters.

    +
    +
    Parameters:
    +

    text – The text to calculate the Manhattan distance to.

    +
    +
    Return type:
    +

    float

    +
    +
    +
    + +
    +
    +fit(text: str | Iterable[str]) None[source]
    +

    Fit the text.

    +
    +
    Parameters:
    +

    text (str | Iterable[str]) – The text to fit.

    +
    +
    Raises:
    +

    ValueError – If fit() is called after + distance().

    +
    +
    Return type:
    +

    None

    +
    +
    +
    + +
    + +
    +
    +mltb2.text._normalize_counter_to_defaultdict(counter: Counter, max_dimensions: int) defaultdict[source]
    +

    Normalize a counter to to max_dimensions.

    +

    The number of dimensions is limited to max_dimensions +of the most commen characters. +The counter values are normalized by deviding them by the total count.

    +
    +
    Parameters:
    +
      +
    • counter (Counter) – The counter to normalize.

    • +
    • max_dimensions (int) – The maximum number of dimensions to use for the normalization. +Must be greater than 0.

    • +
    +
    +
    Returns:
    +

    The normalized counter with a maximum of max_dimensions dimensions.

    +
    +
    Return type:
    +

    defaultdict

    +
    +
    +
    +
    mltb2.text.clean_all_invisible_chars_and_whitespaces(text: str) str[source]
    diff --git a/genindex.html b/genindex.html index 63651c1..3b1111d 100644 --- a/genindex.html +++ b/genindex.html @@ -139,6 +139,10 @@

    _

  • _load_colon_data() (in module mltb2.data)
  • _load_colon_label() (in module mltb2.data) +
  • +
  • _normalize_char_counter() (mltb2.text.TextDistance method) +
  • +
  • _normalize_counter_to_defaultdict() (in module mltb2.text)
  • @@ -171,6 +175,10 @@

    D

    +
    @@ -196,6 +204,8 @@

    F