Skip to content

Commit

Permalink
deploy: 7948aae
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilipMay committed Dec 22, 2023
1 parent 32492aa commit c264710
Show file tree
Hide file tree
Showing 5 changed files with 226 additions and 4 deletions.
112 changes: 111 additions & 1 deletion _modules/mltb2/text.html
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,12 @@ <h1>Source code for mltb2.text</h1><div class="highlight"><pre>
<span class="sd">&quot;&quot;&quot;</span>

<span class="kn">import</span> <span class="nn">re</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Final</span><span class="p">,</span> <span class="n">Pattern</span><span class="p">,</span> <span class="n">Tuple</span>
<span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">Counter</span><span class="p">,</span> <span class="n">defaultdict</span>
<span class="kn">from</span> <span class="nn">dataclasses</span> <span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Final</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Pattern</span><span class="p">,</span> <span class="n">Set</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Union</span>

<span class="kn">from</span> <span class="nn">scipy.spatial.distance</span> <span class="kn">import</span> <span class="n">cityblock</span>
<span class="kn">from</span> <span class="nn">tqdm</span> <span class="kn">import</span> <span class="n">tqdm</span>

<span class="n">INVISIBLE_CHARACTERS</span><span class="p">:</span> <span class="n">Final</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">]]</span> <span class="o">=</span> <span class="p">(</span>
<span class="s2">&quot;</span><span class="se">\u200b</span><span class="s2">&quot;</span><span class="p">,</span> <span class="c1"># Zero Width Space (ZWSP) https://www.compart.com/en/unicode/U+200b</span>
Expand Down Expand Up @@ -218,6 +223,111 @@ <h1>Source code for mltb2.text</h1><div class="highlight"><pre>
<span class="n">text</span> <span class="o">=</span> <span class="n">replace_multiple_whitespaces</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="n">text</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
<span class="k">return</span> <span class="n">text</span></div>


<div class="viewcode-block" id="_normalize_counter_to_defaultdict"><a class="viewcode-back" href="../../api-reference/text.html#mltb2.text._normalize_counter_to_defaultdict">[docs]</a><span class="k">def</span> <span class="nf">_normalize_counter_to_defaultdict</span><span class="p">(</span><span class="n">counter</span><span class="p">:</span> <span class="n">Counter</span><span class="p">,</span> <span class="n">max_dimensions</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">defaultdict</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Normalize a counter to to ``max_dimensions``.</span>

<span class="sd"> The number of dimensions is limited to ``max_dimensions``</span>
<span class="sd"> of the most commen characters.</span>
<span class="sd"> The counter values are normalized by deviding them by the total count.</span>

<span class="sd"> Args:</span>
<span class="sd"> counter: The counter to normalize.</span>
<span class="sd"> max_dimensions: The maximum number of dimensions to use for the normalization.</span>
<span class="sd"> Must be greater than 0.</span>
<span class="sd"> Returns:</span>
<span class="sd"> The normalized counter with a maximum of ``max_dimensions`` dimensions.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">total_count</span> <span class="o">=</span> <span class="nb">sum</span><span class="p">(</span><span class="n">counter</span><span class="o">.</span><span class="n">values</span><span class="p">())</span>
<span class="n">normalized_counter</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">float</span><span class="p">)</span>
<span class="k">for</span> <span class="n">char</span><span class="p">,</span> <span class="n">count</span> <span class="ow">in</span> <span class="n">counter</span><span class="o">.</span><span class="n">most_common</span><span class="p">(</span><span class="n">max_dimensions</span><span class="p">):</span>
<span class="n">normalized_counter</span><span class="p">[</span><span class="n">char</span><span class="p">]</span> <span class="o">=</span> <span class="n">count</span> <span class="o">/</span> <span class="n">total_count</span>
<span class="k">return</span> <span class="n">normalized_counter</span></div>


<div class="viewcode-block" id="TextDistance"><a class="viewcode-back" href="../../api-reference/text.html#mltb2.text.TextDistance">[docs]</a><span class="nd">@dataclass</span>
<span class="k">class</span> <span class="nc">TextDistance</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Calculate the distance between two texts.</span>

<span class="sd"> One text (or multiple texts) must first be fitted with :func:`~TextDistance.fit`.</span>
<span class="sd"> After that the distance to other given texts can be calculated with :func:`~TextDistance.distance`.</span>
<span class="sd"> After the distance was calculated the first time, the class can</span>
<span class="sd"> not be fitted again.</span>

<span class="sd"> Args:</span>
<span class="sd"> show_progress_bar: Show a progressbar during processing.</span>
<span class="sd"> max_dimensions: The maximum number of dimensions to use for the distance calculation.</span>
<span class="sd"> Must be greater than 0.</span>
<span class="sd"> Raises:</span>
<span class="sd"> ValueError: If ``max_dimensions`` is not greater than 0.</span>
<span class="sd"> &quot;&quot;&quot;</span>

<span class="n">show_progress_bar</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">max_dimensions</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span>

<span class="c1"># counter for the text we fit</span>
<span class="n">_char_counter</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Counter</span><span class="p">]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="n">Counter</span><span class="p">,</span> <span class="n">init</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>

<span class="c1"># normalized counter for the text we fit - see _normalize_char_counter</span>
<span class="n">_normalized_char_counts</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">defaultdict</span><span class="p">]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">init</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>

<span class="c1"># set of all counted characters - see _normalize_char_counter</span>
<span class="n">_counted_char_set</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Set</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">init</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>

<span class="k">def</span> <span class="nf">__post_init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Do post init.&quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_dimensions</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;&#39;max_dimensions&#39; must be &gt; 0!&quot;</span><span class="p">)</span>

<div class="viewcode-block" id="TextDistance.fit"><a class="viewcode-back" href="../../api-reference/text.html#mltb2.text.TextDistance.fit">[docs]</a> <span class="k">def</span> <span class="nf">fit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Fit the text.</span>

<span class="sd"> Args:</span>
<span class="sd"> text: The text to fit.</span>
<span class="sd"> Raises:</span>
<span class="sd"> ValueError: If :func:`~TextDistance.fit` is called after</span>
<span class="sd"> :func:`~TextDistance.distance`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_char_counter</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Fit mut not be called after distance calculation!&quot;</span><span class="p">)</span>

<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">text</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_char_counter</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="n">text</span><span class="p">,</span> <span class="n">disable</span><span class="o">=</span><span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">show_progress_bar</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_char_counter</span><span class="o">.</span><span class="n">update</span><span class="p">(</span><span class="n">t</span><span class="p">)</span></div>

<div class="viewcode-block" id="TextDistance._normalize_char_counter"><a class="viewcode-back" href="../../api-reference/text.html#mltb2.text.TextDistance._normalize_char_counter">[docs]</a> <span class="k">def</span> <span class="nf">_normalize_char_counter</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Normalize the char counter to a defaultdict.</span>

<span class="sd"> This supports lazy postprocessing of the char counter.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_char_counter</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_normalized_char_counts</span> <span class="o">=</span> <span class="n">_normalize_counter_to_defaultdict</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_char_counter</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_dimensions</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_char_counter</span> <span class="o">=</span> <span class="kc">None</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_counted_char_set</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_normalized_char_counts</span><span class="p">)</span></div>

<div class="viewcode-block" id="TextDistance.distance"><a class="viewcode-back" href="../../api-reference/text.html#mltb2.text.TextDistance.distance">[docs]</a> <span class="k">def</span> <span class="nf">distance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Calculate the distance between the fitted text and the given text.</span>

<span class="sd"> This implementation uses the Manhattan distance (:func:`scipy.spatial.distance.cityblock`).</span>
<span class="sd"> The distance is only calculated for ``max_dimensions`` most commen characters.</span>

<span class="sd"> Args:</span>
<span class="sd"> text: The text to calculate the Manhattan distance to.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_normalize_char_counter</span><span class="p">()</span>
<span class="n">all_vector</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">text_vector</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">text_count</span> <span class="o">=</span> <span class="n">Counter</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="n">text_count_defaultdict</span> <span class="o">=</span> <span class="n">_normalize_counter_to_defaultdict</span><span class="p">(</span><span class="n">text_count</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_dimensions</span><span class="p">)</span>
<span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_counted_char_set</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">text_count_defaultdict</span><span class="p">):</span> <span class="c1"># type: ignore</span>
<span class="n">all_vector</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_normalized_char_counts</span><span class="p">[</span><span class="n">c</span><span class="p">]</span> <span class="c1"># type: ignore</span>
<span class="p">)</span> <span class="c1"># if c is not in defaultdict, it will return 0</span>
<span class="n">text_vector</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">text_count_defaultdict</span><span class="p">[</span><span class="n">c</span><span class="p">])</span> <span class="c1"># if c is not in defaultdict, it will return 0</span>
<span class="k">return</span> <span class="n">cityblock</span><span class="p">(</span><span class="n">all_vector</span><span class="p">,</span> <span class="n">text_vector</span><span class="p">)</span></div></div>
</pre></div>

</div>
Expand Down
Loading

0 comments on commit c264710

Please sign in to comment.