Skip to content

Commit

Permalink
deploy: db18e32
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilipMay committed Sep 4, 2023
1 parent e4100a6 commit 664b749
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 4 deletions.
45 changes: 44 additions & 1 deletion _modules/mltb2/somajo.html
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ <h1>Source code for mltb2.somajo</h1><div class="highlight"><pre>

<span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABC</span>
<span class="kn">from</span> <span class="nn">dataclasses</span> <span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Container</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Set</span><span class="p">,</span> <span class="n">Union</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Container</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Set</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Union</span>

<span class="kn">from</span> <span class="nn">somajo</span> <span class="kn">import</span> <span class="n">SoMaJo</span>
<span class="kn">from</span> <span class="nn">tqdm</span> <span class="kn">import</span> <span class="n">tqdm</span>
Expand Down Expand Up @@ -266,6 +266,49 @@ <h1>Source code for mltb2.somajo</h1><div class="highlight"><pre>
<span class="n">sentences</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">somajo</span><span class="o">.</span><span class="n">tokenize_text</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">extract_token_class_set</span><span class="p">(</span><span class="n">sentences</span><span class="p">,</span> <span class="n">keep_token_classes</span><span class="o">=</span><span class="s2">&quot;URL&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">result</span></div></div>


<div class="viewcode-block" id="UrlSwapper"><a class="viewcode-back" href="../../api-reference/somajo.html#mltb2.somajo.UrlSwapper">[docs]</a><span class="nd">@dataclass</span>
<span class="k">class</span> <span class="nc">UrlSwapper</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Tool to swap (and reverse swap) links with a numbered replacement link.</span>

<span class="sd"> Args:</span>
<span class="sd"> token_extractor: The sentence token extractor to be used.</span>
<span class="sd"> url_pattern: The pattern to use for replacement. One ``{}`` marks the place where to put the number.</span>
<span class="sd"> &quot;&quot;&quot;</span>

<span class="n">token_extractor</span><span class="p">:</span> <span class="n">TokenExtractor</span>
<span class="n">url_pattern</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;https://link-</span><span class="si">{}</span><span class="s2">.com&quot;</span>
<span class="n">_url_map</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">init</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="nb">repr</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="c1"># map from real url to swapped url</span>

<span class="k">def</span> <span class="nf">__post_init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Do post init.&quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_url_map</span> <span class="o">=</span> <span class="p">{}</span>

<div class="viewcode-block" id="UrlSwapper.swap_urls"><a class="viewcode-back" href="../../api-reference/somajo.html#mltb2.somajo.UrlSwapper.swap_urls">[docs]</a> <span class="k">def</span> <span class="nf">swap_urls</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Swap the urls of the text.&quot;&quot;&quot;</span>
<span class="n">url_set</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">token_extractor</span><span class="o">.</span><span class="n">extract_url_set</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="k">for</span> <span class="n">url</span> <span class="ow">in</span> <span class="n">url_set</span><span class="p">:</span>
<span class="k">if</span> <span class="n">url</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_url_map</span><span class="p">:</span> <span class="c1"># if url is unknown: add it</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_url_map</span><span class="p">[</span><span class="n">url</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">url_pattern</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_url_map</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span>
<span class="n">text</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="n">url</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_url_map</span><span class="p">[</span><span class="n">url</span><span class="p">])</span> <span class="c1"># replace</span>
<span class="k">return</span> <span class="n">text</span></div>

<div class="viewcode-block" id="UrlSwapper.reverse_swap_urls"><a class="viewcode-back" href="../../api-reference/somajo.html#mltb2.somajo.UrlSwapper.reverse_swap_urls">[docs]</a> <span class="k">def</span> <span class="nf">reverse_swap_urls</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Set</span><span class="p">[</span><span class="nb">str</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Revert the url swap.</span>

<span class="sd"> Returns:</span>
<span class="sd"> The reverted text and a ``set`` of URLs that were unknown by the ``URLSwapper``.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">reverse_url_map</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">k</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_url_map</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span> <span class="c1"># map from swapped url to real url</span>
<span class="n">url_set</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">token_extractor</span><span class="o">.</span><span class="n">extract_url_set</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
<span class="n">no_reverse_swap_urls</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">for</span> <span class="n">url</span> <span class="ow">in</span> <span class="n">url_set</span><span class="p">:</span>
<span class="k">if</span> <span class="n">url</span> <span class="ow">in</span> <span class="n">reverse_url_map</span><span class="p">:</span>
<span class="n">text</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="n">url</span><span class="p">,</span> <span class="n">reverse_url_map</span><span class="p">[</span><span class="n">url</span><span class="p">])</span> <span class="c1"># replace</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">no_reverse_swap_urls</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">url</span><span class="p">)</span>
<span class="k">return</span> <span class="n">text</span><span class="p">,</span> <span class="n">no_reverse_swap_urls</span></div></div>
</pre></div>

</div>
Expand Down
51 changes: 51 additions & 0 deletions api-reference/somajo.html
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@
<li class="toctree-l4"><a class="reference internal" href="#mltb2.somajo.TokenExtractor.extract_url_set"><code class="docutils literal notranslate"><span class="pre">TokenExtractor.extract_url_set()</span></code></a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="#mltb2.somajo.UrlSwapper"><code class="docutils literal notranslate"><span class="pre">UrlSwapper</span></code></a><ul>
<li class="toctree-l4"><a class="reference internal" href="#mltb2.somajo.UrlSwapper.reverse_swap_urls"><code class="docutils literal notranslate"><span class="pre">UrlSwapper.reverse_swap_urls()</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="#mltb2.somajo.UrlSwapper.swap_urls"><code class="docutils literal notranslate"><span class="pre">UrlSwapper.swap_urls()</span></code></a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="#mltb2.somajo.detokenize"><code class="docutils literal notranslate"><span class="pre">detokenize()</span></code></a></li>
<li class="toctree-l3"><a class="reference internal" href="#mltb2.somajo.extract_token_class_set"><code class="docutils literal notranslate"><span class="pre">extract_token_class_set()</span></code></a></li>
</ul>
Expand Down Expand Up @@ -252,6 +257,52 @@

</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="mltb2.somajo.UrlSwapper">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">mltb2.somajo.</span></span><span class="sig-name descname"><span class="pre">UrlSwapper</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">token_extractor</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#mltb2.somajo.TokenExtractor" title="mltb2.somajo.TokenExtractor"><span class="pre">TokenExtractor</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">url_pattern</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><span class="pre">str</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'https://link-{}.com'</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/mltb2/somajo.html#UrlSwapper"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mltb2.somajo.UrlSwapper" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
<p>Tool to swap (and reverse swap) links with a numbered replacement link.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>token_extractor</strong> (<a class="reference internal" href="#mltb2.somajo.TokenExtractor" title="mltb2.somajo.TokenExtractor"><em>TokenExtractor</em></a>) – The sentence token extractor to be used.</p></li>
<li><p><strong>url_pattern</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – The pattern to use for replacement. One <code class="docutils literal notranslate"><span class="pre">{}</span></code> marks the place where to put the number.</p></li>
</ul>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="mltb2.somajo.UrlSwapper.reverse_swap_urls">
<span class="sig-name descname"><span class="pre">reverse_swap_urls</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">text</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><span class="pre">str</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.11)"><span class="pre">Tuple</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><span class="pre">str</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Set" title="(in Python v3.11)"><span class="pre">Set</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><span class="pre">str</span></a><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="reference internal" href="../_modules/mltb2/somajo.html#UrlSwapper.reverse_swap_urls"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mltb2.somajo.UrlSwapper.reverse_swap_urls" title="Permalink to this definition"></a></dt>
<dd><p>Revert the url swap.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>The reverted text and a <code class="docutils literal notranslate"><span class="pre">set</span></code> of URLs that were unknown by the <code class="docutils literal notranslate"><span class="pre">URLSwapper</span></code>.</p>
</dd>
<dt class="field-even">Parameters<span class="colon">:</span></dt>
<dd class="field-even"><p><strong>text</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – </p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Tuple" title="(in Python v3.11)"><em>Tuple</em></a>[<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)">str</a>, <a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Set" title="(in Python v3.11)"><em>Set</em></a>[<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)">str</a>]]</p>
</dd>
</dl>
</dd></dl>

<dl class="py method">
<dt class="sig sig-object py" id="mltb2.somajo.UrlSwapper.swap_urls">
<span class="sig-name descname"><span class="pre">swap_urls</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">text</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><span class="pre">str</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><span class="pre">str</span></a></span></span><a class="reference internal" href="../_modules/mltb2/somajo.html#UrlSwapper.swap_urls"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mltb2.somajo.UrlSwapper.swap_urls" title="Permalink to this definition"></a></dt>
<dd><p>Swap the urls of the text.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>text</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – </p>
</dd>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)">str</a></p>
</dd>
</dl>
</dd></dl>

</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="mltb2.somajo.detokenize">
<span class="sig-prename descclassname"><span class="pre">mltb2.somajo.</span></span><span class="sig-name descname"><span class="pre">detokenize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tokens</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><span class="pre">str</span></a></span></span><a class="reference internal" href="../_modules/mltb2/somajo.html#detokenize"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mltb2.somajo.detokenize" title="Permalink to this definition"></a></dt>
Expand Down
Loading

0 comments on commit 664b749

Please sign in to comment.