diff --git a/_modules/mltb2/arangodb.html b/_modules/mltb2/arangodb.html
index 2ec9b25..1673437 100644
--- a/_modules/mltb2/arangodb.html
+++ b/_modules/mltb2/arangodb.html
@@ -90,17 +90,31 @@ <h1>Source code for mltb2.arangodb</h1><div class="highlight"><pre>
 <span class="sd">&quot;&quot;&quot;</span>
 
 
+<span class="kn">import</span> <span class="nn">gzip</span>
+<span class="kn">from</span> <span class="nn">argparse</span> <span class="kn">import</span> <span class="n">ArgumentParser</span>
 <span class="kn">from</span> <span class="nn">contextlib</span> <span class="kn">import</span> <span class="n">closing</span>
 <span class="kn">from</span> <span class="nn">dataclasses</span> <span class="kn">import</span> <span class="n">dataclass</span>
-<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">,</span> <span class="n">Union</span>
+<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">,</span> <span class="n">Union</span>
 
+<span class="kn">import</span> <span class="nn">jsonlines</span>
 <span class="kn">from</span> <span class="nn">arango</span> <span class="kn">import</span> <span class="n">ArangoClient</span>
 <span class="kn">from</span> <span class="nn">arango.database</span> <span class="kn">import</span> <span class="n">StandardDatabase</span>
 <span class="kn">from</span> <span class="nn">dotenv</span> <span class="kn">import</span> <span class="n">dotenv_values</span>
+<span class="kn">from</span> <span class="nn">tqdm</span> <span class="kn">import</span> <span class="n">tqdm</span>
 
 <span class="kn">from</span> <span class="nn">mltb2.db</span> <span class="kn">import</span> <span class="n">AbstractBatchDataManager</span>
 
 
+<div class="viewcode-block" id="_check_config_keys"><a class="viewcode-back" href="../../api-reference/arangodb.html#mltb2.arangodb._check_config_keys">[docs]</a><span class="k">def</span> <span class="nf">_check_config_keys</span><span class="p">(</span><span class="n">config</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> <span class="n">expected_config_keys</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Check if all expected keys are in config.</span>
+
+<span class="sd">    This is useful to check if a config file contains all necessary keys.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">for</span> <span class="n">expected_config_key</span> <span class="ow">in</span> <span class="n">expected_config_keys</span><span class="p">:</span>
+        <span class="k">if</span> <span class="n">expected_config_key</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">config</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Config file must contain &#39;</span><span class="si">{</span><span class="n">expected_config_key</span><span class="si">}</span><span class="s2">&#39;!&quot;</span><span class="p">)</span></div>
+
+
 <div class="viewcode-block" id="ArangoBatchDataManager"><a class="viewcode-back" href="../../api-reference/arangodb.html#mltb2.arangodb.ArangoBatchDataManager">[docs]</a><span class="nd">@dataclass</span>
 <span class="k">class</span> <span class="nc">ArangoBatchDataManager</span><span class="p">(</span><span class="n">AbstractBatchDataManager</span><span class="p">):</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;ArangoDB implementation of the ``AbstractBatchDataManager``.</span>
@@ -170,9 +184,7 @@ <h1>Source code for mltb2.arangodb</h1><div class="highlight"><pre>
             <span class="s2">&quot;attribute_name&quot;</span><span class="p">,</span>
             <span class="s2">&quot;batch_size&quot;</span><span class="p">,</span>
         <span class="p">]</span>
-        <span class="k">for</span> <span class="n">expected_config_file_key</span> <span class="ow">in</span> <span class="n">expected_config_file_keys</span><span class="p">:</span>
-            <span class="k">if</span> <span class="n">expected_config_file_key</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">arango_config</span><span class="p">:</span>
-                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Config file must contain &#39;</span><span class="si">{</span><span class="n">expected_config_file_key</span><span class="si">}</span><span class="s2">&#39;!&quot;</span><span class="p">)</span>
+        <span class="n">_check_config_keys</span><span class="p">(</span><span class="n">arango_config</span><span class="p">,</span> <span class="n">expected_config_file_keys</span><span class="p">)</span>
 
         <span class="k">return</span> <span class="bp">cls</span><span class="p">(</span>
             <span class="n">hosts</span><span class="o">=</span><span class="n">arango_config</span><span class="p">[</span><span class="s2">&quot;hosts&quot;</span><span class="p">],</span>  <span class="c1"># type: ignore</span>
@@ -235,6 +247,55 @@ <h1>Source code for mltb2.arangodb</h1><div class="highlight"><pre>
             <span class="n">connection</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_connection_factory</span><span class="p">(</span><span class="n">arango_client</span><span class="p">)</span>
             <span class="n">collection</span> <span class="o">=</span> <span class="n">connection</span><span class="o">.</span><span class="n">collection</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">collection_name</span><span class="p">)</span>
             <span class="n">collection</span><span class="o">.</span><span class="n">import_bulk</span><span class="p">(</span><span class="n">batch</span><span class="p">,</span> <span class="n">on_duplicate</span><span class="o">=</span><span class="s2">&quot;update&quot;</span><span class="p">)</span></div></div>
+
+
+<div class="viewcode-block" id="arango_collection_backup"><a class="viewcode-back" href="../../api-reference/arangodb.html#mltb2.arangodb.arango_collection_backup">[docs]</a><span class="k">def</span> <span class="nf">arango_collection_backup</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Commandline tool to do an ArangoDB backup of a collection.</span>
+
+<span class="sd">    The backup is written to a gzip compressed JSONL file in the current working directory.</span>
+<span class="sd">    Run ``arango-col-backup -h`` to get command line help.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="c1"># argument parsing</span>
+    <span class="n">description</span> <span class="o">=</span> <span class="p">(</span>
+        <span class="s2">&quot;ArangoDB backup of a collection. &quot;</span>
+        <span class="s2">&quot;The backup is written to a gzip compressed JSONL file in the current working directory.&quot;</span>
+    <span class="p">)</span>
+    <span class="n">argument_parser</span> <span class="o">=</span> <span class="n">ArgumentParser</span><span class="p">(</span><span class="n">description</span><span class="o">=</span><span class="n">description</span><span class="p">)</span>
+    <span class="n">argument_parser</span><span class="o">.</span><span class="n">add_argument</span><span class="p">(</span>
+        <span class="s2">&quot;--conf&quot;</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="nb">str</span><span class="p">,</span> <span class="n">required</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">help</span><span class="o">=</span><span class="s2">&quot;Config file containing &#39;hosts&#39;, &#39;db_name&#39;, &#39;username&#39; and &#39;password&#39;.&quot;</span>
+    <span class="p">)</span>
+    <span class="n">argument_parser</span><span class="o">.</span><span class="n">add_argument</span><span class="p">(</span><span class="s2">&quot;--col&quot;</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="nb">str</span><span class="p">,</span> <span class="n">required</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">help</span><span class="o">=</span><span class="s2">&quot;Collection name to backup.&quot;</span><span class="p">)</span>
+    <span class="n">args</span> <span class="o">=</span> <span class="n">argument_parser</span><span class="o">.</span><span class="n">parse_args</span><span class="p">()</span>
+
+    <span class="c1"># load and check config file</span>
+    <span class="n">arango_config</span> <span class="o">=</span> <span class="n">dotenv_values</span><span class="p">(</span><span class="n">args</span><span class="o">.</span><span class="n">conf</span><span class="p">)</span>
+    <span class="n">expected_config_file_keys</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;hosts&quot;</span><span class="p">,</span> <span class="s2">&quot;db_name&quot;</span><span class="p">,</span> <span class="s2">&quot;username&quot;</span><span class="p">,</span> <span class="s2">&quot;password&quot;</span><span class="p">]</span>
+    <span class="n">_check_config_keys</span><span class="p">(</span><span class="n">arango_config</span><span class="p">,</span> <span class="n">expected_config_file_keys</span><span class="p">)</span>
+
+    <span class="n">output_file_name</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;./</span><span class="si">{</span><span class="n">args</span><span class="o">.</span><span class="n">col</span><span class="si">}</span><span class="s2">_backup.jsonl.gz&quot;</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Writing backup to &#39;</span><span class="si">{</span><span class="n">output_file_name</span><span class="si">}</span><span class="s2">&#39;...&quot;</span><span class="p">)</span>
+
+    <span class="k">with</span> <span class="n">closing</span><span class="p">(</span><span class="n">ArangoClient</span><span class="p">(</span><span class="n">hosts</span><span class="o">=</span><span class="n">arango_config</span><span class="p">[</span><span class="s2">&quot;hosts&quot;</span><span class="p">]))</span> <span class="k">as</span> <span class="n">arango_client</span><span class="p">,</span> <span class="n">gzip</span><span class="o">.</span><span class="n">open</span><span class="p">(</span>  <span class="c1"># type: ignore</span>
+        <span class="n">output_file_name</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span>
+    <span class="p">)</span> <span class="k">as</span> <span class="n">gzip_out</span><span class="p">:</span>
+        <span class="n">connection</span> <span class="o">=</span> <span class="n">arango_client</span><span class="o">.</span><span class="n">db</span><span class="p">(</span>
+            <span class="n">arango_config</span><span class="p">[</span><span class="s2">&quot;db_name&quot;</span><span class="p">],</span>  <span class="c1"># type: ignore</span>
+            <span class="n">arango_config</span><span class="p">[</span><span class="s2">&quot;username&quot;</span><span class="p">],</span>  <span class="c1"># type: ignore</span>
+            <span class="n">arango_config</span><span class="p">[</span><span class="s2">&quot;password&quot;</span><span class="p">],</span>  <span class="c1"># type: ignore</span>
+        <span class="p">)</span>
+        <span class="n">jsonlines_writer</span> <span class="o">=</span> <span class="n">jsonlines</span><span class="o">.</span><span class="n">Writer</span><span class="p">(</span><span class="n">gzip_out</span><span class="p">)</span>  <span class="c1"># type: ignore</span>
+        <span class="k">try</span><span class="p">:</span>
+            <span class="n">cursor</span> <span class="o">=</span> <span class="n">connection</span><span class="o">.</span><span class="n">aql</span><span class="o">.</span><span class="n">execute</span><span class="p">(</span>
+                <span class="s2">&quot;FOR doc IN @@coll RETURN doc&quot;</span><span class="p">,</span>
+                <span class="n">bind_vars</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;@coll&quot;</span><span class="p">:</span> <span class="n">args</span><span class="o">.</span><span class="n">col</span><span class="p">},</span>
+                <span class="n">batch_size</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span>
+                <span class="n">max_runtime</span><span class="o">=</span><span class="mi">60</span> <span class="o">*</span> <span class="mi">60</span><span class="p">,</span>  <span class="c1"># type: ignore # 1 hour</span>
+                <span class="n">stream</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+            <span class="p">)</span>
+            <span class="k">for</span> <span class="n">doc</span> <span class="ow">in</span> <span class="n">tqdm</span><span class="p">(</span><span class="n">cursor</span><span class="p">):</span>
+                <span class="n">jsonlines_writer</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">doc</span><span class="p">)</span>
+        <span class="k">finally</span><span class="p">:</span>
+            <span class="n">cursor</span><span class="o">.</span><span class="n">close</span><span class="p">(</span><span class="n">ignore_missing</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>  <span class="c1"># type: ignore</span></div>
 </pre></div>
 
            </div>
diff --git a/api-reference/arangodb.html b/api-reference/arangodb.html
index 585ad47..07f1ad2 100644
--- a/api-reference/arangodb.html
+++ b/api-reference/arangodb.html
@@ -59,6 +59,8 @@
 <li class="toctree-l4"><a class="reference internal" href="#mltb2.arangodb.ArangoBatchDataManager.save_batch"><code class="docutils literal notranslate"><span class="pre">ArangoBatchDataManager.save_batch()</span></code></a></li>
 </ul>
 </li>
+<li class="toctree-l3"><a class="reference internal" href="#mltb2.arangodb._check_config_keys"><code class="docutils literal notranslate"><span class="pre">_check_config_keys()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#mltb2.arangodb.arango_collection_backup"><code class="docutils literal notranslate"><span class="pre">arango_collection_backup()</span></code></a></li>
 </ul>
 </li>
 <li class="toctree-l2"><a class="reference internal" href="data.html"><code class="xref py py-mod docutils literal notranslate"><span class="pre">data</span></code></a></li>
@@ -224,6 +226,37 @@
 
 </dd></dl>
 
+<dl class="py function">
+<dt class="sig sig-object py" id="mltb2.arangodb._check_config_keys">
+<span class="sig-prename descclassname"><span class="pre">mltb2.arangodb.</span></span><span class="sig-name descname"><span class="pre">_check_config_keys</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Dict" title="(in Python v3.12)"><span class="pre">Dict</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><span class="pre">str</span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><span class="pre">str</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)"><span class="pre">None</span></a><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">expected_config_keys</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Sequence" title="(in Python v3.12)"><span class="pre">Sequence</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><span class="pre">str</span></a><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)"><span class="pre">None</span></a></span></span><a class="reference internal" href="../_modules/mltb2/arangodb.html#_check_config_keys"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mltb2.arangodb._check_config_keys" title="Permalink to this definition"></a></dt>
+<dd><p>Check if all expected keys are in config.</p>
+<p>This is useful to check if a config file contains all necessary keys.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>config</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Dict" title="(in Python v3.12)"><em>Dict</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> | </em><em>None</em><em>]</em>) – </p></li>
+<li><p><strong>expected_config_keys</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Sequence" title="(in Python v3.12)"><em>Sequence</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>]</em>) – </p></li>
+</ul>
+</dd>
+<dt class="field-even">Return type<span class="colon">:</span></dt>
+<dd class="field-even"><p>None</p>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
+<dt class="sig sig-object py" id="mltb2.arangodb.arango_collection_backup">
+<span class="sig-prename descclassname"><span class="pre">mltb2.arangodb.</span></span><span class="sig-name descname"><span class="pre">arango_collection_backup</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)"><span class="pre">None</span></a></span></span><a class="reference internal" href="../_modules/mltb2/arangodb.html#arango_collection_backup"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#mltb2.arangodb.arango_collection_backup" title="Permalink to this definition"></a></dt>
+<dd><p>Commandline tool to do an ArangoDB backup of a collection.</p>
+<p>The backup is written to a gzip compressed JSONL file in the current working directory.
+Run <code class="docutils literal notranslate"><span class="pre">arango-col-backup</span> <span class="pre">-h</span></code> to get command line help.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
+</dd></dl>
+
 </section>
 
 
diff --git a/genindex.html b/genindex.html
index 2d8a0a4..eabddd7 100644
--- a/genindex.html
+++ b/genindex.html
@@ -124,6 +124,8 @@ <h2 id="_">_</h2>
 </li>
       </ul></li>
       <li><a href="api-reference/arangodb.html#mltb2.arangodb.ArangoBatchDataManager._arango_client_factory">_arango_client_factory() (mltb2.arangodb.ArangoBatchDataManager method)</a>
+</li>
+      <li><a href="api-reference/arangodb.html#mltb2.arangodb._check_config_keys">_check_config_keys() (in module mltb2.arangodb)</a>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
@@ -159,6 +161,8 @@ <h2 id="A">A</h2>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="api-reference/arangodb.html#mltb2.arangodb.arango_collection_backup">arango_collection_backup() (in module mltb2.arangodb)</a>
+</li>
       <li><a href="api-reference/arangodb.html#mltb2.arangodb.ArangoBatchDataManager">ArangoBatchDataManager (class in mltb2.arangodb)</a>
 </li>
   </ul></td>
diff --git a/objects.inv b/objects.inv
index 60b6f88..307668a 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/searchindex.js b/searchindex.js
index e1f8e1d..c770b77 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["api-reference", "api-reference/arangodb", "api-reference/data", "api-reference/db", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/arangodb.rst", "api-reference/data.rst", "api-reference/db.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">arangodb</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">data</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">db</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">fasttext</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">files</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">md</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">openai</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">optuna</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">plot</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">somajo</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">somajo_transformers</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">text</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">transformers</span></code>", "MLTB2 Documentation"], "terms": {"arangodb": [0, 14], "data": [0, 1, 3, 5, 8, 13, 14], "db": [0, 8, 14], "fasttext": [0, 14], "file": [0, 1, 2, 4, 7, 14], "md": [0, 14], "openai": [0, 14], "optuna": [0, 14], "plot": [0, 14], "somajo": [0, 11, 14], "somajo_transform": [0, 14], "text": [0, 4, 6, 7, 10, 11, 13, 14], "transform": [0, 11, 14], "util": [1, 3, 5, 13], "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "pip": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "instal": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13], "necessari": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "depend": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "class": [1, 3, 4, 6, 7, 8, 10, 11, 12, 13], "arangobatchdatamanag": 1, "host": [1, 13], "str": [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 13], "sequenc": [1, 3], "db_name": 1, "usernam": 1, "password": 1, "collection_nam": 1, "attribute_nam": 1, "batch_siz": 1, "int": [1, 4, 6, 7, 8, 9, 11, 12, 13], "20": [1, 8], "aql_overwrit": 1, "none": [1, 2, 3, 5, 7, 9, 10, 12, 13], "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "base": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "abstractbatchdatamanag": [1, 3], "implement": [1, 3, 8, 12], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "databas": [1, 3], "name": [1, 7], "document": 1, "from": [1, 2, 3, 4, 5, 7, 8, 10, 12], "collect": 1, "ar": [1, 2, 4, 6, 7, 8, 12, 13], "process": [1, 3, 6, 7, 10, 11, 12, 13], "attribut": 1, "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "check": [1, 7, 8, 12], "alreadi": [1, 6], "If": [1, 2, 5, 7, 9, 10, 11, 12, 13], "present": 1, "avail": [1, 14], "consid": 1, "The": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "batch": [1, 3], "size": 1, "aql": 1, "string": [1, 7, 10], "overwrit": [1, 7], "default": [1, 2, 5, 9], "_arango_client_factori": 1, "arangocli": 1, "creat": [1, 5, 7, 9], "an": [1, 3, 4, 7, 8, 10, 11, 13], "client": 1, "return": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "type": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "_connection_factori": 1, "arango_cli": 1, "standarddatabas": 1, "connect": 1, "classmethod": [1, 7], "from_config_fil": 1, "config_file_nam": 1, "construct": [1, 7], "config": 1, "must": [1, 7, 8, 12], "contain": [1, 2, 4, 10, 12, 13], "valu": [1, 8, 9, 12], "exampl": [1, 7, 8, 10, 12], "http": [1, 2, 10], "com": [1, 10], "my_ml_databas": 1, "my_usernam": 1, "secret": 1, "my_ml_data_collect": 1, "processing_metadata": 1, "100": [1, 8, 12], "path": [1, 4, 5, 13], "load_batch": [1, 3], "load": [1, 2, 3], "save_batch": [1, 3], "save": [1, 3, 5, 9], "offer": [2, 4, 8, 10, 11, 12, 13], "tool": [2, 4, 8, 9, 10, 11, 12, 13, 14], "follow": [2, 7, 12], "tabular": 2, "set": [2, 8, 10, 13], "biolog": 2, "medic": 2, "domain": 2, "support": [2, 12], "colon": 2, "genom": 2, "pub": 2, "princeton": 2, "edu": 2, "oncologi": 2, "affydata": 2, "index": [2, 14], "html": [2, 12], "prostat": 2, "web": 2, "stanford": 2, "hasti": 2, "casi_fil": 2, "leukemia_big": 2, "leukemia": 2, "after": [2, 12], "internet": 2, "pars": 2, "convert": [2, 10], "cach": 2, "directori": [2, 5, 13], "determin": [2, 5], "get_and_create_mltb2_data_dir": [2, 5], "_load_colon_data": 2, "datafram": 2, "label": [2, 9, 13], "also": [2, 7], "see": [2, 8], "panda": 2, "_load_colon_label": 2, "seri": 2, "load_colon": 2, "mltb2_base_data_dir": [2, 5], "tupl": [2, 10], "user": [2, 5, 8], "platformdir": [2, 5], "user_data_dir": [2, 5], "load_leukemia_big": 2, "big": 2, "load_prost": 2, "abc": [3, 7, 10], "abstract": [3, 7, 10], "respect": 3, "intend": 3, "conjunct": 3, "batchdataprocessor": 3, "data_manag": 3, "process_batch_callback": 3, "callabl": 3, "object": [3, 4, 6, 7, 8, 10, 11, 12, 13], "manag": 3, "A": [3, 4, 8, 14], "callback": 3, "function": [3, 5, 7, 8, 9, 12], "one": [3, 9, 10], "run": [3, 7], "done": [3, 8], "until": [3, 8], "empti": 3, "For": [3, 8, 9, 12], "each": 3, "call": [3, 7, 8, 9, 12], "fasttextlanguageidentif": 4, "identifi": 4, "languag": [4, 10], "__call__": [4, 6, 7, 10, 11, 13], "num_lang": 4, "10": [4, 8], "given": [4, 5, 8, 12], "which": [4, 5, 7, 8, 10, 12, 13], "recogn": 4, "number": [4, 6, 7, 8, 9, 10, 11, 12, 13], "dict": [4, 7], "probabl": 4, "more": [4, 8, 9, 12], "than": [4, 11, 12], "element": 4, "so": 4, "guarante": 4, "you": [4, 7, 8, 14], "want": 4, "includ": 4, "case": [4, 7], "when": [4, 7], "veri": 4, "low": 4, "possibl": 4, "af": 4, "al": 4, "am": 4, "arz": 4, "ast": 4, "av": 4, "az": 4, "azb": 4, "ba": 4, "bar": 4, "bcl": 4, "bg": 4, "bh": 4, "bn": 4, "bo": 4, "bpy": 4, "br": 4, "b": 4, "bxr": 4, "ca": 4, "cbk": 4, "ce": 4, "ceb": 4, "ckb": 4, "co": [4, 13], "c": 4, "cv": [4, 8], "cy": 4, "da": [4, 10], "de": [4, 10], "diq": 4, "dsb": 4, "dty": 4, "dv": 4, "el": 4, "eml": 4, "en": 4, "eo": 4, "e": 4, "et": 4, "eu": 4, "fa": 4, "fi": 4, "fr": 4, "frr": 4, "fy": 4, "ga": 4, "gd": 4, "gl": 4, "gn": 4, "gom": 4, "gu": 4, "gv": 4, "he": 4, "hi": 4, "hif": 4, "hr": 4, "hsb": 4, "ht": 4, "hu": 4, "hy": 4, "ia": 4, "id": [4, 13], "ie": 4, "ilo": 4, "io": 4, "ja": 4, "jbo": 4, "jv": 4, "ka": 4, "kk": 4, "km": 4, "kn": 4, "ko": 4, "krc": 4, "ku": 4, "kv": 4, "kw": 4, "ky": 4, "la": 4, "lb": 4, "lez": 4, "li": 4, "lmo": 4, "lo": 4, "lrc": 4, "lt": 4, "lv": 4, "mai": 4, "mg": 4, "mhr": 4, "min": 4, "mk": 4, "ml": 4, "mn": 4, "mr": 4, "mrj": 4, "m": 4, "mt": 4, "mwl": 4, "my": 4, "myv": 4, "mzn": 4, "nah": 4, "nap": 4, "nd": 4, "ne": 4, "new": 4, "nl": 4, "nn": 4, "oc": 4, "o": 4, "pa": 4, "pam": 4, "pfl": 4, "pl": 4, "pm": 4, "pnb": 4, "p": 4, "pt": 4, "qu": 4, "rm": 4, "ro": 4, "ru": 4, "rue": 4, "sa": 4, "sah": 4, "sc": 4, "scn": 4, "sco": 4, "sd": 4, "sh": 4, "si": 4, "sk": 4, "sl": 4, "sq": 4, "sr": 4, "su": 4, "sv": 4, "sw": 4, "ta": 4, "te": 4, "tg": 4, "th": 4, "tk": 4, "tl": 4, "tr": 4, "tt": 4, "tyv": 4, "ug": 4, "uk": 4, "ur": 4, "uz": 4, "vec": 4, "vep": 4, "vi": 4, "vl": 4, "vo": 4, "wa": [4, 7, 12, 13], "war": 4, "wuu": 4, "xal": 4, "xmf": 4, "yi": 4, "yo": 4, "yue": 4, "zh": 4, "static": 4, "get_model_path_and_download": 4, "get": [4, 7, 10], "model": [4, 7, 8, 13], "download": 4, "need": 4, "full": [4, 5, 7], "provid": [5, 8], "other": [5, 12], "fetch_remote_fil": 5, "dirnam": 5, "filenam": [5, 9], "url": [5, 10], "sha256_checksum": 5, "fetch": 5, "remot": 5, "where": [5, 10], "under": 5, "sha256": 5, "checksum": 5, "rais": [5, 11, 12], "ioerror": 5, "wrong": 5, "dir": 5, "exact": 5, "folder": 5, "append": [5, 8], "markdown": 6, "specif": [6, 7, 10, 11, 12, 13], "mdtextsplitt": 6, "max_token": [6, 11], "transformers_token_count": [6, 11], "transformerstokencount": [6, 11, 13], "show_progress_bar": [6, 7, 10, 11, 12, 13], "bool": [6, 7, 8, 9, 10, 11, 12, 13], "fals": [6, 7, 9, 10, 11, 12, 13], "split": [6, 8, 10, 11, 13], "section": [6, 11], "specifi": [6, 7, 11], "maximum": [6, 11, 12], "token": [6, 7, 10, 11, 13], "doe": [6, 8, 9, 11], "divid": [6, 11], "head": 6, "correspond": 6, "paragraph": 6, "per": [6, 8, 11], "can": [6, 7, 9, 12, 14], "onli": [6, 7, 8, 12], "exceed": 6, "singl": [6, 9, 12], "chunk": 6, "larger": [6, 8], "counter": [6, 11, 12], "show": [6, 7, 10, 11, 12, 13], "progressbar": [6, 7, 10, 11, 12, 13], "dure": [6, 7, 10, 11, 12, 13], "md_text": 6, "list": [6, 7, 10, 11, 13], "_chunk_md_by_headlin": 6, "headlin": 6, "chunk_md": 6, "merg": 6, "isol": 6, "subsequ": 6, "end": 6, "without": [6, 7], "content": 6, "remov": [6, 12], "openaiazurechatcomplet": 7, "completion_kwarg": 7, "ani": 7, "openaichatcomplet": 7, "azur": 7, "chat": 7, "complet": 7, "openaibasecomplet": 7, "from_yaml": 7, "kwarg": 7, "properti": 7, "api_typ": 7, "api_vers": 7, "api_bas": 7, "engin": 7, "quickstart": 7, "start": 7, "gpt": 7, "35": 7, "turbo": 7, "4": [7, 8], "servic": 7, "openaiazurecomplet": 7, "openaicomplet": 7, "non": 7, "gener": [7, 13], "prompt": 7, "map": 7, "openaicompletionansw": 7, "llm": 7, "In": [7, 8], "allow": 7, "chang": 7, "temperatur": 7, "_complet": 7, "completion_kwargs_for_this_cal": 7, "openaiobject": 7, "method": [7, 8, 12], "yaml_fil": 7, "yaml": 7, "prompt_token": 7, "completion_token": 7, "total_token": 7, "finish_reason": 7, "answer": 7, "result": [7, 8], "ha": [7, 9], "been": 7, "total": [7, 12], "reason": [7, 8], "why": 7, "stop": 7, "mean": [7, 8], "api": [7, 14], "limit": [7, 12], "length": 7, "becaus": 7, "function_cal": 7, "from_open_ai_object": 7, "open_ai_object": 7, "openaitokencount": 7, "model_nam": 7, "count": [7, 12, 13], "some": [7, 14], "3": [7, 8], "5": 7, "davinci": 7, "003": 7, "embed": 7, "ada": 7, "002": 7, "iter": [7, 10, 12, 13], "just": [7, 13], "_check_mandatory_azure_completion_kwarg": 7, "mandatori": 7, "significancerepeatedtrainingprun": 8, "alpha": 8, "float": [8, 10, 12], "0": [8, 9, 12], "1": [8, 13], "n_warmup_step": 8, "baseprun": 8, "pruner": 8, "statist": 8, "signific": 8, "heurist": 8, "decis": 8, "make": [8, 9], "It": [8, 10, 12, 14], "prune": 8, "repeat": 8, "train": [8, 13], "like": 8, "cross": [8, 13], "valid": [8, 13], "As": 8, "test": [8, 13], "t": 8, "our": 8, "experi": 8, "have": 8, "shown": 8, "aplha": 8, "between": [8, 12], "": [8, 9], "standard": 8, "assum": 8, "adjust": 8, "onc": [8, 12], "hyperparamet": 8, "those": 8, "work": 8, "basi": 8, "intermedi": 8, "epoch": 8, "contrast": 8, "precis": 8, "individu": 8, "fold": [8, 13], "below": 8, "minimalist": 8, "import": [8, 10], "log": 8, "numpi": 8, "np": 8, "sklearn": 8, "dataset": [8, 13], "load_iri": 8, "model_select": 8, "stratifiedkfold": 8, "ensembl": 8, "randomforestclassifi": 8, "metric": 8, "accuracy_scor": 8, "configur": 8, "logger": 8, "debug": 8, "output": [8, 10], "getlogg": 8, "addhandl": 8, "streamhandl": 8, "setlevel": 8, "x": [8, 9], "y": [8, 9], "target": 8, "def": 8, "trial": 8, "min_samples_split": 8, "suggest_int": 8, "2": 8, "n_estim": 8, "validation_result_list": 8, "skf": 8, "n_split": [8, 13], "fold_index": 8, "train_index": 8, "val_index": 8, "enumer": 8, "x_train": 8, "x_val": 8, "y_train": 8, "y_val": 8, "rf": 8, "fit": [8, 12], "y_pred": 8, "predict": 8, "acc": 8, "report": 8, "we": 8, "should": [8, 10], "should_prun": 8, "here": 8, "break": 8, "studi": 8, "create_studi": 8, "storag": 8, "sqlite": 8, "memori": 8, "study_nam": 8, "iris_cv": 8, "direct": 8, "maxim": 8, "load_if_exist": 8, "true": [8, 9, 11, 12], "sampler": 8, "tpesampl": 8, "multivari": 8, "add": 8, "optim": 8, "n_trial": 8, "level": 8, "aggress": 8, "smaller": 8, "stronger": 8, "differ": [8, 9, 12], "two": [8, 9, 10, 12], "distribut": 8, "disabl": 8, "reach": 8, "exce": 8, "step": [8, 9], "frozentri": 8, "judg": 8, "whether": 8, "note": 8, "suppos": 8, "librari": 8, "instead": 8, "interfac": 8, "mechan": 8, "take": 8, "copi": 8, "befor": [8, 12], "modifi": 8, "boolean": 8, "repres": 8, "matplotlib": 9, "boxplot": 9, "titl": 9, "xlabel": 9, "ylabel": 9, "vert": 9, "print": [9, 10], "diagram": 9, "pyplot": 9, "axi": 9, "box": [9, 14], "vertic": 9, "horizont": 9, "boxplot_dict": 9, "values_dict": 9, "form": [9, 12], "dictionari": 9, "save_last_figur": 9, "last": 9, "made": 9, "jupyt": 9, "notebook": 9, "same": 9, "cell": 9, "twin_axes_timeseries_plot": 9, "values_1": 9, "label_1": 9, "values_2": 9, "label_2": 9, "start_timestep_numb": 9, "shift_1": 9, "shift_2": 9, "label_x": 9, "color_1": 9, "tab": 9, "red": 9, "color_2": 9, "blue": 9, "twin": 9, "ax": 9, "timeseri": 9, "curv": 9, "array_lik": 9, "first": [9, 12], "second": 9, "point": 9, "time": [9, 12], "timestep": 9, "shift": 9, "posit": 9, "neg": 9, "color": 9, "jaccardsimilar": 10, "liter": 10, "de_cmc": 10, "en_ptb": 10, "somajobaseclass": 10, "calcul": [10, 12], "jaccard": 10, "similar": 10, "german": 10, "english": 10, "text1": 10, "text2": 10, "get_token_set": 10, "word": [10, 11], "directli": 10, "somajosentencesplitt": [10, 11], "sentenc": [10, 11], "tokenextractor": 10, "extract": 10, "extract_url_set": 10, "token_extractor": 10, "url_set": 10, "ist": 10, "ein": 10, "link": 10, "github": [10, 14], "urlswapp": 10, "url_pattern": 10, "swap": 10, "revers": 10, "replac": [10, 12], "extractor": 10, "pattern": 10, "One": [10, 12], "mark": 10, "place": 10, "put": 10, "reverse_swap_url": 10, "revert": 10, "were": 10, "unknown": 10, "swap_url": 10, "detoken": 10, "how": 10, "do": [10, 13], "extract_token_class_set": 10, "keep_token_class": 10, "keep": 10, "all": [10, 14], "kept": 10, "hug": [11, 13], "face": [11, 13], "textsplitt": 11, "somajo_sentence_splitt": 11, "ignore_overly_long_sent": 11, "alwai": 11, "whole": 11, "splitter": 11, "valueerror": [11, 12], "except": 11, "longer": 11, "simpli": 11, "ignor": 11, "detect": 12, "clean": 12, "invis": 12, "charact": 12, "special": 12, "whitespac": 12, "duplic": 12, "distanc": 12, "find": 12, "anomali": 12, "textdist": 12, "max_dimens": 12, "markup": 12, "unusu": 12, "multipl": 12, "again": 12, "dimens": 12, "greater": 12, "_normalize_char_count": 12, "normal": 12, "char": 12, "defaultdict": 12, "lazi": 12, "postprocess": 12, "manhattan": 12, "scipi": 12, "spatial": 12, "cityblock": 12, "most": 12, "commen": 12, "higher": 12, "least": 12, "_normalize_counter_to_defaultdict": 12, "devid": 12, "them": [12, 14], "clean_all_invisible_chars_and_whitespac": 12, "lead": 12, "trail": 12, "defin": 12, "constant": 12, "invisible_charact": 12, "special_whitespac": 12, "rteturn": 12, "has_invisible_charact": 12, "otherwis": 12, "has_special_whitespac": 12, "remove_invisible_charact": 12, "replace_multiple_whitespac": 12, "replace_special_whitespac": 12, "kfoldlabeleddataset": 13, "7": 13, "n_repeat": 13, "random_st": 13, "k": 13, "labeleddataset": 13, "labeled_dataset": 13, "stratification_label": 13, "encod": 13, "labe": 13, "pretrained_model_name_or_path": 13, "pathlik": 13, "insid": 13, "repo": 13, "huggingfac": 13, "machin": 14, "learn": 14, "python": 14, "packag": 14, "pypi": 14, "option": 14, "might": 14, "refer": 14, "repositori": 14, "licens": 14, "imprint": 14}, "objects": {"mltb2": [[1, 0, 0, "-", "arangodb"], [2, 0, 0, "-", "data"], [3, 0, 0, "-", "db"], [4, 0, 0, "-", "fasttext"], [5, 0, 0, "-", "files"], [6, 0, 0, "-", "md"], [7, 0, 0, "-", "openai"], [8, 0, 0, "-", "optuna"], [9, 0, 0, "-", "plot"], [10, 0, 0, "-", "somajo"], [11, 0, 0, "-", "somajo_transformers"], [12, 0, 0, "-", "text"], [13, 0, 0, "-", "transformers"]], "mltb2.arangodb": [[1, 1, 1, "", "ArangoBatchDataManager"]], "mltb2.arangodb.ArangoBatchDataManager": [[1, 2, 1, "", "_arango_client_factory"], [1, 2, 1, "", "_connection_factory"], [1, 2, 1, "", "from_config_file"], [1, 2, 1, "", "load_batch"], [1, 2, 1, "", "save_batch"]], "mltb2.data": [[2, 3, 1, "", "_load_colon_data"], [2, 3, 1, "", "_load_colon_label"], [2, 3, 1, "", "load_colon"], [2, 3, 1, "", "load_leukemia_big"], [2, 3, 1, "", "load_prostate"]], "mltb2.db": [[3, 1, 1, "", "AbstractBatchDataManager"], [3, 1, 1, "", "BatchDataProcessor"]], "mltb2.db.AbstractBatchDataManager": [[3, 2, 1, "", "load_batch"], [3, 2, 1, "", "save_batch"]], "mltb2.db.BatchDataProcessor": [[3, 2, 1, "", "run"]], "mltb2.fasttext": [[4, 1, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[4, 2, 1, "", "__call__"], [4, 2, 1, "", "get_model_path_and_download"]], "mltb2.files": [[5, 3, 1, "", "fetch_remote_file"], [5, 3, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[6, 1, 1, "", "MdTextSplitter"], [6, 3, 1, "", "_chunk_md_by_headline"], [6, 3, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[6, 2, 1, "", "__call__"]], "mltb2.openai": [[7, 1, 1, "", "OpenAiAzureChatCompletion"], [7, 1, 1, "", "OpenAiAzureCompletion"], [7, 1, 1, "", "OpenAiBaseCompletion"], [7, 1, 1, "", "OpenAiChatCompletion"], [7, 1, 1, "", "OpenAiCompletion"], [7, 1, 1, "", "OpenAiCompletionAnswer"], [7, 1, 1, "", "OpenAiTokenCounter"], [7, 3, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[7, 2, 1, "", "__call__"], [7, 2, 1, "", "_completion"], [7, 2, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[7, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[7, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[7, 2, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[7, 2, 1, "", "__call__"]], "mltb2.optuna": [[8, 1, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[8, 2, 1, "", "prune"]], "mltb2.plot": [[9, 3, 1, "", "boxplot"], [9, 3, 1, "", "boxplot_dict"], [9, 3, 1, "", "save_last_figure"], [9, 3, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[10, 1, 1, "", "JaccardSimilarity"], [10, 1, 1, "", "SoMaJoBaseClass"], [10, 1, 1, "", "SoMaJoSentenceSplitter"], [10, 1, 1, "", "TokenExtractor"], [10, 1, 1, "", "UrlSwapper"], [10, 3, 1, "", "detokenize"], [10, 3, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[10, 2, 1, "", "__call__"], [10, 2, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[10, 2, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[10, 2, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[10, 2, 1, "", "reverse_swap_urls"], [10, 2, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[11, 1, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[11, 2, 1, "", "__call__"]], "mltb2.text": [[12, 1, 1, "", "TextDistance"], [12, 3, 1, "", "_normalize_counter_to_defaultdict"], [12, 3, 1, "", "clean_all_invisible_chars_and_whitespaces"], [12, 3, 1, "", "has_invisible_characters"], [12, 3, 1, "", "has_special_whitespaces"], [12, 3, 1, "", "remove_invisible_characters"], [12, 3, 1, "", "replace_multiple_whitespaces"], [12, 3, 1, "", "replace_special_whitespaces"]], "mltb2.text.TextDistance": [[12, 2, 1, "", "_normalize_char_counter"], [12, 2, 1, "", "distance"], [12, 2, 1, "", "fit"]], "mltb2.transformers": [[13, 1, 1, "", "KFoldLabeledDataset"], [13, 1, 1, "", "LabeledDataset"], [13, 1, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[13, 2, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[13, 2, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "refer": 0, "arangodb": 1, "data": 2, "db": 3, "fasttext": 4, "file": 5, "md": 6, "openai": 7, "optuna": 8, "plot": 9, "somajo": 10, "somajo_transform": 11, "text": 12, "transform": 13, "mltb2": 14, "document": 14, "instal": 14, "content": 14}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "arangodb": [[1, "module-mltb2.arangodb"]], "data": [[2, "module-mltb2.data"]], "db": [[3, "module-mltb2.db"]], "fasttext": [[4, "module-mltb2.fasttext"]], "files": [[5, "module-mltb2.files"]], "md": [[6, "module-mltb2.md"]], "openai": [[7, "module-mltb2.openai"]], "optuna": [[8, "module-mltb2.optuna"]], "plot": [[9, "module-mltb2.plot"]], "somajo": [[10, "module-mltb2.somajo"]], "somajo_transformers": [[11, "module-mltb2.somajo_transformers"]], "text": [[12, "module-mltb2.text"]], "transformers": [[13, "module-mltb2.transformers"]], "MLTB2 Documentation": [[14, "mltb2-documentation"]], "Installation": [[14, "installation"]], "Content": [[14, "content"]]}, "indexentries": {"arangobatchdatamanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoBatchDataManager"]], "_arango_client_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._arango_client_factory"]], "_connection_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._connection_factory"]], "from_config_file() (mltb2.arangodb.arangobatchdatamanager class method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.from_config_file"]], "load_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.load_batch"]], "mltb2.arangodb": [[1, "module-mltb2.arangodb"]], "module": [[1, "module-mltb2.arangodb"], [2, "module-mltb2.data"], [3, "module-mltb2.db"], [4, "module-mltb2.fasttext"], [5, "module-mltb2.files"], [6, "module-mltb2.md"], [7, "module-mltb2.openai"], [8, "module-mltb2.optuna"], [9, "module-mltb2.plot"], [10, "module-mltb2.somajo"], [11, "module-mltb2.somajo_transformers"], [12, "module-mltb2.text"], [13, "module-mltb2.transformers"]], "save_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.save_batch"]], "_load_colon_data() (in module mltb2.data)": [[2, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[2, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[2, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[2, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[2, "mltb2.data.load_prostate"]], "mltb2.data": [[2, "module-mltb2.data"]], "abstractbatchdatamanager (class in mltb2.db)": [[3, "mltb2.db.AbstractBatchDataManager"]], "batchdataprocessor (class in mltb2.db)": [[3, "mltb2.db.BatchDataProcessor"]], "load_batch() (mltb2.db.abstractbatchdatamanager method)": [[3, "mltb2.db.AbstractBatchDataManager.load_batch"]], "mltb2.db": [[3, "module-mltb2.db"]], "run() (mltb2.db.batchdataprocessor method)": [[3, "mltb2.db.BatchDataProcessor.run"]], "save_batch() (mltb2.db.abstractbatchdatamanager method)": [[3, "mltb2.db.AbstractBatchDataManager.save_batch"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[4, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[4, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[4, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[4, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[5, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[5, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[5, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[6, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[6, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[6, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[6, "mltb2.md.chunk_md"]], "mltb2.md": [[6, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[7, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[7, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[7, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[7, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[7, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[7, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[7, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[7, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[7, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[8, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[8, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[8, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[9, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[9, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[9, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[9, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[9, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[10, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[10, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[10, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[10, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[10, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[10, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[10, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[10, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[10, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[10, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[10, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[10, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[10, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[10, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[11, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[11, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[11, "module-mltb2.somajo_transformers"]], "textdistance (class in mltb2.text)": [[12, "mltb2.text.TextDistance"]], "_normalize_char_counter() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance._normalize_char_counter"]], "_normalize_counter_to_defaultdict() (in module mltb2.text)": [[12, "mltb2.text._normalize_counter_to_defaultdict"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "distance() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance.distance"]], "fit() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance.fit"]], "has_invisible_characters() (in module mltb2.text)": [[12, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[12, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[12, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[13, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[13, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[13, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[13, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[13, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[13, "mltb2.transformers.KFoldLabeledDataset.split"]]}})
\ No newline at end of file
+Search.setIndex({"docnames": ["api-reference", "api-reference/arangodb", "api-reference/data", "api-reference/db", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/arangodb.rst", "api-reference/data.rst", "api-reference/db.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">arangodb</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">data</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">db</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">fasttext</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">files</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">md</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">openai</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">optuna</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">plot</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">somajo</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">somajo_transformers</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">text</span></code>", "<code class=\"xref py py-mod docutils literal notranslate\"><span class=\"pre\">transformers</span></code>", "MLTB2 Documentation"], "terms": {"arangodb": [0, 14], "data": [0, 1, 3, 5, 8, 13, 14], "db": [0, 8, 14], "fasttext": [0, 14], "file": [0, 1, 2, 4, 7, 14], "md": [0, 14], "openai": [0, 14], "optuna": [0, 14], "plot": [0, 14], "somajo": [0, 11, 14], "somajo_transform": [0, 14], "text": [0, 4, 6, 7, 10, 11, 13, 14], "transform": [0, 11, 14], "util": [1, 3, 5, 13], "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "pip": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "instal": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13], "necessari": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "depend": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "class": [1, 3, 4, 6, 7, 8, 10, 11, 12, 13], "arangobatchdatamanag": 1, "host": [1, 13], "str": [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 13], "sequenc": [1, 3], "db_name": 1, "usernam": 1, "password": 1, "collection_nam": 1, "attribute_nam": 1, "batch_siz": 1, "int": [1, 4, 6, 7, 8, 9, 11, 12, 13], "20": [1, 8], "aql_overwrit": 1, "none": [1, 2, 3, 5, 7, 9, 10, 12, 13], "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "base": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "abstractbatchdatamanag": [1, 3], "implement": [1, 3, 8, 12], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "databas": [1, 3], "name": [1, 7], "document": 1, "from": [1, 2, 3, 4, 5, 7, 8, 10, 12], "collect": 1, "ar": [1, 2, 4, 6, 7, 8, 12, 13], "process": [1, 3, 6, 7, 10, 11, 12, 13], "attribut": 1, "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "check": [1, 7, 8, 12], "alreadi": [1, 6], "If": [1, 2, 5, 7, 9, 10, 11, 12, 13], "present": 1, "avail": [1, 14], "consid": 1, "The": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "batch": [1, 3], "size": 1, "aql": 1, "string": [1, 7, 10], "overwrit": [1, 7], "default": [1, 2, 5, 9], "_arango_client_factori": 1, "arangocli": 1, "creat": [1, 5, 7, 9], "an": [1, 3, 4, 7, 8, 10, 11, 13], "client": 1, "return": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "type": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "_connection_factori": 1, "arango_cli": 1, "standarddatabas": 1, "connect": 1, "classmethod": [1, 7], "from_config_fil": 1, "config_file_nam": 1, "construct": [1, 7], "config": 1, "must": [1, 7, 8, 12], "contain": [1, 2, 4, 10, 12, 13], "valu": [1, 8, 9, 12], "exampl": [1, 7, 8, 10, 12], "http": [1, 2, 10], "com": [1, 10], "my_ml_databas": 1, "my_usernam": 1, "secret": 1, "my_ml_data_collect": 1, "processing_metadata": 1, "100": [1, 8, 12], "path": [1, 4, 5, 13], "load_batch": [1, 3], "load": [1, 2, 3], "save_batch": [1, 3], "save": [1, 3, 5, 9], "_check_config_kei": 1, "dict": [1, 4, 7], "expected_config_kei": 1, "all": [1, 10, 14], "expect": 1, "kei": 1, "arango_collection_backup": 1, "commandlin": 1, "tool": [1, 2, 4, 8, 9, 10, 11, 12, 13, 14], "do": [1, 10, 13], "backup": 1, "written": 1, "gzip": 1, "compress": 1, "jsonl": 1, "current": 1, "work": [1, 8], "directori": [1, 2, 5, 13], "run": [1, 3, 7], "arango": 1, "col": 1, "h": 1, "get": [1, 4, 7, 10], "command": 1, "line": 1, "help": 1, "offer": [2, 4, 8, 10, 11, 12, 13], "follow": [2, 7, 12], "tabular": 2, "set": [2, 8, 10, 13], "biolog": 2, "medic": 2, "domain": 2, "support": [2, 12], "colon": 2, "genom": 2, "pub": 2, "princeton": 2, "edu": 2, "oncologi": 2, "affydata": 2, "index": [2, 14], "html": [2, 12], "prostat": 2, "web": 2, "stanford": 2, "hasti": 2, "casi_fil": 2, "leukemia_big": 2, "leukemia": 2, "after": [2, 12], "internet": 2, "pars": 2, "convert": [2, 10], "cach": 2, "determin": [2, 5], "get_and_create_mltb2_data_dir": [2, 5], "_load_colon_data": 2, "datafram": 2, "label": [2, 9, 13], "also": [2, 7], "see": [2, 8], "panda": 2, "_load_colon_label": 2, "seri": 2, "load_colon": 2, "mltb2_base_data_dir": [2, 5], "tupl": [2, 10], "user": [2, 5, 8], "platformdir": [2, 5], "user_data_dir": [2, 5], "load_leukemia_big": 2, "big": 2, "load_prost": 2, "abc": [3, 7, 10], "abstract": [3, 7, 10], "respect": 3, "intend": 3, "conjunct": 3, "batchdataprocessor": 3, "data_manag": 3, "process_batch_callback": 3, "callabl": 3, "object": [3, 4, 6, 7, 8, 10, 11, 12, 13], "manag": 3, "A": [3, 4, 8, 14], "callback": 3, "function": [3, 5, 7, 8, 9, 12], "one": [3, 9, 10], "done": [3, 8], "until": [3, 8], "empti": 3, "For": [3, 8, 9, 12], "each": 3, "call": [3, 7, 8, 9, 12], "fasttextlanguageidentif": 4, "identifi": 4, "languag": [4, 10], "__call__": [4, 6, 7, 10, 11, 13], "num_lang": 4, "10": [4, 8], "given": [4, 5, 8, 12], "which": [4, 5, 7, 8, 10, 12, 13], "recogn": 4, "number": [4, 6, 7, 8, 9, 10, 11, 12, 13], "probabl": 4, "more": [4, 8, 9, 12], "than": [4, 11, 12], "element": 4, "so": 4, "guarante": 4, "you": [4, 7, 8, 14], "want": 4, "includ": 4, "case": [4, 7], "when": [4, 7], "veri": 4, "low": 4, "possibl": 4, "af": 4, "al": 4, "am": 4, "arz": 4, "ast": 4, "av": 4, "az": 4, "azb": 4, "ba": 4, "bar": 4, "bcl": 4, "bg": 4, "bh": 4, "bn": 4, "bo": 4, "bpy": 4, "br": 4, "b": 4, "bxr": 4, "ca": 4, "cbk": 4, "ce": 4, "ceb": 4, "ckb": 4, "co": [4, 13], "c": 4, "cv": [4, 8], "cy": 4, "da": [4, 10], "de": [4, 10], "diq": 4, "dsb": 4, "dty": 4, "dv": 4, "el": 4, "eml": 4, "en": 4, "eo": 4, "e": 4, "et": 4, "eu": 4, "fa": 4, "fi": 4, "fr": 4, "frr": 4, "fy": 4, "ga": 4, "gd": 4, "gl": 4, "gn": 4, "gom": 4, "gu": 4, "gv": 4, "he": 4, "hi": 4, "hif": 4, "hr": 4, "hsb": 4, "ht": 4, "hu": 4, "hy": 4, "ia": 4, "id": [4, 13], "ie": 4, "ilo": 4, "io": 4, "ja": 4, "jbo": 4, "jv": 4, "ka": 4, "kk": 4, "km": 4, "kn": 4, "ko": 4, "krc": 4, "ku": 4, "kv": 4, "kw": 4, "ky": 4, "la": 4, "lb": 4, "lez": 4, "li": 4, "lmo": 4, "lo": 4, "lrc": 4, "lt": 4, "lv": 4, "mai": 4, "mg": 4, "mhr": 4, "min": 4, "mk": 4, "ml": 4, "mn": 4, "mr": 4, "mrj": 4, "m": 4, "mt": 4, "mwl": 4, "my": 4, "myv": 4, "mzn": 4, "nah": 4, "nap": 4, "nd": 4, "ne": 4, "new": 4, "nl": 4, "nn": 4, "oc": 4, "o": 4, "pa": 4, "pam": 4, "pfl": 4, "pl": 4, "pm": 4, "pnb": 4, "p": 4, "pt": 4, "qu": 4, "rm": 4, "ro": 4, "ru": 4, "rue": 4, "sa": 4, "sah": 4, "sc": 4, "scn": 4, "sco": 4, "sd": 4, "sh": 4, "si": 4, "sk": 4, "sl": 4, "sq": 4, "sr": 4, "su": 4, "sv": 4, "sw": 4, "ta": 4, "te": 4, "tg": 4, "th": 4, "tk": 4, "tl": 4, "tr": 4, "tt": 4, "tyv": 4, "ug": 4, "uk": 4, "ur": 4, "uz": 4, "vec": 4, "vep": 4, "vi": 4, "vl": 4, "vo": 4, "wa": [4, 7, 12, 13], "war": 4, "wuu": 4, "xal": 4, "xmf": 4, "yi": 4, "yo": 4, "yue": 4, "zh": 4, "static": 4, "get_model_path_and_download": 4, "model": [4, 7, 8, 13], "download": 4, "need": 4, "full": [4, 5, 7], "provid": [5, 8], "other": [5, 12], "fetch_remote_fil": 5, "dirnam": 5, "filenam": [5, 9], "url": [5, 10], "sha256_checksum": 5, "fetch": 5, "remot": 5, "where": [5, 10], "under": 5, "sha256": 5, "checksum": 5, "rais": [5, 11, 12], "ioerror": 5, "wrong": 5, "dir": 5, "exact": 5, "folder": 5, "append": [5, 8], "markdown": 6, "specif": [6, 7, 10, 11, 12, 13], "mdtextsplitt": 6, "max_token": [6, 11], "transformers_token_count": [6, 11], "transformerstokencount": [6, 11, 13], "show_progress_bar": [6, 7, 10, 11, 12, 13], "bool": [6, 7, 8, 9, 10, 11, 12, 13], "fals": [6, 7, 9, 10, 11, 12, 13], "split": [6, 8, 10, 11, 13], "section": [6, 11], "specifi": [6, 7, 11], "maximum": [6, 11, 12], "token": [6, 7, 10, 11, 13], "doe": [6, 8, 9, 11], "divid": [6, 11], "head": 6, "correspond": 6, "paragraph": 6, "per": [6, 8, 11], "can": [6, 7, 9, 12, 14], "onli": [6, 7, 8, 12], "exceed": 6, "singl": [6, 9, 12], "chunk": 6, "larger": [6, 8], "counter": [6, 11, 12], "show": [6, 7, 10, 11, 12, 13], "progressbar": [6, 7, 10, 11, 12, 13], "dure": [6, 7, 10, 11, 12, 13], "md_text": 6, "list": [6, 7, 10, 11, 13], "_chunk_md_by_headlin": 6, "headlin": 6, "chunk_md": 6, "merg": 6, "isol": 6, "subsequ": 6, "end": 6, "without": [6, 7], "content": 6, "remov": [6, 12], "openaiazurechatcomplet": 7, "completion_kwarg": 7, "ani": 7, "openaichatcomplet": 7, "azur": 7, "chat": 7, "complet": 7, "openaibasecomplet": 7, "from_yaml": 7, "kwarg": 7, "properti": 7, "api_typ": 7, "api_vers": 7, "api_bas": 7, "engin": 7, "quickstart": 7, "start": 7, "gpt": 7, "35": 7, "turbo": 7, "4": [7, 8], "servic": 7, "openaiazurecomplet": 7, "openaicomplet": 7, "non": 7, "gener": [7, 13], "prompt": 7, "map": 7, "openaicompletionansw": 7, "llm": 7, "In": [7, 8], "allow": 7, "chang": 7, "temperatur": 7, "_complet": 7, "completion_kwargs_for_this_cal": 7, "openaiobject": 7, "method": [7, 8, 12], "yaml_fil": 7, "yaml": 7, "prompt_token": 7, "completion_token": 7, "total_token": 7, "finish_reason": 7, "answer": 7, "result": [7, 8], "ha": [7, 9], "been": 7, "total": [7, 12], "reason": [7, 8], "why": 7, "stop": 7, "mean": [7, 8], "api": [7, 14], "limit": [7, 12], "length": 7, "becaus": 7, "function_cal": 7, "from_open_ai_object": 7, "open_ai_object": 7, "openaitokencount": 7, "model_nam": 7, "count": [7, 12, 13], "some": [7, 14], "3": [7, 8], "5": 7, "davinci": 7, "003": 7, "embed": 7, "ada": 7, "002": 7, "iter": [7, 10, 12, 13], "just": [7, 13], "_check_mandatory_azure_completion_kwarg": 7, "mandatori": 7, "significancerepeatedtrainingprun": 8, "alpha": 8, "float": [8, 10, 12], "0": [8, 9, 12], "1": [8, 13], "n_warmup_step": 8, "baseprun": 8, "pruner": 8, "statist": 8, "signific": 8, "heurist": 8, "decis": 8, "make": [8, 9], "It": [8, 10, 12, 14], "prune": 8, "repeat": 8, "train": [8, 13], "like": 8, "cross": [8, 13], "valid": [8, 13], "As": 8, "test": [8, 13], "t": 8, "our": 8, "experi": 8, "have": 8, "shown": 8, "aplha": 8, "between": [8, 12], "": [8, 9], "standard": 8, "assum": 8, "adjust": 8, "onc": [8, 12], "hyperparamet": 8, "those": 8, "basi": 8, "intermedi": 8, "epoch": 8, "contrast": 8, "precis": 8, "individu": 8, "fold": [8, 13], "below": 8, "minimalist": 8, "import": [8, 10], "log": 8, "numpi": 8, "np": 8, "sklearn": 8, "dataset": [8, 13], "load_iri": 8, "model_select": 8, "stratifiedkfold": 8, "ensembl": 8, "randomforestclassifi": 8, "metric": 8, "accuracy_scor": 8, "configur": 8, "logger": 8, "debug": 8, "output": [8, 10], "getlogg": 8, "addhandl": 8, "streamhandl": 8, "setlevel": 8, "x": [8, 9], "y": [8, 9], "target": 8, "def": 8, "trial": 8, "min_samples_split": 8, "suggest_int": 8, "2": 8, "n_estim": 8, "validation_result_list": 8, "skf": 8, "n_split": [8, 13], "fold_index": 8, "train_index": 8, "val_index": 8, "enumer": 8, "x_train": 8, "x_val": 8, "y_train": 8, "y_val": 8, "rf": 8, "fit": [8, 12], "y_pred": 8, "predict": 8, "acc": 8, "report": 8, "we": 8, "should": [8, 10], "should_prun": 8, "here": 8, "break": 8, "studi": 8, "create_studi": 8, "storag": 8, "sqlite": 8, "memori": 8, "study_nam": 8, "iris_cv": 8, "direct": 8, "maxim": 8, "load_if_exist": 8, "true": [8, 9, 11, 12], "sampler": 8, "tpesampl": 8, "multivari": 8, "add": 8, "optim": 8, "n_trial": 8, "level": 8, "aggress": 8, "smaller": 8, "stronger": 8, "differ": [8, 9, 12], "two": [8, 9, 10, 12], "distribut": 8, "disabl": 8, "reach": 8, "exce": 8, "step": [8, 9], "frozentri": 8, "judg": 8, "whether": 8, "note": 8, "suppos": 8, "librari": 8, "instead": 8, "interfac": 8, "mechan": 8, "take": 8, "copi": 8, "befor": [8, 12], "modifi": 8, "boolean": 8, "repres": 8, "matplotlib": 9, "boxplot": 9, "titl": 9, "xlabel": 9, "ylabel": 9, "vert": 9, "print": [9, 10], "diagram": 9, "pyplot": 9, "axi": 9, "box": [9, 14], "vertic": 9, "horizont": 9, "boxplot_dict": 9, "values_dict": 9, "form": [9, 12], "dictionari": 9, "save_last_figur": 9, "last": 9, "made": 9, "jupyt": 9, "notebook": 9, "same": 9, "cell": 9, "twin_axes_timeseries_plot": 9, "values_1": 9, "label_1": 9, "values_2": 9, "label_2": 9, "start_timestep_numb": 9, "shift_1": 9, "shift_2": 9, "label_x": 9, "color_1": 9, "tab": 9, "red": 9, "color_2": 9, "blue": 9, "twin": 9, "ax": 9, "timeseri": 9, "curv": 9, "array_lik": 9, "first": [9, 12], "second": 9, "point": 9, "time": [9, 12], "timestep": 9, "shift": 9, "posit": 9, "neg": 9, "color": 9, "jaccardsimilar": 10, "liter": 10, "de_cmc": 10, "en_ptb": 10, "somajobaseclass": 10, "calcul": [10, 12], "jaccard": 10, "similar": 10, "german": 10, "english": 10, "text1": 10, "text2": 10, "get_token_set": 10, "word": [10, 11], "directli": 10, "somajosentencesplitt": [10, 11], "sentenc": [10, 11], "tokenextractor": 10, "extract": 10, "extract_url_set": 10, "token_extractor": 10, "url_set": 10, "ist": 10, "ein": 10, "link": 10, "github": [10, 14], "urlswapp": 10, "url_pattern": 10, "swap": 10, "revers": 10, "replac": [10, 12], "extractor": 10, "pattern": 10, "One": [10, 12], "mark": 10, "place": 10, "put": 10, "reverse_swap_url": 10, "revert": 10, "were": 10, "unknown": 10, "swap_url": 10, "detoken": 10, "how": 10, "extract_token_class_set": 10, "keep_token_class": 10, "keep": 10, "kept": 10, "hug": [11, 13], "face": [11, 13], "textsplitt": 11, "somajo_sentence_splitt": 11, "ignore_overly_long_sent": 11, "alwai": 11, "whole": 11, "splitter": 11, "valueerror": [11, 12], "except": 11, "longer": 11, "simpli": 11, "ignor": 11, "detect": 12, "clean": 12, "invis": 12, "charact": 12, "special": 12, "whitespac": 12, "duplic": 12, "distanc": 12, "find": 12, "anomali": 12, "textdist": 12, "max_dimens": 12, "markup": 12, "unusu": 12, "multipl": 12, "again": 12, "dimens": 12, "greater": 12, "_normalize_char_count": 12, "normal": 12, "char": 12, "defaultdict": 12, "lazi": 12, "postprocess": 12, "manhattan": 12, "scipi": 12, "spatial": 12, "cityblock": 12, "most": 12, "commen": 12, "higher": 12, "least": 12, "_normalize_counter_to_defaultdict": 12, "devid": 12, "them": [12, 14], "clean_all_invisible_chars_and_whitespac": 12, "lead": 12, "trail": 12, "defin": 12, "constant": 12, "invisible_charact": 12, "special_whitespac": 12, "rteturn": 12, "has_invisible_charact": 12, "otherwis": 12, "has_special_whitespac": 12, "remove_invisible_charact": 12, "replace_multiple_whitespac": 12, "replace_special_whitespac": 12, "kfoldlabeleddataset": 13, "7": 13, "n_repeat": 13, "random_st": 13, "k": 13, "labeleddataset": 13, "labeled_dataset": 13, "stratification_label": 13, "encod": 13, "labe": 13, "pretrained_model_name_or_path": 13, "pathlik": 13, "insid": 13, "repo": 13, "huggingfac": 13, "machin": 14, "learn": 14, "python": 14, "packag": 14, "pypi": 14, "option": 14, "might": 14, "refer": 14, "repositori": 14, "licens": 14, "imprint": 14}, "objects": {"mltb2": [[1, 0, 0, "-", "arangodb"], [2, 0, 0, "-", "data"], [3, 0, 0, "-", "db"], [4, 0, 0, "-", "fasttext"], [5, 0, 0, "-", "files"], [6, 0, 0, "-", "md"], [7, 0, 0, "-", "openai"], [8, 0, 0, "-", "optuna"], [9, 0, 0, "-", "plot"], [10, 0, 0, "-", "somajo"], [11, 0, 0, "-", "somajo_transformers"], [12, 0, 0, "-", "text"], [13, 0, 0, "-", "transformers"]], "mltb2.arangodb": [[1, 1, 1, "", "ArangoBatchDataManager"], [1, 3, 1, "", "_check_config_keys"], [1, 3, 1, "", "arango_collection_backup"]], "mltb2.arangodb.ArangoBatchDataManager": [[1, 2, 1, "", "_arango_client_factory"], [1, 2, 1, "", "_connection_factory"], [1, 2, 1, "", "from_config_file"], [1, 2, 1, "", "load_batch"], [1, 2, 1, "", "save_batch"]], "mltb2.data": [[2, 3, 1, "", "_load_colon_data"], [2, 3, 1, "", "_load_colon_label"], [2, 3, 1, "", "load_colon"], [2, 3, 1, "", "load_leukemia_big"], [2, 3, 1, "", "load_prostate"]], "mltb2.db": [[3, 1, 1, "", "AbstractBatchDataManager"], [3, 1, 1, "", "BatchDataProcessor"]], "mltb2.db.AbstractBatchDataManager": [[3, 2, 1, "", "load_batch"], [3, 2, 1, "", "save_batch"]], "mltb2.db.BatchDataProcessor": [[3, 2, 1, "", "run"]], "mltb2.fasttext": [[4, 1, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[4, 2, 1, "", "__call__"], [4, 2, 1, "", "get_model_path_and_download"]], "mltb2.files": [[5, 3, 1, "", "fetch_remote_file"], [5, 3, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[6, 1, 1, "", "MdTextSplitter"], [6, 3, 1, "", "_chunk_md_by_headline"], [6, 3, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[6, 2, 1, "", "__call__"]], "mltb2.openai": [[7, 1, 1, "", "OpenAiAzureChatCompletion"], [7, 1, 1, "", "OpenAiAzureCompletion"], [7, 1, 1, "", "OpenAiBaseCompletion"], [7, 1, 1, "", "OpenAiChatCompletion"], [7, 1, 1, "", "OpenAiCompletion"], [7, 1, 1, "", "OpenAiCompletionAnswer"], [7, 1, 1, "", "OpenAiTokenCounter"], [7, 3, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[7, 2, 1, "", "__call__"], [7, 2, 1, "", "_completion"], [7, 2, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[7, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[7, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[7, 2, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[7, 2, 1, "", "__call__"]], "mltb2.optuna": [[8, 1, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[8, 2, 1, "", "prune"]], "mltb2.plot": [[9, 3, 1, "", "boxplot"], [9, 3, 1, "", "boxplot_dict"], [9, 3, 1, "", "save_last_figure"], [9, 3, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[10, 1, 1, "", "JaccardSimilarity"], [10, 1, 1, "", "SoMaJoBaseClass"], [10, 1, 1, "", "SoMaJoSentenceSplitter"], [10, 1, 1, "", "TokenExtractor"], [10, 1, 1, "", "UrlSwapper"], [10, 3, 1, "", "detokenize"], [10, 3, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[10, 2, 1, "", "__call__"], [10, 2, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[10, 2, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[10, 2, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[10, 2, 1, "", "reverse_swap_urls"], [10, 2, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[11, 1, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[11, 2, 1, "", "__call__"]], "mltb2.text": [[12, 1, 1, "", "TextDistance"], [12, 3, 1, "", "_normalize_counter_to_defaultdict"], [12, 3, 1, "", "clean_all_invisible_chars_and_whitespaces"], [12, 3, 1, "", "has_invisible_characters"], [12, 3, 1, "", "has_special_whitespaces"], [12, 3, 1, "", "remove_invisible_characters"], [12, 3, 1, "", "replace_multiple_whitespaces"], [12, 3, 1, "", "replace_special_whitespaces"]], "mltb2.text.TextDistance": [[12, 2, 1, "", "_normalize_char_counter"], [12, 2, 1, "", "distance"], [12, 2, 1, "", "fit"]], "mltb2.transformers": [[13, 1, 1, "", "KFoldLabeledDataset"], [13, 1, 1, "", "LabeledDataset"], [13, 1, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[13, 2, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[13, 2, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "refer": 0, "arangodb": 1, "data": 2, "db": 3, "fasttext": 4, "file": 5, "md": 6, "openai": 7, "optuna": 8, "plot": 9, "somajo": 10, "somajo_transform": 11, "text": 12, "transform": 13, "mltb2": 14, "document": 14, "instal": 14, "content": 14}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "arangodb": [[1, "module-mltb2.arangodb"]], "data": [[2, "module-mltb2.data"]], "db": [[3, "module-mltb2.db"]], "fasttext": [[4, "module-mltb2.fasttext"]], "files": [[5, "module-mltb2.files"]], "md": [[6, "module-mltb2.md"]], "openai": [[7, "module-mltb2.openai"]], "optuna": [[8, "module-mltb2.optuna"]], "plot": [[9, "module-mltb2.plot"]], "somajo": [[10, "module-mltb2.somajo"]], "somajo_transformers": [[11, "module-mltb2.somajo_transformers"]], "text": [[12, "module-mltb2.text"]], "transformers": [[13, "module-mltb2.transformers"]], "MLTB2 Documentation": [[14, "mltb2-documentation"]], "Installation": [[14, "installation"]], "Content": [[14, "content"]]}, "indexentries": {"arangobatchdatamanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoBatchDataManager"]], "_arango_client_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._arango_client_factory"]], "_check_config_keys() (in module mltb2.arangodb)": [[1, "mltb2.arangodb._check_config_keys"]], "_connection_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._connection_factory"]], "arango_collection_backup() (in module mltb2.arangodb)": [[1, "mltb2.arangodb.arango_collection_backup"]], "from_config_file() (mltb2.arangodb.arangobatchdatamanager class method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.from_config_file"]], "load_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.load_batch"]], "mltb2.arangodb": [[1, "module-mltb2.arangodb"]], "module": [[1, "module-mltb2.arangodb"], [2, "module-mltb2.data"], [3, "module-mltb2.db"], [4, "module-mltb2.fasttext"], [5, "module-mltb2.files"], [6, "module-mltb2.md"], [7, "module-mltb2.openai"], [8, "module-mltb2.optuna"], [9, "module-mltb2.plot"], [10, "module-mltb2.somajo"], [11, "module-mltb2.somajo_transformers"], [12, "module-mltb2.text"], [13, "module-mltb2.transformers"]], "save_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.save_batch"]], "_load_colon_data() (in module mltb2.data)": [[2, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[2, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[2, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[2, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[2, "mltb2.data.load_prostate"]], "mltb2.data": [[2, "module-mltb2.data"]], "abstractbatchdatamanager (class in mltb2.db)": [[3, "mltb2.db.AbstractBatchDataManager"]], "batchdataprocessor (class in mltb2.db)": [[3, "mltb2.db.BatchDataProcessor"]], "load_batch() (mltb2.db.abstractbatchdatamanager method)": [[3, "mltb2.db.AbstractBatchDataManager.load_batch"]], "mltb2.db": [[3, "module-mltb2.db"]], "run() (mltb2.db.batchdataprocessor method)": [[3, "mltb2.db.BatchDataProcessor.run"]], "save_batch() (mltb2.db.abstractbatchdatamanager method)": [[3, "mltb2.db.AbstractBatchDataManager.save_batch"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[4, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[4, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[4, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[4, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[5, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[5, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[5, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[6, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[6, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[6, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[6, "mltb2.md.chunk_md"]], "mltb2.md": [[6, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[7, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[7, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[7, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[7, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[7, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[7, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[7, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[7, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[7, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[8, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[8, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[8, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[9, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[9, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[9, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[9, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[9, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[10, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[10, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[10, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[10, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[10, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[10, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[10, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[10, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[10, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[10, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[10, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[10, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[10, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[10, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[11, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[11, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[11, "module-mltb2.somajo_transformers"]], "textdistance (class in mltb2.text)": [[12, "mltb2.text.TextDistance"]], "_normalize_char_counter() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance._normalize_char_counter"]], "_normalize_counter_to_defaultdict() (in module mltb2.text)": [[12, "mltb2.text._normalize_counter_to_defaultdict"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "distance() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance.distance"]], "fit() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance.fit"]], "has_invisible_characters() (in module mltb2.text)": [[12, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[12, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[12, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[13, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[13, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[13, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[13, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[13, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[13, "mltb2.transformers.KFoldLabeledDataset.split"]]}})
\ No newline at end of file