diff --git a/_modules/mltb2/data.html b/_modules/mltb2/data.html index a2047a7..8d6e1db 100644 --- a/_modules/mltb2/data.html +++ b/_modules/mltb2/data.html @@ -84,10 +84,21 @@

Source code for mltb2.data

 # This software is distributed under the terms of the MIT license
 # which is available at https://opensource.org/licenses/MIT
 
-"""Data loading module.
+"""This module offers tools for loading data.
 
-Use pip to install the necessary dependencies for this module:
-``pip install mltb2[data]``
+The following tabular data sets from the biological and medical domain are supported:
+
+- colon: `<http://genomics-pubs.princeton.edu/oncology/affydata/index.html>`_
+- prostate: `<https://web.stanford.edu/~hastie/CASI_files/DATA/prostate.html>`_
+- leukemia_big: `<https://web.stanford.edu/~hastie/CASI_files/DATA/leukemia.html>`_
+
+After loading the data from the internet it is parsed, converted and
+cached in the mltb2 data directory.
+This data directory is determined by :func:`mltb2.files.get_and_create_mltb2_data_dir`.
+
+Hint:
+    Use pip to install the necessary dependencies for this module:
+    ``pip install mltb2[data]``
 """
 
 import os
diff --git a/_modules/mltb2/fasttext.html b/_modules/mltb2/fasttext.html
index 3d23451..b1da2b5 100644
--- a/_modules/mltb2/fasttext.html
+++ b/_modules/mltb2/fasttext.html
@@ -82,11 +82,11 @@ 

Source code for mltb2.fasttext

 # This software is distributed under the terms of the MIT license
 # which is available at https://opensource.org/licenses/MIT
 
-"""fastText specific module.
+"""This module offers tools for `fastText <https://fasttext.cc/docs/en/support.html>`_.
 
-This module is based on `fastText <https://fasttext.cc/docs/en/support.html>`_.
-Use pip to install the necessary dependencies for this module:
-``pip install mltb2[fasttext]``
+Hint:
+    Use pip to install the necessary dependencies for this module:
+    ``pip install mltb2[fasttext]``
 """
 
 import os
diff --git a/_modules/mltb2/files.html b/_modules/mltb2/files.html
index 622f41e..e0f52a6 100644
--- a/_modules/mltb2/files.html
+++ b/_modules/mltb2/files.html
@@ -84,8 +84,11 @@ 

Source code for mltb2.files

 
 """File utils module.
 
-Use pip to install the necessary dependencies for this module:
-``pip install mltb2[files]``
+This module provides utility functions for other modules.
+
+Hint:
+    Use pip to install the necessary dependencies for this module:
+    ``pip install mltb2[files]``
 """
 
 
@@ -98,11 +101,15 @@ 

Source code for mltb2.files

 
 
 
[docs]def get_and_create_mltb2_data_dir(mltb2_base_data_dir: Optional[str] = None) -> str: - """Return and create mltb data dir. + """Return and create a data dir for mltb2. + + The exact directory is given by the ``mltb2_base_data_dir`` as the base folder + and then the folder ``mltb2`` is appended. Args: mltb2_base_data_dir: The base data directory. If ``None`` the default - user data directory is used. + user data directory is used. The default user data directory is + determined by :func:`platformdirs.user_data_dir`. Returns: The directory path. diff --git a/_modules/mltb2/md.html b/_modules/mltb2/md.html index d5c25d0..1c2b611 100644 --- a/_modules/mltb2/md.html +++ b/_modules/mltb2/md.html @@ -85,8 +85,9 @@

Source code for mltb2.md

 
 """Markdown specific module.
 
-Use pip to install the necessary dependencies for this module:
-``pip install mltb2[md]``
+Hint:
+    Use pip to install the necessary dependencies for this module:
+    ``pip install mltb2[md]``
 """
 
 import re
diff --git a/_modules/mltb2/openai.html b/_modules/mltb2/openai.html
index 56f671e..394b15c 100644
--- a/_modules/mltb2/openai.html
+++ b/_modules/mltb2/openai.html
@@ -84,8 +84,9 @@ 

Source code for mltb2.openai

 
 """OpenAI specific module.
 
-Use pip to install the necessary dependencies for this module:
-``pip install mltb2[openai]``
+Hint:
+    Use pip to install the necessary dependencies for this module:
+    ``pip install mltb2[openai]``
 """
 
 
diff --git a/_modules/mltb2/optuna.html b/_modules/mltb2/optuna.html
index 42caf70..85c77e6 100644
--- a/_modules/mltb2/optuna.html
+++ b/_modules/mltb2/optuna.html
@@ -82,11 +82,11 @@ 

Source code for mltb2.optuna

 # This software is distributed under the terms of the MIT license
 # which is available at https://opensource.org/licenses/MIT
 
-"""Optuna specific module.
+"""This module offers tools for `Optuna <https://optuna.readthedocs.io/en/stable/>`_.
 
-This module is based on `Optuna <https://optuna.readthedocs.io/en/stable/>`_.
-Use pip to install the necessary dependencies for this module:
-``pip install mltb2[optuna]``
+Hint:
+    Use pip to install the necessary dependencies for this module:
+    ``pip install mltb2[optuna]``
 """
 
 
diff --git a/_modules/mltb2/plot.html b/_modules/mltb2/plot.html
index 79f1805..a89fe43 100644
--- a/_modules/mltb2/plot.html
+++ b/_modules/mltb2/plot.html
@@ -85,8 +85,10 @@ 

Source code for mltb2.plot

 """Plot tools module.
 
 This module is based on `Matplotlib <https://matplotlib.org/>`_.
-Use pip to install the necessary dependencies for this module:
-``pip install mltb2[plot]``
+
+Hint:
+    Use pip to install the necessary dependencies for this module:
+    ``pip install mltb2[plot]``
 """
 
 from typing import Optional
@@ -111,7 +113,9 @@ 

Source code for mltb2.plot

     """Create twin axes timeseries plot.
 
     Plots two different timeseries curves in one diagram but two different y-axes.
-    This function does not call `matplotlib.pyplot.plot()`.
+
+    Hint:
+        This function does not use `matplotlib.pyplot.plot`.
 
     Args:
         values_1: (``array_like``) Values for the first timeseries curve.
@@ -168,7 +172,8 @@ 

Source code for mltb2.plot

 ):
     """Prints one or more boxplots in a single diagram.
 
-    This function does not call `matplotlib.pyplot.plot()`.
+    Hint:
+        This function does not use `matplotlib.pyplot.plot`.
 
     Args:
         values: Values for the boxplot(s).
@@ -211,7 +216,8 @@ 

Source code for mltb2.plot

 ):
     """Create boxplot form dictionary.
 
-    This function does not call `matplotlib.pyplot.plot()`.
+    Hint:
+        This function does not use `matplotlib.pyplot.plot`.
 
     Args:
         values_dict: Dictionary with values for the boxplot(s).
@@ -232,7 +238,7 @@ 

Source code for mltb2.plot

 
 
 
[docs]def save_last_figure(filename): - """Saves the last plot. + """Saves the last plot made by Matplotlib. For jupyter notebooks this has to be called in the same cell that created the plot. """ diff --git a/_modules/mltb2/somajo.html b/_modules/mltb2/somajo.html index 6529726..e3bbf80 100644 --- a/_modules/mltb2/somajo.html +++ b/_modules/mltb2/somajo.html @@ -82,11 +82,11 @@

Source code for mltb2.somajo

 # This software is distributed under the terms of the MIT license
 # which is available at https://opensource.org/licenses/MIT
 
-"""SoMaJo specific module.
+"""This module offers `SoMaJo <https://github.com/tsproisl/SoMaJo>`_ specific tools.
 
-This module is based on `SoMaJo <https://github.com/tsproisl/SoMaJo>`_.
-Use pip to install the necessary dependencies for this module:
-``pip install mltb2[somajo]``
+Hint:
+    Use pip to install the necessary dependencies for this module:
+    ``pip install mltb2[somajo]``
 """
 
 
diff --git a/_modules/mltb2/somajo_transformers.html b/_modules/mltb2/somajo_transformers.html
index 95550e6..291d4ae 100644
--- a/_modules/mltb2/somajo_transformers.html
+++ b/_modules/mltb2/somajo_transformers.html
@@ -83,13 +83,15 @@ 

Source code for mltb2.somajo_transformers

 # This software is distributed under the terms of the MIT license
 # which is available at https://opensource.org/licenses/MIT
 
-"""Hugging Face Transformers and SoMaJo specific module.
+"""This module offers Hugging Face Transformers and SoMaJo specific tools.
 
 This module is based on
 `Hugging Face Transformers <https://huggingface.co/docs/transformers/index>`_ and
 `SoMaJo <https://github.com/tsproisl/SoMaJo>`_.
-Use pip to install the necessary dependencies for this module:
-``pip install mltb2[somajo_transformers]``
+
+Hint:
+    Use pip to install the necessary dependencies for this module:
+    ``pip install mltb2[somajo_transformers]``
 """
 
 
diff --git a/_modules/mltb2/text.html b/_modules/mltb2/text.html
index f5bf5fc..2c88228 100644
--- a/_modules/mltb2/text.html
+++ b/_modules/mltb2/text.html
@@ -82,7 +82,14 @@ 

Source code for mltb2.text

 # This software is distributed under the terms of the MIT license
 # which is available at https://opensource.org/licenses/MIT
 
-"""Text specific module."""
+"""This module offers text specific tools.
+
+It offers the following functionality:
+
+- detect or clean invisible characters
+- detect or replace special whitespaces
+- remove duplicate whitespaces
+"""
 
 import re
 from typing import Dict, Final, Pattern, Tuple
diff --git a/_modules/mltb2/transformers.html b/_modules/mltb2/transformers.html
index f7f6342..acd194b 100644
--- a/_modules/mltb2/transformers.html
+++ b/_modules/mltb2/transformers.html
@@ -82,12 +82,11 @@ 

Source code for mltb2.transformers

 # This software is distributed under the terms of the MIT license
 # which is available at https://opensource.org/licenses/MIT
 
-"""Hugging Face Transformers specific module.
+"""This module offers `Hugging Face Transformers <https://huggingface.co/docs/transformers/index>`_ specific tools.
 
-This module is based on
-`Hugging Face Transformers <https://huggingface.co/docs/transformers/index>`_.
-Use pip to install the necessary dependencies for this module:
-``pip install mltb2[transformers]``
+Hint:
+    Use pip to install the necessary dependencies for this module:
+    ``pip install mltb2[transformers]``
 """
 
 import os
diff --git a/api-reference/data.html b/api-reference/data.html
index 8566f58..35ba794 100644
--- a/api-reference/data.html
+++ b/api-reference/data.html
@@ -103,9 +103,21 @@
              
   

data

-

Data loading module.

+

This module offers tools for loading data.

+

The following tabular data sets from the biological and medical domain are supported:

+ +

After loading the data from the internet it is parsed, converted and +cached in the mltb2 data directory. +This data directory is determined by mltb2.files.get_and_create_mltb2_data_dir().

+
+

Hint

Use pip to install the necessary dependencies for this module: pip install mltb2[data]

+
mltb2.data._load_colon_data() DataFrame[source]
diff --git a/api-reference/fasttext.html b/api-reference/fasttext.html index 3c5b2f9..9625231 100644 --- a/api-reference/fasttext.html +++ b/api-reference/fasttext.html @@ -103,10 +103,12 @@

fasttext

-

fastText specific module.

-

This module is based on fastText. -Use pip to install the necessary dependencies for this module: +

This module offers tools for fastText.

+
+

Hint

+

Use pip to install the necessary dependencies for this module: pip install mltb2[fasttext]

+
class mltb2.fasttext.FastTextLanguageIdentification[source]
diff --git a/api-reference/files.html b/api-reference/files.html index ac3ec9f..0860fa5 100644 --- a/api-reference/files.html +++ b/api-reference/files.html @@ -101,8 +101,12 @@

files

File utils module.

+

This module provides utility functions for other modules.

+
+

Hint

Use pip to install the necessary dependencies for this module: pip install mltb2[files]

+
mltb2.files.fetch_remote_file(dirname, filename, url: str, sha256_checksum: str) str[source]
@@ -131,11 +135,14 @@
mltb2.files.get_and_create_mltb2_data_dir(mltb2_base_data_dir: str | None = None) str[source]
-

Return and create mltb data dir.

+

Return and create a data dir for mltb2.

+

The exact directory is given by the mltb2_base_data_dir as the base folder +and then the folder mltb2 is appended.

Parameters:

mltb2_base_data_dir (str | None) – The base data directory. If None the default -user data directory is used.

+user data directory is used. The default user data directory is +determined by platformdirs.user_data_dir().

Returns:

The directory path.

diff --git a/api-reference/md.html b/api-reference/md.html index 7d939f4..c2c9be0 100644 --- a/api-reference/md.html +++ b/api-reference/md.html @@ -105,8 +105,11 @@

md

Markdown specific module.

+
+

Hint

Use pip to install the necessary dependencies for this module: pip install mltb2[md]

+
class mltb2.md.MdTextSplitter(max_token: int, transformers_token_counter: TransformersTokenCounter, show_progress_bar: bool = False)[source]
diff --git a/api-reference/openai.html b/api-reference/openai.html index 60310f6..abc114e 100644 --- a/api-reference/openai.html +++ b/api-reference/openai.html @@ -124,8 +124,11 @@

openai

OpenAI specific module.

+
+

Hint

Use pip to install the necessary dependencies for this module: pip install mltb2[openai]

+
class mltb2.openai.OpenAiAzureChatCompletion(completion_kwargs: Dict[str, Any])[source]
diff --git a/api-reference/optuna.html b/api-reference/optuna.html index c18d1ee..f236dac 100644 --- a/api-reference/optuna.html +++ b/api-reference/optuna.html @@ -102,10 +102,12 @@

optuna

-

Optuna specific module.

-

This module is based on Optuna. -Use pip to install the necessary dependencies for this module: +

This module offers tools for Optuna.

+
+

Hint

+

Use pip to install the necessary dependencies for this module: pip install mltb2[optuna]

+
class mltb2.optuna.SignificanceRepeatedTrainingPruner(alpha: float = 0.1, n_warmup_steps: int = 4)[source]
diff --git a/api-reference/plot.html b/api-reference/plot.html index f21eaf0..89a69ef 100644 --- a/api-reference/plot.html +++ b/api-reference/plot.html @@ -103,14 +103,20 @@

plot

Plot tools module.

-

This module is based on Matplotlib. -Use pip to install the necessary dependencies for this module: +

This module is based on Matplotlib.

+
+

Hint

+

Use pip to install the necessary dependencies for this module: pip install mltb2[plot]

+
mltb2.plot.boxplot(values, labels=None, title: str | None = None, xlabel: str | None = None, ylabel: str | None = None, vert: bool = True)[source]

Prints one or more boxplots in a single diagram.

-

This function does not call matplotlib.pyplot.plot().

+
+

Hint

+

This function does not use matplotlib.pyplot.plot.

+
Parameters:
    @@ -130,7 +136,10 @@
    mltb2.plot.boxplot_dict(values_dict, title: str | None = None, xlabel: str | None = None, ylabel: str | None = None, vert: bool = True)[source]

    Create boxplot form dictionary.

    -

    This function does not call matplotlib.pyplot.plot().

    +
    +

    Hint

    +

    This function does not use matplotlib.pyplot.plot.

    +
    Parameters:
      @@ -148,7 +157,7 @@
      mltb2.plot.save_last_figure(filename)[source]
      -

      Saves the last plot.

      +

      Saves the last plot made by Matplotlib.

      For jupyter notebooks this has to be called in the same cell that created the plot.

      @@ -156,8 +165,11 @@
      mltb2.plot.twin_axes_timeseries_plot(values_1, label_1: str, values_2, label_2: str, start_timestep_number: int = 0, shift_1: int = 0, shift_2: int = 0, title: str | None = None, label_x: str = 'Step', color_1: str = 'tab:red', color_2: str = 'tab:blue')[source]

      Create twin axes timeseries plot.

      -

      Plots two different timeseries curves in one diagram but two different y-axes. -This function does not call matplotlib.pyplot.plot().

      +

      Plots two different timeseries curves in one diagram but two different y-axes.

      +
      +

      Hint

      +

      This function does not use matplotlib.pyplot.plot.

      +
      Parameters:
        diff --git a/api-reference/somajo.html b/api-reference/somajo.html index bece500..898f94c 100644 --- a/api-reference/somajo.html +++ b/api-reference/somajo.html @@ -119,10 +119,12 @@

        somajo

        -

        SoMaJo specific module.

        -

        This module is based on SoMaJo. -Use pip to install the necessary dependencies for this module: +

        This module offers SoMaJo specific tools.

        +
        +

        Hint

        +

        Use pip to install the necessary dependencies for this module: pip install mltb2[somajo]

        +
        class mltb2.somajo.JaccardSimilarity(language: Literal['de_CMC', 'en_PTB'])[source]
        diff --git a/api-reference/somajo_transformers.html b/api-reference/somajo_transformers.html index 8fb3135..86ec20f 100644 --- a/api-reference/somajo_transformers.html +++ b/api-reference/somajo_transformers.html @@ -102,12 +102,15 @@

        somajo_transformers

        -

        Hugging Face Transformers and SoMaJo specific module.

        +

        This module offers Hugging Face Transformers and SoMaJo specific tools.

        This module is based on Hugging Face Transformers and -SoMaJo. -Use pip to install the necessary dependencies for this module: +SoMaJo.

        +
        +

        Hint

        +

        Use pip to install the necessary dependencies for this module: pip install mltb2[somajo_transformers]

        +
        class mltb2.somajo_transformers.TextSplitter(max_token: int, somajo_sentence_splitter: SoMaJoSentenceSplitter, transformers_token_counter: TransformersTokenCounter, show_progress_bar: bool = False, ignore_overly_long_sentences: bool = False)[source]
        diff --git a/api-reference/text.html b/api-reference/text.html index ce28338..fe8ed89 100644 --- a/api-reference/text.html +++ b/api-reference/text.html @@ -104,7 +104,13 @@

        text

        -

        Text specific module.

        +

        This module offers text specific tools.

        +

        It offers the following functionality:

        +
          +
        • detect or clean invisible characters

        • +
        • detect or replace special whitespaces

        • +
        • remove duplicate whitespaces

        • +
        mltb2.text.clean_all_invisible_chars_and_whitespaces(text: str) str[source]
        diff --git a/api-reference/transformers.html b/api-reference/transformers.html index e6f2097..cfac6ca 100644 --- a/api-reference/transformers.html +++ b/api-reference/transformers.html @@ -106,11 +106,12 @@

        transformers

        -

        Hugging Face Transformers specific module.

        -

        This module is based on -Hugging Face Transformers. -Use pip to install the necessary dependencies for this module: +

        This module offers Hugging Face Transformers specific tools.

        +
        +

        Hint

        +

        Use pip to install the necessary dependencies for this module: pip install mltb2[transformers]

        +
        class mltb2.transformers.KFoldLabeledDataset(n_splits=7, n_repeats=1, random_state=None)[source]
        diff --git a/searchindex.js b/searchindex.js index ccd3fdc..1e87ba2 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["api-reference", "api-reference/data", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/data.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "data", "fasttext", "files", "md", "openai", "optuna", "plot", "somajo", "somajo_transformers", "text", "transformers", "MLTB2 Documentation"], "terms": {"data": [0, 3, 6, 11, 12], "fasttext": [0, 12], "file": [0, 2, 5, 12], "md": [0, 12], "openai": [0, 12], "optuna": [0, 12], "plot": [0, 12], "somajo": [0, 9, 12], "somajo_transform": [0, 12], "text": [0, 2, 4, 5, 8, 9, 11, 12], "transform": [0, 9, 12], "load": 1, "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "pip": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "instal": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "necessari": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "depend": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "_load_colon_data": 1, "datafram": 1, "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "colon": 1, "label": [1, 7, 11], "The": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "pars": 1, "from": [1, 2, 3, 5, 6, 8, 10], "internet": 1, "also": [1, 5], "see": [1, 6], "http": [1, 8], "genom": 1, "pub": 1, "princeton": 1, "edu": 1, "oncologi": 1, "affydata": 1, "index": [1, 12], "html": 1, "return": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "panda": 1, "type": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "_load_colon_label": 1, "seri": 1, "load_colon": 1, "tupl": [1, 8], "contain": [1, 2, 8, 10, 11], "load_leukemia_big": 1, "leukemia": 1, "big": 1, "web": 1, "stanford": 1, "hasti": 1, "casi_fil": 1, "load_prost": 1, "prostat": 1, "specif": [2, 4, 5, 6, 8, 9, 10, 11], "base": [2, 3, 4, 5, 6, 7, 8, 9, 11], "class": [2, 4, 5, 6, 8, 9, 11], "fasttextlanguageidentif": 2, "object": [2, 4, 5, 6, 8, 9, 11], "identifi": 2, "languag": [2, 8], "__call__": [2, 4, 5, 8, 9, 11], "str": [2, 3, 4, 5, 7, 8, 9, 10, 11], "num_lang": 2, "int": [2, 4, 5, 6, 7, 9, 11], "10": [2, 6], "given": [2, 6], "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "which": [2, 3, 5, 6, 8, 10, 11], "recogn": 2, "number": [2, 4, 5, 6, 7, 8, 9, 11], "A": [2, 6, 12], "dict": [2, 5], "probabl": 2, "more": [2, 6, 7], "than": [2, 9], "element": 2, "so": 2, "guarante": 2, "you": [2, 5, 6, 12], "want": 2, "includ": 2, "case": [2, 5], "when": [2, 5], "veri": 2, "low": 2, "possibl": 2, "ar": [2, 4, 5, 6, 10, 11], "af": 2, "al": 2, "am": 2, "an": [2, 5, 6, 8, 9, 11], "arz": 2, "ast": 2, "av": 2, "az": 2, "azb": 2, "ba": 2, "bar": 2, "bcl": 2, "bg": 2, "bh": 2, "bn": 2, "bo": 2, "bpy": 2, "br": 2, "b": 2, "bxr": 2, "ca": 2, "cbk": 2, "ce": 2, "ceb": 2, "ckb": 2, "co": [2, 11], "c": 2, "cv": [2, 6], "cy": 2, "da": [2, 8], "de": [2, 8], "diq": 2, "dsb": 2, "dty": 2, "dv": 2, "el": 2, "eml": 2, "en": 2, "eo": 2, "e": 2, "et": 2, "eu": 2, "fa": 2, "fi": 2, "fr": 2, "frr": 2, "fy": 2, "ga": 2, "gd": 2, "gl": 2, "gn": 2, "gom": 2, "gu": 2, "gv": 2, "he": 2, "hi": 2, "hif": 2, "hr": 2, "hsb": 2, "ht": 2, "hu": 2, "hy": 2, "ia": 2, "id": [2, 11], "ie": 2, "ilo": 2, "io": 2, "ja": 2, "jbo": 2, "jv": 2, "ka": 2, "kk": 2, "km": 2, "kn": 2, "ko": 2, "krc": 2, "ku": 2, "kv": 2, "kw": 2, "ky": 2, "la": 2, "lb": 2, "lez": 2, "li": 2, "lmo": 2, "lo": 2, "lrc": 2, "lt": 2, "lv": 2, "mai": 2, "mg": 2, "mhr": 2, "min": 2, "mk": 2, "ml": 2, "mn": 2, "mr": 2, "mrj": 2, "m": 2, "mt": 2, "mwl": 2, "my": 2, "myv": 2, "mzn": 2, "nah": 2, "nap": 2, "nd": 2, "ne": 2, "new": 2, "nl": 2, "nn": 2, "oc": 2, "o": 2, "pa": 2, "pam": 2, "pfl": 2, "pl": 2, "pm": 2, "pnb": 2, "p": 2, "pt": 2, "qu": 2, "rm": 2, "ro": 2, "ru": 2, "rue": 2, "sa": 2, "sah": 2, "sc": 2, "scn": 2, "sco": 2, "sd": 2, "sh": 2, "si": 2, "sk": 2, "sl": 2, "sq": 2, "sr": 2, "su": 2, "sv": 2, "sw": 2, "ta": 2, "te": 2, "tg": 2, "th": 2, "tk": 2, "tl": 2, "tr": 2, "tt": 2, "tyv": 2, "ug": 2, "uk": 2, "ur": 2, "uz": 2, "vec": 2, "vep": 2, "vi": 2, "vl": 2, "vo": 2, "wa": [2, 5, 11], "war": 2, "wuu": 2, "xal": 2, "xmf": 2, "yi": 2, "yo": 2, "yue": 2, "zh": 2, "static": 2, "get_model_path_and_download": 2, "get": [2, 5, 8], "model": [2, 5, 6, 11], "path": [2, 3, 11], "download": 2, "need": 2, "full": [2, 3, 5], "util": [3, 11], "fetch_remote_fil": 3, "dirnam": 3, "filenam": [3, 7], "url": [3, 8], "sha256_checksum": 3, "fetch": 3, "remot": 3, "directori": [3, 11], "where": [3, 8], "save": [3, 7], "under": 3, "sha256": 3, "checksum": 3, "creat": [3, 5, 7], "rais": [3, 9], "ioerror": 3, "wrong": 3, "get_and_create_mltb2_data_dir": 3, "mltb2_base_data_dir": 3, "none": [3, 5, 7, 8, 11], "mltb": 3, "dir": 3, "If": [3, 5, 7, 8, 9, 11], "default": [3, 7], "user": [3, 6], "markdown": 4, "mdtextsplitt": 4, "max_token": [4, 9], "transformers_token_count": [4, 9], "transformerstokencount": [4, 9, 11], "show_progress_bar": [4, 5, 8, 9, 11], "bool": [4, 5, 6, 7, 8, 9, 10, 11], "fals": [4, 5, 7, 8, 9, 10, 11], "split": [4, 6, 8, 9, 11], "section": [4, 9], "specifi": [4, 5, 9], "maximum": [4, 9], "token": [4, 5, 8, 9, 11], "doe": [4, 6, 7, 9], "divid": [4, 9], "head": 4, "correspond": 4, "paragraph": 4, "per": [4, 6, 9], "can": [4, 5, 7, 12], "onli": [4, 5, 6], "exceed": 4, "singl": [4, 7, 10], "chunk": 4, "alreadi": 4, "larger": [4, 6], "counter": [4, 9], "show": [4, 5, 8, 9, 11], "progressbar": [4, 5, 8, 9, 11], "dure": [4, 5, 8, 9, 11], "process": [4, 5, 8, 9, 11], "md_text": 4, "list": [4, 5, 8, 9, 11], "_chunk_md_by_headlin": 4, "headlin": 4, "chunk_md": 4, "merg": 4, "isol": 4, "subsequ": 4, "end": 4, "without": [4, 5], "content": 4, "remov": [4, 10], "openaiazurechatcomplet": 5, "completion_kwarg": 5, "ani": 5, "openaichatcomplet": 5, "azur": 5, "chat": 5, "complet": 5, "construct": 5, "openaibasecomplet": 5, "from_yaml": 5, "kwarg": 5, "function": [5, 6, 7], "follow": 5, "properti": 5, "must": [5, 6], "api_typ": 5, "api_vers": 5, "api_bas": 5, "engin": 5, "quickstart": 5, "start": 5, "gpt": 5, "35": 5, "turbo": 5, "4": [5, 6], "servic": 5, "openaiazurecomplet": 5, "openaicomplet": 5, "non": 5, "gener": [5, 11], "abc": [5, 8], "abstract": [5, 8], "prompt": 5, "map": 5, "openaicompletionansw": 5, "call": [5, 6, 7], "llm": 5, "In": [5, 6], "string": [5, 8], "allow": 5, "overwrit": 5, "exampl": [5, 6, 8], "chang": 5, "temperatur": 5, "_complet": 5, "completion_kwargs_for_this_cal": 5, "openaiobject": 5, "method": [5, 6], "classmethod": 5, "yaml_fil": 5, "yaml": 5, "prompt_token": 5, "completion_token": 5, "total_token": 5, "finish_reason": 5, "answer": 5, "result": [5, 6], "name": 5, "ha": [5, 7], "been": 5, "total": 5, "reason": [5, 6], "why": 5, "stop": 5, "mean": [5, 6], "api": [5, 12], "run": 5, "limit": 5, "length": 5, "becaus": 5, "function_cal": 5, "from_open_ai_object": 5, "open_ai_object": 5, "openaitokencount": 5, "model_nam": 5, "count": [5, 11], "some": [5, 12], "3": [5, 6], "5": 5, "davinci": 5, "003": 5, "embed": 5, "ada": 5, "002": 5, "iter": [5, 8, 11], "just": [5, 11], "_check_mandatory_azure_completion_kwarg": 5, "check": [5, 6, 10], "mandatori": 5, "significancerepeatedtrainingprun": 6, "alpha": 6, "float": [6, 8], "0": [6, 7], "1": [6, 11], "n_warmup_step": 6, "baseprun": 6, "pruner": 6, "statist": 6, "signific": 6, "heurist": 6, "decis": 6, "make": [6, 7], "It": [6, 8, 12], "prune": 6, "repeat": 6, "train": [6, 11], "like": 6, "cross": [6, 11], "valid": [6, 11], "As": 6, "test": [6, 11], "t": 6, "our": 6, "experi": 6, "have": 6, "shown": 6, "aplha": 6, "valu": [6, 7], "between": 6, "": [6, 7], "standard": 6, "assum": 6, "adjust": 6, "onc": 6, "hyperparamet": 6, "set": [6, 8, 11], "those": 6, "work": 6, "basi": 6, "intermedi": 6, "For": [6, 7], "epoch": 6, "contrast": 6, "precis": 6, "individu": 6, "fold": [6, 11], "below": 6, "minimalist": 6, "import": [6, 8], "log": 6, "numpi": 6, "np": 6, "sklearn": 6, "dataset": [6, 11], "load_iri": 6, "model_select": 6, "stratifiedkfold": 6, "ensembl": 6, "randomforestclassifi": 6, "metric": 6, "accuracy_scor": 6, "configur": 6, "logger": 6, "debug": 6, "output": [6, 8], "getlogg": 6, "addhandl": 6, "streamhandl": 6, "setlevel": 6, "x": [6, 7], "y": [6, 7], "target": 6, "def": 6, "trial": 6, "min_samples_split": 6, "suggest_int": 6, "2": 6, "20": 6, "n_estim": 6, "100": 6, "validation_result_list": 6, "skf": 6, "n_split": [6, 11], "fold_index": 6, "train_index": 6, "val_index": 6, "enumer": 6, "x_train": 6, "x_val": 6, "y_train": 6, "y_val": 6, "rf": 6, "fit": 6, "y_pred": 6, "predict": 6, "acc": 6, "append": 6, "report": 6, "we": 6, "should": [6, 8], "should_prun": 6, "here": 6, "done": 6, "break": 6, "studi": 6, "create_studi": 6, "storag": 6, "sqlite": 6, "db": 6, "memori": 6, "study_nam": 6, "iris_cv": 6, "direct": 6, "maxim": 6, "load_if_exist": 6, "true": [6, 7, 9, 10], "sampler": 6, "tpesampl": 6, "multivari": 6, "add": 6, "optim": 6, "n_trial": 6, "level": 6, "aggress": 6, "smaller": 6, "stronger": 6, "differ": [6, 7], "two": [6, 7, 8], "distribut": 6, "disabl": 6, "until": 6, "reach": 6, "exce": 6, "step": [6, 7], "frozentri": 6, "judg": 6, "whether": 6, "note": 6, "suppos": 6, "librari": 6, "instead": 6, "provid": 6, "interfac": 6, "implement": 6, "mechan": 6, "take": 6, "copi": 6, "befor": 6, "modifi": 6, "boolean": 6, "repres": 6, "tool": [7, 8, 12], "matplotlib": 7, "boxplot": 7, "titl": 7, "xlabel": 7, "ylabel": 7, "vert": 7, "print": [7, 8], "one": [7, 8], "diagram": 7, "pyplot": 7, "axi": 7, "box": [7, 12], "vertic": 7, "horizont": 7, "boxplot_dict": 7, "values_dict": 7, "form": [7, 10], "dictionari": 7, "save_last_figur": 7, "last": 7, "jupyt": 7, "notebook": 7, "same": 7, "cell": 7, "twin_axes_timeseries_plot": 7, "values_1": 7, "label_1": 7, "values_2": 7, "label_2": 7, "start_timestep_numb": 7, "shift_1": 7, "shift_2": 7, "label_x": 7, "color_1": 7, "tab": 7, "red": 7, "color_2": 7, "blue": 7, "twin": 7, "ax": 7, "timeseri": 7, "curv": 7, "array_lik": 7, "first": 7, "second": 7, "point": 7, "time": 7, "timestep": 7, "shift": 7, "posit": 7, "neg": 7, "color": 7, "jaccardsimilar": 8, "liter": 8, "de_cmc": 8, "en_ptb": 8, "somajobaseclass": 8, "calcul": 8, "jaccard": 8, "similar": 8, "german": 8, "english": 8, "text1": 8, "text2": 8, "get_token_set": 8, "word": [8, 9], "directli": 8, "somajosentencesplitt": [8, 9], "sentenc": [8, 9], "tokenextractor": 8, "extract": 8, "extract_url_set": 8, "token_extractor": 8, "url_set": 8, "ist": 8, "ein": 8, "link": 8, "github": [8, 12], "com": 8, "urlswapp": 8, "url_pattern": 8, "swap": 8, "revers": 8, "replac": [8, 10], "extractor": 8, "pattern": 8, "One": 8, "mark": 8, "place": 8, "put": 8, "reverse_swap_url": 8, "revert": 8, "were": 8, "unknown": 8, "swap_url": 8, "detoken": 8, "convert": 8, "how": 8, "do": [8, 11], "extract_token_class_set": 8, "keep_token_class": 8, "keep": 8, "all": [8, 12], "kept": 8, "hug": [9, 11], "face": [9, 11], "textsplitt": 9, "somajo_sentence_splitt": 9, "ignore_overly_long_sent": 9, "alwai": 9, "whole": 9, "splitter": 9, "valueerror": 9, "except": 9, "longer": 9, "simpli": 9, "ignor": 9, "clean_all_invisible_chars_and_whitespac": 10, "clean": 10, "invis": 10, "charact": 10, "whitespac": 10, "special": 10, "normal": 10, "multipl": 10, "lead": 10, "trail": 10, "defin": 10, "constant": 10, "invisible_charact": 10, "special_whitespac": 10, "rteturn": 10, "has_invisible_charact": 10, "otherwis": 10, "has_special_whitespac": 10, "remove_invisible_charact": 10, "replace_multiple_whitespac": 10, "replace_special_whitespac": 10, "kfoldlabeleddataset": 11, "7": 11, "n_repeat": 11, "random_st": 11, "k": 11, "labeleddataset": 11, "labeled_dataset": 11, "stratification_label": 11, "encod": 11, "labe": 11, "pretrained_model_name_or_path": 11, "pathlik": 11, "host": 11, "insid": 11, "repo": 11, "huggingfac": 11, "machin": 12, "learn": 12, "avail": 12, "python": 12, "packag": 12, "pypi": 12, "option": 12, "might": 12, "them": 12, "refer": 12, "repositori": 12, "licens": 12, "imprint": 12}, "objects": {"mltb2": [[1, 0, 0, "-", "data"], [2, 0, 0, "-", "fasttext"], [3, 0, 0, "-", "files"], [4, 0, 0, "-", "md"], [5, 0, 0, "-", "openai"], [6, 0, 0, "-", "optuna"], [7, 0, 0, "-", "plot"], [8, 0, 0, "-", "somajo"], [9, 0, 0, "-", "somajo_transformers"], [10, 0, 0, "-", "text"], [11, 0, 0, "-", "transformers"]], "mltb2.data": [[1, 1, 1, "", "_load_colon_data"], [1, 1, 1, "", "_load_colon_label"], [1, 1, 1, "", "load_colon"], [1, 1, 1, "", "load_leukemia_big"], [1, 1, 1, "", "load_prostate"]], "mltb2.fasttext": [[2, 2, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[2, 3, 1, "", "__call__"], [2, 3, 1, "", "get_model_path_and_download"]], "mltb2.files": [[3, 1, 1, "", "fetch_remote_file"], [3, 1, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[4, 2, 1, "", "MdTextSplitter"], [4, 1, 1, "", "_chunk_md_by_headline"], [4, 1, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[4, 3, 1, "", "__call__"]], "mltb2.openai": [[5, 2, 1, "", "OpenAiAzureChatCompletion"], [5, 2, 1, "", "OpenAiAzureCompletion"], [5, 2, 1, "", "OpenAiBaseCompletion"], [5, 2, 1, "", "OpenAiChatCompletion"], [5, 2, 1, "", "OpenAiCompletion"], [5, 2, 1, "", "OpenAiCompletionAnswer"], [5, 2, 1, "", "OpenAiTokenCounter"], [5, 1, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[5, 3, 1, "", "__call__"], [5, 3, 1, "", "_completion"], [5, 3, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[5, 3, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[5, 3, 1, "", "__call__"]], "mltb2.optuna": [[6, 2, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[6, 3, 1, "", "prune"]], "mltb2.plot": [[7, 1, 1, "", "boxplot"], [7, 1, 1, "", "boxplot_dict"], [7, 1, 1, "", "save_last_figure"], [7, 1, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[8, 2, 1, "", "JaccardSimilarity"], [8, 2, 1, "", "SoMaJoBaseClass"], [8, 2, 1, "", "SoMaJoSentenceSplitter"], [8, 2, 1, "", "TokenExtractor"], [8, 2, 1, "", "UrlSwapper"], [8, 1, 1, "", "detokenize"], [8, 1, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[8, 3, 1, "", "__call__"], [8, 3, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[8, 3, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[8, 3, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[8, 3, 1, "", "reverse_swap_urls"], [8, 3, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[9, 2, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[9, 3, 1, "", "__call__"]], "mltb2.text": [[10, 1, 1, "", "clean_all_invisible_chars_and_whitespaces"], [10, 1, 1, "", "has_invisible_characters"], [10, 1, 1, "", "has_special_whitespaces"], [10, 1, 1, "", "remove_invisible_characters"], [10, 1, 1, "", "replace_multiple_whitespaces"], [10, 1, 1, "", "replace_special_whitespaces"]], "mltb2.transformers": [[11, 2, 1, "", "KFoldLabeledDataset"], [11, 2, 1, "", "LabeledDataset"], [11, 2, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[11, 3, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[11, 3, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:function", "2": "py:class", "3": "py:method"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"], "2": ["py", "class", "Python class"], "3": ["py", "method", "Python method"]}, "titleterms": {"api": 0, "refer": 0, "data": 1, "fasttext": 2, "file": 3, "md": 4, "openai": 5, "optuna": 6, "plot": 7, "somajo": 8, "somajo_transform": 9, "text": 10, "transform": 11, "mltb2": 12, "document": 12, "instal": 12, "content": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "data": [[1, "module-mltb2.data"]], "fasttext": [[2, "module-mltb2.fasttext"]], "files": [[3, "module-mltb2.files"]], "md": [[4, "module-mltb2.md"]], "openai": [[5, "module-mltb2.openai"]], "optuna": [[6, "module-mltb2.optuna"]], "plot": [[7, "module-mltb2.plot"]], "somajo": [[8, "module-mltb2.somajo"]], "somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "text": [[10, "module-mltb2.text"]], "transformers": [[11, "module-mltb2.transformers"]], "MLTB2 Documentation": [[12, "mltb2-documentation"]], "Installation": [[12, "installation"]], "Content": [[12, "content"]]}, "indexentries": {"_load_colon_data() (in module mltb2.data)": [[1, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[1, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[1, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[1, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[1, "mltb2.data.load_prostate"]], "mltb2.data": [[1, "module-mltb2.data"]], "module": [[1, "module-mltb2.data"], [2, "module-mltb2.fasttext"], [3, "module-mltb2.files"], [4, "module-mltb2.md"], [5, "module-mltb2.openai"], [6, "module-mltb2.optuna"], [7, "module-mltb2.plot"], [8, "module-mltb2.somajo"], [9, "module-mltb2.somajo_transformers"], [10, "module-mltb2.text"], [11, "module-mltb2.transformers"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[2, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[2, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[3, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[3, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[3, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[4, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[4, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[4, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[4, "mltb2.md.chunk_md"]], "mltb2.md": [[4, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[5, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[5, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[5, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[5, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[5, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[5, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[5, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[6, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[7, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[7, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[7, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[8, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[8, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[8, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[8, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[8, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[8, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[8, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[9, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[9, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "has_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[10, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[11, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[11, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[11, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[11, "mltb2.transformers.KFoldLabeledDataset.split"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["api-reference", "api-reference/data", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/data.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "data", "fasttext", "files", "md", "openai", "optuna", "plot", "somajo", "somajo_transformers", "text", "transformers", "MLTB2 Documentation"], "terms": {"data": [0, 3, 6, 11, 12], "fasttext": [0, 12], "file": [0, 1, 2, 5, 12], "md": [0, 12], "openai": [0, 12], "optuna": [0, 12], "plot": [0, 12], "somajo": [0, 9, 12], "somajo_transform": [0, 12], "text": [0, 2, 4, 5, 8, 9, 11, 12], "transform": [0, 9, 12], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "offer": [1, 2, 6, 8, 9, 10, 11], "tool": [1, 2, 6, 7, 8, 9, 10, 11, 12], "load": 1, "The": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "follow": [1, 5, 10], "tabular": 1, "set": [1, 6, 8, 11], "from": [1, 2, 3, 5, 6, 8, 10], "biolog": 1, "medic": 1, "domain": 1, "ar": [1, 2, 4, 5, 6, 10, 11], "support": 1, "colon": 1, "http": [1, 8], "genom": 1, "pub": 1, "princeton": 1, "edu": 1, "oncologi": 1, "affydata": 1, "index": [1, 12], "html": 1, "prostat": 1, "web": 1, "stanford": 1, "hasti": 1, "casi_fil": 1, "leukemia_big": 1, "leukemia": 1, "after": 1, "internet": 1, "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "pars": 1, "convert": [1, 8], "cach": 1, "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "directori": [1, 3, 11], "determin": [1, 3], "get_and_create_mltb2_data_dir": [1, 3], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "pip": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "instal": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "necessari": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "depend": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "_load_colon_data": 1, "datafram": 1, "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "label": [1, 7, 11], "also": [1, 5], "see": [1, 6], "return": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "panda": 1, "type": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "_load_colon_label": 1, "seri": 1, "load_colon": 1, "tupl": [1, 8], "contain": [1, 2, 8, 10, 11], "load_leukemia_big": 1, "big": 1, "load_prost": 1, "class": [2, 4, 5, 6, 8, 9, 11], "fasttextlanguageidentif": 2, "base": [2, 3, 4, 5, 6, 7, 8, 9, 11], "object": [2, 4, 5, 6, 8, 9, 11], "identifi": 2, "languag": [2, 8], "__call__": [2, 4, 5, 8, 9, 11], "str": [2, 3, 4, 5, 7, 8, 9, 10, 11], "num_lang": 2, "int": [2, 4, 5, 6, 7, 9, 11], "10": [2, 6], "given": [2, 3, 6], "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "which": [2, 3, 5, 6, 8, 10, 11], "recogn": 2, "number": [2, 4, 5, 6, 7, 8, 9, 11], "A": [2, 6, 12], "dict": [2, 5], "probabl": 2, "more": [2, 6, 7], "than": [2, 9], "element": 2, "so": 2, "guarante": 2, "you": [2, 5, 6, 12], "want": 2, "includ": 2, "case": [2, 5], "when": [2, 5], "veri": 2, "low": 2, "possibl": 2, "af": 2, "al": 2, "am": 2, "an": [2, 5, 6, 8, 9, 11], "arz": 2, "ast": 2, "av": 2, "az": 2, "azb": 2, "ba": 2, "bar": 2, "bcl": 2, "bg": 2, "bh": 2, "bn": 2, "bo": 2, "bpy": 2, "br": 2, "b": 2, "bxr": 2, "ca": 2, "cbk": 2, "ce": 2, "ceb": 2, "ckb": 2, "co": [2, 11], "c": 2, "cv": [2, 6], "cy": 2, "da": [2, 8], "de": [2, 8], "diq": 2, "dsb": 2, "dty": 2, "dv": 2, "el": 2, "eml": 2, "en": 2, "eo": 2, "e": 2, "et": 2, "eu": 2, "fa": 2, "fi": 2, "fr": 2, "frr": 2, "fy": 2, "ga": 2, "gd": 2, "gl": 2, "gn": 2, "gom": 2, "gu": 2, "gv": 2, "he": 2, "hi": 2, "hif": 2, "hr": 2, "hsb": 2, "ht": 2, "hu": 2, "hy": 2, "ia": 2, "id": [2, 11], "ie": 2, "ilo": 2, "io": 2, "ja": 2, "jbo": 2, "jv": 2, "ka": 2, "kk": 2, "km": 2, "kn": 2, "ko": 2, "krc": 2, "ku": 2, "kv": 2, "kw": 2, "ky": 2, "la": 2, "lb": 2, "lez": 2, "li": 2, "lmo": 2, "lo": 2, "lrc": 2, "lt": 2, "lv": 2, "mai": 2, "mg": 2, "mhr": 2, "min": 2, "mk": 2, "ml": 2, "mn": 2, "mr": 2, "mrj": 2, "m": 2, "mt": 2, "mwl": 2, "my": 2, "myv": 2, "mzn": 2, "nah": 2, "nap": 2, "nd": 2, "ne": 2, "new": 2, "nl": 2, "nn": 2, "oc": 2, "o": 2, "pa": 2, "pam": 2, "pfl": 2, "pl": 2, "pm": 2, "pnb": 2, "p": 2, "pt": 2, "qu": 2, "rm": 2, "ro": 2, "ru": 2, "rue": 2, "sa": 2, "sah": 2, "sc": 2, "scn": 2, "sco": 2, "sd": 2, "sh": 2, "si": 2, "sk": 2, "sl": 2, "sq": 2, "sr": 2, "su": 2, "sv": 2, "sw": 2, "ta": 2, "te": 2, "tg": 2, "th": 2, "tk": 2, "tl": 2, "tr": 2, "tt": 2, "tyv": 2, "ug": 2, "uk": 2, "ur": 2, "uz": 2, "vec": 2, "vep": 2, "vi": 2, "vl": 2, "vo": 2, "wa": [2, 5, 11], "war": 2, "wuu": 2, "xal": 2, "xmf": 2, "yi": 2, "yo": 2, "yue": 2, "zh": 2, "static": 2, "get_model_path_and_download": 2, "get": [2, 5, 8], "model": [2, 5, 6, 11], "path": [2, 3, 11], "download": 2, "need": 2, "full": [2, 3, 5], "util": [3, 11], "provid": [3, 6], "function": [3, 5, 6, 7, 10], "other": 3, "fetch_remote_fil": 3, "dirnam": 3, "filenam": [3, 7], "url": [3, 8], "sha256_checksum": 3, "fetch": 3, "remot": 3, "where": [3, 8], "save": [3, 7], "under": 3, "sha256": 3, "checksum": 3, "creat": [3, 5, 7], "rais": [3, 9], "ioerror": 3, "wrong": 3, "mltb2_base_data_dir": 3, "none": [3, 5, 7, 8, 11], "dir": 3, "exact": 3, "folder": 3, "append": [3, 6], "If": [3, 5, 7, 8, 9, 11], "default": [3, 7], "user": [3, 6], "platformdir": 3, "user_data_dir": 3, "markdown": 4, "specif": [4, 5, 8, 9, 10, 11], "mdtextsplitt": 4, "max_token": [4, 9], "transformers_token_count": [4, 9], "transformerstokencount": [4, 9, 11], "show_progress_bar": [4, 5, 8, 9, 11], "bool": [4, 5, 6, 7, 8, 9, 10, 11], "fals": [4, 5, 7, 8, 9, 10, 11], "split": [4, 6, 8, 9, 11], "section": [4, 9], "specifi": [4, 5, 9], "maximum": [4, 9], "token": [4, 5, 8, 9, 11], "doe": [4, 6, 7, 9], "divid": [4, 9], "head": 4, "correspond": 4, "paragraph": 4, "per": [4, 6, 9], "can": [4, 5, 7, 12], "onli": [4, 5, 6], "exceed": 4, "singl": [4, 7, 10], "chunk": 4, "alreadi": 4, "larger": [4, 6], "counter": [4, 9], "show": [4, 5, 8, 9, 11], "progressbar": [4, 5, 8, 9, 11], "dure": [4, 5, 8, 9, 11], "process": [4, 5, 8, 9, 11], "md_text": 4, "list": [4, 5, 8, 9, 11], "_chunk_md_by_headlin": 4, "headlin": 4, "chunk_md": 4, "merg": 4, "isol": 4, "subsequ": 4, "end": 4, "without": [4, 5], "content": 4, "remov": [4, 10], "openaiazurechatcomplet": 5, "completion_kwarg": 5, "ani": 5, "openaichatcomplet": 5, "azur": 5, "chat": 5, "complet": 5, "construct": 5, "openaibasecomplet": 5, "from_yaml": 5, "kwarg": 5, "properti": 5, "must": [5, 6], "api_typ": 5, "api_vers": 5, "api_bas": 5, "engin": 5, "quickstart": 5, "start": 5, "gpt": 5, "35": 5, "turbo": 5, "4": [5, 6], "servic": 5, "openaiazurecomplet": 5, "openaicomplet": 5, "non": 5, "gener": [5, 11], "abc": [5, 8], "abstract": [5, 8], "prompt": 5, "map": 5, "openaicompletionansw": 5, "call": [5, 6, 7], "llm": 5, "In": [5, 6], "string": [5, 8], "allow": 5, "overwrit": 5, "exampl": [5, 6, 8], "chang": 5, "temperatur": 5, "_complet": 5, "completion_kwargs_for_this_cal": 5, "openaiobject": 5, "method": [5, 6], "classmethod": 5, "yaml_fil": 5, "yaml": 5, "prompt_token": 5, "completion_token": 5, "total_token": 5, "finish_reason": 5, "answer": 5, "result": [5, 6], "name": 5, "ha": [5, 7], "been": 5, "total": 5, "reason": [5, 6], "why": 5, "stop": 5, "mean": [5, 6], "api": [5, 12], "run": 5, "limit": 5, "length": 5, "becaus": 5, "function_cal": 5, "from_open_ai_object": 5, "open_ai_object": 5, "openaitokencount": 5, "model_nam": 5, "count": [5, 11], "some": [5, 12], "3": [5, 6], "5": 5, "davinci": 5, "003": 5, "embed": 5, "ada": 5, "002": 5, "iter": [5, 8, 11], "just": [5, 11], "_check_mandatory_azure_completion_kwarg": 5, "check": [5, 6, 10], "mandatori": 5, "significancerepeatedtrainingprun": 6, "alpha": 6, "float": [6, 8], "0": [6, 7], "1": [6, 11], "n_warmup_step": 6, "baseprun": 6, "pruner": 6, "statist": 6, "signific": 6, "heurist": 6, "decis": 6, "make": [6, 7], "It": [6, 8, 10, 12], "prune": 6, "repeat": 6, "train": [6, 11], "like": 6, "cross": [6, 11], "valid": [6, 11], "As": 6, "test": [6, 11], "t": 6, "our": 6, "experi": 6, "have": 6, "shown": 6, "aplha": 6, "valu": [6, 7], "between": 6, "": [6, 7], "standard": 6, "assum": 6, "adjust": 6, "onc": 6, "hyperparamet": 6, "those": 6, "work": 6, "basi": 6, "intermedi": 6, "For": [6, 7], "epoch": 6, "contrast": 6, "precis": 6, "individu": 6, "fold": [6, 11], "below": 6, "minimalist": 6, "import": [6, 8], "log": 6, "numpi": 6, "np": 6, "sklearn": 6, "dataset": [6, 11], "load_iri": 6, "model_select": 6, "stratifiedkfold": 6, "ensembl": 6, "randomforestclassifi": 6, "metric": 6, "accuracy_scor": 6, "configur": 6, "logger": 6, "debug": 6, "output": [6, 8], "getlogg": 6, "addhandl": 6, "streamhandl": 6, "setlevel": 6, "x": [6, 7], "y": [6, 7], "target": 6, "def": 6, "trial": 6, "min_samples_split": 6, "suggest_int": 6, "2": 6, "20": 6, "n_estim": 6, "100": 6, "validation_result_list": 6, "skf": 6, "n_split": [6, 11], "fold_index": 6, "train_index": 6, "val_index": 6, "enumer": 6, "x_train": 6, "x_val": 6, "y_train": 6, "y_val": 6, "rf": 6, "fit": 6, "y_pred": 6, "predict": 6, "acc": 6, "report": 6, "we": 6, "should": [6, 8], "should_prun": 6, "here": 6, "done": 6, "break": 6, "studi": 6, "create_studi": 6, "storag": 6, "sqlite": 6, "db": 6, "memori": 6, "study_nam": 6, "iris_cv": 6, "direct": 6, "maxim": 6, "load_if_exist": 6, "true": [6, 7, 9, 10], "sampler": 6, "tpesampl": 6, "multivari": 6, "add": 6, "optim": 6, "n_trial": 6, "level": 6, "aggress": 6, "smaller": 6, "stronger": 6, "differ": [6, 7], "two": [6, 7, 8], "distribut": 6, "disabl": 6, "until": 6, "reach": 6, "exce": 6, "step": [6, 7], "frozentri": 6, "judg": 6, "whether": 6, "note": 6, "suppos": 6, "librari": 6, "instead": 6, "interfac": 6, "implement": 6, "mechan": 6, "take": 6, "copi": 6, "befor": 6, "modifi": 6, "boolean": 6, "repres": 6, "matplotlib": 7, "boxplot": 7, "titl": 7, "xlabel": 7, "ylabel": 7, "vert": 7, "print": [7, 8], "one": [7, 8], "diagram": 7, "pyplot": 7, "axi": 7, "box": [7, 12], "vertic": 7, "horizont": 7, "boxplot_dict": 7, "values_dict": 7, "form": [7, 10], "dictionari": 7, "save_last_figur": 7, "last": 7, "made": 7, "jupyt": 7, "notebook": 7, "same": 7, "cell": 7, "twin_axes_timeseries_plot": 7, "values_1": 7, "label_1": 7, "values_2": 7, "label_2": 7, "start_timestep_numb": 7, "shift_1": 7, "shift_2": 7, "label_x": 7, "color_1": 7, "tab": 7, "red": 7, "color_2": 7, "blue": 7, "twin": 7, "ax": 7, "timeseri": 7, "curv": 7, "array_lik": 7, "first": 7, "second": 7, "point": 7, "time": 7, "timestep": 7, "shift": 7, "posit": 7, "neg": 7, "color": 7, "jaccardsimilar": 8, "liter": 8, "de_cmc": 8, "en_ptb": 8, "somajobaseclass": 8, "calcul": 8, "jaccard": 8, "similar": 8, "german": 8, "english": 8, "text1": 8, "text2": 8, "get_token_set": 8, "word": [8, 9], "directli": 8, "somajosentencesplitt": [8, 9], "sentenc": [8, 9], "tokenextractor": 8, "extract": 8, "extract_url_set": 8, "token_extractor": 8, "url_set": 8, "ist": 8, "ein": 8, "link": 8, "github": [8, 12], "com": 8, "urlswapp": 8, "url_pattern": 8, "swap": 8, "revers": 8, "replac": [8, 10], "extractor": 8, "pattern": 8, "One": 8, "mark": 8, "place": 8, "put": 8, "reverse_swap_url": 8, "revert": 8, "were": 8, "unknown": 8, "swap_url": 8, "detoken": 8, "how": 8, "do": [8, 11], "extract_token_class_set": 8, "keep_token_class": 8, "keep": 8, "all": [8, 12], "kept": 8, "hug": [9, 11], "face": [9, 11], "textsplitt": 9, "somajo_sentence_splitt": 9, "ignore_overly_long_sent": 9, "alwai": 9, "whole": 9, "splitter": 9, "valueerror": 9, "except": 9, "longer": 9, "simpli": 9, "ignor": 9, "detect": 10, "clean": 10, "invis": 10, "charact": 10, "special": 10, "whitespac": 10, "duplic": 10, "clean_all_invisible_chars_and_whitespac": 10, "normal": 10, "multipl": 10, "lead": 10, "trail": 10, "defin": 10, "constant": 10, "invisible_charact": 10, "special_whitespac": 10, "rteturn": 10, "has_invisible_charact": 10, "otherwis": 10, "has_special_whitespac": 10, "remove_invisible_charact": 10, "replace_multiple_whitespac": 10, "replace_special_whitespac": 10, "kfoldlabeleddataset": 11, "7": 11, "n_repeat": 11, "random_st": 11, "k": 11, "labeleddataset": 11, "labeled_dataset": 11, "stratification_label": 11, "encod": 11, "labe": 11, "pretrained_model_name_or_path": 11, "pathlik": 11, "host": 11, "insid": 11, "repo": 11, "huggingfac": 11, "machin": 12, "learn": 12, "avail": 12, "python": 12, "packag": 12, "pypi": 12, "option": 12, "might": 12, "them": 12, "refer": 12, "repositori": 12, "licens": 12, "imprint": 12}, "objects": {"mltb2": [[1, 0, 0, "-", "data"], [2, 0, 0, "-", "fasttext"], [3, 0, 0, "-", "files"], [4, 0, 0, "-", "md"], [5, 0, 0, "-", "openai"], [6, 0, 0, "-", "optuna"], [7, 0, 0, "-", "plot"], [8, 0, 0, "-", "somajo"], [9, 0, 0, "-", "somajo_transformers"], [10, 0, 0, "-", "text"], [11, 0, 0, "-", "transformers"]], "mltb2.data": [[1, 1, 1, "", "_load_colon_data"], [1, 1, 1, "", "_load_colon_label"], [1, 1, 1, "", "load_colon"], [1, 1, 1, "", "load_leukemia_big"], [1, 1, 1, "", "load_prostate"]], "mltb2.fasttext": [[2, 2, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[2, 3, 1, "", "__call__"], [2, 3, 1, "", "get_model_path_and_download"]], "mltb2.files": [[3, 1, 1, "", "fetch_remote_file"], [3, 1, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[4, 2, 1, "", "MdTextSplitter"], [4, 1, 1, "", "_chunk_md_by_headline"], [4, 1, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[4, 3, 1, "", "__call__"]], "mltb2.openai": [[5, 2, 1, "", "OpenAiAzureChatCompletion"], [5, 2, 1, "", "OpenAiAzureCompletion"], [5, 2, 1, "", "OpenAiBaseCompletion"], [5, 2, 1, "", "OpenAiChatCompletion"], [5, 2, 1, "", "OpenAiCompletion"], [5, 2, 1, "", "OpenAiCompletionAnswer"], [5, 2, 1, "", "OpenAiTokenCounter"], [5, 1, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[5, 3, 1, "", "__call__"], [5, 3, 1, "", "_completion"], [5, 3, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[5, 3, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[5, 3, 1, "", "__call__"]], "mltb2.optuna": [[6, 2, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[6, 3, 1, "", "prune"]], "mltb2.plot": [[7, 1, 1, "", "boxplot"], [7, 1, 1, "", "boxplot_dict"], [7, 1, 1, "", "save_last_figure"], [7, 1, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[8, 2, 1, "", "JaccardSimilarity"], [8, 2, 1, "", "SoMaJoBaseClass"], [8, 2, 1, "", "SoMaJoSentenceSplitter"], [8, 2, 1, "", "TokenExtractor"], [8, 2, 1, "", "UrlSwapper"], [8, 1, 1, "", "detokenize"], [8, 1, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[8, 3, 1, "", "__call__"], [8, 3, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[8, 3, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[8, 3, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[8, 3, 1, "", "reverse_swap_urls"], [8, 3, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[9, 2, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[9, 3, 1, "", "__call__"]], "mltb2.text": [[10, 1, 1, "", "clean_all_invisible_chars_and_whitespaces"], [10, 1, 1, "", "has_invisible_characters"], [10, 1, 1, "", "has_special_whitespaces"], [10, 1, 1, "", "remove_invisible_characters"], [10, 1, 1, "", "replace_multiple_whitespaces"], [10, 1, 1, "", "replace_special_whitespaces"]], "mltb2.transformers": [[11, 2, 1, "", "KFoldLabeledDataset"], [11, 2, 1, "", "LabeledDataset"], [11, 2, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[11, 3, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[11, 3, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:function", "2": "py:class", "3": "py:method"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"], "2": ["py", "class", "Python class"], "3": ["py", "method", "Python method"]}, "titleterms": {"api": 0, "refer": 0, "data": 1, "fasttext": 2, "file": 3, "md": 4, "openai": 5, "optuna": 6, "plot": 7, "somajo": 8, "somajo_transform": 9, "text": 10, "transform": 11, "mltb2": 12, "document": 12, "instal": 12, "content": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "data": [[1, "module-mltb2.data"]], "fasttext": [[2, "module-mltb2.fasttext"]], "files": [[3, "module-mltb2.files"]], "md": [[4, "module-mltb2.md"]], "openai": [[5, "module-mltb2.openai"]], "optuna": [[6, "module-mltb2.optuna"]], "plot": [[7, "module-mltb2.plot"]], "somajo": [[8, "module-mltb2.somajo"]], "somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "text": [[10, "module-mltb2.text"]], "transformers": [[11, "module-mltb2.transformers"]], "MLTB2 Documentation": [[12, "mltb2-documentation"]], "Installation": [[12, "installation"]], "Content": [[12, "content"]]}, "indexentries": {"_load_colon_data() (in module mltb2.data)": [[1, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[1, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[1, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[1, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[1, "mltb2.data.load_prostate"]], "mltb2.data": [[1, "module-mltb2.data"]], "module": [[1, "module-mltb2.data"], [2, "module-mltb2.fasttext"], [3, "module-mltb2.files"], [4, "module-mltb2.md"], [5, "module-mltb2.openai"], [6, "module-mltb2.optuna"], [7, "module-mltb2.plot"], [8, "module-mltb2.somajo"], [9, "module-mltb2.somajo_transformers"], [10, "module-mltb2.text"], [11, "module-mltb2.transformers"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[2, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[2, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[3, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[3, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[3, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[4, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[4, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[4, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[4, "mltb2.md.chunk_md"]], "mltb2.md": [[4, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[5, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[5, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[5, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[5, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[5, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[5, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[5, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[6, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[7, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[7, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[7, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[8, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[8, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[8, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[8, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[8, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[8, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[8, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[9, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[9, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "has_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[10, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[11, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[11, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[11, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[11, "mltb2.transformers.KFoldLabeledDataset.split"]]}}) \ No newline at end of file