From 34cee65395d5c75a52d403854938a76114751e7f Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sun, 10 Dec 2023 20:11:36 +0000 Subject: [PATCH] deploy: a3d7bec85f7bd44ebb576d930c9f15e4e482f148 --- _modules/mltb2/text.html | 50 ++++++++++++++++++++++++++++++++---- api-reference/text.html | 53 ++++++++++++++++++++++++++++++++++++--- genindex.html | 6 +++++ objects.inv | Bin 1307 -> 1332 bytes searchindex.js | 2 +- 5 files changed, 101 insertions(+), 10 deletions(-) diff --git a/_modules/mltb2/text.html b/_modules/mltb2/text.html index 6cea366..f5bf5fc 100644 --- a/_modules/mltb2/text.html +++ b/_modules/mltb2/text.html @@ -84,7 +84,8 @@

Source code for mltb2.text

 
 """Text specific module."""
 
-from typing import Dict, Final, Tuple
+import re
+from typing import Dict, Final, Pattern, Tuple
 
 INVISIBLE_CHARACTERS: Final[Tuple[str, ...]] = (
     "\u200b",  # Zero Width Space (ZWSP) https://www.compart.com/en/unicode/U+200b
@@ -116,11 +117,15 @@ 

Source code for mltb2.text

 
 SPECIAL_WHITESPACES_TRANS: Final[Dict[int, str]] = str.maketrans({char: " " for char in SPECIAL_WHITESPACES})
 
+INVISIBLE_CHARACTERS_AND_SPECIAL_WHITESPACES_TRANS = {**SPECIAL_WHITESPACES_TRANS, **INVISIBLE_CHARACTERS_TRANS}
+
+MULTI_SPACE_PATTERN: Pattern = re.compile(r" {2,}")
+
 
 
[docs]def remove_invisible_characters(text: str) -> str: """Remove invisible characters from text. - The invisible characters are defined in the constant `INVISIBLE_CHARACTERS`. + The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``. Args: text: The text from which the invisible characters are to be removed. @@ -134,7 +139,7 @@

Source code for mltb2.text

 
[docs]def has_invisible_characters(text: str) -> bool: """Check if text contains invisible characters. - The invisible characters are defined in the constant `INVISIBLE_CHARACTERS`. + The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``. Args: text: The text to check. @@ -148,7 +153,7 @@

Source code for mltb2.text

 
[docs]def replace_special_whitespaces(text: str) -> str: """Replace special whitespaces with normal whitespaces. - The special whitespaces are defined in the constant `SPECIAL_WHITESPACES`. + The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``. Args: text: The text from which the special whitespaces are to be replaced. @@ -162,7 +167,7 @@

Source code for mltb2.text

 
[docs]def has_special_whitespaces(text: str) -> bool: """Check if text contains special whitespaces. - The special whitespaces are defined in the constant `SPECIAL_WHITESPACES`. + The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``. Args: text: The text to check. @@ -171,6 +176,41 @@

Source code for mltb2.text

         ``True`` if the text contains special whitespaces, ``False`` otherwise.
     """
     return any(char in text for char in SPECIAL_WHITESPACES)
+ + +
[docs]def replace_multiple_whitespaces(text: str) -> str: + """Replace multiple whitespaces with single whitespace. + + Args: + text: The text from which the multiple whitespaces are to be replaced. + + Returns: + The cleaned text. + """ + return MULTI_SPACE_PATTERN.sub(" ", text)
+ + +
[docs]def clean_all_invisible_chars_and_whitespaces(text: str) -> str: + """Clean text form invisible characters and whitespaces. + + - Remove invisible characters from text. + - Replace special whitespaces with normal whitespaces. + - Replace multiple whitespaces with single whitespace. + - Remove leading and trailing whitespaces. + + The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``. + The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``. + + Args: + text: The text to clean. + + Rteturns: + The cleaned text. + """ + text = text.translate(INVISIBLE_CHARACTERS_AND_SPECIAL_WHITESPACES_TRANS) + text = replace_multiple_whitespaces(text) + text = text.strip() + return text
diff --git a/api-reference/text.html b/api-reference/text.html index ac2d756..ce28338 100644 --- a/api-reference/text.html +++ b/api-reference/text.html @@ -60,9 +60,11 @@
  • somajo
  • somajo_transformers
  • text
  • @@ -103,11 +105,37 @@

    text

    Text specific module.

    +
    +
    +mltb2.text.clean_all_invisible_chars_and_whitespaces(text: str) str[source]
    +

    Clean text form invisible characters and whitespaces.

    +
      +
    • Remove invisible characters from text.

    • +
    • Replace special whitespaces with normal whitespaces.

    • +
    • Replace multiple whitespaces with single whitespace.

    • +
    • Remove leading and trailing whitespaces.

    • +
    +

    The invisible characters are defined in the constant INVISIBLE_CHARACTERS. +The special whitespaces are defined in the constant SPECIAL_WHITESPACES.

    +
    +
    Parameters:
    +

    text (str) – The text to clean.

    +
    +
    Return type:
    +

    str

    +
    +
    +
    +
    Rteturns:

    The cleaned text.

    +
    +
    +
    +
    mltb2.text.has_invisible_characters(text: str) bool[source]

    Check if text contains invisible characters.

    -

    The invisible characters are defined in the constant INVISIBLE_CHARACTERS.

    +

    The invisible characters are defined in the constant INVISIBLE_CHARACTERS.

    Parameters:

    text (str) – The text to check.

    @@ -125,7 +153,7 @@
    mltb2.text.has_special_whitespaces(text: str) bool[source]

    Check if text contains special whitespaces.

    -

    The special whitespaces are defined in the constant SPECIAL_WHITESPACES.

    +

    The special whitespaces are defined in the constant SPECIAL_WHITESPACES.

    Parameters:

    text (str) – The text to check.

    @@ -143,7 +171,7 @@
    mltb2.text.remove_invisible_characters(text: str) str[source]

    Remove invisible characters from text.

    -

    The invisible characters are defined in the constant INVISIBLE_CHARACTERS.

    +

    The invisible characters are defined in the constant INVISIBLE_CHARACTERS.

    Parameters:

    text (str) – The text from which the invisible characters are to be removed.

    @@ -157,11 +185,28 @@
    +
    +
    +mltb2.text.replace_multiple_whitespaces(text: str) str[source]
    +

    Replace multiple whitespaces with single whitespace.

    +
    +
    Parameters:
    +

    text (str) – The text from which the multiple whitespaces are to be replaced.

    +
    +
    Returns:
    +

    The cleaned text.

    +
    +
    Return type:
    +

    str

    +
    +
    +
    +
    mltb2.text.replace_special_whitespaces(text: str) str[source]

    Replace special whitespaces with normal whitespaces.

    -

    The special whitespaces are defined in the constant SPECIAL_WHITESPACES.

    +

    The special whitespaces are defined in the constant SPECIAL_WHITESPACES.

    Parameters:

    text (str) – The text from which the special whitespaces are to be replaced.

    diff --git a/genindex.html b/genindex.html index beab39b..63651c1 100644 --- a/genindex.html +++ b/genindex.html @@ -159,6 +159,10 @@

    C

    +
    @@ -405,6 +409,8 @@

    R

      diff --git a/objects.inv b/objects.inv index d637bf1c94e27809c313e43590d9bec1dffbb5a7..0b7cd492c44bb49fa054e42c48985e1cc08c1317 100644 GIT binary patch delta 1232 zcmV;>1TXuW3bYE4bbpy&bK58oz~A*LG}C*v@2-!1OVf5Hy<{#^=bjl1Ynujv22dRL z(;t5%OR@!)CHE<7!Twklc7aX7&HCe#VuSPY@Trir;*kH9Oc9Y>3e=!Bl^}*mk^dcqB`6%PfltG48nMCJ70O45cGqd{qtHDcz z2<99iKLs#5NpC*H77Vfz2aZZ?b_ffKsN4(Z`YEF_4m#y<*BZ`*(&JEWkH29W!d zT=x#|6yzr6W`9Ez*xiGClaDM!nYtcY(8RO~8kv)Ac%shxtCBP8F%{n{4I`mEWINawgJ_9mF)0D>Y>a0<(nsLWhV8>&AcWpF4KO(EC){nE(q* z?7AS#kX-)+V&<9qB40t=%SxD;rglNI!^UP!AU5CGQv@5S4hXw!(ODz(y2om(lj$%v zf~f>frbO2vni75Eifx5I$gRQ;DBUqfUleA7iS4thMB0}ve*$<}x<5(B4)IGaQ`a%q z9GS9jT7S#;h7k34%NSO+r$3oU(D)7Pty&v{FTnJ(@!XH58ANpx2_<02esFjKKB`Gmhqe`xtx zPMdC^tn5|V^}T$%n*f{bK%Wf1z8i&!OwiO*5F}J7K4%gBQ+&0@WjWbpEnunO z4U`6??Qdt7o^RB0Hj9GasO(odOC%YMufvx2PHKEU35YCC=#5l`_1DF;(d(VeA^o1? zUOUk-#>2C(2^p+0a zuzx-Q+Z|Ca-!z&ofiTSHo95o=_CUOybqY#u%|S)Q4YOCQTbLM#H<|Oe;29A$y)OTJ zlbk-fQXSAQ*x)+AhodIBk1tS{+Qm&GRz|~bZtY9w$GE4!Z1kAqF+51GPp%F9L?y`= zC(6Eme)s&4|7bpC=OxRidRx%UosK_JFWA?RqGm1 zUeIjlH56Cuyq5AHXG8Cb6yW&o$b+1nU!>#e$7^=(q-^Lgi7R)`-aN?J#nb2#?fmG< z2O6^T^c4rU`(j{EjI-_BsGc>x%1R^!?#k7rlB4Ty{YzVxOw6;zVH{W`kZo}!I{oFD-2HW`S$DWV7t0!=7`LM}7TmJ{cg|%xONoqO( delta 1207 zcmV;o1W5a|3Y!X$bbpx5lG`W{fbV(=Rb{XC?&g@=WRg@(cBXbJo;_0-+BO*kDnMg9 zPrv*~mNXVvmYkEZpg)?XyMZpaS$|woY;ay4KNXTz9P+<%T(X6N4JZ%<{&d~TohkT- zFktaVzQ~G`;KseU=OsziuYVZ;v zf;mUXF9FO>(wk4Q1%vFufujnHuth9Ii#bn zi%s%w!}^T%??H{(_6bnZ)AeTDjgkdA;)2hk-f)xqsDIln{8sAyR@Lr=-lx*e1X$Q& zHw0mZI6wwF{aZHa2SlvH8xPBG^cEMA&7E&K{xHJyu(tOoy=< zOeJtKCAtC8l;}HGY%BahZUufo>5e)2qA(LoY@byn(za~*6Tr*T{Ui-L#4ou_UB_H^ zWXZm3Eq@OUA!_fIF)VCPe=?Dv@f+A%wKn!%vbCNtjbir9#?1jVGOPd%irMC*WocR9 zpE6<3Fd$;ly7N0G1S`5?1>;yTbBy~;7k8O6Cj71>x+|S=5soyNso0f#!r!GowEQZk zEjLhB_A2ehUcTK;fXz0bPljI)Mqy&p*F@d=kbfnF7OE7VvkCtxzS`rmoNTfduvG8{ zN(0jNztc<4S86$%MZr5N`<2cTNk(Jq;PT!`jm;+kk;MtUk*cuvx|lY4qmwzLKXTk_ zCpyP?`1trpPII9|kSGs?F02s~Z%nf_w|9dbc3W24H@ZthPv;X&mp~ZiXuo3ybcqQ} zgnzk5v^}wIr=5b5TXk-u;W%F$;e@>*|9q32KDx>m&@c9}U&qU^6DHl_0(GgK8zx+D zH2mh;zI6VKyIIdhkBP)ZRjT|Nxpa|?8oPK&ky;J=2LcF0gQ^b^}*b=vf*M@ zT)bEA%A0@LK-nn4I8~K$-^|{ZP2!?;Nq;3TXg2h?hzoWeA9-kJL)Y{a;B<}8LpwXa zm&V0Ux7OUXvZ34@7w#Nod1z-Bllvvw`ALuubY$o0D-Le`V!$WH>2|JEPa9vfB$5Jh z(vA{J`*AP-;93hOvT!2a=NqpbMe5ggn}03w6awCr$06IQs&95+J{-38wj?MV`hSe( zYcR`$Yi8}R+M1maPmeF3KjvQ~sS10Y_9gDDpraH#6>({+nvFPn-VEqf9v4kVJ#jd{ zf`%d1M$Ou-4}^9*dmXYN@J7vkLvscnet_pJO!cUNC+`yVbhRAOq2U1$y8jSta4A+{{d12bGHuZS-AiJ diff --git a/searchindex.js b/searchindex.js index bba8727..faff599 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["api-reference", "api-reference/data", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/data.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "data", "fasttext", "files", "md", "openai", "optuna", "plot", "somajo", "somajo_transformers", "text", "transformers", "MLTB2 Documentation"], "terms": {"data": [0, 3, 6, 11, 12], "fasttext": [0, 12], "file": [0, 2, 5, 12], "md": [0, 12], "openai": [0, 12], "optuna": [0, 12], "plot": [0, 12], "somajo": [0, 9, 12], "somajo_transform": [0, 12], "text": [0, 2, 4, 5, 8, 9, 11, 12], "transform": [0, 9, 12], "load": 1, "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "pip": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "instal": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "necessari": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "depend": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "_load_colon_data": 1, "datafram": 1, "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "colon": 1, "label": [1, 7, 11], "The": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "pars": 1, "from": [1, 2, 3, 5, 6, 8, 10], "internet": 1, "also": [1, 5], "see": [1, 6], "http": [1, 8], "genom": 1, "pub": 1, "princeton": 1, "edu": 1, "oncologi": 1, "affydata": 1, "index": [1, 12], "html": 1, "return": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "panda": 1, "type": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "_load_colon_label": 1, "seri": 1, "load_colon": 1, "tupl": [1, 8], "contain": [1, 2, 8, 10, 11], "load_leukemia_big": 1, "leukemia": 1, "big": 1, "web": 1, "stanford": 1, "hasti": 1, "casi_fil": 1, "load_prost": 1, "prostat": 1, "specif": [2, 4, 5, 6, 8, 9, 10, 11], "base": [2, 3, 4, 5, 6, 7, 8, 9, 11], "class": [2, 4, 5, 6, 8, 9, 11], "fasttextlanguageidentif": 2, "object": [2, 4, 5, 6, 8, 9, 11], "identifi": 2, "languag": [2, 8], "__call__": [2, 4, 5, 8, 9, 11], "str": [2, 3, 4, 5, 7, 8, 9, 10, 11], "num_lang": 2, "int": [2, 4, 5, 6, 7, 9, 11], "10": [2, 6], "given": [2, 6], "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "which": [2, 3, 5, 6, 8, 10, 11], "recogn": 2, "number": [2, 4, 5, 6, 7, 8, 9, 11], "A": [2, 6, 12], "dict": [2, 5], "probabl": 2, "more": [2, 6, 7], "than": [2, 9], "element": 2, "so": 2, "guarante": 2, "you": [2, 5, 6, 12], "want": 2, "includ": 2, "case": [2, 5], "when": [2, 5], "veri": 2, "low": 2, "possibl": 2, "ar": [2, 4, 5, 6, 10, 11], "af": 2, "al": 2, "am": 2, "an": [2, 5, 6, 8, 9, 11], "arz": 2, "ast": 2, "av": 2, "az": 2, "azb": 2, "ba": 2, "bar": 2, "bcl": 2, "bg": 2, "bh": 2, "bn": 2, "bo": 2, "bpy": 2, "br": 2, "b": 2, "bxr": 2, "ca": 2, "cbk": 2, "ce": 2, "ceb": 2, "ckb": 2, "co": [2, 11], "c": 2, "cv": [2, 6], "cy": 2, "da": [2, 8], "de": [2, 8], "diq": 2, "dsb": 2, "dty": 2, "dv": 2, "el": 2, "eml": 2, "en": 2, "eo": 2, "e": 2, "et": 2, "eu": 2, "fa": 2, "fi": 2, "fr": 2, "frr": 2, "fy": 2, "ga": 2, "gd": 2, "gl": 2, "gn": 2, "gom": 2, "gu": 2, "gv": 2, "he": 2, "hi": 2, "hif": 2, "hr": 2, "hsb": 2, "ht": 2, "hu": 2, "hy": 2, "ia": 2, "id": [2, 11], "ie": 2, "ilo": 2, "io": 2, "ja": 2, "jbo": 2, "jv": 2, "ka": 2, "kk": 2, "km": 2, "kn": 2, "ko": 2, "krc": 2, "ku": 2, "kv": 2, "kw": 2, "ky": 2, "la": 2, "lb": 2, "lez": 2, "li": 2, "lmo": 2, "lo": 2, "lrc": 2, "lt": 2, "lv": 2, "mai": 2, "mg": 2, "mhr": 2, "min": 2, "mk": 2, "ml": 2, "mn": 2, "mr": 2, "mrj": 2, "m": 2, "mt": 2, "mwl": 2, "my": 2, "myv": 2, "mzn": 2, "nah": 2, "nap": 2, "nd": 2, "ne": 2, "new": 2, "nl": 2, "nn": 2, "oc": 2, "o": 2, "pa": 2, "pam": 2, "pfl": 2, "pl": 2, "pm": 2, "pnb": 2, "p": 2, "pt": 2, "qu": 2, "rm": 2, "ro": 2, "ru": 2, "rue": 2, "sa": 2, "sah": 2, "sc": 2, "scn": 2, "sco": 2, "sd": 2, "sh": 2, "si": 2, "sk": 2, "sl": 2, "sq": 2, "sr": 2, "su": 2, "sv": 2, "sw": 2, "ta": 2, "te": 2, "tg": 2, "th": 2, "tk": 2, "tl": 2, "tr": 2, "tt": 2, "tyv": 2, "ug": 2, "uk": 2, "ur": 2, "uz": 2, "vec": 2, "vep": 2, "vi": 2, "vl": 2, "vo": 2, "wa": [2, 5, 11], "war": 2, "wuu": 2, "xal": 2, "xmf": 2, "yi": 2, "yo": 2, "yue": 2, "zh": 2, "static": 2, "get_model_path_and_download": 2, "get": [2, 5, 8], "model": [2, 5, 6, 11], "path": [2, 3, 11], "download": 2, "need": 2, "full": [2, 3, 5], "util": [3, 11], "fetch_remote_fil": 3, "dirnam": 3, "filenam": [3, 7], "url": [3, 8], "sha256_checksum": 3, "fetch": 3, "remot": 3, "directori": [3, 11], "where": [3, 8], "save": [3, 7], "under": 3, "sha256": 3, "checksum": 3, "creat": [3, 5, 7], "rais": [3, 9], "ioerror": 3, "wrong": 3, "get_and_create_mltb2_data_dir": 3, "mltb2_base_data_dir": 3, "none": [3, 5, 7, 8, 11], "mltb": 3, "dir": 3, "If": [3, 5, 7, 8, 9, 11], "default": [3, 7], "user": [3, 6], "markdown": 4, "mdtextsplitt": 4, "max_token": [4, 9], "transformers_token_count": [4, 9], "transformerstokencount": [4, 9, 11], "show_progress_bar": [4, 5, 8, 9, 11], "bool": [4, 5, 6, 7, 8, 9, 10, 11], "fals": [4, 5, 7, 8, 9, 10, 11], "split": [4, 6, 8, 9, 11], "section": [4, 9], "specifi": [4, 5, 9], "maximum": [4, 9], "token": [4, 5, 8, 9, 11], "doe": [4, 6, 7, 9], "divid": [4, 9], "head": 4, "correspond": 4, "paragraph": 4, "per": [4, 6, 9], "can": [4, 5, 7, 12], "onli": [4, 5, 6], "exceed": 4, "singl": [4, 7], "chunk": 4, "alreadi": 4, "larger": [4, 6], "counter": [4, 9], "show": [4, 5, 8, 9, 11], "progressbar": [4, 5, 8, 9, 11], "dure": [4, 5, 8, 9, 11], "process": [4, 5, 8, 9, 11], "md_text": 4, "list": [4, 5, 8, 9, 11], "_chunk_md_by_headlin": 4, "headlin": 4, "chunk_md": 4, "merg": 4, "isol": 4, "subsequ": 4, "end": 4, "without": [4, 5], "content": 4, "remov": [4, 10], "openaiazurechatcomplet": 5, "completion_kwarg": 5, "ani": 5, "openaichatcomplet": 5, "azur": 5, "chat": 5, "complet": 5, "construct": 5, "openaibasecomplet": 5, "from_yaml": 5, "kwarg": 5, "function": [5, 6, 7], "follow": 5, "properti": 5, "must": [5, 6], "api_typ": 5, "api_vers": 5, "api_bas": 5, "engin": 5, "quickstart": 5, "start": 5, "gpt": 5, "35": 5, "turbo": 5, "4": [5, 6], "servic": 5, "openaiazurecomplet": 5, "openaicomplet": 5, "non": 5, "gener": [5, 11], "abc": [5, 8], "abstract": [5, 8], "prompt": 5, "map": 5, "openaicompletionansw": 5, "call": [5, 6, 7], "llm": 5, "In": [5, 6], "string": [5, 8], "allow": 5, "overwrit": 5, "exampl": [5, 6, 8], "chang": 5, "temperatur": 5, "_complet": 5, "completion_kwargs_for_this_cal": 5, "openaiobject": 5, "method": [5, 6], "classmethod": 5, "yaml_fil": 5, "yaml": 5, "prompt_token": 5, "completion_token": 5, "total_token": 5, "finish_reason": 5, "answer": 5, "result": [5, 6], "name": 5, "ha": [5, 7], "been": 5, "total": 5, "reason": [5, 6], "why": 5, "stop": 5, "mean": [5, 6], "api": [5, 12], "run": 5, "limit": 5, "length": 5, "becaus": 5, "function_cal": 5, "from_open_ai_object": 5, "open_ai_object": 5, "openaitokencount": 5, "model_nam": 5, "count": [5, 11], "some": [5, 12], "3": [5, 6], "5": 5, "davinci": 5, "003": 5, "embed": 5, "ada": 5, "002": 5, "iter": [5, 8, 11], "just": [5, 11], "_check_mandatory_azure_completion_kwarg": 5, "check": [5, 6, 10], "mandatori": 5, "significancerepeatedtrainingprun": 6, "alpha": 6, "float": [6, 8], "0": [6, 7], "1": [6, 11], "n_warmup_step": 6, "baseprun": 6, "pruner": 6, "statist": 6, "signific": 6, "heurist": 6, "decis": 6, "make": [6, 7], "It": [6, 8, 12], "prune": 6, "repeat": 6, "train": [6, 11], "like": 6, "cross": [6, 11], "valid": [6, 11], "As": 6, "test": [6, 11], "t": 6, "our": 6, "experi": 6, "have": 6, "shown": 6, "aplha": 6, "valu": [6, 7], "between": 6, "": [6, 7], "standard": 6, "assum": 6, "adjust": 6, "onc": 6, "hyperparamet": 6, "set": [6, 8, 11], "those": 6, "work": 6, "basi": 6, "intermedi": 6, "For": [6, 7], "epoch": 6, "contrast": 6, "precis": 6, "individu": 6, "fold": [6, 11], "below": 6, "minimalist": 6, "import": [6, 8], "log": 6, "numpi": 6, "np": 6, "sklearn": 6, "dataset": [6, 11], "load_iri": 6, "model_select": 6, "stratifiedkfold": 6, "ensembl": 6, "randomforestclassifi": 6, "metric": 6, "accuracy_scor": 6, "configur": 6, "logger": 6, "debug": 6, "output": [6, 8], "getlogg": 6, "addhandl": 6, "streamhandl": 6, "setlevel": 6, "x": [6, 7], "y": [6, 7], "target": 6, "def": 6, "trial": 6, "min_samples_split": 6, "suggest_int": 6, "2": 6, "20": 6, "n_estim": 6, "100": 6, "validation_result_list": 6, "skf": 6, "n_split": [6, 11], "fold_index": 6, "train_index": 6, "val_index": 6, "enumer": 6, "x_train": 6, "x_val": 6, "y_train": 6, "y_val": 6, "rf": 6, "fit": 6, "y_pred": 6, "predict": 6, "acc": 6, "append": 6, "report": 6, "we": 6, "should": [6, 8], "should_prun": 6, "here": 6, "done": 6, "break": 6, "studi": 6, "create_studi": 6, "storag": 6, "sqlite": 6, "db": 6, "memori": 6, "study_nam": 6, "iris_cv": 6, "direct": 6, "maxim": 6, "load_if_exist": 6, "true": [6, 7, 9, 10], "sampler": 6, "tpesampl": 6, "multivari": 6, "add": 6, "optim": 6, "n_trial": 6, "level": 6, "aggress": 6, "smaller": 6, "stronger": 6, "differ": [6, 7], "two": [6, 7, 8], "distribut": 6, "disabl": 6, "until": 6, "reach": 6, "exce": 6, "step": [6, 7], "frozentri": 6, "judg": 6, "whether": 6, "note": 6, "suppos": 6, "librari": 6, "instead": 6, "provid": 6, "interfac": 6, "implement": 6, "mechan": 6, "take": 6, "copi": 6, "befor": 6, "modifi": 6, "boolean": 6, "repres": 6, "tool": [7, 8, 12], "matplotlib": 7, "boxplot": 7, "titl": 7, "xlabel": 7, "ylabel": 7, "vert": 7, "print": [7, 8], "one": [7, 8], "diagram": 7, "pyplot": 7, "axi": 7, "box": [7, 12], "vertic": 7, "horizont": 7, "boxplot_dict": 7, "values_dict": 7, "form": 7, "dictionari": 7, "save_last_figur": 7, "last": 7, "jupyt": 7, "notebook": 7, "same": 7, "cell": 7, "twin_axes_timeseries_plot": 7, "values_1": 7, "label_1": 7, "values_2": 7, "label_2": 7, "start_timestep_numb": 7, "shift_1": 7, "shift_2": 7, "label_x": 7, "color_1": 7, "tab": 7, "red": 7, "color_2": 7, "blue": 7, "twin": 7, "ax": 7, "timeseri": 7, "curv": 7, "array_lik": 7, "first": 7, "second": 7, "point": 7, "time": 7, "timestep": 7, "shift": 7, "posit": 7, "neg": 7, "color": 7, "jaccardsimilar": 8, "somajobaseclass": 8, "calcul": 8, "jaccard": 8, "similar": 8, "de_cmc": 8, "german": 8, "en_ptb": 8, "english": 8, "text1": 8, "text2": 8, "get_token_set": 8, "word": [8, 9], "directli": 8, "somajosentencesplitt": [8, 9], "sentenc": [8, 9], "tokenextractor": 8, "extract": 8, "extract_url_set": 8, "token_extractor": 8, "url_set": 8, "ist": 8, "ein": 8, "link": 8, "github": [8, 12], "com": 8, "urlswapp": 8, "url_pattern": 8, "swap": 8, "revers": 8, "replac": [8, 10], "extractor": 8, "pattern": 8, "One": 8, "mark": 8, "place": 8, "put": 8, "reverse_swap_url": 8, "revert": 8, "were": 8, "unknown": 8, "swap_url": 8, "detoken": 8, "convert": 8, "how": 8, "do": [8, 11], "extract_token_class_set": 8, "keep_token_class": 8, "keep": 8, "all": [8, 12], "kept": 8, "hug": [9, 11], "face": [9, 11], "textsplitt": 9, "somajo_sentence_splitt": 9, "ignore_overly_long_sent": 9, "alwai": 9, "whole": 9, "splitter": 9, "valueerror": 9, "except": 9, "longer": 9, "simpli": 9, "ignor": 9, "has_invisible_charact": 10, "invis": 10, "charact": 10, "defin": 10, "constant": 10, "invisible_charact": 10, "otherwis": 10, "has_special_whitespac": 10, "special": 10, "whitespac": 10, "special_whitespac": 10, "remove_invisible_charact": 10, "clean": 10, "replace_special_whitespac": 10, "normal": 10, "kfoldlabeleddataset": 11, "7": 11, "n_repeat": 11, "random_st": 11, "k": 11, "labeleddataset": 11, "labeled_dataset": 11, "stratification_label": 11, "encod": 11, "labe": 11, "pretrained_model_name_or_path": 11, "pathlik": 11, "host": 11, "insid": 11, "repo": 11, "huggingfac": 11, "machin": 12, "learn": 12, "avail": 12, "python": 12, "packag": 12, "pypi": 12, "option": 12, "might": 12, "them": 12, "refer": 12, "repositori": 12, "licens": 12, "imprint": 12}, "objects": {"mltb2": [[1, 0, 0, "-", "data"], [2, 0, 0, "-", "fasttext"], [3, 0, 0, "-", "files"], [4, 0, 0, "-", "md"], [5, 0, 0, "-", "openai"], [6, 0, 0, "-", "optuna"], [7, 0, 0, "-", "plot"], [8, 0, 0, "-", "somajo"], [9, 0, 0, "-", "somajo_transformers"], [10, 0, 0, "-", "text"], [11, 0, 0, "-", "transformers"]], "mltb2.data": [[1, 1, 1, "", "_load_colon_data"], [1, 1, 1, "", "_load_colon_label"], [1, 1, 1, "", "load_colon"], [1, 1, 1, "", "load_leukemia_big"], [1, 1, 1, "", "load_prostate"]], "mltb2.fasttext": [[2, 2, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[2, 3, 1, "", "__call__"], [2, 3, 1, "", "get_model_path_and_download"]], "mltb2.files": [[3, 1, 1, "", "fetch_remote_file"], [3, 1, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[4, 2, 1, "", "MdTextSplitter"], [4, 1, 1, "", "_chunk_md_by_headline"], [4, 1, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[4, 3, 1, "", "__call__"]], "mltb2.openai": [[5, 2, 1, "", "OpenAiAzureChatCompletion"], [5, 2, 1, "", "OpenAiAzureCompletion"], [5, 2, 1, "", "OpenAiBaseCompletion"], [5, 2, 1, "", "OpenAiChatCompletion"], [5, 2, 1, "", "OpenAiCompletion"], [5, 2, 1, "", "OpenAiCompletionAnswer"], [5, 2, 1, "", "OpenAiTokenCounter"], [5, 1, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[5, 3, 1, "", "__call__"], [5, 3, 1, "", "_completion"], [5, 3, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[5, 3, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[5, 3, 1, "", "__call__"]], "mltb2.optuna": [[6, 2, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[6, 3, 1, "", "prune"]], "mltb2.plot": [[7, 1, 1, "", "boxplot"], [7, 1, 1, "", "boxplot_dict"], [7, 1, 1, "", "save_last_figure"], [7, 1, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[8, 2, 1, "", "JaccardSimilarity"], [8, 2, 1, "", "SoMaJoBaseClass"], [8, 2, 1, "", "SoMaJoSentenceSplitter"], [8, 2, 1, "", "TokenExtractor"], [8, 2, 1, "", "UrlSwapper"], [8, 1, 1, "", "detokenize"], [8, 1, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[8, 3, 1, "", "__call__"], [8, 3, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[8, 3, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[8, 3, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[8, 3, 1, "", "reverse_swap_urls"], [8, 3, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[9, 2, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[9, 3, 1, "", "__call__"]], "mltb2.text": [[10, 1, 1, "", "has_invisible_characters"], [10, 1, 1, "", "has_special_whitespaces"], [10, 1, 1, "", "remove_invisible_characters"], [10, 1, 1, "", "replace_special_whitespaces"]], "mltb2.transformers": [[11, 2, 1, "", "KFoldLabeledDataset"], [11, 2, 1, "", "LabeledDataset"], [11, 2, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[11, 3, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[11, 3, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:function", "2": "py:class", "3": "py:method"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"], "2": ["py", "class", "Python class"], "3": ["py", "method", "Python method"]}, "titleterms": {"api": 0, "refer": 0, "data": 1, "fasttext": 2, "file": 3, "md": 4, "openai": 5, "optuna": 6, "plot": 7, "somajo": 8, "somajo_transform": 9, "text": 10, "transform": 11, "mltb2": 12, "document": 12, "instal": 12, "content": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "data": [[1, "module-mltb2.data"]], "fasttext": [[2, "module-mltb2.fasttext"]], "files": [[3, "module-mltb2.files"]], "md": [[4, "module-mltb2.md"]], "openai": [[5, "module-mltb2.openai"]], "optuna": [[6, "module-mltb2.optuna"]], "plot": [[7, "module-mltb2.plot"]], "somajo": [[8, "module-mltb2.somajo"]], "somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "text": [[10, "module-mltb2.text"]], "transformers": [[11, "module-mltb2.transformers"]], "MLTB2 Documentation": [[12, "mltb2-documentation"]], "Installation": [[12, "installation"]], "Content": [[12, "content"]]}, "indexentries": {"_load_colon_data() (in module mltb2.data)": [[1, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[1, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[1, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[1, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[1, "mltb2.data.load_prostate"]], "mltb2.data": [[1, "module-mltb2.data"]], "module": [[1, "module-mltb2.data"], [2, "module-mltb2.fasttext"], [3, "module-mltb2.files"], [4, "module-mltb2.md"], [5, "module-mltb2.openai"], [6, "module-mltb2.optuna"], [7, "module-mltb2.plot"], [8, "module-mltb2.somajo"], [9, "module-mltb2.somajo_transformers"], [10, "module-mltb2.text"], [11, "module-mltb2.transformers"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[2, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[2, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[3, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[3, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[3, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[4, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[4, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[4, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[4, "mltb2.md.chunk_md"]], "mltb2.md": [[4, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[5, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[5, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[5, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[5, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[5, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[5, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[5, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[6, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[7, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[7, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[7, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[8, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[8, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[8, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[8, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[8, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[8, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[8, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[9, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[9, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "has_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[10, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.remove_invisible_characters"]], "replace_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[11, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[11, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[11, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[11, "mltb2.transformers.KFoldLabeledDataset.split"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["api-reference", "api-reference/data", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/data.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "data", "fasttext", "files", "md", "openai", "optuna", "plot", "somajo", "somajo_transformers", "text", "transformers", "MLTB2 Documentation"], "terms": {"data": [0, 3, 6, 11, 12], "fasttext": [0, 12], "file": [0, 2, 5, 12], "md": [0, 12], "openai": [0, 12], "optuna": [0, 12], "plot": [0, 12], "somajo": [0, 9, 12], "somajo_transform": [0, 12], "text": [0, 2, 4, 5, 8, 9, 11, 12], "transform": [0, 9, 12], "load": 1, "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "pip": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "instal": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "necessari": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "depend": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "_load_colon_data": 1, "datafram": 1, "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "colon": 1, "label": [1, 7, 11], "The": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "pars": 1, "from": [1, 2, 3, 5, 6, 8, 10], "internet": 1, "also": [1, 5], "see": [1, 6], "http": [1, 8], "genom": 1, "pub": 1, "princeton": 1, "edu": 1, "oncologi": 1, "affydata": 1, "index": [1, 12], "html": 1, "return": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "panda": 1, "type": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "_load_colon_label": 1, "seri": 1, "load_colon": 1, "tupl": [1, 8], "contain": [1, 2, 8, 10, 11], "load_leukemia_big": 1, "leukemia": 1, "big": 1, "web": 1, "stanford": 1, "hasti": 1, "casi_fil": 1, "load_prost": 1, "prostat": 1, "specif": [2, 4, 5, 6, 8, 9, 10, 11], "base": [2, 3, 4, 5, 6, 7, 8, 9, 11], "class": [2, 4, 5, 6, 8, 9, 11], "fasttextlanguageidentif": 2, "object": [2, 4, 5, 6, 8, 9, 11], "identifi": 2, "languag": [2, 8], "__call__": [2, 4, 5, 8, 9, 11], "str": [2, 3, 4, 5, 7, 8, 9, 10, 11], "num_lang": 2, "int": [2, 4, 5, 6, 7, 9, 11], "10": [2, 6], "given": [2, 6], "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "which": [2, 3, 5, 6, 8, 10, 11], "recogn": 2, "number": [2, 4, 5, 6, 7, 8, 9, 11], "A": [2, 6, 12], "dict": [2, 5], "probabl": 2, "more": [2, 6, 7], "than": [2, 9], "element": 2, "so": 2, "guarante": 2, "you": [2, 5, 6, 12], "want": 2, "includ": 2, "case": [2, 5], "when": [2, 5], "veri": 2, "low": 2, "possibl": 2, "ar": [2, 4, 5, 6, 10, 11], "af": 2, "al": 2, "am": 2, "an": [2, 5, 6, 8, 9, 11], "arz": 2, "ast": 2, "av": 2, "az": 2, "azb": 2, "ba": 2, "bar": 2, "bcl": 2, "bg": 2, "bh": 2, "bn": 2, "bo": 2, "bpy": 2, "br": 2, "b": 2, "bxr": 2, "ca": 2, "cbk": 2, "ce": 2, "ceb": 2, "ckb": 2, "co": [2, 11], "c": 2, "cv": [2, 6], "cy": 2, "da": [2, 8], "de": [2, 8], "diq": 2, "dsb": 2, "dty": 2, "dv": 2, "el": 2, "eml": 2, "en": 2, "eo": 2, "e": 2, "et": 2, "eu": 2, "fa": 2, "fi": 2, "fr": 2, "frr": 2, "fy": 2, "ga": 2, "gd": 2, "gl": 2, "gn": 2, "gom": 2, "gu": 2, "gv": 2, "he": 2, "hi": 2, "hif": 2, "hr": 2, "hsb": 2, "ht": 2, "hu": 2, "hy": 2, "ia": 2, "id": [2, 11], "ie": 2, "ilo": 2, "io": 2, "ja": 2, "jbo": 2, "jv": 2, "ka": 2, "kk": 2, "km": 2, "kn": 2, "ko": 2, "krc": 2, "ku": 2, "kv": 2, "kw": 2, "ky": 2, "la": 2, "lb": 2, "lez": 2, "li": 2, "lmo": 2, "lo": 2, "lrc": 2, "lt": 2, "lv": 2, "mai": 2, "mg": 2, "mhr": 2, "min": 2, "mk": 2, "ml": 2, "mn": 2, "mr": 2, "mrj": 2, "m": 2, "mt": 2, "mwl": 2, "my": 2, "myv": 2, "mzn": 2, "nah": 2, "nap": 2, "nd": 2, "ne": 2, "new": 2, "nl": 2, "nn": 2, "oc": 2, "o": 2, "pa": 2, "pam": 2, "pfl": 2, "pl": 2, "pm": 2, "pnb": 2, "p": 2, "pt": 2, "qu": 2, "rm": 2, "ro": 2, "ru": 2, "rue": 2, "sa": 2, "sah": 2, "sc": 2, "scn": 2, "sco": 2, "sd": 2, "sh": 2, "si": 2, "sk": 2, "sl": 2, "sq": 2, "sr": 2, "su": 2, "sv": 2, "sw": 2, "ta": 2, "te": 2, "tg": 2, "th": 2, "tk": 2, "tl": 2, "tr": 2, "tt": 2, "tyv": 2, "ug": 2, "uk": 2, "ur": 2, "uz": 2, "vec": 2, "vep": 2, "vi": 2, "vl": 2, "vo": 2, "wa": [2, 5, 11], "war": 2, "wuu": 2, "xal": 2, "xmf": 2, "yi": 2, "yo": 2, "yue": 2, "zh": 2, "static": 2, "get_model_path_and_download": 2, "get": [2, 5, 8], "model": [2, 5, 6, 11], "path": [2, 3, 11], "download": 2, "need": 2, "full": [2, 3, 5], "util": [3, 11], "fetch_remote_fil": 3, "dirnam": 3, "filenam": [3, 7], "url": [3, 8], "sha256_checksum": 3, "fetch": 3, "remot": 3, "directori": [3, 11], "where": [3, 8], "save": [3, 7], "under": 3, "sha256": 3, "checksum": 3, "creat": [3, 5, 7], "rais": [3, 9], "ioerror": 3, "wrong": 3, "get_and_create_mltb2_data_dir": 3, "mltb2_base_data_dir": 3, "none": [3, 5, 7, 8, 11], "mltb": 3, "dir": 3, "If": [3, 5, 7, 8, 9, 11], "default": [3, 7], "user": [3, 6], "markdown": 4, "mdtextsplitt": 4, "max_token": [4, 9], "transformers_token_count": [4, 9], "transformerstokencount": [4, 9, 11], "show_progress_bar": [4, 5, 8, 9, 11], "bool": [4, 5, 6, 7, 8, 9, 10, 11], "fals": [4, 5, 7, 8, 9, 10, 11], "split": [4, 6, 8, 9, 11], "section": [4, 9], "specifi": [4, 5, 9], "maximum": [4, 9], "token": [4, 5, 8, 9, 11], "doe": [4, 6, 7, 9], "divid": [4, 9], "head": 4, "correspond": 4, "paragraph": 4, "per": [4, 6, 9], "can": [4, 5, 7, 12], "onli": [4, 5, 6], "exceed": 4, "singl": [4, 7, 10], "chunk": 4, "alreadi": 4, "larger": [4, 6], "counter": [4, 9], "show": [4, 5, 8, 9, 11], "progressbar": [4, 5, 8, 9, 11], "dure": [4, 5, 8, 9, 11], "process": [4, 5, 8, 9, 11], "md_text": 4, "list": [4, 5, 8, 9, 11], "_chunk_md_by_headlin": 4, "headlin": 4, "chunk_md": 4, "merg": 4, "isol": 4, "subsequ": 4, "end": 4, "without": [4, 5], "content": 4, "remov": [4, 10], "openaiazurechatcomplet": 5, "completion_kwarg": 5, "ani": 5, "openaichatcomplet": 5, "azur": 5, "chat": 5, "complet": 5, "construct": 5, "openaibasecomplet": 5, "from_yaml": 5, "kwarg": 5, "function": [5, 6, 7], "follow": 5, "properti": 5, "must": [5, 6], "api_typ": 5, "api_vers": 5, "api_bas": 5, "engin": 5, "quickstart": 5, "start": 5, "gpt": 5, "35": 5, "turbo": 5, "4": [5, 6], "servic": 5, "openaiazurecomplet": 5, "openaicomplet": 5, "non": 5, "gener": [5, 11], "abc": [5, 8], "abstract": [5, 8], "prompt": 5, "map": 5, "openaicompletionansw": 5, "call": [5, 6, 7], "llm": 5, "In": [5, 6], "string": [5, 8], "allow": 5, "overwrit": 5, "exampl": [5, 6, 8], "chang": 5, "temperatur": 5, "_complet": 5, "completion_kwargs_for_this_cal": 5, "openaiobject": 5, "method": [5, 6], "classmethod": 5, "yaml_fil": 5, "yaml": 5, "prompt_token": 5, "completion_token": 5, "total_token": 5, "finish_reason": 5, "answer": 5, "result": [5, 6], "name": 5, "ha": [5, 7], "been": 5, "total": 5, "reason": [5, 6], "why": 5, "stop": 5, "mean": [5, 6], "api": [5, 12], "run": 5, "limit": 5, "length": 5, "becaus": 5, "function_cal": 5, "from_open_ai_object": 5, "open_ai_object": 5, "openaitokencount": 5, "model_nam": 5, "count": [5, 11], "some": [5, 12], "3": [5, 6], "5": 5, "davinci": 5, "003": 5, "embed": 5, "ada": 5, "002": 5, "iter": [5, 8, 11], "just": [5, 11], "_check_mandatory_azure_completion_kwarg": 5, "check": [5, 6, 10], "mandatori": 5, "significancerepeatedtrainingprun": 6, "alpha": 6, "float": [6, 8], "0": [6, 7], "1": [6, 11], "n_warmup_step": 6, "baseprun": 6, "pruner": 6, "statist": 6, "signific": 6, "heurist": 6, "decis": 6, "make": [6, 7], "It": [6, 8, 12], "prune": 6, "repeat": 6, "train": [6, 11], "like": 6, "cross": [6, 11], "valid": [6, 11], "As": 6, "test": [6, 11], "t": 6, "our": 6, "experi": 6, "have": 6, "shown": 6, "aplha": 6, "valu": [6, 7], "between": 6, "": [6, 7], "standard": 6, "assum": 6, "adjust": 6, "onc": 6, "hyperparamet": 6, "set": [6, 8, 11], "those": 6, "work": 6, "basi": 6, "intermedi": 6, "For": [6, 7], "epoch": 6, "contrast": 6, "precis": 6, "individu": 6, "fold": [6, 11], "below": 6, "minimalist": 6, "import": [6, 8], "log": 6, "numpi": 6, "np": 6, "sklearn": 6, "dataset": [6, 11], "load_iri": 6, "model_select": 6, "stratifiedkfold": 6, "ensembl": 6, "randomforestclassifi": 6, "metric": 6, "accuracy_scor": 6, "configur": 6, "logger": 6, "debug": 6, "output": [6, 8], "getlogg": 6, "addhandl": 6, "streamhandl": 6, "setlevel": 6, "x": [6, 7], "y": [6, 7], "target": 6, "def": 6, "trial": 6, "min_samples_split": 6, "suggest_int": 6, "2": 6, "20": 6, "n_estim": 6, "100": 6, "validation_result_list": 6, "skf": 6, "n_split": [6, 11], "fold_index": 6, "train_index": 6, "val_index": 6, "enumer": 6, "x_train": 6, "x_val": 6, "y_train": 6, "y_val": 6, "rf": 6, "fit": 6, "y_pred": 6, "predict": 6, "acc": 6, "append": 6, "report": 6, "we": 6, "should": [6, 8], "should_prun": 6, "here": 6, "done": 6, "break": 6, "studi": 6, "create_studi": 6, "storag": 6, "sqlite": 6, "db": 6, "memori": 6, "study_nam": 6, "iris_cv": 6, "direct": 6, "maxim": 6, "load_if_exist": 6, "true": [6, 7, 9, 10], "sampler": 6, "tpesampl": 6, "multivari": 6, "add": 6, "optim": 6, "n_trial": 6, "level": 6, "aggress": 6, "smaller": 6, "stronger": 6, "differ": [6, 7], "two": [6, 7, 8], "distribut": 6, "disabl": 6, "until": 6, "reach": 6, "exce": 6, "step": [6, 7], "frozentri": 6, "judg": 6, "whether": 6, "note": 6, "suppos": 6, "librari": 6, "instead": 6, "provid": 6, "interfac": 6, "implement": 6, "mechan": 6, "take": 6, "copi": 6, "befor": 6, "modifi": 6, "boolean": 6, "repres": 6, "tool": [7, 8, 12], "matplotlib": 7, "boxplot": 7, "titl": 7, "xlabel": 7, "ylabel": 7, "vert": 7, "print": [7, 8], "one": [7, 8], "diagram": 7, "pyplot": 7, "axi": 7, "box": [7, 12], "vertic": 7, "horizont": 7, "boxplot_dict": 7, "values_dict": 7, "form": [7, 10], "dictionari": 7, "save_last_figur": 7, "last": 7, "jupyt": 7, "notebook": 7, "same": 7, "cell": 7, "twin_axes_timeseries_plot": 7, "values_1": 7, "label_1": 7, "values_2": 7, "label_2": 7, "start_timestep_numb": 7, "shift_1": 7, "shift_2": 7, "label_x": 7, "color_1": 7, "tab": 7, "red": 7, "color_2": 7, "blue": 7, "twin": 7, "ax": 7, "timeseri": 7, "curv": 7, "array_lik": 7, "first": 7, "second": 7, "point": 7, "time": 7, "timestep": 7, "shift": 7, "posit": 7, "neg": 7, "color": 7, "jaccardsimilar": 8, "somajobaseclass": 8, "calcul": 8, "jaccard": 8, "similar": 8, "de_cmc": 8, "german": 8, "en_ptb": 8, "english": 8, "text1": 8, "text2": 8, "get_token_set": 8, "word": [8, 9], "directli": 8, "somajosentencesplitt": [8, 9], "sentenc": [8, 9], "tokenextractor": 8, "extract": 8, "extract_url_set": 8, "token_extractor": 8, "url_set": 8, "ist": 8, "ein": 8, "link": 8, "github": [8, 12], "com": 8, "urlswapp": 8, "url_pattern": 8, "swap": 8, "revers": 8, "replac": [8, 10], "extractor": 8, "pattern": 8, "One": 8, "mark": 8, "place": 8, "put": 8, "reverse_swap_url": 8, "revert": 8, "were": 8, "unknown": 8, "swap_url": 8, "detoken": 8, "convert": 8, "how": 8, "do": [8, 11], "extract_token_class_set": 8, "keep_token_class": 8, "keep": 8, "all": [8, 12], "kept": 8, "hug": [9, 11], "face": [9, 11], "textsplitt": 9, "somajo_sentence_splitt": 9, "ignore_overly_long_sent": 9, "alwai": 9, "whole": 9, "splitter": 9, "valueerror": 9, "except": 9, "longer": 9, "simpli": 9, "ignor": 9, "clean_all_invisible_chars_and_whitespac": 10, "clean": 10, "invis": 10, "charact": 10, "whitespac": 10, "special": 10, "normal": 10, "multipl": 10, "lead": 10, "trail": 10, "defin": 10, "constant": 10, "invisible_charact": 10, "special_whitespac": 10, "rteturn": 10, "has_invisible_charact": 10, "otherwis": 10, "has_special_whitespac": 10, "remove_invisible_charact": 10, "replace_multiple_whitespac": 10, "replace_special_whitespac": 10, "kfoldlabeleddataset": 11, "7": 11, "n_repeat": 11, "random_st": 11, "k": 11, "labeleddataset": 11, "labeled_dataset": 11, "stratification_label": 11, "encod": 11, "labe": 11, "pretrained_model_name_or_path": 11, "pathlik": 11, "host": 11, "insid": 11, "repo": 11, "huggingfac": 11, "machin": 12, "learn": 12, "avail": 12, "python": 12, "packag": 12, "pypi": 12, "option": 12, "might": 12, "them": 12, "refer": 12, "repositori": 12, "licens": 12, "imprint": 12}, "objects": {"mltb2": [[1, 0, 0, "-", "data"], [2, 0, 0, "-", "fasttext"], [3, 0, 0, "-", "files"], [4, 0, 0, "-", "md"], [5, 0, 0, "-", "openai"], [6, 0, 0, "-", "optuna"], [7, 0, 0, "-", "plot"], [8, 0, 0, "-", "somajo"], [9, 0, 0, "-", "somajo_transformers"], [10, 0, 0, "-", "text"], [11, 0, 0, "-", "transformers"]], "mltb2.data": [[1, 1, 1, "", "_load_colon_data"], [1, 1, 1, "", "_load_colon_label"], [1, 1, 1, "", "load_colon"], [1, 1, 1, "", "load_leukemia_big"], [1, 1, 1, "", "load_prostate"]], "mltb2.fasttext": [[2, 2, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[2, 3, 1, "", "__call__"], [2, 3, 1, "", "get_model_path_and_download"]], "mltb2.files": [[3, 1, 1, "", "fetch_remote_file"], [3, 1, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[4, 2, 1, "", "MdTextSplitter"], [4, 1, 1, "", "_chunk_md_by_headline"], [4, 1, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[4, 3, 1, "", "__call__"]], "mltb2.openai": [[5, 2, 1, "", "OpenAiAzureChatCompletion"], [5, 2, 1, "", "OpenAiAzureCompletion"], [5, 2, 1, "", "OpenAiBaseCompletion"], [5, 2, 1, "", "OpenAiChatCompletion"], [5, 2, 1, "", "OpenAiCompletion"], [5, 2, 1, "", "OpenAiCompletionAnswer"], [5, 2, 1, "", "OpenAiTokenCounter"], [5, 1, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[5, 3, 1, "", "__call__"], [5, 3, 1, "", "_completion"], [5, 3, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[5, 3, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[5, 3, 1, "", "__call__"]], "mltb2.optuna": [[6, 2, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[6, 3, 1, "", "prune"]], "mltb2.plot": [[7, 1, 1, "", "boxplot"], [7, 1, 1, "", "boxplot_dict"], [7, 1, 1, "", "save_last_figure"], [7, 1, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[8, 2, 1, "", "JaccardSimilarity"], [8, 2, 1, "", "SoMaJoBaseClass"], [8, 2, 1, "", "SoMaJoSentenceSplitter"], [8, 2, 1, "", "TokenExtractor"], [8, 2, 1, "", "UrlSwapper"], [8, 1, 1, "", "detokenize"], [8, 1, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[8, 3, 1, "", "__call__"], [8, 3, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[8, 3, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[8, 3, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[8, 3, 1, "", "reverse_swap_urls"], [8, 3, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[9, 2, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[9, 3, 1, "", "__call__"]], "mltb2.text": [[10, 1, 1, "", "clean_all_invisible_chars_and_whitespaces"], [10, 1, 1, "", "has_invisible_characters"], [10, 1, 1, "", "has_special_whitespaces"], [10, 1, 1, "", "remove_invisible_characters"], [10, 1, 1, "", "replace_multiple_whitespaces"], [10, 1, 1, "", "replace_special_whitespaces"]], "mltb2.transformers": [[11, 2, 1, "", "KFoldLabeledDataset"], [11, 2, 1, "", "LabeledDataset"], [11, 2, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[11, 3, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[11, 3, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:function", "2": "py:class", "3": "py:method"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"], "2": ["py", "class", "Python class"], "3": ["py", "method", "Python method"]}, "titleterms": {"api": 0, "refer": 0, "data": 1, "fasttext": 2, "file": 3, "md": 4, "openai": 5, "optuna": 6, "plot": 7, "somajo": 8, "somajo_transform": 9, "text": 10, "transform": 11, "mltb2": 12, "document": 12, "instal": 12, "content": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "data": [[1, "module-mltb2.data"]], "fasttext": [[2, "module-mltb2.fasttext"]], "files": [[3, "module-mltb2.files"]], "md": [[4, "module-mltb2.md"]], "openai": [[5, "module-mltb2.openai"]], "optuna": [[6, "module-mltb2.optuna"]], "plot": [[7, "module-mltb2.plot"]], "somajo": [[8, "module-mltb2.somajo"]], "somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "text": [[10, "module-mltb2.text"]], "transformers": [[11, "module-mltb2.transformers"]], "MLTB2 Documentation": [[12, "mltb2-documentation"]], "Installation": [[12, "installation"]], "Content": [[12, "content"]]}, "indexentries": {"_load_colon_data() (in module mltb2.data)": [[1, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[1, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[1, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[1, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[1, "mltb2.data.load_prostate"]], "mltb2.data": [[1, "module-mltb2.data"]], "module": [[1, "module-mltb2.data"], [2, "module-mltb2.fasttext"], [3, "module-mltb2.files"], [4, "module-mltb2.md"], [5, "module-mltb2.openai"], [6, "module-mltb2.optuna"], [7, "module-mltb2.plot"], [8, "module-mltb2.somajo"], [9, "module-mltb2.somajo_transformers"], [10, "module-mltb2.text"], [11, "module-mltb2.transformers"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[2, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[2, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[3, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[3, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[3, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[4, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[4, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[4, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[4, "mltb2.md.chunk_md"]], "mltb2.md": [[4, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[5, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[5, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[5, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[5, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[5, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[5, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[5, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[6, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[7, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[7, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[7, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[8, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[8, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[8, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[8, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[8, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[8, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[8, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[9, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[9, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "has_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[10, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[11, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[11, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[11, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[11, "mltb2.transformers.KFoldLabeledDataset.split"]]}}) \ No newline at end of file