diff --git a/_modules/mltb2/text.html b/_modules/mltb2/text.html index 6cea366..f5bf5fc 100644 --- a/_modules/mltb2/text.html +++ b/_modules/mltb2/text.html @@ -84,7 +84,8 @@
"""Text specific module."""
-from typing import Dict, Final, Tuple
+import re
+from typing import Dict, Final, Pattern, Tuple
INVISIBLE_CHARACTERS: Final[Tuple[str, ...]] = (
"\u200b", # Zero Width Space (ZWSP) https://www.compart.com/en/unicode/U+200b
@@ -116,11 +117,15 @@ Source code for mltb2.text
SPECIAL_WHITESPACES_TRANS: Final[Dict[int, str]] = str.maketrans({char: " " for char in SPECIAL_WHITESPACES})
+INVISIBLE_CHARACTERS_AND_SPECIAL_WHITESPACES_TRANS = {**SPECIAL_WHITESPACES_TRANS, **INVISIBLE_CHARACTERS_TRANS}
+
+MULTI_SPACE_PATTERN: Pattern = re.compile(r" {2,}")
+
[docs]def remove_invisible_characters(text: str) -> str:
"""Remove invisible characters from text.
- The invisible characters are defined in the constant `INVISIBLE_CHARACTERS`.
+ The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``.
Args:
text: The text from which the invisible characters are to be removed.
@@ -134,7 +139,7 @@ Source code for mltb2.text
[docs]def has_invisible_characters(text: str) -> bool:
"""Check if text contains invisible characters.
- The invisible characters are defined in the constant `INVISIBLE_CHARACTERS`.
+ The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``.
Args:
text: The text to check.
@@ -148,7 +153,7 @@ Source code for mltb2.text
[docs]def replace_special_whitespaces(text: str) -> str:
"""Replace special whitespaces with normal whitespaces.
- The special whitespaces are defined in the constant `SPECIAL_WHITESPACES`.
+ The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``.
Args:
text: The text from which the special whitespaces are to be replaced.
@@ -162,7 +167,7 @@ Source code for mltb2.text
[docs]def has_special_whitespaces(text: str) -> bool:
"""Check if text contains special whitespaces.
- The special whitespaces are defined in the constant `SPECIAL_WHITESPACES`.
+ The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``.
Args:
text: The text to check.
@@ -171,6 +176,41 @@ Source code for mltb2.text
``True`` if the text contains special whitespaces, ``False`` otherwise.
"""
return any(char in text for char in SPECIAL_WHITESPACES)
+
+
+[docs]def replace_multiple_whitespaces(text: str) -> str:
+ """Replace multiple whitespaces with single whitespace.
+
+ Args:
+ text: The text from which the multiple whitespaces are to be replaced.
+
+ Returns:
+ The cleaned text.
+ """
+ return MULTI_SPACE_PATTERN.sub(" ", text)
+
+
+[docs]def clean_all_invisible_chars_and_whitespaces(text: str) -> str:
+ """Clean text form invisible characters and whitespaces.
+
+ - Remove invisible characters from text.
+ - Replace special whitespaces with normal whitespaces.
+ - Replace multiple whitespaces with single whitespace.
+ - Remove leading and trailing whitespaces.
+
+ The invisible characters are defined in the constant ``INVISIBLE_CHARACTERS``.
+ The special whitespaces are defined in the constant ``SPECIAL_WHITESPACES``.
+
+ Args:
+ text: The text to clean.
+
+ Rteturns:
+ The cleaned text.
+ """
+ text = text.translate(INVISIBLE_CHARACTERS_AND_SPECIAL_WHITESPACES_TRANS)
+ text = replace_multiple_whitespaces(text)
+ text = text.strip()
+ return text
diff --git a/api-reference/text.html b/api-reference/text.html
index ac2d756..ce28338 100644
--- a/api-reference/text.html
+++ b/api-reference/text.html
@@ -60,9 +60,11 @@
somajo
somajo_transformers
text
@@ -103,11 +105,37 @@
text
Text specific module.
+
+-
+mltb2.text.clean_all_invisible_chars_and_whitespaces(text: str) str [source]
+Clean text form invisible characters and whitespaces.
+
+Remove invisible characters from text.
+Replace special whitespaces with normal whitespaces.
+Replace multiple whitespaces with single whitespace.
+Remove leading and trailing whitespaces.
+
+The invisible characters are defined in the constant INVISIBLE_CHARACTERS
.
+The special whitespaces are defined in the constant SPECIAL_WHITESPACES
.
+
+
+- Rteturns:
The cleaned text.
+
+
+
+
-
mltb2.text.has_invisible_characters(text: str) bool [source]
Check if text contains invisible characters.
-The invisible characters are defined in the constant INVISIBLE_CHARACTERS.
+The invisible characters are defined in the constant INVISIBLE_CHARACTERS
.
- Parameters:
text (str) – The text to check.
@@ -125,7 +153,7 @@
-
mltb2.text.has_special_whitespaces(text: str) bool [source]
Check if text contains special whitespaces.
-The special whitespaces are defined in the constant SPECIAL_WHITESPACES.
+The special whitespaces are defined in the constant SPECIAL_WHITESPACES
.
- Parameters:
text (str) – The text to check.
@@ -143,7 +171,7 @@
-
mltb2.text.remove_invisible_characters(text: str) str [source]
Remove invisible characters from text.
-The invisible characters are defined in the constant INVISIBLE_CHARACTERS.
+The invisible characters are defined in the constant INVISIBLE_CHARACTERS
.
- Parameters:
text (str) – The text from which the invisible characters are to be removed.
@@ -157,11 +185,28 @@
+
+-
+mltb2.text.replace_multiple_whitespaces(text: str) str [source]
+Replace multiple whitespaces with single whitespace.
+
+
+
-
mltb2.text.replace_special_whitespaces(text: str) str [source]
Replace special whitespaces with normal whitespaces.
-The special whitespaces are defined in the constant SPECIAL_WHITESPACES.
+The special whitespaces are defined in the constant SPECIAL_WHITESPACES
.
- Parameters:
text (str) – The text from which the special whitespaces are to be replaced.
diff --git a/genindex.html b/genindex.html
index beab39b..63651c1 100644
--- a/genindex.html
+++ b/genindex.html
@@ -159,6 +159,10 @@ C
+
@@ -405,6 +409,8 @@ R
diff --git a/objects.inv b/objects.inv
index d637bf1..0b7cd49 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/searchindex.js b/searchindex.js
index bba8727..faff599 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["api-reference", "api-reference/data", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/data.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "data
", "fasttext
", "files
", "md
", "openai
", "optuna
", "plot
", "somajo
", "somajo_transformers
", "text
", "transformers
", "MLTB2 Documentation"], "terms": {"data": [0, 3, 6, 11, 12], "fasttext": [0, 12], "file": [0, 2, 5, 12], "md": [0, 12], "openai": [0, 12], "optuna": [0, 12], "plot": [0, 12], "somajo": [0, 9, 12], "somajo_transform": [0, 12], "text": [0, 2, 4, 5, 8, 9, 11, 12], "transform": [0, 9, 12], "load": 1, "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "pip": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "instal": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "necessari": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "depend": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "_load_colon_data": 1, "datafram": 1, "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "colon": 1, "label": [1, 7, 11], "The": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "pars": 1, "from": [1, 2, 3, 5, 6, 8, 10], "internet": 1, "also": [1, 5], "see": [1, 6], "http": [1, 8], "genom": 1, "pub": 1, "princeton": 1, "edu": 1, "oncologi": 1, "affydata": 1, "index": [1, 12], "html": 1, "return": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "panda": 1, "type": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "_load_colon_label": 1, "seri": 1, "load_colon": 1, "tupl": [1, 8], "contain": [1, 2, 8, 10, 11], "load_leukemia_big": 1, "leukemia": 1, "big": 1, "web": 1, "stanford": 1, "hasti": 1, "casi_fil": 1, "load_prost": 1, "prostat": 1, "specif": [2, 4, 5, 6, 8, 9, 10, 11], "base": [2, 3, 4, 5, 6, 7, 8, 9, 11], "class": [2, 4, 5, 6, 8, 9, 11], "fasttextlanguageidentif": 2, "object": [2, 4, 5, 6, 8, 9, 11], "identifi": 2, "languag": [2, 8], "__call__": [2, 4, 5, 8, 9, 11], "str": [2, 3, 4, 5, 7, 8, 9, 10, 11], "num_lang": 2, "int": [2, 4, 5, 6, 7, 9, 11], "10": [2, 6], "given": [2, 6], "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "which": [2, 3, 5, 6, 8, 10, 11], "recogn": 2, "number": [2, 4, 5, 6, 7, 8, 9, 11], "A": [2, 6, 12], "dict": [2, 5], "probabl": 2, "more": [2, 6, 7], "than": [2, 9], "element": 2, "so": 2, "guarante": 2, "you": [2, 5, 6, 12], "want": 2, "includ": 2, "case": [2, 5], "when": [2, 5], "veri": 2, "low": 2, "possibl": 2, "ar": [2, 4, 5, 6, 10, 11], "af": 2, "al": 2, "am": 2, "an": [2, 5, 6, 8, 9, 11], "arz": 2, "ast": 2, "av": 2, "az": 2, "azb": 2, "ba": 2, "bar": 2, "bcl": 2, "bg": 2, "bh": 2, "bn": 2, "bo": 2, "bpy": 2, "br": 2, "b": 2, "bxr": 2, "ca": 2, "cbk": 2, "ce": 2, "ceb": 2, "ckb": 2, "co": [2, 11], "c": 2, "cv": [2, 6], "cy": 2, "da": [2, 8], "de": [2, 8], "diq": 2, "dsb": 2, "dty": 2, "dv": 2, "el": 2, "eml": 2, "en": 2, "eo": 2, "e": 2, "et": 2, "eu": 2, "fa": 2, "fi": 2, "fr": 2, "frr": 2, "fy": 2, "ga": 2, "gd": 2, "gl": 2, "gn": 2, "gom": 2, "gu": 2, "gv": 2, "he": 2, "hi": 2, "hif": 2, "hr": 2, "hsb": 2, "ht": 2, "hu": 2, "hy": 2, "ia": 2, "id": [2, 11], "ie": 2, "ilo": 2, "io": 2, "ja": 2, "jbo": 2, "jv": 2, "ka": 2, "kk": 2, "km": 2, "kn": 2, "ko": 2, "krc": 2, "ku": 2, "kv": 2, "kw": 2, "ky": 2, "la": 2, "lb": 2, "lez": 2, "li": 2, "lmo": 2, "lo": 2, "lrc": 2, "lt": 2, "lv": 2, "mai": 2, "mg": 2, "mhr": 2, "min": 2, "mk": 2, "ml": 2, "mn": 2, "mr": 2, "mrj": 2, "m": 2, "mt": 2, "mwl": 2, "my": 2, "myv": 2, "mzn": 2, "nah": 2, "nap": 2, "nd": 2, "ne": 2, "new": 2, "nl": 2, "nn": 2, "oc": 2, "o": 2, "pa": 2, "pam": 2, "pfl": 2, "pl": 2, "pm": 2, "pnb": 2, "p": 2, "pt": 2, "qu": 2, "rm": 2, "ro": 2, "ru": 2, "rue": 2, "sa": 2, "sah": 2, "sc": 2, "scn": 2, "sco": 2, "sd": 2, "sh": 2, "si": 2, "sk": 2, "sl": 2, "sq": 2, "sr": 2, "su": 2, "sv": 2, "sw": 2, "ta": 2, "te": 2, "tg": 2, "th": 2, "tk": 2, "tl": 2, "tr": 2, "tt": 2, "tyv": 2, "ug": 2, "uk": 2, "ur": 2, "uz": 2, "vec": 2, "vep": 2, "vi": 2, "vl": 2, "vo": 2, "wa": [2, 5, 11], "war": 2, "wuu": 2, "xal": 2, "xmf": 2, "yi": 2, "yo": 2, "yue": 2, "zh": 2, "static": 2, "get_model_path_and_download": 2, "get": [2, 5, 8], "model": [2, 5, 6, 11], "path": [2, 3, 11], "download": 2, "need": 2, "full": [2, 3, 5], "util": [3, 11], "fetch_remote_fil": 3, "dirnam": 3, "filenam": [3, 7], "url": [3, 8], "sha256_checksum": 3, "fetch": 3, "remot": 3, "directori": [3, 11], "where": [3, 8], "save": [3, 7], "under": 3, "sha256": 3, "checksum": 3, "creat": [3, 5, 7], "rais": [3, 9], "ioerror": 3, "wrong": 3, "get_and_create_mltb2_data_dir": 3, "mltb2_base_data_dir": 3, "none": [3, 5, 7, 8, 11], "mltb": 3, "dir": 3, "If": [3, 5, 7, 8, 9, 11], "default": [3, 7], "user": [3, 6], "markdown": 4, "mdtextsplitt": 4, "max_token": [4, 9], "transformers_token_count": [4, 9], "transformerstokencount": [4, 9, 11], "show_progress_bar": [4, 5, 8, 9, 11], "bool": [4, 5, 6, 7, 8, 9, 10, 11], "fals": [4, 5, 7, 8, 9, 10, 11], "split": [4, 6, 8, 9, 11], "section": [4, 9], "specifi": [4, 5, 9], "maximum": [4, 9], "token": [4, 5, 8, 9, 11], "doe": [4, 6, 7, 9], "divid": [4, 9], "head": 4, "correspond": 4, "paragraph": 4, "per": [4, 6, 9], "can": [4, 5, 7, 12], "onli": [4, 5, 6], "exceed": 4, "singl": [4, 7], "chunk": 4, "alreadi": 4, "larger": [4, 6], "counter": [4, 9], "show": [4, 5, 8, 9, 11], "progressbar": [4, 5, 8, 9, 11], "dure": [4, 5, 8, 9, 11], "process": [4, 5, 8, 9, 11], "md_text": 4, "list": [4, 5, 8, 9, 11], "_chunk_md_by_headlin": 4, "headlin": 4, "chunk_md": 4, "merg": 4, "isol": 4, "subsequ": 4, "end": 4, "without": [4, 5], "content": 4, "remov": [4, 10], "openaiazurechatcomplet": 5, "completion_kwarg": 5, "ani": 5, "openaichatcomplet": 5, "azur": 5, "chat": 5, "complet": 5, "construct": 5, "openaibasecomplet": 5, "from_yaml": 5, "kwarg": 5, "function": [5, 6, 7], "follow": 5, "properti": 5, "must": [5, 6], "api_typ": 5, "api_vers": 5, "api_bas": 5, "engin": 5, "quickstart": 5, "start": 5, "gpt": 5, "35": 5, "turbo": 5, "4": [5, 6], "servic": 5, "openaiazurecomplet": 5, "openaicomplet": 5, "non": 5, "gener": [5, 11], "abc": [5, 8], "abstract": [5, 8], "prompt": 5, "map": 5, "openaicompletionansw": 5, "call": [5, 6, 7], "llm": 5, "In": [5, 6], "string": [5, 8], "allow": 5, "overwrit": 5, "exampl": [5, 6, 8], "chang": 5, "temperatur": 5, "_complet": 5, "completion_kwargs_for_this_cal": 5, "openaiobject": 5, "method": [5, 6], "classmethod": 5, "yaml_fil": 5, "yaml": 5, "prompt_token": 5, "completion_token": 5, "total_token": 5, "finish_reason": 5, "answer": 5, "result": [5, 6], "name": 5, "ha": [5, 7], "been": 5, "total": 5, "reason": [5, 6], "why": 5, "stop": 5, "mean": [5, 6], "api": [5, 12], "run": 5, "limit": 5, "length": 5, "becaus": 5, "function_cal": 5, "from_open_ai_object": 5, "open_ai_object": 5, "openaitokencount": 5, "model_nam": 5, "count": [5, 11], "some": [5, 12], "3": [5, 6], "5": 5, "davinci": 5, "003": 5, "embed": 5, "ada": 5, "002": 5, "iter": [5, 8, 11], "just": [5, 11], "_check_mandatory_azure_completion_kwarg": 5, "check": [5, 6, 10], "mandatori": 5, "significancerepeatedtrainingprun": 6, "alpha": 6, "float": [6, 8], "0": [6, 7], "1": [6, 11], "n_warmup_step": 6, "baseprun": 6, "pruner": 6, "statist": 6, "signific": 6, "heurist": 6, "decis": 6, "make": [6, 7], "It": [6, 8, 12], "prune": 6, "repeat": 6, "train": [6, 11], "like": 6, "cross": [6, 11], "valid": [6, 11], "As": 6, "test": [6, 11], "t": 6, "our": 6, "experi": 6, "have": 6, "shown": 6, "aplha": 6, "valu": [6, 7], "between": 6, "": [6, 7], "standard": 6, "assum": 6, "adjust": 6, "onc": 6, "hyperparamet": 6, "set": [6, 8, 11], "those": 6, "work": 6, "basi": 6, "intermedi": 6, "For": [6, 7], "epoch": 6, "contrast": 6, "precis": 6, "individu": 6, "fold": [6, 11], "below": 6, "minimalist": 6, "import": [6, 8], "log": 6, "numpi": 6, "np": 6, "sklearn": 6, "dataset": [6, 11], "load_iri": 6, "model_select": 6, "stratifiedkfold": 6, "ensembl": 6, "randomforestclassifi": 6, "metric": 6, "accuracy_scor": 6, "configur": 6, "logger": 6, "debug": 6, "output": [6, 8], "getlogg": 6, "addhandl": 6, "streamhandl": 6, "setlevel": 6, "x": [6, 7], "y": [6, 7], "target": 6, "def": 6, "trial": 6, "min_samples_split": 6, "suggest_int": 6, "2": 6, "20": 6, "n_estim": 6, "100": 6, "validation_result_list": 6, "skf": 6, "n_split": [6, 11], "fold_index": 6, "train_index": 6, "val_index": 6, "enumer": 6, "x_train": 6, "x_val": 6, "y_train": 6, "y_val": 6, "rf": 6, "fit": 6, "y_pred": 6, "predict": 6, "acc": 6, "append": 6, "report": 6, "we": 6, "should": [6, 8], "should_prun": 6, "here": 6, "done": 6, "break": 6, "studi": 6, "create_studi": 6, "storag": 6, "sqlite": 6, "db": 6, "memori": 6, "study_nam": 6, "iris_cv": 6, "direct": 6, "maxim": 6, "load_if_exist": 6, "true": [6, 7, 9, 10], "sampler": 6, "tpesampl": 6, "multivari": 6, "add": 6, "optim": 6, "n_trial": 6, "level": 6, "aggress": 6, "smaller": 6, "stronger": 6, "differ": [6, 7], "two": [6, 7, 8], "distribut": 6, "disabl": 6, "until": 6, "reach": 6, "exce": 6, "step": [6, 7], "frozentri": 6, "judg": 6, "whether": 6, "note": 6, "suppos": 6, "librari": 6, "instead": 6, "provid": 6, "interfac": 6, "implement": 6, "mechan": 6, "take": 6, "copi": 6, "befor": 6, "modifi": 6, "boolean": 6, "repres": 6, "tool": [7, 8, 12], "matplotlib": 7, "boxplot": 7, "titl": 7, "xlabel": 7, "ylabel": 7, "vert": 7, "print": [7, 8], "one": [7, 8], "diagram": 7, "pyplot": 7, "axi": 7, "box": [7, 12], "vertic": 7, "horizont": 7, "boxplot_dict": 7, "values_dict": 7, "form": 7, "dictionari": 7, "save_last_figur": 7, "last": 7, "jupyt": 7, "notebook": 7, "same": 7, "cell": 7, "twin_axes_timeseries_plot": 7, "values_1": 7, "label_1": 7, "values_2": 7, "label_2": 7, "start_timestep_numb": 7, "shift_1": 7, "shift_2": 7, "label_x": 7, "color_1": 7, "tab": 7, "red": 7, "color_2": 7, "blue": 7, "twin": 7, "ax": 7, "timeseri": 7, "curv": 7, "array_lik": 7, "first": 7, "second": 7, "point": 7, "time": 7, "timestep": 7, "shift": 7, "posit": 7, "neg": 7, "color": 7, "jaccardsimilar": 8, "somajobaseclass": 8, "calcul": 8, "jaccard": 8, "similar": 8, "de_cmc": 8, "german": 8, "en_ptb": 8, "english": 8, "text1": 8, "text2": 8, "get_token_set": 8, "word": [8, 9], "directli": 8, "somajosentencesplitt": [8, 9], "sentenc": [8, 9], "tokenextractor": 8, "extract": 8, "extract_url_set": 8, "token_extractor": 8, "url_set": 8, "ist": 8, "ein": 8, "link": 8, "github": [8, 12], "com": 8, "urlswapp": 8, "url_pattern": 8, "swap": 8, "revers": 8, "replac": [8, 10], "extractor": 8, "pattern": 8, "One": 8, "mark": 8, "place": 8, "put": 8, "reverse_swap_url": 8, "revert": 8, "were": 8, "unknown": 8, "swap_url": 8, "detoken": 8, "convert": 8, "how": 8, "do": [8, 11], "extract_token_class_set": 8, "keep_token_class": 8, "keep": 8, "all": [8, 12], "kept": 8, "hug": [9, 11], "face": [9, 11], "textsplitt": 9, "somajo_sentence_splitt": 9, "ignore_overly_long_sent": 9, "alwai": 9, "whole": 9, "splitter": 9, "valueerror": 9, "except": 9, "longer": 9, "simpli": 9, "ignor": 9, "has_invisible_charact": 10, "invis": 10, "charact": 10, "defin": 10, "constant": 10, "invisible_charact": 10, "otherwis": 10, "has_special_whitespac": 10, "special": 10, "whitespac": 10, "special_whitespac": 10, "remove_invisible_charact": 10, "clean": 10, "replace_special_whitespac": 10, "normal": 10, "kfoldlabeleddataset": 11, "7": 11, "n_repeat": 11, "random_st": 11, "k": 11, "labeleddataset": 11, "labeled_dataset": 11, "stratification_label": 11, "encod": 11, "labe": 11, "pretrained_model_name_or_path": 11, "pathlik": 11, "host": 11, "insid": 11, "repo": 11, "huggingfac": 11, "machin": 12, "learn": 12, "avail": 12, "python": 12, "packag": 12, "pypi": 12, "option": 12, "might": 12, "them": 12, "refer": 12, "repositori": 12, "licens": 12, "imprint": 12}, "objects": {"mltb2": [[1, 0, 0, "-", "data"], [2, 0, 0, "-", "fasttext"], [3, 0, 0, "-", "files"], [4, 0, 0, "-", "md"], [5, 0, 0, "-", "openai"], [6, 0, 0, "-", "optuna"], [7, 0, 0, "-", "plot"], [8, 0, 0, "-", "somajo"], [9, 0, 0, "-", "somajo_transformers"], [10, 0, 0, "-", "text"], [11, 0, 0, "-", "transformers"]], "mltb2.data": [[1, 1, 1, "", "_load_colon_data"], [1, 1, 1, "", "_load_colon_label"], [1, 1, 1, "", "load_colon"], [1, 1, 1, "", "load_leukemia_big"], [1, 1, 1, "", "load_prostate"]], "mltb2.fasttext": [[2, 2, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[2, 3, 1, "", "__call__"], [2, 3, 1, "", "get_model_path_and_download"]], "mltb2.files": [[3, 1, 1, "", "fetch_remote_file"], [3, 1, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[4, 2, 1, "", "MdTextSplitter"], [4, 1, 1, "", "_chunk_md_by_headline"], [4, 1, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[4, 3, 1, "", "__call__"]], "mltb2.openai": [[5, 2, 1, "", "OpenAiAzureChatCompletion"], [5, 2, 1, "", "OpenAiAzureCompletion"], [5, 2, 1, "", "OpenAiBaseCompletion"], [5, 2, 1, "", "OpenAiChatCompletion"], [5, 2, 1, "", "OpenAiCompletion"], [5, 2, 1, "", "OpenAiCompletionAnswer"], [5, 2, 1, "", "OpenAiTokenCounter"], [5, 1, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[5, 3, 1, "", "__call__"], [5, 3, 1, "", "_completion"], [5, 3, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[5, 3, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[5, 3, 1, "", "__call__"]], "mltb2.optuna": [[6, 2, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[6, 3, 1, "", "prune"]], "mltb2.plot": [[7, 1, 1, "", "boxplot"], [7, 1, 1, "", "boxplot_dict"], [7, 1, 1, "", "save_last_figure"], [7, 1, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[8, 2, 1, "", "JaccardSimilarity"], [8, 2, 1, "", "SoMaJoBaseClass"], [8, 2, 1, "", "SoMaJoSentenceSplitter"], [8, 2, 1, "", "TokenExtractor"], [8, 2, 1, "", "UrlSwapper"], [8, 1, 1, "", "detokenize"], [8, 1, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[8, 3, 1, "", "__call__"], [8, 3, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[8, 3, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[8, 3, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[8, 3, 1, "", "reverse_swap_urls"], [8, 3, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[9, 2, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[9, 3, 1, "", "__call__"]], "mltb2.text": [[10, 1, 1, "", "has_invisible_characters"], [10, 1, 1, "", "has_special_whitespaces"], [10, 1, 1, "", "remove_invisible_characters"], [10, 1, 1, "", "replace_special_whitespaces"]], "mltb2.transformers": [[11, 2, 1, "", "KFoldLabeledDataset"], [11, 2, 1, "", "LabeledDataset"], [11, 2, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[11, 3, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[11, 3, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:function", "2": "py:class", "3": "py:method"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"], "2": ["py", "class", "Python class"], "3": ["py", "method", "Python method"]}, "titleterms": {"api": 0, "refer": 0, "data": 1, "fasttext": 2, "file": 3, "md": 4, "openai": 5, "optuna": 6, "plot": 7, "somajo": 8, "somajo_transform": 9, "text": 10, "transform": 11, "mltb2": 12, "document": 12, "instal": 12, "content": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "data": [[1, "module-mltb2.data"]], "fasttext": [[2, "module-mltb2.fasttext"]], "files": [[3, "module-mltb2.files"]], "md": [[4, "module-mltb2.md"]], "openai": [[5, "module-mltb2.openai"]], "optuna": [[6, "module-mltb2.optuna"]], "plot": [[7, "module-mltb2.plot"]], "somajo": [[8, "module-mltb2.somajo"]], "somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "text": [[10, "module-mltb2.text"]], "transformers": [[11, "module-mltb2.transformers"]], "MLTB2 Documentation": [[12, "mltb2-documentation"]], "Installation": [[12, "installation"]], "Content": [[12, "content"]]}, "indexentries": {"_load_colon_data() (in module mltb2.data)": [[1, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[1, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[1, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[1, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[1, "mltb2.data.load_prostate"]], "mltb2.data": [[1, "module-mltb2.data"]], "module": [[1, "module-mltb2.data"], [2, "module-mltb2.fasttext"], [3, "module-mltb2.files"], [4, "module-mltb2.md"], [5, "module-mltb2.openai"], [6, "module-mltb2.optuna"], [7, "module-mltb2.plot"], [8, "module-mltb2.somajo"], [9, "module-mltb2.somajo_transformers"], [10, "module-mltb2.text"], [11, "module-mltb2.transformers"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[2, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[2, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[3, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[3, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[3, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[4, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[4, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[4, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[4, "mltb2.md.chunk_md"]], "mltb2.md": [[4, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[5, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[5, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[5, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[5, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[5, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[5, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[5, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[6, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[7, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[7, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[7, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[8, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[8, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[8, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[8, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[8, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[8, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[8, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[9, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[9, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "has_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[10, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.remove_invisible_characters"]], "replace_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[11, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[11, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[11, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[11, "mltb2.transformers.KFoldLabeledDataset.split"]]}})
\ No newline at end of file
+Search.setIndex({"docnames": ["api-reference", "api-reference/data", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/data.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "data
", "fasttext
", "files
", "md
", "openai
", "optuna
", "plot
", "somajo
", "somajo_transformers
", "text
", "transformers
", "MLTB2 Documentation"], "terms": {"data": [0, 3, 6, 11, 12], "fasttext": [0, 12], "file": [0, 2, 5, 12], "md": [0, 12], "openai": [0, 12], "optuna": [0, 12], "plot": [0, 12], "somajo": [0, 9, 12], "somajo_transform": [0, 12], "text": [0, 2, 4, 5, 8, 9, 11, 12], "transform": [0, 9, 12], "load": 1, "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "pip": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "instal": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "necessari": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "depend": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "_load_colon_data": 1, "datafram": 1, "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "colon": 1, "label": [1, 7, 11], "The": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "pars": 1, "from": [1, 2, 3, 5, 6, 8, 10], "internet": 1, "also": [1, 5], "see": [1, 6], "http": [1, 8], "genom": 1, "pub": 1, "princeton": 1, "edu": 1, "oncologi": 1, "affydata": 1, "index": [1, 12], "html": 1, "return": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "panda": 1, "type": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "_load_colon_label": 1, "seri": 1, "load_colon": 1, "tupl": [1, 8], "contain": [1, 2, 8, 10, 11], "load_leukemia_big": 1, "leukemia": 1, "big": 1, "web": 1, "stanford": 1, "hasti": 1, "casi_fil": 1, "load_prost": 1, "prostat": 1, "specif": [2, 4, 5, 6, 8, 9, 10, 11], "base": [2, 3, 4, 5, 6, 7, 8, 9, 11], "class": [2, 4, 5, 6, 8, 9, 11], "fasttextlanguageidentif": 2, "object": [2, 4, 5, 6, 8, 9, 11], "identifi": 2, "languag": [2, 8], "__call__": [2, 4, 5, 8, 9, 11], "str": [2, 3, 4, 5, 7, 8, 9, 10, 11], "num_lang": 2, "int": [2, 4, 5, 6, 7, 9, 11], "10": [2, 6], "given": [2, 6], "paramet": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], "which": [2, 3, 5, 6, 8, 10, 11], "recogn": 2, "number": [2, 4, 5, 6, 7, 8, 9, 11], "A": [2, 6, 12], "dict": [2, 5], "probabl": 2, "more": [2, 6, 7], "than": [2, 9], "element": 2, "so": 2, "guarante": 2, "you": [2, 5, 6, 12], "want": 2, "includ": 2, "case": [2, 5], "when": [2, 5], "veri": 2, "low": 2, "possibl": 2, "ar": [2, 4, 5, 6, 10, 11], "af": 2, "al": 2, "am": 2, "an": [2, 5, 6, 8, 9, 11], "arz": 2, "ast": 2, "av": 2, "az": 2, "azb": 2, "ba": 2, "bar": 2, "bcl": 2, "bg": 2, "bh": 2, "bn": 2, "bo": 2, "bpy": 2, "br": 2, "b": 2, "bxr": 2, "ca": 2, "cbk": 2, "ce": 2, "ceb": 2, "ckb": 2, "co": [2, 11], "c": 2, "cv": [2, 6], "cy": 2, "da": [2, 8], "de": [2, 8], "diq": 2, "dsb": 2, "dty": 2, "dv": 2, "el": 2, "eml": 2, "en": 2, "eo": 2, "e": 2, "et": 2, "eu": 2, "fa": 2, "fi": 2, "fr": 2, "frr": 2, "fy": 2, "ga": 2, "gd": 2, "gl": 2, "gn": 2, "gom": 2, "gu": 2, "gv": 2, "he": 2, "hi": 2, "hif": 2, "hr": 2, "hsb": 2, "ht": 2, "hu": 2, "hy": 2, "ia": 2, "id": [2, 11], "ie": 2, "ilo": 2, "io": 2, "ja": 2, "jbo": 2, "jv": 2, "ka": 2, "kk": 2, "km": 2, "kn": 2, "ko": 2, "krc": 2, "ku": 2, "kv": 2, "kw": 2, "ky": 2, "la": 2, "lb": 2, "lez": 2, "li": 2, "lmo": 2, "lo": 2, "lrc": 2, "lt": 2, "lv": 2, "mai": 2, "mg": 2, "mhr": 2, "min": 2, "mk": 2, "ml": 2, "mn": 2, "mr": 2, "mrj": 2, "m": 2, "mt": 2, "mwl": 2, "my": 2, "myv": 2, "mzn": 2, "nah": 2, "nap": 2, "nd": 2, "ne": 2, "new": 2, "nl": 2, "nn": 2, "oc": 2, "o": 2, "pa": 2, "pam": 2, "pfl": 2, "pl": 2, "pm": 2, "pnb": 2, "p": 2, "pt": 2, "qu": 2, "rm": 2, "ro": 2, "ru": 2, "rue": 2, "sa": 2, "sah": 2, "sc": 2, "scn": 2, "sco": 2, "sd": 2, "sh": 2, "si": 2, "sk": 2, "sl": 2, "sq": 2, "sr": 2, "su": 2, "sv": 2, "sw": 2, "ta": 2, "te": 2, "tg": 2, "th": 2, "tk": 2, "tl": 2, "tr": 2, "tt": 2, "tyv": 2, "ug": 2, "uk": 2, "ur": 2, "uz": 2, "vec": 2, "vep": 2, "vi": 2, "vl": 2, "vo": 2, "wa": [2, 5, 11], "war": 2, "wuu": 2, "xal": 2, "xmf": 2, "yi": 2, "yo": 2, "yue": 2, "zh": 2, "static": 2, "get_model_path_and_download": 2, "get": [2, 5, 8], "model": [2, 5, 6, 11], "path": [2, 3, 11], "download": 2, "need": 2, "full": [2, 3, 5], "util": [3, 11], "fetch_remote_fil": 3, "dirnam": 3, "filenam": [3, 7], "url": [3, 8], "sha256_checksum": 3, "fetch": 3, "remot": 3, "directori": [3, 11], "where": [3, 8], "save": [3, 7], "under": 3, "sha256": 3, "checksum": 3, "creat": [3, 5, 7], "rais": [3, 9], "ioerror": 3, "wrong": 3, "get_and_create_mltb2_data_dir": 3, "mltb2_base_data_dir": 3, "none": [3, 5, 7, 8, 11], "mltb": 3, "dir": 3, "If": [3, 5, 7, 8, 9, 11], "default": [3, 7], "user": [3, 6], "markdown": 4, "mdtextsplitt": 4, "max_token": [4, 9], "transformers_token_count": [4, 9], "transformerstokencount": [4, 9, 11], "show_progress_bar": [4, 5, 8, 9, 11], "bool": [4, 5, 6, 7, 8, 9, 10, 11], "fals": [4, 5, 7, 8, 9, 10, 11], "split": [4, 6, 8, 9, 11], "section": [4, 9], "specifi": [4, 5, 9], "maximum": [4, 9], "token": [4, 5, 8, 9, 11], "doe": [4, 6, 7, 9], "divid": [4, 9], "head": 4, "correspond": 4, "paragraph": 4, "per": [4, 6, 9], "can": [4, 5, 7, 12], "onli": [4, 5, 6], "exceed": 4, "singl": [4, 7, 10], "chunk": 4, "alreadi": 4, "larger": [4, 6], "counter": [4, 9], "show": [4, 5, 8, 9, 11], "progressbar": [4, 5, 8, 9, 11], "dure": [4, 5, 8, 9, 11], "process": [4, 5, 8, 9, 11], "md_text": 4, "list": [4, 5, 8, 9, 11], "_chunk_md_by_headlin": 4, "headlin": 4, "chunk_md": 4, "merg": 4, "isol": 4, "subsequ": 4, "end": 4, "without": [4, 5], "content": 4, "remov": [4, 10], "openaiazurechatcomplet": 5, "completion_kwarg": 5, "ani": 5, "openaichatcomplet": 5, "azur": 5, "chat": 5, "complet": 5, "construct": 5, "openaibasecomplet": 5, "from_yaml": 5, "kwarg": 5, "function": [5, 6, 7], "follow": 5, "properti": 5, "must": [5, 6], "api_typ": 5, "api_vers": 5, "api_bas": 5, "engin": 5, "quickstart": 5, "start": 5, "gpt": 5, "35": 5, "turbo": 5, "4": [5, 6], "servic": 5, "openaiazurecomplet": 5, "openaicomplet": 5, "non": 5, "gener": [5, 11], "abc": [5, 8], "abstract": [5, 8], "prompt": 5, "map": 5, "openaicompletionansw": 5, "call": [5, 6, 7], "llm": 5, "In": [5, 6], "string": [5, 8], "allow": 5, "overwrit": 5, "exampl": [5, 6, 8], "chang": 5, "temperatur": 5, "_complet": 5, "completion_kwargs_for_this_cal": 5, "openaiobject": 5, "method": [5, 6], "classmethod": 5, "yaml_fil": 5, "yaml": 5, "prompt_token": 5, "completion_token": 5, "total_token": 5, "finish_reason": 5, "answer": 5, "result": [5, 6], "name": 5, "ha": [5, 7], "been": 5, "total": 5, "reason": [5, 6], "why": 5, "stop": 5, "mean": [5, 6], "api": [5, 12], "run": 5, "limit": 5, "length": 5, "becaus": 5, "function_cal": 5, "from_open_ai_object": 5, "open_ai_object": 5, "openaitokencount": 5, "model_nam": 5, "count": [5, 11], "some": [5, 12], "3": [5, 6], "5": 5, "davinci": 5, "003": 5, "embed": 5, "ada": 5, "002": 5, "iter": [5, 8, 11], "just": [5, 11], "_check_mandatory_azure_completion_kwarg": 5, "check": [5, 6, 10], "mandatori": 5, "significancerepeatedtrainingprun": 6, "alpha": 6, "float": [6, 8], "0": [6, 7], "1": [6, 11], "n_warmup_step": 6, "baseprun": 6, "pruner": 6, "statist": 6, "signific": 6, "heurist": 6, "decis": 6, "make": [6, 7], "It": [6, 8, 12], "prune": 6, "repeat": 6, "train": [6, 11], "like": 6, "cross": [6, 11], "valid": [6, 11], "As": 6, "test": [6, 11], "t": 6, "our": 6, "experi": 6, "have": 6, "shown": 6, "aplha": 6, "valu": [6, 7], "between": 6, "": [6, 7], "standard": 6, "assum": 6, "adjust": 6, "onc": 6, "hyperparamet": 6, "set": [6, 8, 11], "those": 6, "work": 6, "basi": 6, "intermedi": 6, "For": [6, 7], "epoch": 6, "contrast": 6, "precis": 6, "individu": 6, "fold": [6, 11], "below": 6, "minimalist": 6, "import": [6, 8], "log": 6, "numpi": 6, "np": 6, "sklearn": 6, "dataset": [6, 11], "load_iri": 6, "model_select": 6, "stratifiedkfold": 6, "ensembl": 6, "randomforestclassifi": 6, "metric": 6, "accuracy_scor": 6, "configur": 6, "logger": 6, "debug": 6, "output": [6, 8], "getlogg": 6, "addhandl": 6, "streamhandl": 6, "setlevel": 6, "x": [6, 7], "y": [6, 7], "target": 6, "def": 6, "trial": 6, "min_samples_split": 6, "suggest_int": 6, "2": 6, "20": 6, "n_estim": 6, "100": 6, "validation_result_list": 6, "skf": 6, "n_split": [6, 11], "fold_index": 6, "train_index": 6, "val_index": 6, "enumer": 6, "x_train": 6, "x_val": 6, "y_train": 6, "y_val": 6, "rf": 6, "fit": 6, "y_pred": 6, "predict": 6, "acc": 6, "append": 6, "report": 6, "we": 6, "should": [6, 8], "should_prun": 6, "here": 6, "done": 6, "break": 6, "studi": 6, "create_studi": 6, "storag": 6, "sqlite": 6, "db": 6, "memori": 6, "study_nam": 6, "iris_cv": 6, "direct": 6, "maxim": 6, "load_if_exist": 6, "true": [6, 7, 9, 10], "sampler": 6, "tpesampl": 6, "multivari": 6, "add": 6, "optim": 6, "n_trial": 6, "level": 6, "aggress": 6, "smaller": 6, "stronger": 6, "differ": [6, 7], "two": [6, 7, 8], "distribut": 6, "disabl": 6, "until": 6, "reach": 6, "exce": 6, "step": [6, 7], "frozentri": 6, "judg": 6, "whether": 6, "note": 6, "suppos": 6, "librari": 6, "instead": 6, "provid": 6, "interfac": 6, "implement": 6, "mechan": 6, "take": 6, "copi": 6, "befor": 6, "modifi": 6, "boolean": 6, "repres": 6, "tool": [7, 8, 12], "matplotlib": 7, "boxplot": 7, "titl": 7, "xlabel": 7, "ylabel": 7, "vert": 7, "print": [7, 8], "one": [7, 8], "diagram": 7, "pyplot": 7, "axi": 7, "box": [7, 12], "vertic": 7, "horizont": 7, "boxplot_dict": 7, "values_dict": 7, "form": [7, 10], "dictionari": 7, "save_last_figur": 7, "last": 7, "jupyt": 7, "notebook": 7, "same": 7, "cell": 7, "twin_axes_timeseries_plot": 7, "values_1": 7, "label_1": 7, "values_2": 7, "label_2": 7, "start_timestep_numb": 7, "shift_1": 7, "shift_2": 7, "label_x": 7, "color_1": 7, "tab": 7, "red": 7, "color_2": 7, "blue": 7, "twin": 7, "ax": 7, "timeseri": 7, "curv": 7, "array_lik": 7, "first": 7, "second": 7, "point": 7, "time": 7, "timestep": 7, "shift": 7, "posit": 7, "neg": 7, "color": 7, "jaccardsimilar": 8, "somajobaseclass": 8, "calcul": 8, "jaccard": 8, "similar": 8, "de_cmc": 8, "german": 8, "en_ptb": 8, "english": 8, "text1": 8, "text2": 8, "get_token_set": 8, "word": [8, 9], "directli": 8, "somajosentencesplitt": [8, 9], "sentenc": [8, 9], "tokenextractor": 8, "extract": 8, "extract_url_set": 8, "token_extractor": 8, "url_set": 8, "ist": 8, "ein": 8, "link": 8, "github": [8, 12], "com": 8, "urlswapp": 8, "url_pattern": 8, "swap": 8, "revers": 8, "replac": [8, 10], "extractor": 8, "pattern": 8, "One": 8, "mark": 8, "place": 8, "put": 8, "reverse_swap_url": 8, "revert": 8, "were": 8, "unknown": 8, "swap_url": 8, "detoken": 8, "convert": 8, "how": 8, "do": [8, 11], "extract_token_class_set": 8, "keep_token_class": 8, "keep": 8, "all": [8, 12], "kept": 8, "hug": [9, 11], "face": [9, 11], "textsplitt": 9, "somajo_sentence_splitt": 9, "ignore_overly_long_sent": 9, "alwai": 9, "whole": 9, "splitter": 9, "valueerror": 9, "except": 9, "longer": 9, "simpli": 9, "ignor": 9, "clean_all_invisible_chars_and_whitespac": 10, "clean": 10, "invis": 10, "charact": 10, "whitespac": 10, "special": 10, "normal": 10, "multipl": 10, "lead": 10, "trail": 10, "defin": 10, "constant": 10, "invisible_charact": 10, "special_whitespac": 10, "rteturn": 10, "has_invisible_charact": 10, "otherwis": 10, "has_special_whitespac": 10, "remove_invisible_charact": 10, "replace_multiple_whitespac": 10, "replace_special_whitespac": 10, "kfoldlabeleddataset": 11, "7": 11, "n_repeat": 11, "random_st": 11, "k": 11, "labeleddataset": 11, "labeled_dataset": 11, "stratification_label": 11, "encod": 11, "labe": 11, "pretrained_model_name_or_path": 11, "pathlik": 11, "host": 11, "insid": 11, "repo": 11, "huggingfac": 11, "machin": 12, "learn": 12, "avail": 12, "python": 12, "packag": 12, "pypi": 12, "option": 12, "might": 12, "them": 12, "refer": 12, "repositori": 12, "licens": 12, "imprint": 12}, "objects": {"mltb2": [[1, 0, 0, "-", "data"], [2, 0, 0, "-", "fasttext"], [3, 0, 0, "-", "files"], [4, 0, 0, "-", "md"], [5, 0, 0, "-", "openai"], [6, 0, 0, "-", "optuna"], [7, 0, 0, "-", "plot"], [8, 0, 0, "-", "somajo"], [9, 0, 0, "-", "somajo_transformers"], [10, 0, 0, "-", "text"], [11, 0, 0, "-", "transformers"]], "mltb2.data": [[1, 1, 1, "", "_load_colon_data"], [1, 1, 1, "", "_load_colon_label"], [1, 1, 1, "", "load_colon"], [1, 1, 1, "", "load_leukemia_big"], [1, 1, 1, "", "load_prostate"]], "mltb2.fasttext": [[2, 2, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[2, 3, 1, "", "__call__"], [2, 3, 1, "", "get_model_path_and_download"]], "mltb2.files": [[3, 1, 1, "", "fetch_remote_file"], [3, 1, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[4, 2, 1, "", "MdTextSplitter"], [4, 1, 1, "", "_chunk_md_by_headline"], [4, 1, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[4, 3, 1, "", "__call__"]], "mltb2.openai": [[5, 2, 1, "", "OpenAiAzureChatCompletion"], [5, 2, 1, "", "OpenAiAzureCompletion"], [5, 2, 1, "", "OpenAiBaseCompletion"], [5, 2, 1, "", "OpenAiChatCompletion"], [5, 2, 1, "", "OpenAiCompletion"], [5, 2, 1, "", "OpenAiCompletionAnswer"], [5, 2, 1, "", "OpenAiTokenCounter"], [5, 1, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[5, 3, 1, "", "__call__"], [5, 3, 1, "", "_completion"], [5, 3, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[5, 3, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[5, 3, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[5, 3, 1, "", "__call__"]], "mltb2.optuna": [[6, 2, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[6, 3, 1, "", "prune"]], "mltb2.plot": [[7, 1, 1, "", "boxplot"], [7, 1, 1, "", "boxplot_dict"], [7, 1, 1, "", "save_last_figure"], [7, 1, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[8, 2, 1, "", "JaccardSimilarity"], [8, 2, 1, "", "SoMaJoBaseClass"], [8, 2, 1, "", "SoMaJoSentenceSplitter"], [8, 2, 1, "", "TokenExtractor"], [8, 2, 1, "", "UrlSwapper"], [8, 1, 1, "", "detokenize"], [8, 1, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[8, 3, 1, "", "__call__"], [8, 3, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[8, 3, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[8, 3, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[8, 3, 1, "", "reverse_swap_urls"], [8, 3, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[9, 2, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[9, 3, 1, "", "__call__"]], "mltb2.text": [[10, 1, 1, "", "clean_all_invisible_chars_and_whitespaces"], [10, 1, 1, "", "has_invisible_characters"], [10, 1, 1, "", "has_special_whitespaces"], [10, 1, 1, "", "remove_invisible_characters"], [10, 1, 1, "", "replace_multiple_whitespaces"], [10, 1, 1, "", "replace_special_whitespaces"]], "mltb2.transformers": [[11, 2, 1, "", "KFoldLabeledDataset"], [11, 2, 1, "", "LabeledDataset"], [11, 2, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[11, 3, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[11, 3, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:function", "2": "py:class", "3": "py:method"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "function", "Python function"], "2": ["py", "class", "Python class"], "3": ["py", "method", "Python method"]}, "titleterms": {"api": 0, "refer": 0, "data": 1, "fasttext": 2, "file": 3, "md": 4, "openai": 5, "optuna": 6, "plot": 7, "somajo": 8, "somajo_transform": 9, "text": 10, "transform": 11, "mltb2": 12, "document": 12, "instal": 12, "content": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "data": [[1, "module-mltb2.data"]], "fasttext": [[2, "module-mltb2.fasttext"]], "files": [[3, "module-mltb2.files"]], "md": [[4, "module-mltb2.md"]], "openai": [[5, "module-mltb2.openai"]], "optuna": [[6, "module-mltb2.optuna"]], "plot": [[7, "module-mltb2.plot"]], "somajo": [[8, "module-mltb2.somajo"]], "somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "text": [[10, "module-mltb2.text"]], "transformers": [[11, "module-mltb2.transformers"]], "MLTB2 Documentation": [[12, "mltb2-documentation"]], "Installation": [[12, "installation"]], "Content": [[12, "content"]]}, "indexentries": {"_load_colon_data() (in module mltb2.data)": [[1, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[1, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[1, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[1, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[1, "mltb2.data.load_prostate"]], "mltb2.data": [[1, "module-mltb2.data"]], "module": [[1, "module-mltb2.data"], [2, "module-mltb2.fasttext"], [3, "module-mltb2.files"], [4, "module-mltb2.md"], [5, "module-mltb2.openai"], [6, "module-mltb2.optuna"], [7, "module-mltb2.plot"], [8, "module-mltb2.somajo"], [9, "module-mltb2.somajo_transformers"], [10, "module-mltb2.text"], [11, "module-mltb2.transformers"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[2, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[2, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[2, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[3, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[3, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[3, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[4, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[4, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[4, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[4, "mltb2.md.chunk_md"]], "mltb2.md": [[4, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[5, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[5, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[5, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[5, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[5, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[5, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[5, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[5, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[5, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[6, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[6, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[7, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[7, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[7, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[7, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[8, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[8, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[8, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[8, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[8, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[8, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[8, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[8, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[8, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[8, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[9, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[9, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[9, "module-mltb2.somajo_transformers"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "has_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[10, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[10, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[10, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[11, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[11, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[11, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[11, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[11, "mltb2.transformers.KFoldLabeledDataset.split"]]}})
\ No newline at end of file