diff --git a/_modules/mltb2/somajo.html b/_modules/mltb2/somajo.html index 51d581f..eebee3e 100644 --- a/_modules/mltb2/somajo.html +++ b/_modules/mltb2/somajo.html @@ -90,7 +90,7 @@
from abc import ABC
from dataclasses import dataclass, field
-from typing import Container, Iterable, List, Optional, Set, Union
+from typing import Container, Dict, Iterable, List, Optional, Set, Tuple, Union
from somajo import SoMaJo
from tqdm import tqdm
@@ -266,6 +266,49 @@ Source code for mltb2.somajo
sentences = self.somajo.tokenize_text(text)
result = extract_token_class_set(sentences, keep_token_classes="URL")
return result
TokenExtractor.extract_url_set()
UrlSwapper
+detokenize()
extract_token_class_set()
Bases: object
Tool to swap (and reverse swap) links with a numbered replacement link.
+token_extractor (TokenExtractor) – The sentence token extractor to be used.
url_pattern (str) – The pattern to use for replacement. One {}
marks the place where to put the number.
Revert the url swap.
+ ++ |
S |
- + |
+ |
fasttext
", "files
", "md
", "openai
", "optuna
", "plot
", "somajo
", "somajo_transformers
", "transformers
", "MLTB2 Documentation"], "terms": {"fasttext": [0, 10], "file": [0, 4, 10], "md": [0, 10], "openai": [0, 10], "optuna": [0, 10], "plot": [0, 10], "somajo": [0, 8, 10], "somajo_transform": [0, 10], "transform": [0, 8, 10], "specif": [1, 3, 4, 5, 7, 8, 9], "function": [1, 3, 4, 5, 6, 7, 8, 9], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9], "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9], "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 6, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9], "pip": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "instal": [1, 2, 3, 4, 5, 6, 7, 8, 9], "necessari": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "depend": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9], "class": [1, 3, 4, 5, 7, 8, 9], "fasttextlanguageidentif": 1, "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9], "object": [1, 3, 4, 5, 8, 9], "identifi": 1, "languag": [1, 7], "text": [1, 3, 4, 7, 8, 9], "__call__": [1, 3, 4, 7, 8, 9], "num_lang": 1, "int": [1, 3, 4, 5, 6, 8, 9], "10": [1, 5], "given": [1, 5], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9], "which": [1, 2, 4, 5, 7, 9], "recogn": 1, "number": [1, 3, 4, 5, 6, 8, 9], "return": [1, 2, 3, 4, 5, 7, 8, 9], "A": [1, 5, 6, 10], "dict": [1, 4], "from": [1, 2, 4, 5, 7], "probabl": 1, "contain": [1, 7, 9], "more": [1, 5, 6], "than": [1, 8], "element": 1, "so": 1, "guarante": 1, "you": [1, 4, 5, 10], "want": 1, "includ": 1, "case": [1, 4], "when": [1, 4], "veri": 1, "low": 1, "possibl": 1, "ar": [1, 4, 5, 9], "af": 1, "al": 1, "am": 1, "an": [1, 4, 5, 7, 8, 9], "arz": 1, "ast": 1, "av": 1, "az": 1, "azb": 1, "ba": 1, "bar": 1, "bcl": 1, "bg": 1, "bh": 1, "bn": 1, "bo": 1, "bpy": 1, "br": 1, "b": 1, "bxr": 1, "ca": 1, "cbk": 1, "ce": 1, "ceb": 1, "ckb": 1, "co": [1, 9], "c": 1, "cv": [1, 5], "cy": 1, "da": [1, 7], "de": [1, 7], "diq": 1, "dsb": 1, "dty": 1, "dv": 1, "el": 1, "eml": 1, "en": 1, "eo": 1, "e": 1, "et": 1, "eu": 1, "fa": 1, "fi": 1, "fr": 1, "frr": 1, "fy": 1, "ga": 1, "gd": 1, "gl": 1, "gn": 1, "gom": 1, "gu": 1, "gv": 1, "he": 1, "hi": 1, "hif": 1, "hr": 1, "hsb": 1, "ht": 1, "hu": 1, "hy": 1, "ia": 1, "id": [1, 9], "ie": 1, "ilo": 1, "io": 1, "ja": 1, "jbo": 1, "jv": 1, "ka": 1, "kk": 1, "km": 1, "kn": 1, "ko": 1, "krc": 1, "ku": 1, "kv": 1, "kw": 1, "ky": 1, "la": 1, "lb": 1, "lez": 1, "li": 1, "lmo": 1, "lo": 1, "lrc": 1, "lt": 1, "lv": 1, "mai": 1, "mg": 1, "mhr": 1, "min": 1, "mk": 1, "ml": 1, "mn": 1, "mr": 1, "mrj": 1, "m": 1, "mt": 1, "mwl": 1, "my": 1, "myv": 1, "mzn": 1, "nah": 1, "nap": 1, "nd": 1, "ne": 1, "new": 1, "nl": 1, "nn": 1, "oc": 1, "o": 1, "pa": 1, "pam": 1, "pfl": 1, "pl": 1, "pm": 1, "pnb": 1, "p": 1, "pt": 1, "qu": 1, "rm": 1, "ro": 1, "ru": 1, "rue": 1, "sa": 1, "sah": 1, "sc": 1, "scn": 1, "sco": 1, "sd": 1, "sh": 1, "si": 1, "sk": 1, "sl": 1, "sq": 1, "sr": 1, "su": 1, "sv": 1, "sw": 1, "ta": 1, "te": 1, "tg": 1, "th": 1, "tk": 1, "tl": 1, "tr": 1, "tt": 1, "tyv": 1, "ug": 1, "uk": 1, "ur": 1, "uz": 1, "vec": 1, "vep": 1, "vi": 1, "vl": 1, "vo": 1, "wa": [1, 4, 9], "war": 1, "wuu": 1, "xal": 1, "xmf": 1, "yi": 1, "yo": 1, "yue": 1, "zh": 1, "static": 1, "get_model_path_and_download": 1, "str": [1, 2, 3, 4, 6, 7, 8, 9], "get": [1, 4, 7], "model": [1, 4, 5, 9], "path": [1, 2, 9], "download": 1, "need": 1, "type": [1, 2, 3, 4, 5, 7, 8, 9], "util": [2, 9], "fetch_remote_fil": 2, "dirnam": 2, "filenam": [2, 6], "url": [2, 7], "sha256_checksum": 2, "fetch": 2, "remot": 2, "directori": [2, 9], "where": 2, "save": [2, 6], "under": 2, "sha256": 2, "checksum": 2, "full": [2, 4], "creat": [2, 4, 6], "rais": [2, 8], "ioerror": 2, "wrong": 2, "get_and_create_mltb2_data_dir": 2, "mltb2_base_data_dir": 2, "none": [2, 4, 6, 7, 9], "mltb": 2, "data": [2, 5, 9], "dir": 2, "The": [2, 3, 4, 5, 7, 8, 9], "markdown": 3, "mdtextsplitt": 3, "max_token": [3, 8], "transformers_token_count": [3, 8], "transformerstokencount": [3, 8, 9], "show_progress_bar": [3, 4, 7, 8, 9], "bool": [3, 4, 5, 7, 8, 9], "fals": [3, 4, 7, 8, 9], "split": [3, 5, 7, 8, 9], "section": [3, 8], "specifi": [3, 4, 8], "maximum": [3, 8], "token": [3, 4, 7, 8, 9], "doe": [3, 5, 6, 8], "divid": [3, 8], "head": 3, "correspond": 3, "paragraph": 3, "per": [3, 5, 8], "can": [3, 4, 6, 10], "onli": [3, 4, 5], "exceed": 3, "singl": [3, 6], "chunk": 3, "alreadi": 3, "larger": [3, 5], "counter": [3, 8], "show": [3, 4, 7, 8, 9], "progressbar": [3, 4, 7, 8, 9], "dure": [3, 4, 7, 8, 9], "process": [3, 4, 7, 8, 9], "md_text": 3, "list": [3, 4, 7, 8, 9], "_chunk_md_by_headlin": 3, "headlin": 3, "chunk_md": 3, "merg": 3, "isol": 3, "openaiazurechatcomplet": 4, "completion_kwarg": 4, "ani": 4, "openaichatcomplet": 4, "azur": 4, "chat": 4, "complet": 4, "also": 4, "construct": 4, "openaibasecomplet": 4, "from_yaml": 4, "kwarg": 4, "follow": 4, "properti": 4, "must": [4, 5], "api_typ": 4, "api_vers": 4, "api_bas": 4, "engin": 4, "quickstart": 4, "start": 4, "gpt": 4, "35": 4, "turbo": 4, "4": [4, 5], "servic": 4, "openaiazurecomplet": 4, "openaicomplet": 4, "non": 4, "gener": [4, 9], "abc": [4, 7], "abstract": [4, 7], "prompt": 4, "map": 4, "openaicompletionansw": 4, "call": [4, 5, 6], "llm": 4, "In": [4, 5], "string": [4, 7], "allow": 4, "overwrit": 4, "exampl": [4, 5, 7], "chang": 4, "temperatur": 4, "_complet": 4, "completion_kwargs_for_this_cal": 4, "openaiobject": 4, "method": [4, 5], "classmethod": 4, "yaml_fil": 4, "yaml": 4, "prompt_token": 4, "completion_token": 4, "total_token": 4, "finish_reason": 4, "answer": 4, "result": [4, 5], "name": 4, "ha": [4, 6], "been": 4, "total": 4, "reason": [4, 5], "why": 4, "stop": 4, "mean": [4, 5], "api": [4, 10], "without": 4, "run": 4, "limit": 4, "length": 4, "becaus": 4, "function_cal": 4, "from_open_ai_object": 4, "open_ai_object": 4, "openaitokencount": 4, "model_nam": 4, "count": [4, 9], "some": [4, 10], "3": [4, 5], "5": 4, "davinci": 4, "003": 4, "embed": 4, "ada": 4, "002": 4, "iter": [4, 7, 9], "just": [4, 9], "If": [4, 7, 8, 9], "_check_mandatory_azure_completion_kwarg": 4, "check": [4, 5], "mandatori": 4, "significancerepeatedtrainingprun": 5, "alpha": 5, "float": [5, 7], "0": [5, 6], "1": [5, 9], "n_warmup_step": 5, "baseprun": 5, "pruner": 5, "statist": 5, "signific": 5, "heurist": 5, "decis": 5, "make": 5, "It": [5, 7, 10], "prune": 5, "repeat": 5, "train": [5, 9], "like": 5, "cross": [5, 9], "valid": [5, 9], "As": 5, "test": [5, 9], "t": 5, "our": 5, "experi": 5, "have": 5, "shown": 5, "aplha": 5, "valu": [5, 6], "between": 5, "": 5, "standard": 5, "assum": 5, "adjust": 5, "onc": 5, "hyperparamet": 5, "set": [5, 7, 9], "those": 5, "work": 5, "basi": 5, "intermedi": 5, "For": [5, 6], "epoch": 5, "contrast": 5, "precis": 5, "individu": 5, "fold": [5, 9], "below": 5, "minimalist": 5, "import": [5, 7], "log": 5, "numpi": 5, "np": 5, "sklearn": 5, "dataset": [5, 9], "load_iri": 5, "model_select": 5, "stratifiedkfold": 5, "ensembl": 5, "randomforestclassifi": 5, "metric": 5, "accuracy_scor": 5, "configur": 5, "logger": 5, "see": 5, "debug": 5, "output": [5, 7], "getlogg": 5, "addhandl": 5, "streamhandl": 5, "setlevel": 5, "x": [5, 6], "y": [5, 6], "target": 5, "def": 5, "trial": 5, "min_samples_split": 5, "suggest_int": 5, "2": 5, "20": 5, "n_estim": 5, "100": 5, "validation_result_list": 5, "skf": 5, "n_split": [5, 9], "fold_index": 5, "train_index": 5, "val_index": 5, "enumer": 5, "x_train": 5, "x_val": 5, "y_train": 5, "y_val": 5, "rf": 5, "fit": 5, "y_pred": 5, "predict": 5, "acc": 5, "append": 5, "report": 5, "we": 5, "should": [5, 7], "should_prun": 5, "here": 5, "done": 5, "break": 5, "studi": 5, "create_studi": 5, "storag": 5, "sqlite": 5, "db": 5, "memori": 5, "study_nam": 5, "iris_cv": 5, "direct": 5, "maxim": 5, "load_if_exist": 5, "true": [5, 6, 8], "sampler": 5, "tpesampl": 5, "multivari": 5, "add": 5, "optim": 5, "n_trial": 5, "level": 5, "aggress": 5, "smaller": 5, "stronger": 5, "differ": [5, 6], "two": [5, 6, 7], "distribut": 5, "disabl": 5, "until": 5, "reach": 5, "exce": 5, "step": [5, 6], "frozentri": 5, "judg": 5, "whether": 5, "note": 5, "suppos": 5, "librari": 5, "user": 5, "instead": 5, "provid": 5, "interfac": 5, "implement": 5, "mechan": 5, "take": 5, "copi": 5, "befor": 5, "modifi": 5, "boolean": 5, "repres": 5, "collect": 6, "tool": [6, 7, 10], "matplotlib": 6, "boxplot": 6, "label": [6, 9], "titl": 6, "xlabel": 6, "ylabel": 6, "vert": 6, "print": [6, 7], "one": [6, 7], "diagram": 6, "pyplot": 6, "boxplot_dict": 6, "values_dict": 6, "form": 6, "dictionari": 6, "save_last_figur": 6, "last": 6, "jupyt": 6, "notebook": 6, "same": 6, "cell": 6, "twin_axes_timeseries_plot": 6, "values_1": 6, "label_1": 6, "values_2": 6, "label_2": 6, "start_timestep_numb": 6, "shift_1": 6, "shift_2": 6, "label_x": 6, "color_1": 6, "tab": 6, "red": 6, "color_2": 6, "blue": 6, "twin": 6, "ax": 6, "timeseri": 6, "curv": 6, "array_lik": 6, "first": 6, "second": 6, "point": 6, "time": 6, "default": 6, "timestep": 6, "shift": 6, "posit": 6, "neg": 6, "axi": 6, "color": 6, "jaccardsimilar": 7, "somajobaseclass": 7, "calcul": 7, "jaccard": 7, "similar": 7, "de_cmc": 7, "german": 7, "en_ptb": 7, "english": 7, "text1": 7, "text2": 7, "get_token_set": 7, "word": [7, 8], "directli": 7, "somajosentencesplitt": [7, 8], "sentenc": [7, 8], "tokenextractor": 7, "extract": 7, "extract_url_set": 7, "token_extractor": 7, "url_set": 7, "ist": 7, "ein": 7, "link": 7, "http": 7, "github": [7, 10], "com": 7, "detoken": 7, "convert": 7, "how": 7, "do": [7, 9], "extract_token_class_set": 7, "keep_token_class": 7, "keep": 7, "all": [7, 10], "kept": 7, "hug": [8, 9], "face": [8, 9], "textsplitt": 8, "somajo_sentence_splitt": 8, "ignore_overly_long_sent": 8, "alwai": 8, "whole": 8, "splitter": 8, "valueerror": 8, "except": 8, "longer": 8, "simpli": 8, "ignor": 8, "kfoldlabeleddataset": 9, "7": 9, "n_repeat": 9, "random_st": 9, "k": 9, "labeleddataset": 9, "labeled_dataset": 9, "stratification_label": 9, "encod": 9, "labe": 9, "pretrained_model_name_or_path": 9, "pathlik": 9, "host": 9, "insid": 9, "repo": 9, "huggingfac": 9, "box": 10, "machin": 10, "learn": 10, "avail": 10, "python": 10, "packag": 10, "index": 10, "pypi": 10, "option": 10, "might": 10, "them": 10, "refer": 10, "repositori": 10, "licens": 10, "imprint": 10}, "objects": {"mltb2": [[1, 0, 0, "-", "fasttext"], [2, 0, 0, "-", "files"], [3, 0, 0, "-", "md"], [4, 0, 0, "-", "openai"], [5, 0, 0, "-", "optuna"], [6, 0, 0, "-", "plot"], [7, 0, 0, "-", "somajo"], [8, 0, 0, "-", "somajo_transformers"], [9, 0, 0, "-", "transformers"]], "mltb2.fasttext": [[1, 1, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[1, 2, 1, "", "__call__"], [1, 2, 1, "", "get_model_path_and_download"]], "mltb2.files": [[2, 3, 1, "", "fetch_remote_file"], [2, 3, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[3, 1, 1, "", "MdTextSplitter"], [3, 3, 1, "", "_chunk_md_by_headline"], [3, 3, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[3, 2, 1, "", "__call__"]], "mltb2.openai": [[4, 1, 1, "", "OpenAiAzureChatCompletion"], [4, 1, 1, "", "OpenAiAzureCompletion"], [4, 1, 1, "", "OpenAiBaseCompletion"], [4, 1, 1, "", "OpenAiChatCompletion"], [4, 1, 1, "", "OpenAiCompletion"], [4, 1, 1, "", "OpenAiCompletionAnswer"], [4, 1, 1, "", "OpenAiTokenCounter"], [4, 3, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[4, 2, 1, "", "__call__"], [4, 2, 1, "", "_completion"], [4, 2, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[4, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[4, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[4, 2, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[4, 2, 1, "", "__call__"]], "mltb2.optuna": [[5, 1, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[5, 2, 1, "", "prune"]], "mltb2.plot": [[6, 3, 1, "", "boxplot"], [6, 3, 1, "", "boxplot_dict"], [6, 3, 1, "", "save_last_figure"], [6, 3, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[7, 1, 1, "", "JaccardSimilarity"], [7, 1, 1, "", "SoMaJoBaseClass"], [7, 1, 1, "", "SoMaJoSentenceSplitter"], [7, 1, 1, "", "TokenExtractor"], [7, 3, 1, "", "detokenize"], [7, 3, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[7, 2, 1, "", "__call__"], [7, 2, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[7, 2, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[7, 2, 1, "", "extract_url_set"]], "mltb2.somajo_transformers": [[8, 1, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[8, 2, 1, "", "__call__"]], "mltb2.transformers": [[9, 1, 1, "", "KFoldLabeledDataset"], [9, 1, 1, "", "LabeledDataset"], [9, 1, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[9, 2, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[9, 2, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "refer": 0, "fasttext": 1, "file": 2, "md": 3, "openai": 4, "optuna": 5, "plot": 6, "somajo": 7, "somajo_transform": 8, "transform": 9, "mltb2": 10, "document": 10, "instal": 10, "content": 10}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "fasttext": [[1, "module-mltb2.fasttext"]], "files": [[2, "module-mltb2.files"]], "md": [[3, "module-mltb2.md"]], "openai": [[4, "module-mltb2.openai"]], "optuna": [[5, "module-mltb2.optuna"]], "plot": [[6, "module-mltb2.plot"]], "somajo": [[7, "module-mltb2.somajo"]], "somajo_transformers": [[8, "module-mltb2.somajo_transformers"]], "transformers": [[9, "module-mltb2.transformers"]], "MLTB2 Documentation": [[10, "mltb2-documentation"]], "Installation": [[10, "installation"]], "Content": [[10, "content"]]}, "indexentries": {"fasttextlanguageidentification (class in mltb2.fasttext)": [[1, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[1, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[1, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[1, "module-mltb2.fasttext"]], "module": [[1, "module-mltb2.fasttext"], [2, "module-mltb2.files"], [3, "module-mltb2.md"], [4, "module-mltb2.openai"], [5, "module-mltb2.optuna"], [6, "module-mltb2.plot"], [7, "module-mltb2.somajo"], [8, "module-mltb2.somajo_transformers"], [9, "module-mltb2.transformers"]], "fetch_remote_file() (in module mltb2.files)": [[2, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[2, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[2, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[3, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[3, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[3, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[3, "mltb2.md.chunk_md"]], "mltb2.md": [[3, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[4, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[4, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[4, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[4, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[4, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[4, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[4, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[4, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[4, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[5, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[5, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[5, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[6, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[6, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[6, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[6, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[6, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[7, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[7, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[7, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[7, "mltb2.somajo.TokenExtractor"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[7, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[7, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[7, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[7, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[7, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[7, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[7, "module-mltb2.somajo"]], "textsplitter (class in mltb2.somajo_transformers)": [[8, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[8, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[8, "module-mltb2.somajo_transformers"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[9, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[9, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[9, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[9, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[9, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[9, "mltb2.transformers.KFoldLabeledDataset.split"]]}})
\ No newline at end of file
+Search.setIndex({"docnames": ["api-reference", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "fasttext
", "files
", "md
", "openai
", "optuna
", "plot
", "somajo
", "somajo_transformers
", "transformers
", "MLTB2 Documentation"], "terms": {"fasttext": [0, 10], "file": [0, 4, 10], "md": [0, 10], "openai": [0, 10], "optuna": [0, 10], "plot": [0, 10], "somajo": [0, 8, 10], "somajo_transform": [0, 10], "transform": [0, 8, 10], "specif": [1, 3, 4, 5, 7, 8, 9], "function": [1, 3, 4, 5, 6, 7, 8, 9], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9], "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9], "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 6, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9], "pip": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "instal": [1, 2, 3, 4, 5, 6, 7, 8, 9], "necessari": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "depend": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9], "class": [1, 3, 4, 5, 7, 8, 9], "fasttextlanguageidentif": 1, "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9], "object": [1, 3, 4, 5, 7, 8, 9], "identifi": 1, "languag": [1, 7], "text": [1, 3, 4, 7, 8, 9], "__call__": [1, 3, 4, 7, 8, 9], "num_lang": 1, "int": [1, 3, 4, 5, 6, 8, 9], "10": [1, 5], "given": [1, 5], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9], "which": [1, 2, 4, 5, 7, 9], "recogn": 1, "number": [1, 3, 4, 5, 6, 7, 8, 9], "return": [1, 2, 3, 4, 5, 7, 8, 9], "A": [1, 5, 6, 10], "dict": [1, 4], "from": [1, 2, 4, 5, 7], "probabl": 1, "contain": [1, 7, 9], "more": [1, 5, 6], "than": [1, 8], "element": 1, "so": 1, "guarante": 1, "you": [1, 4, 5, 10], "want": 1, "includ": 1, "case": [1, 4], "when": [1, 4], "veri": 1, "low": 1, "possibl": 1, "ar": [1, 4, 5, 9], "af": 1, "al": 1, "am": 1, "an": [1, 4, 5, 7, 8, 9], "arz": 1, "ast": 1, "av": 1, "az": 1, "azb": 1, "ba": 1, "bar": 1, "bcl": 1, "bg": 1, "bh": 1, "bn": 1, "bo": 1, "bpy": 1, "br": 1, "b": 1, "bxr": 1, "ca": 1, "cbk": 1, "ce": 1, "ceb": 1, "ckb": 1, "co": [1, 9], "c": 1, "cv": [1, 5], "cy": 1, "da": [1, 7], "de": [1, 7], "diq": 1, "dsb": 1, "dty": 1, "dv": 1, "el": 1, "eml": 1, "en": 1, "eo": 1, "e": 1, "et": 1, "eu": 1, "fa": 1, "fi": 1, "fr": 1, "frr": 1, "fy": 1, "ga": 1, "gd": 1, "gl": 1, "gn": 1, "gom": 1, "gu": 1, "gv": 1, "he": 1, "hi": 1, "hif": 1, "hr": 1, "hsb": 1, "ht": 1, "hu": 1, "hy": 1, "ia": 1, "id": [1, 9], "ie": 1, "ilo": 1, "io": 1, "ja": 1, "jbo": 1, "jv": 1, "ka": 1, "kk": 1, "km": 1, "kn": 1, "ko": 1, "krc": 1, "ku": 1, "kv": 1, "kw": 1, "ky": 1, "la": 1, "lb": 1, "lez": 1, "li": 1, "lmo": 1, "lo": 1, "lrc": 1, "lt": 1, "lv": 1, "mai": 1, "mg": 1, "mhr": 1, "min": 1, "mk": 1, "ml": 1, "mn": 1, "mr": 1, "mrj": 1, "m": 1, "mt": 1, "mwl": 1, "my": 1, "myv": 1, "mzn": 1, "nah": 1, "nap": 1, "nd": 1, "ne": 1, "new": 1, "nl": 1, "nn": 1, "oc": 1, "o": 1, "pa": 1, "pam": 1, "pfl": 1, "pl": 1, "pm": 1, "pnb": 1, "p": 1, "pt": 1, "qu": 1, "rm": 1, "ro": 1, "ru": 1, "rue": 1, "sa": 1, "sah": 1, "sc": 1, "scn": 1, "sco": 1, "sd": 1, "sh": 1, "si": 1, "sk": 1, "sl": 1, "sq": 1, "sr": 1, "su": 1, "sv": 1, "sw": 1, "ta": 1, "te": 1, "tg": 1, "th": 1, "tk": 1, "tl": 1, "tr": 1, "tt": 1, "tyv": 1, "ug": 1, "uk": 1, "ur": 1, "uz": 1, "vec": 1, "vep": 1, "vi": 1, "vl": 1, "vo": 1, "wa": [1, 4, 9], "war": 1, "wuu": 1, "xal": 1, "xmf": 1, "yi": 1, "yo": 1, "yue": 1, "zh": 1, "static": 1, "get_model_path_and_download": 1, "str": [1, 2, 3, 4, 6, 7, 8, 9], "get": [1, 4, 7], "model": [1, 4, 5, 9], "path": [1, 2, 9], "download": 1, "need": 1, "type": [1, 2, 3, 4, 5, 7, 8, 9], "util": [2, 9], "fetch_remote_fil": 2, "dirnam": 2, "filenam": [2, 6], "url": [2, 7], "sha256_checksum": 2, "fetch": 2, "remot": 2, "directori": [2, 9], "where": [2, 7], "save": [2, 6], "under": 2, "sha256": 2, "checksum": 2, "full": [2, 4], "creat": [2, 4, 6], "rais": [2, 8], "ioerror": 2, "wrong": 2, "get_and_create_mltb2_data_dir": 2, "mltb2_base_data_dir": 2, "none": [2, 4, 6, 7, 9], "mltb": 2, "data": [2, 5, 9], "dir": 2, "The": [2, 3, 4, 5, 7, 8, 9], "markdown": 3, "mdtextsplitt": 3, "max_token": [3, 8], "transformers_token_count": [3, 8], "transformerstokencount": [3, 8, 9], "show_progress_bar": [3, 4, 7, 8, 9], "bool": [3, 4, 5, 7, 8, 9], "fals": [3, 4, 7, 8, 9], "split": [3, 5, 7, 8, 9], "section": [3, 8], "specifi": [3, 4, 8], "maximum": [3, 8], "token": [3, 4, 7, 8, 9], "doe": [3, 5, 6, 8], "divid": [3, 8], "head": 3, "correspond": 3, "paragraph": 3, "per": [3, 5, 8], "can": [3, 4, 6, 10], "onli": [3, 4, 5], "exceed": 3, "singl": [3, 6], "chunk": 3, "alreadi": 3, "larger": [3, 5], "counter": [3, 8], "show": [3, 4, 7, 8, 9], "progressbar": [3, 4, 7, 8, 9], "dure": [3, 4, 7, 8, 9], "process": [3, 4, 7, 8, 9], "md_text": 3, "list": [3, 4, 7, 8, 9], "_chunk_md_by_headlin": 3, "headlin": 3, "chunk_md": 3, "merg": 3, "isol": 3, "openaiazurechatcomplet": 4, "completion_kwarg": 4, "ani": 4, "openaichatcomplet": 4, "azur": 4, "chat": 4, "complet": 4, "also": 4, "construct": 4, "openaibasecomplet": 4, "from_yaml": 4, "kwarg": 4, "follow": 4, "properti": 4, "must": [4, 5], "api_typ": 4, "api_vers": 4, "api_bas": 4, "engin": 4, "quickstart": 4, "start": 4, "gpt": 4, "35": 4, "turbo": 4, "4": [4, 5], "servic": 4, "openaiazurecomplet": 4, "openaicomplet": 4, "non": 4, "gener": [4, 9], "abc": [4, 7], "abstract": [4, 7], "prompt": 4, "map": 4, "openaicompletionansw": 4, "call": [4, 5, 6], "llm": 4, "In": [4, 5], "string": [4, 7], "allow": 4, "overwrit": 4, "exampl": [4, 5, 7], "chang": 4, "temperatur": 4, "_complet": 4, "completion_kwargs_for_this_cal": 4, "openaiobject": 4, "method": [4, 5], "classmethod": 4, "yaml_fil": 4, "yaml": 4, "prompt_token": 4, "completion_token": 4, "total_token": 4, "finish_reason": 4, "answer": 4, "result": [4, 5], "name": 4, "ha": [4, 6], "been": 4, "total": 4, "reason": [4, 5], "why": 4, "stop": 4, "mean": [4, 5], "api": [4, 10], "without": 4, "run": 4, "limit": 4, "length": 4, "becaus": 4, "function_cal": 4, "from_open_ai_object": 4, "open_ai_object": 4, "openaitokencount": 4, "model_nam": 4, "count": [4, 9], "some": [4, 10], "3": [4, 5], "5": 4, "davinci": 4, "003": 4, "embed": 4, "ada": 4, "002": 4, "iter": [4, 7, 9], "just": [4, 9], "If": [4, 7, 8, 9], "_check_mandatory_azure_completion_kwarg": 4, "check": [4, 5], "mandatori": 4, "significancerepeatedtrainingprun": 5, "alpha": 5, "float": [5, 7], "0": [5, 6], "1": [5, 9], "n_warmup_step": 5, "baseprun": 5, "pruner": 5, "statist": 5, "signific": 5, "heurist": 5, "decis": 5, "make": 5, "It": [5, 7, 10], "prune": 5, "repeat": 5, "train": [5, 9], "like": 5, "cross": [5, 9], "valid": [5, 9], "As": 5, "test": [5, 9], "t": 5, "our": 5, "experi": 5, "have": 5, "shown": 5, "aplha": 5, "valu": [5, 6], "between": 5, "": 5, "standard": 5, "assum": 5, "adjust": 5, "onc": 5, "hyperparamet": 5, "set": [5, 7, 9], "those": 5, "work": 5, "basi": 5, "intermedi": 5, "For": [5, 6], "epoch": 5, "contrast": 5, "precis": 5, "individu": 5, "fold": [5, 9], "below": 5, "minimalist": 5, "import": [5, 7], "log": 5, "numpi": 5, "np": 5, "sklearn": 5, "dataset": [5, 9], "load_iri": 5, "model_select": 5, "stratifiedkfold": 5, "ensembl": 5, "randomforestclassifi": 5, "metric": 5, "accuracy_scor": 5, "configur": 5, "logger": 5, "see": 5, "debug": 5, "output": [5, 7], "getlogg": 5, "addhandl": 5, "streamhandl": 5, "setlevel": 5, "x": [5, 6], "y": [5, 6], "target": 5, "def": 5, "trial": 5, "min_samples_split": 5, "suggest_int": 5, "2": 5, "20": 5, "n_estim": 5, "100": 5, "validation_result_list": 5, "skf": 5, "n_split": [5, 9], "fold_index": 5, "train_index": 5, "val_index": 5, "enumer": 5, "x_train": 5, "x_val": 5, "y_train": 5, "y_val": 5, "rf": 5, "fit": 5, "y_pred": 5, "predict": 5, "acc": 5, "append": 5, "report": 5, "we": 5, "should": [5, 7], "should_prun": 5, "here": 5, "done": 5, "break": 5, "studi": 5, "create_studi": 5, "storag": 5, "sqlite": 5, "db": 5, "memori": 5, "study_nam": 5, "iris_cv": 5, "direct": 5, "maxim": 5, "load_if_exist": 5, "true": [5, 6, 8], "sampler": 5, "tpesampl": 5, "multivari": 5, "add": 5, "optim": 5, "n_trial": 5, "level": 5, "aggress": 5, "smaller": 5, "stronger": 5, "differ": [5, 6], "two": [5, 6, 7], "distribut": 5, "disabl": 5, "until": 5, "reach": 5, "exce": 5, "step": [5, 6], "frozentri": 5, "judg": 5, "whether": 5, "note": 5, "suppos": 5, "librari": 5, "user": 5, "instead": 5, "provid": 5, "interfac": 5, "implement": 5, "mechan": 5, "take": 5, "copi": 5, "befor": 5, "modifi": 5, "boolean": 5, "repres": 5, "collect": 6, "tool": [6, 7, 10], "matplotlib": 6, "boxplot": 6, "label": [6, 9], "titl": 6, "xlabel": 6, "ylabel": 6, "vert": 6, "print": [6, 7], "one": [6, 7], "diagram": 6, "pyplot": 6, "boxplot_dict": 6, "values_dict": 6, "form": 6, "dictionari": 6, "save_last_figur": 6, "last": 6, "jupyt": 6, "notebook": 6, "same": 6, "cell": 6, "twin_axes_timeseries_plot": 6, "values_1": 6, "label_1": 6, "values_2": 6, "label_2": 6, "start_timestep_numb": 6, "shift_1": 6, "shift_2": 6, "label_x": 6, "color_1": 6, "tab": 6, "red": 6, "color_2": 6, "blue": 6, "twin": 6, "ax": 6, "timeseri": 6, "curv": 6, "array_lik": 6, "first": 6, "second": 6, "point": 6, "time": 6, "default": 6, "timestep": 6, "shift": 6, "posit": 6, "neg": 6, "axi": 6, "color": 6, "jaccardsimilar": 7, "somajobaseclass": 7, "calcul": 7, "jaccard": 7, "similar": 7, "de_cmc": 7, "german": 7, "en_ptb": 7, "english": 7, "text1": 7, "text2": 7, "get_token_set": 7, "word": [7, 8], "directli": 7, "somajosentencesplitt": [7, 8], "sentenc": [7, 8], "tokenextractor": 7, "extract": 7, "extract_url_set": 7, "token_extractor": 7, "url_set": 7, "ist": 7, "ein": 7, "link": 7, "http": 7, "github": [7, 10], "com": 7, "urlswapp": 7, "url_pattern": 7, "swap": 7, "revers": 7, "replac": 7, "extractor": 7, "pattern": 7, "One": 7, "mark": 7, "place": 7, "put": 7, "reverse_swap_url": 7, "tupl": 7, "revert": 7, "were": 7, "unknown": 7, "swap_url": 7, "detoken": 7, "convert": 7, "how": 7, "do": [7, 9], "extract_token_class_set": 7, "keep_token_class": 7, "keep": 7, "all": [7, 10], "kept": 7, "hug": [8, 9], "face": [8, 9], "textsplitt": 8, "somajo_sentence_splitt": 8, "ignore_overly_long_sent": 8, "alwai": 8, "whole": 8, "splitter": 8, "valueerror": 8, "except": 8, "longer": 8, "simpli": 8, "ignor": 8, "kfoldlabeleddataset": 9, "7": 9, "n_repeat": 9, "random_st": 9, "k": 9, "labeleddataset": 9, "labeled_dataset": 9, "stratification_label": 9, "encod": 9, "labe": 9, "pretrained_model_name_or_path": 9, "pathlik": 9, "host": 9, "insid": 9, "repo": 9, "huggingfac": 9, "box": 10, "machin": 10, "learn": 10, "avail": 10, "python": 10, "packag": 10, "index": 10, "pypi": 10, "option": 10, "might": 10, "them": 10, "refer": 10, "repositori": 10, "licens": 10, "imprint": 10}, "objects": {"mltb2": [[1, 0, 0, "-", "fasttext"], [2, 0, 0, "-", "files"], [3, 0, 0, "-", "md"], [4, 0, 0, "-", "openai"], [5, 0, 0, "-", "optuna"], [6, 0, 0, "-", "plot"], [7, 0, 0, "-", "somajo"], [8, 0, 0, "-", "somajo_transformers"], [9, 0, 0, "-", "transformers"]], "mltb2.fasttext": [[1, 1, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[1, 2, 1, "", "__call__"], [1, 2, 1, "", "get_model_path_and_download"]], "mltb2.files": [[2, 3, 1, "", "fetch_remote_file"], [2, 3, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[3, 1, 1, "", "MdTextSplitter"], [3, 3, 1, "", "_chunk_md_by_headline"], [3, 3, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[3, 2, 1, "", "__call__"]], "mltb2.openai": [[4, 1, 1, "", "OpenAiAzureChatCompletion"], [4, 1, 1, "", "OpenAiAzureCompletion"], [4, 1, 1, "", "OpenAiBaseCompletion"], [4, 1, 1, "", "OpenAiChatCompletion"], [4, 1, 1, "", "OpenAiCompletion"], [4, 1, 1, "", "OpenAiCompletionAnswer"], [4, 1, 1, "", "OpenAiTokenCounter"], [4, 3, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[4, 2, 1, "", "__call__"], [4, 2, 1, "", "_completion"], [4, 2, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[4, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[4, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[4, 2, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[4, 2, 1, "", "__call__"]], "mltb2.optuna": [[5, 1, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[5, 2, 1, "", "prune"]], "mltb2.plot": [[6, 3, 1, "", "boxplot"], [6, 3, 1, "", "boxplot_dict"], [6, 3, 1, "", "save_last_figure"], [6, 3, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[7, 1, 1, "", "JaccardSimilarity"], [7, 1, 1, "", "SoMaJoBaseClass"], [7, 1, 1, "", "SoMaJoSentenceSplitter"], [7, 1, 1, "", "TokenExtractor"], [7, 1, 1, "", "UrlSwapper"], [7, 3, 1, "", "detokenize"], [7, 3, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[7, 2, 1, "", "__call__"], [7, 2, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[7, 2, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[7, 2, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[7, 2, 1, "", "reverse_swap_urls"], [7, 2, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[8, 1, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[8, 2, 1, "", "__call__"]], "mltb2.transformers": [[9, 1, 1, "", "KFoldLabeledDataset"], [9, 1, 1, "", "LabeledDataset"], [9, 1, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[9, 2, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[9, 2, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "refer": 0, "fasttext": 1, "file": 2, "md": 3, "openai": 4, "optuna": 5, "plot": 6, "somajo": 7, "somajo_transform": 8, "transform": 9, "mltb2": 10, "document": 10, "instal": 10, "content": 10}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "fasttext": [[1, "module-mltb2.fasttext"]], "files": [[2, "module-mltb2.files"]], "md": [[3, "module-mltb2.md"]], "openai": [[4, "module-mltb2.openai"]], "optuna": [[5, "module-mltb2.optuna"]], "plot": [[6, "module-mltb2.plot"]], "somajo": [[7, "module-mltb2.somajo"]], "somajo_transformers": [[8, "module-mltb2.somajo_transformers"]], "transformers": [[9, "module-mltb2.transformers"]], "MLTB2 Documentation": [[10, "mltb2-documentation"]], "Installation": [[10, "installation"]], "Content": [[10, "content"]]}, "indexentries": {"fasttextlanguageidentification (class in mltb2.fasttext)": [[1, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[1, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[1, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[1, "module-mltb2.fasttext"]], "module": [[1, "module-mltb2.fasttext"], [2, "module-mltb2.files"], [3, "module-mltb2.md"], [4, "module-mltb2.openai"], [5, "module-mltb2.optuna"], [6, "module-mltb2.plot"], [7, "module-mltb2.somajo"], [8, "module-mltb2.somajo_transformers"], [9, "module-mltb2.transformers"]], "fetch_remote_file() (in module mltb2.files)": [[2, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[2, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[2, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[3, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[3, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[3, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[3, "mltb2.md.chunk_md"]], "mltb2.md": [[3, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[4, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[4, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[4, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[4, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[4, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[4, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[4, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[4, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[4, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[4, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[5, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[5, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[5, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[6, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[6, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[6, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[6, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[6, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[7, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[7, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[7, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[7, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[7, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[7, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[7, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[7, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[7, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[7, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[7, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[7, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[7, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[7, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[8, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[8, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[8, "module-mltb2.somajo_transformers"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[9, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[9, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[9, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[9, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[9, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[9, "mltb2.transformers.KFoldLabeledDataset.split"]]}})
\ No newline at end of file