|
diff --git a/searchindex.js b/searchindex.js
index c770b77..9235d64 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["api-reference", "api-reference/arangodb", "api-reference/data", "api-reference/db", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/arangodb.rst", "api-reference/data.rst", "api-reference/db.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "arangodb ", "data ", "db ", "fasttext ", "files ", "md ", "openai ", "optuna ", "plot ", "somajo ", "somajo_transformers ", "text ", "transformers ", "MLTB2 Documentation"], "terms": {"arangodb": [0, 14], "data": [0, 1, 3, 5, 8, 13, 14], "db": [0, 8, 14], "fasttext": [0, 14], "file": [0, 1, 2, 4, 7, 14], "md": [0, 14], "openai": [0, 14], "optuna": [0, 14], "plot": [0, 14], "somajo": [0, 11, 14], "somajo_transform": [0, 14], "text": [0, 4, 6, 7, 10, 11, 13, 14], "transform": [0, 11, 14], "util": [1, 3, 5, 13], "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "pip": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "instal": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13], "necessari": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "depend": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "class": [1, 3, 4, 6, 7, 8, 10, 11, 12, 13], "arangobatchdatamanag": 1, "host": [1, 13], "str": [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 13], "sequenc": [1, 3], "db_name": 1, "usernam": 1, "password": 1, "collection_nam": 1, "attribute_nam": 1, "batch_siz": 1, "int": [1, 4, 6, 7, 8, 9, 11, 12, 13], "20": [1, 8], "aql_overwrit": 1, "none": [1, 2, 3, 5, 7, 9, 10, 12, 13], "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "base": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "abstractbatchdatamanag": [1, 3], "implement": [1, 3, 8, 12], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "databas": [1, 3], "name": [1, 7], "document": 1, "from": [1, 2, 3, 4, 5, 7, 8, 10, 12], "collect": 1, "ar": [1, 2, 4, 6, 7, 8, 12, 13], "process": [1, 3, 6, 7, 10, 11, 12, 13], "attribut": 1, "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "check": [1, 7, 8, 12], "alreadi": [1, 6], "If": [1, 2, 5, 7, 9, 10, 11, 12, 13], "present": 1, "avail": [1, 14], "consid": 1, "The": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "batch": [1, 3], "size": 1, "aql": 1, "string": [1, 7, 10], "overwrit": [1, 7], "default": [1, 2, 5, 9], "_arango_client_factori": 1, "arangocli": 1, "creat": [1, 5, 7, 9], "an": [1, 3, 4, 7, 8, 10, 11, 13], "client": 1, "return": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "type": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "_connection_factori": 1, "arango_cli": 1, "standarddatabas": 1, "connect": 1, "classmethod": [1, 7], "from_config_fil": 1, "config_file_nam": 1, "construct": [1, 7], "config": 1, "must": [1, 7, 8, 12], "contain": [1, 2, 4, 10, 12, 13], "valu": [1, 8, 9, 12], "exampl": [1, 7, 8, 10, 12], "http": [1, 2, 10], "com": [1, 10], "my_ml_databas": 1, "my_usernam": 1, "secret": 1, "my_ml_data_collect": 1, "processing_metadata": 1, "100": [1, 8, 12], "path": [1, 4, 5, 13], "load_batch": [1, 3], "load": [1, 2, 3], "save_batch": [1, 3], "save": [1, 3, 5, 9], "_check_config_kei": 1, "dict": [1, 4, 7], "expected_config_kei": 1, "all": [1, 10, 14], "expect": 1, "kei": 1, "arango_collection_backup": 1, "commandlin": 1, "tool": [1, 2, 4, 8, 9, 10, 11, 12, 13, 14], "do": [1, 10, 13], "backup": 1, "written": 1, "gzip": 1, "compress": 1, "jsonl": 1, "current": 1, "work": [1, 8], "directori": [1, 2, 5, 13], "run": [1, 3, 7], "arango": 1, "col": 1, "h": 1, "get": [1, 4, 7, 10], "command": 1, "line": 1, "help": 1, "offer": [2, 4, 8, 10, 11, 12, 13], "follow": [2, 7, 12], "tabular": 2, "set": [2, 8, 10, 13], "biolog": 2, "medic": 2, "domain": 2, "support": [2, 12], "colon": 2, "genom": 2, "pub": 2, "princeton": 2, "edu": 2, "oncologi": 2, "affydata": 2, "index": [2, 14], "html": [2, 12], "prostat": 2, "web": 2, "stanford": 2, "hasti": 2, "casi_fil": 2, "leukemia_big": 2, "leukemia": 2, "after": [2, 12], "internet": 2, "pars": 2, "convert": [2, 10], "cach": 2, "determin": [2, 5], "get_and_create_mltb2_data_dir": [2, 5], "_load_colon_data": 2, "datafram": 2, "label": [2, 9, 13], "also": [2, 7], "see": [2, 8], "panda": 2, "_load_colon_label": 2, "seri": 2, "load_colon": 2, "mltb2_base_data_dir": [2, 5], "tupl": [2, 10], "user": [2, 5, 8], "platformdir": [2, 5], "user_data_dir": [2, 5], "load_leukemia_big": 2, "big": 2, "load_prost": 2, "abc": [3, 7, 10], "abstract": [3, 7, 10], "respect": 3, "intend": 3, "conjunct": 3, "batchdataprocessor": 3, "data_manag": 3, "process_batch_callback": 3, "callabl": 3, "object": [3, 4, 6, 7, 8, 10, 11, 12, 13], "manag": 3, "A": [3, 4, 8, 14], "callback": 3, "function": [3, 5, 7, 8, 9, 12], "one": [3, 9, 10], "done": [3, 8], "until": [3, 8], "empti": 3, "For": [3, 8, 9, 12], "each": 3, "call": [3, 7, 8, 9, 12], "fasttextlanguageidentif": 4, "identifi": 4, "languag": [4, 10], "__call__": [4, 6, 7, 10, 11, 13], "num_lang": 4, "10": [4, 8], "given": [4, 5, 8, 12], "which": [4, 5, 7, 8, 10, 12, 13], "recogn": 4, "number": [4, 6, 7, 8, 9, 10, 11, 12, 13], "probabl": 4, "more": [4, 8, 9, 12], "than": [4, 11, 12], "element": 4, "so": 4, "guarante": 4, "you": [4, 7, 8, 14], "want": 4, "includ": 4, "case": [4, 7], "when": [4, 7], "veri": 4, "low": 4, "possibl": 4, "af": 4, "al": 4, "am": 4, "arz": 4, "ast": 4, "av": 4, "az": 4, "azb": 4, "ba": 4, "bar": 4, "bcl": 4, "bg": 4, "bh": 4, "bn": 4, "bo": 4, "bpy": 4, "br": 4, "b": 4, "bxr": 4, "ca": 4, "cbk": 4, "ce": 4, "ceb": 4, "ckb": 4, "co": [4, 13], "c": 4, "cv": [4, 8], "cy": 4, "da": [4, 10], "de": [4, 10], "diq": 4, "dsb": 4, "dty": 4, "dv": 4, "el": 4, "eml": 4, "en": 4, "eo": 4, "e": 4, "et": 4, "eu": 4, "fa": 4, "fi": 4, "fr": 4, "frr": 4, "fy": 4, "ga": 4, "gd": 4, "gl": 4, "gn": 4, "gom": 4, "gu": 4, "gv": 4, "he": 4, "hi": 4, "hif": 4, "hr": 4, "hsb": 4, "ht": 4, "hu": 4, "hy": 4, "ia": 4, "id": [4, 13], "ie": 4, "ilo": 4, "io": 4, "ja": 4, "jbo": 4, "jv": 4, "ka": 4, "kk": 4, "km": 4, "kn": 4, "ko": 4, "krc": 4, "ku": 4, "kv": 4, "kw": 4, "ky": 4, "la": 4, "lb": 4, "lez": 4, "li": 4, "lmo": 4, "lo": 4, "lrc": 4, "lt": 4, "lv": 4, "mai": 4, "mg": 4, "mhr": 4, "min": 4, "mk": 4, "ml": 4, "mn": 4, "mr": 4, "mrj": 4, "m": 4, "mt": 4, "mwl": 4, "my": 4, "myv": 4, "mzn": 4, "nah": 4, "nap": 4, "nd": 4, "ne": 4, "new": 4, "nl": 4, "nn": 4, "oc": 4, "o": 4, "pa": 4, "pam": 4, "pfl": 4, "pl": 4, "pm": 4, "pnb": 4, "p": 4, "pt": 4, "qu": 4, "rm": 4, "ro": 4, "ru": 4, "rue": 4, "sa": 4, "sah": 4, "sc": 4, "scn": 4, "sco": 4, "sd": 4, "sh": 4, "si": 4, "sk": 4, "sl": 4, "sq": 4, "sr": 4, "su": 4, "sv": 4, "sw": 4, "ta": 4, "te": 4, "tg": 4, "th": 4, "tk": 4, "tl": 4, "tr": 4, "tt": 4, "tyv": 4, "ug": 4, "uk": 4, "ur": 4, "uz": 4, "vec": 4, "vep": 4, "vi": 4, "vl": 4, "vo": 4, "wa": [4, 7, 12, 13], "war": 4, "wuu": 4, "xal": 4, "xmf": 4, "yi": 4, "yo": 4, "yue": 4, "zh": 4, "static": 4, "get_model_path_and_download": 4, "model": [4, 7, 8, 13], "download": 4, "need": 4, "full": [4, 5, 7], "provid": [5, 8], "other": [5, 12], "fetch_remote_fil": 5, "dirnam": 5, "filenam": [5, 9], "url": [5, 10], "sha256_checksum": 5, "fetch": 5, "remot": 5, "where": [5, 10], "under": 5, "sha256": 5, "checksum": 5, "rais": [5, 11, 12], "ioerror": 5, "wrong": 5, "dir": 5, "exact": 5, "folder": 5, "append": [5, 8], "markdown": 6, "specif": [6, 7, 10, 11, 12, 13], "mdtextsplitt": 6, "max_token": [6, 11], "transformers_token_count": [6, 11], "transformerstokencount": [6, 11, 13], "show_progress_bar": [6, 7, 10, 11, 12, 13], "bool": [6, 7, 8, 9, 10, 11, 12, 13], "fals": [6, 7, 9, 10, 11, 12, 13], "split": [6, 8, 10, 11, 13], "section": [6, 11], "specifi": [6, 7, 11], "maximum": [6, 11, 12], "token": [6, 7, 10, 11, 13], "doe": [6, 8, 9, 11], "divid": [6, 11], "head": 6, "correspond": 6, "paragraph": 6, "per": [6, 8, 11], "can": [6, 7, 9, 12, 14], "onli": [6, 7, 8, 12], "exceed": 6, "singl": [6, 9, 12], "chunk": 6, "larger": [6, 8], "counter": [6, 11, 12], "show": [6, 7, 10, 11, 12, 13], "progressbar": [6, 7, 10, 11, 12, 13], "dure": [6, 7, 10, 11, 12, 13], "md_text": 6, "list": [6, 7, 10, 11, 13], "_chunk_md_by_headlin": 6, "headlin": 6, "chunk_md": 6, "merg": 6, "isol": 6, "subsequ": 6, "end": 6, "without": [6, 7], "content": 6, "remov": [6, 12], "openaiazurechatcomplet": 7, "completion_kwarg": 7, "ani": 7, "openaichatcomplet": 7, "azur": 7, "chat": 7, "complet": 7, "openaibasecomplet": 7, "from_yaml": 7, "kwarg": 7, "properti": 7, "api_typ": 7, "api_vers": 7, "api_bas": 7, "engin": 7, "quickstart": 7, "start": 7, "gpt": 7, "35": 7, "turbo": 7, "4": [7, 8], "servic": 7, "openaiazurecomplet": 7, "openaicomplet": 7, "non": 7, "gener": [7, 13], "prompt": 7, "map": 7, "openaicompletionansw": 7, "llm": 7, "In": [7, 8], "allow": 7, "chang": 7, "temperatur": 7, "_complet": 7, "completion_kwargs_for_this_cal": 7, "openaiobject": 7, "method": [7, 8, 12], "yaml_fil": 7, "yaml": 7, "prompt_token": 7, "completion_token": 7, "total_token": 7, "finish_reason": 7, "answer": 7, "result": [7, 8], "ha": [7, 9], "been": 7, "total": [7, 12], "reason": [7, 8], "why": 7, "stop": 7, "mean": [7, 8], "api": [7, 14], "limit": [7, 12], "length": 7, "becaus": 7, "function_cal": 7, "from_open_ai_object": 7, "open_ai_object": 7, "openaitokencount": 7, "model_nam": 7, "count": [7, 12, 13], "some": [7, 14], "3": [7, 8], "5": 7, "davinci": 7, "003": 7, "embed": 7, "ada": 7, "002": 7, "iter": [7, 10, 12, 13], "just": [7, 13], "_check_mandatory_azure_completion_kwarg": 7, "mandatori": 7, "significancerepeatedtrainingprun": 8, "alpha": 8, "float": [8, 10, 12], "0": [8, 9, 12], "1": [8, 13], "n_warmup_step": 8, "baseprun": 8, "pruner": 8, "statist": 8, "signific": 8, "heurist": 8, "decis": 8, "make": [8, 9], "It": [8, 10, 12, 14], "prune": 8, "repeat": 8, "train": [8, 13], "like": 8, "cross": [8, 13], "valid": [8, 13], "As": 8, "test": [8, 13], "t": 8, "our": 8, "experi": 8, "have": 8, "shown": 8, "aplha": 8, "between": [8, 12], "": [8, 9], "standard": 8, "assum": 8, "adjust": 8, "onc": [8, 12], "hyperparamet": 8, "those": 8, "basi": 8, "intermedi": 8, "epoch": 8, "contrast": 8, "precis": 8, "individu": 8, "fold": [8, 13], "below": 8, "minimalist": 8, "import": [8, 10], "log": 8, "numpi": 8, "np": 8, "sklearn": 8, "dataset": [8, 13], "load_iri": 8, "model_select": 8, "stratifiedkfold": 8, "ensembl": 8, "randomforestclassifi": 8, "metric": 8, "accuracy_scor": 8, "configur": 8, "logger": 8, "debug": 8, "output": [8, 10], "getlogg": 8, "addhandl": 8, "streamhandl": 8, "setlevel": 8, "x": [8, 9], "y": [8, 9], "target": 8, "def": 8, "trial": 8, "min_samples_split": 8, "suggest_int": 8, "2": 8, "n_estim": 8, "validation_result_list": 8, "skf": 8, "n_split": [8, 13], "fold_index": 8, "train_index": 8, "val_index": 8, "enumer": 8, "x_train": 8, "x_val": 8, "y_train": 8, "y_val": 8, "rf": 8, "fit": [8, 12], "y_pred": 8, "predict": 8, "acc": 8, "report": 8, "we": 8, "should": [8, 10], "should_prun": 8, "here": 8, "break": 8, "studi": 8, "create_studi": 8, "storag": 8, "sqlite": 8, "memori": 8, "study_nam": 8, "iris_cv": 8, "direct": 8, "maxim": 8, "load_if_exist": 8, "true": [8, 9, 11, 12], "sampler": 8, "tpesampl": 8, "multivari": 8, "add": 8, "optim": 8, "n_trial": 8, "level": 8, "aggress": 8, "smaller": 8, "stronger": 8, "differ": [8, 9, 12], "two": [8, 9, 10, 12], "distribut": 8, "disabl": 8, "reach": 8, "exce": 8, "step": [8, 9], "frozentri": 8, "judg": 8, "whether": 8, "note": 8, "suppos": 8, "librari": 8, "instead": 8, "interfac": 8, "mechan": 8, "take": 8, "copi": 8, "befor": [8, 12], "modifi": 8, "boolean": 8, "repres": 8, "matplotlib": 9, "boxplot": 9, "titl": 9, "xlabel": 9, "ylabel": 9, "vert": 9, "print": [9, 10], "diagram": 9, "pyplot": 9, "axi": 9, "box": [9, 14], "vertic": 9, "horizont": 9, "boxplot_dict": 9, "values_dict": 9, "form": [9, 12], "dictionari": 9, "save_last_figur": 9, "last": 9, "made": 9, "jupyt": 9, "notebook": 9, "same": 9, "cell": 9, "twin_axes_timeseries_plot": 9, "values_1": 9, "label_1": 9, "values_2": 9, "label_2": 9, "start_timestep_numb": 9, "shift_1": 9, "shift_2": 9, "label_x": 9, "color_1": 9, "tab": 9, "red": 9, "color_2": 9, "blue": 9, "twin": 9, "ax": 9, "timeseri": 9, "curv": 9, "array_lik": 9, "first": [9, 12], "second": 9, "point": 9, "time": [9, 12], "timestep": 9, "shift": 9, "posit": 9, "neg": 9, "color": 9, "jaccardsimilar": 10, "liter": 10, "de_cmc": 10, "en_ptb": 10, "somajobaseclass": 10, "calcul": [10, 12], "jaccard": 10, "similar": 10, "german": 10, "english": 10, "text1": 10, "text2": 10, "get_token_set": 10, "word": [10, 11], "directli": 10, "somajosentencesplitt": [10, 11], "sentenc": [10, 11], "tokenextractor": 10, "extract": 10, "extract_url_set": 10, "token_extractor": 10, "url_set": 10, "ist": 10, "ein": 10, "link": 10, "github": [10, 14], "urlswapp": 10, "url_pattern": 10, "swap": 10, "revers": 10, "replac": [10, 12], "extractor": 10, "pattern": 10, "One": [10, 12], "mark": 10, "place": 10, "put": 10, "reverse_swap_url": 10, "revert": 10, "were": 10, "unknown": 10, "swap_url": 10, "detoken": 10, "how": 10, "extract_token_class_set": 10, "keep_token_class": 10, "keep": 10, "kept": 10, "hug": [11, 13], "face": [11, 13], "textsplitt": 11, "somajo_sentence_splitt": 11, "ignore_overly_long_sent": 11, "alwai": 11, "whole": 11, "splitter": 11, "valueerror": [11, 12], "except": 11, "longer": 11, "simpli": 11, "ignor": 11, "detect": 12, "clean": 12, "invis": 12, "charact": 12, "special": 12, "whitespac": 12, "duplic": 12, "distanc": 12, "find": 12, "anomali": 12, "textdist": 12, "max_dimens": 12, "markup": 12, "unusu": 12, "multipl": 12, "again": 12, "dimens": 12, "greater": 12, "_normalize_char_count": 12, "normal": 12, "char": 12, "defaultdict": 12, "lazi": 12, "postprocess": 12, "manhattan": 12, "scipi": 12, "spatial": 12, "cityblock": 12, "most": 12, "commen": 12, "higher": 12, "least": 12, "_normalize_counter_to_defaultdict": 12, "devid": 12, "them": [12, 14], "clean_all_invisible_chars_and_whitespac": 12, "lead": 12, "trail": 12, "defin": 12, "constant": 12, "invisible_charact": 12, "special_whitespac": 12, "rteturn": 12, "has_invisible_charact": 12, "otherwis": 12, "has_special_whitespac": 12, "remove_invisible_charact": 12, "replace_multiple_whitespac": 12, "replace_special_whitespac": 12, "kfoldlabeleddataset": 13, "7": 13, "n_repeat": 13, "random_st": 13, "k": 13, "labeleddataset": 13, "labeled_dataset": 13, "stratification_label": 13, "encod": 13, "labe": 13, "pretrained_model_name_or_path": 13, "pathlik": 13, "insid": 13, "repo": 13, "huggingfac": 13, "machin": 14, "learn": 14, "python": 14, "packag": 14, "pypi": 14, "option": 14, "might": 14, "refer": 14, "repositori": 14, "licens": 14, "imprint": 14}, "objects": {"mltb2": [[1, 0, 0, "-", "arangodb"], [2, 0, 0, "-", "data"], [3, 0, 0, "-", "db"], [4, 0, 0, "-", "fasttext"], [5, 0, 0, "-", "files"], [6, 0, 0, "-", "md"], [7, 0, 0, "-", "openai"], [8, 0, 0, "-", "optuna"], [9, 0, 0, "-", "plot"], [10, 0, 0, "-", "somajo"], [11, 0, 0, "-", "somajo_transformers"], [12, 0, 0, "-", "text"], [13, 0, 0, "-", "transformers"]], "mltb2.arangodb": [[1, 1, 1, "", "ArangoBatchDataManager"], [1, 3, 1, "", "_check_config_keys"], [1, 3, 1, "", "arango_collection_backup"]], "mltb2.arangodb.ArangoBatchDataManager": [[1, 2, 1, "", "_arango_client_factory"], [1, 2, 1, "", "_connection_factory"], [1, 2, 1, "", "from_config_file"], [1, 2, 1, "", "load_batch"], [1, 2, 1, "", "save_batch"]], "mltb2.data": [[2, 3, 1, "", "_load_colon_data"], [2, 3, 1, "", "_load_colon_label"], [2, 3, 1, "", "load_colon"], [2, 3, 1, "", "load_leukemia_big"], [2, 3, 1, "", "load_prostate"]], "mltb2.db": [[3, 1, 1, "", "AbstractBatchDataManager"], [3, 1, 1, "", "BatchDataProcessor"]], "mltb2.db.AbstractBatchDataManager": [[3, 2, 1, "", "load_batch"], [3, 2, 1, "", "save_batch"]], "mltb2.db.BatchDataProcessor": [[3, 2, 1, "", "run"]], "mltb2.fasttext": [[4, 1, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[4, 2, 1, "", "__call__"], [4, 2, 1, "", "get_model_path_and_download"]], "mltb2.files": [[5, 3, 1, "", "fetch_remote_file"], [5, 3, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[6, 1, 1, "", "MdTextSplitter"], [6, 3, 1, "", "_chunk_md_by_headline"], [6, 3, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[6, 2, 1, "", "__call__"]], "mltb2.openai": [[7, 1, 1, "", "OpenAiAzureChatCompletion"], [7, 1, 1, "", "OpenAiAzureCompletion"], [7, 1, 1, "", "OpenAiBaseCompletion"], [7, 1, 1, "", "OpenAiChatCompletion"], [7, 1, 1, "", "OpenAiCompletion"], [7, 1, 1, "", "OpenAiCompletionAnswer"], [7, 1, 1, "", "OpenAiTokenCounter"], [7, 3, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[7, 2, 1, "", "__call__"], [7, 2, 1, "", "_completion"], [7, 2, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[7, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[7, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[7, 2, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[7, 2, 1, "", "__call__"]], "mltb2.optuna": [[8, 1, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[8, 2, 1, "", "prune"]], "mltb2.plot": [[9, 3, 1, "", "boxplot"], [9, 3, 1, "", "boxplot_dict"], [9, 3, 1, "", "save_last_figure"], [9, 3, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[10, 1, 1, "", "JaccardSimilarity"], [10, 1, 1, "", "SoMaJoBaseClass"], [10, 1, 1, "", "SoMaJoSentenceSplitter"], [10, 1, 1, "", "TokenExtractor"], [10, 1, 1, "", "UrlSwapper"], [10, 3, 1, "", "detokenize"], [10, 3, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[10, 2, 1, "", "__call__"], [10, 2, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[10, 2, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[10, 2, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[10, 2, 1, "", "reverse_swap_urls"], [10, 2, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[11, 1, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[11, 2, 1, "", "__call__"]], "mltb2.text": [[12, 1, 1, "", "TextDistance"], [12, 3, 1, "", "_normalize_counter_to_defaultdict"], [12, 3, 1, "", "clean_all_invisible_chars_and_whitespaces"], [12, 3, 1, "", "has_invisible_characters"], [12, 3, 1, "", "has_special_whitespaces"], [12, 3, 1, "", "remove_invisible_characters"], [12, 3, 1, "", "replace_multiple_whitespaces"], [12, 3, 1, "", "replace_special_whitespaces"]], "mltb2.text.TextDistance": [[12, 2, 1, "", "_normalize_char_counter"], [12, 2, 1, "", "distance"], [12, 2, 1, "", "fit"]], "mltb2.transformers": [[13, 1, 1, "", "KFoldLabeledDataset"], [13, 1, 1, "", "LabeledDataset"], [13, 1, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[13, 2, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[13, 2, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "refer": 0, "arangodb": 1, "data": 2, "db": 3, "fasttext": 4, "file": 5, "md": 6, "openai": 7, "optuna": 8, "plot": 9, "somajo": 10, "somajo_transform": 11, "text": 12, "transform": 13, "mltb2": 14, "document": 14, "instal": 14, "content": 14}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "arangodb": [[1, "module-mltb2.arangodb"]], "data": [[2, "module-mltb2.data"]], "db": [[3, "module-mltb2.db"]], "fasttext": [[4, "module-mltb2.fasttext"]], "files": [[5, "module-mltb2.files"]], "md": [[6, "module-mltb2.md"]], "openai": [[7, "module-mltb2.openai"]], "optuna": [[8, "module-mltb2.optuna"]], "plot": [[9, "module-mltb2.plot"]], "somajo": [[10, "module-mltb2.somajo"]], "somajo_transformers": [[11, "module-mltb2.somajo_transformers"]], "text": [[12, "module-mltb2.text"]], "transformers": [[13, "module-mltb2.transformers"]], "MLTB2 Documentation": [[14, "mltb2-documentation"]], "Installation": [[14, "installation"]], "Content": [[14, "content"]]}, "indexentries": {"arangobatchdatamanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoBatchDataManager"]], "_arango_client_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._arango_client_factory"]], "_check_config_keys() (in module mltb2.arangodb)": [[1, "mltb2.arangodb._check_config_keys"]], "_connection_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._connection_factory"]], "arango_collection_backup() (in module mltb2.arangodb)": [[1, "mltb2.arangodb.arango_collection_backup"]], "from_config_file() (mltb2.arangodb.arangobatchdatamanager class method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.from_config_file"]], "load_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.load_batch"]], "mltb2.arangodb": [[1, "module-mltb2.arangodb"]], "module": [[1, "module-mltb2.arangodb"], [2, "module-mltb2.data"], [3, "module-mltb2.db"], [4, "module-mltb2.fasttext"], [5, "module-mltb2.files"], [6, "module-mltb2.md"], [7, "module-mltb2.openai"], [8, "module-mltb2.optuna"], [9, "module-mltb2.plot"], [10, "module-mltb2.somajo"], [11, "module-mltb2.somajo_transformers"], [12, "module-mltb2.text"], [13, "module-mltb2.transformers"]], "save_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.save_batch"]], "_load_colon_data() (in module mltb2.data)": [[2, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[2, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[2, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[2, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[2, "mltb2.data.load_prostate"]], "mltb2.data": [[2, "module-mltb2.data"]], "abstractbatchdatamanager (class in mltb2.db)": [[3, "mltb2.db.AbstractBatchDataManager"]], "batchdataprocessor (class in mltb2.db)": [[3, "mltb2.db.BatchDataProcessor"]], "load_batch() (mltb2.db.abstractbatchdatamanager method)": [[3, "mltb2.db.AbstractBatchDataManager.load_batch"]], "mltb2.db": [[3, "module-mltb2.db"]], "run() (mltb2.db.batchdataprocessor method)": [[3, "mltb2.db.BatchDataProcessor.run"]], "save_batch() (mltb2.db.abstractbatchdatamanager method)": [[3, "mltb2.db.AbstractBatchDataManager.save_batch"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[4, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[4, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[4, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[4, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[5, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[5, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[5, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[6, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[6, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[6, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[6, "mltb2.md.chunk_md"]], "mltb2.md": [[6, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[7, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[7, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[7, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[7, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[7, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[7, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[7, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[7, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[7, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[8, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[8, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[8, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[9, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[9, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[9, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[9, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[9, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[10, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[10, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[10, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[10, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[10, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[10, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[10, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[10, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[10, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[10, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[10, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[10, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[10, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[10, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[11, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[11, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[11, "module-mltb2.somajo_transformers"]], "textdistance (class in mltb2.text)": [[12, "mltb2.text.TextDistance"]], "_normalize_char_counter() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance._normalize_char_counter"]], "_normalize_counter_to_defaultdict() (in module mltb2.text)": [[12, "mltb2.text._normalize_counter_to_defaultdict"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "distance() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance.distance"]], "fit() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance.fit"]], "has_invisible_characters() (in module mltb2.text)": [[12, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[12, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[12, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[13, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[13, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[13, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[13, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[13, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[13, "mltb2.transformers.KFoldLabeledDataset.split"]]}})
\ No newline at end of file
+Search.setIndex({"docnames": ["api-reference", "api-reference/arangodb", "api-reference/bs", "api-reference/data", "api-reference/db", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/arangodb.rst", "api-reference/bs.rst", "api-reference/data.rst", "api-reference/db.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "arangodb ", "bs ", "data ", "db ", "fasttext ", "files ", "md ", "openai ", "optuna ", "plot ", "somajo ", "somajo_transformers ", "text ", "transformers ", "MLTB2 Documentation"], "terms": {"arangodb": [0, 15], "b": [0, 5, 15], "data": [0, 1, 4, 6, 9, 14, 15], "db": [0, 9, 15], "fasttext": [0, 15], "file": [0, 1, 3, 5, 8, 15], "md": [0, 15], "openai": [0, 15], "optuna": [0, 15], "plot": [0, 15], "somajo": [0, 12, 15], "somajo_transform": [0, 15], "text": [0, 2, 5, 7, 8, 11, 12, 14, 15], "transform": [0, 12, 15], "util": [1, 4, 6, 14], "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "pip": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15], "instal": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14], "necessari": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15], "depend": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "class": [1, 4, 5, 7, 8, 9, 11, 12, 13, 14], "arangobatchdatamanag": 1, "host": [1, 14], "str": [1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14], "sequenc": [1, 4], "db_name": 1, "usernam": 1, "password": 1, "collection_nam": 1, "attribute_nam": 1, "batch_siz": 1, "int": [1, 5, 7, 8, 9, 10, 12, 13, 14], "20": [1, 9], "aql_overwrit": 1, "none": [1, 2, 3, 4, 6, 8, 10, 11, 13, 14], "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "base": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "abstractbatchdatamanag": [1, 4], "implement": [1, 4, 9, 13], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "databas": [1, 4], "name": [1, 2, 8], "document": 1, "from": [1, 2, 3, 4, 5, 6, 8, 9, 11, 13], "collect": 1, "ar": [1, 2, 3, 5, 7, 8, 9, 13, 14], "process": [1, 4, 7, 8, 11, 12, 13, 14], "attribut": [1, 2], "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "check": [1, 8, 9, 13], "alreadi": [1, 7], "If": [1, 2, 3, 6, 8, 10, 11, 12, 13, 14], "present": 1, "avail": [1, 15], "consid": 1, "The": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "batch": [1, 4], "size": 1, "aql": 1, "string": [1, 2, 8, 11], "overwrit": [1, 8], "default": [1, 2, 3, 6, 10], "_arango_client_factori": 1, "arangocli": 1, "creat": [1, 6, 8, 10], "an": [1, 4, 5, 8, 9, 11, 12, 14], "client": 1, "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "type": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "_connection_factori": 1, "arango_cli": 1, "standarddatabas": 1, "connect": 1, "classmethod": [1, 8], "from_config_fil": 1, "config_file_nam": 1, "construct": [1, 8], "config": 1, "must": [1, 8, 9, 13], "contain": [1, 3, 5, 11, 13, 14], "valu": [1, 9, 10, 13], "exampl": [1, 8, 9, 11, 13], "http": [1, 3, 11], "com": [1, 11], "my_ml_databas": 1, "my_usernam": 1, "secret": 1, "my_ml_data_collect": 1, "processing_metadata": 1, "100": [1, 9, 13], "path": [1, 5, 6, 14], "load_batch": [1, 4], "load": [1, 3, 4], "save_batch": [1, 4], "save": [1, 4, 6, 10], "_check_config_kei": 1, "dict": [1, 2, 5, 8], "expected_config_kei": 1, "all": [1, 2, 11, 15], "expect": 1, "kei": 1, "arango_collection_backup": 1, "commandlin": 1, "tool": [1, 2, 3, 5, 9, 10, 11, 12, 13, 14, 15], "do": [1, 11, 14], "backup": 1, "written": 1, "gzip": 1, "compress": 1, "jsonl": 1, "current": 1, "work": [1, 9], "directori": [1, 3, 6, 14], "run": [1, 4, 8], "arango": 1, "col": 1, "h": 1, "get": [1, 5, 8, 11], "command": 1, "line": [1, 2], "help": 1, "beauti": 2, "soup": 2, "html": [2, 3, 13], "specif": [2, 7, 8, 11, 12, 13, 14], "extract_al": 2, "beautifulsoup": 2, "attr": 2, "kwarg": [2, 8], "ani": [2, 8], "extract": [2, 11], "specifi": [2, 7, 8, 12], "element": [2, 5], "object": [2, 4, 5, 7, 8, 9, 11, 12, 13, 14], "tag": 2, "addit": 2, "keyword": 2, "argument": 2, "extract_on": 2, "exactli": 2, "one": [2, 4, 10, 11], "function": [2, 4, 6, 8, 9, 10, 13], "expact": 2, "onli": [2, 7, 8, 9, 13], "result": [2, 8, 9], "found": 2, "otherwis": [2, 13], "runtimeerror": 2, "rais": [2, 6, 12, 13], "extract_text": 2, "join_str": 2, "join": 2, "part": 2, "per": [2, 7, 9, 12], "space": 2, "html_to_md": 2, "mdformat_opt": 2, "convert": [2, 3, 11], "markdown": [2, 7], "mdformat": 2, "option": [2, 15], "number": [2, 5, 7, 8, 9, 10, 11, 12, 13, 14], "true": [2, 9, 10, 12, 13], "appli": 2, "consecut": 2, "order": 2, "list": [2, 7, 8, 11, 12, 14], "wrap": 2, "paragraph": [2, 7], "word": [2, 11, 12], "mode": 2, "end": [2, 7], "lf": 2, "remove_al": 2, "remov": [2, 7, 13], "done": [2, 4, 9], "place": [2, 11], "noth": 2, "": [2, 9, 10], "soup_to_md": 2, "offer": [3, 5, 9, 11, 12, 13, 14], "follow": [3, 8, 13], "tabular": 3, "set": [3, 9, 11, 14], "biolog": 3, "medic": 3, "domain": 3, "support": [3, 13], "colon": 3, "genom": 3, "pub": 3, "princeton": 3, "edu": 3, "oncologi": 3, "affydata": 3, "index": [3, 15], "prostat": 3, "web": 3, "stanford": 3, "hasti": 3, "casi_fil": 3, "leukemia_big": 3, "leukemia": 3, "after": [3, 13], "internet": 3, "pars": 3, "cach": 3, "determin": [3, 6], "get_and_create_mltb2_data_dir": [3, 6], "_load_colon_data": 3, "datafram": 3, "label": [3, 10, 14], "also": [3, 8], "see": [3, 9], "panda": 3, "_load_colon_label": 3, "seri": 3, "load_colon": 3, "mltb2_base_data_dir": [3, 6], "tupl": [3, 11], "user": [3, 6, 9], "platformdir": [3, 6], "user_data_dir": [3, 6], "load_leukemia_big": 3, "big": 3, "load_prost": 3, "abc": [4, 8, 11], "abstract": [4, 8, 11], "respect": 4, "intend": 4, "conjunct": 4, "batchdataprocessor": 4, "data_manag": 4, "process_batch_callback": 4, "callabl": 4, "manag": 4, "A": [4, 5, 9, 15], "callback": 4, "until": [4, 9], "empti": 4, "For": [4, 9, 10, 13], "each": 4, "call": [4, 8, 9, 10, 13], "fasttextlanguageidentif": 5, "identifi": 5, "languag": [5, 11], "__call__": [5, 7, 8, 11, 12, 14], "num_lang": 5, "10": [5, 9], "given": [5, 6, 9, 13], "which": [5, 6, 8, 9, 11, 13, 14], "recogn": 5, "probabl": 5, "more": [5, 9, 10, 13], "than": [5, 12, 13], "so": 5, "guarante": 5, "you": [5, 8, 9, 15], "want": 5, "includ": 5, "case": [5, 8], "when": [5, 8], "veri": 5, "low": 5, "possibl": 5, "af": 5, "al": 5, "am": 5, "arz": 5, "ast": 5, "av": 5, "az": 5, "azb": 5, "ba": 5, "bar": 5, "bcl": 5, "bg": 5, "bh": 5, "bn": 5, "bo": 5, "bpy": 5, "br": 5, "bxr": 5, "ca": 5, "cbk": 5, "ce": 5, "ceb": 5, "ckb": 5, "co": [5, 14], "c": 5, "cv": [5, 9], "cy": 5, "da": [5, 11], "de": [5, 11], "diq": 5, "dsb": 5, "dty": 5, "dv": 5, "el": 5, "eml": 5, "en": 5, "eo": 5, "e": 5, "et": 5, "eu": 5, "fa": 5, "fi": 5, "fr": 5, "frr": 5, "fy": 5, "ga": 5, "gd": 5, "gl": 5, "gn": 5, "gom": 5, "gu": 5, "gv": 5, "he": 5, "hi": 5, "hif": 5, "hr": 5, "hsb": 5, "ht": 5, "hu": 5, "hy": 5, "ia": 5, "id": [5, 14], "ie": 5, "ilo": 5, "io": 5, "ja": 5, "jbo": 5, "jv": 5, "ka": 5, "kk": 5, "km": 5, "kn": 5, "ko": 5, "krc": 5, "ku": 5, "kv": 5, "kw": 5, "ky": 5, "la": 5, "lb": 5, "lez": 5, "li": 5, "lmo": 5, "lo": 5, "lrc": 5, "lt": 5, "lv": 5, "mai": 5, "mg": 5, "mhr": 5, "min": 5, "mk": 5, "ml": 5, "mn": 5, "mr": 5, "mrj": 5, "m": 5, "mt": 5, "mwl": 5, "my": 5, "myv": 5, "mzn": 5, "nah": 5, "nap": 5, "nd": 5, "ne": 5, "new": 5, "nl": 5, "nn": 5, "oc": 5, "o": 5, "pa": 5, "pam": 5, "pfl": 5, "pl": 5, "pm": 5, "pnb": 5, "p": 5, "pt": 5, "qu": 5, "rm": 5, "ro": 5, "ru": 5, "rue": 5, "sa": 5, "sah": 5, "sc": 5, "scn": 5, "sco": 5, "sd": 5, "sh": 5, "si": 5, "sk": 5, "sl": 5, "sq": 5, "sr": 5, "su": 5, "sv": 5, "sw": 5, "ta": 5, "te": 5, "tg": 5, "th": 5, "tk": 5, "tl": 5, "tr": 5, "tt": 5, "tyv": 5, "ug": 5, "uk": 5, "ur": 5, "uz": 5, "vec": 5, "vep": 5, "vi": 5, "vl": 5, "vo": 5, "wa": [5, 8, 13, 14], "war": 5, "wuu": 5, "xal": 5, "xmf": 5, "yi": 5, "yo": 5, "yue": 5, "zh": 5, "static": 5, "get_model_path_and_download": 5, "model": [5, 8, 9, 14], "download": 5, "need": 5, "full": [5, 6, 8], "provid": [6, 9], "other": [6, 13], "fetch_remote_fil": 6, "dirnam": 6, "filenam": [6, 10], "url": [6, 11], "sha256_checksum": 6, "fetch": 6, "remot": 6, "where": [6, 11], "under": 6, "sha256": 6, "checksum": 6, "ioerror": 6, "wrong": 6, "dir": 6, "exact": 6, "folder": 6, "append": [6, 9], "mdtextsplitt": 7, "max_token": [7, 12], "transformers_token_count": [7, 12], "transformerstokencount": [7, 12, 14], "show_progress_bar": [7, 8, 11, 12, 13, 14], "bool": [7, 8, 9, 10, 11, 12, 13, 14], "fals": [7, 8, 10, 11, 12, 13, 14], "split": [7, 9, 11, 12, 14], "section": [7, 12], "maximum": [7, 12, 13], "token": [7, 8, 11, 12, 14], "doe": [7, 9, 10, 12], "divid": [7, 12], "head": 7, "correspond": 7, "can": [7, 8, 10, 13, 15], "exceed": 7, "singl": [7, 10, 13], "chunk": 7, "larger": [7, 9], "counter": [7, 12, 13], "show": [7, 8, 11, 12, 13, 14], "progressbar": [7, 8, 11, 12, 13, 14], "dure": [7, 8, 11, 12, 13, 14], "md_text": 7, "_chunk_md_by_headlin": 7, "headlin": 7, "chunk_md": 7, "merg": 7, "isol": 7, "subsequ": 7, "without": [7, 8], "content": 7, "openaiazurechatcomplet": 8, "completion_kwarg": 8, "openaichatcomplet": 8, "azur": 8, "chat": 8, "complet": 8, "openaibasecomplet": 8, "from_yaml": 8, "properti": 8, "api_typ": 8, "api_vers": 8, "api_bas": 8, "engin": 8, "quickstart": 8, "start": 8, "gpt": 8, "35": 8, "turbo": 8, "4": [8, 9], "servic": 8, "openaiazurecomplet": 8, "openaicomplet": 8, "non": 8, "gener": [8, 14], "prompt": 8, "map": 8, "openaicompletionansw": 8, "llm": 8, "In": [8, 9], "allow": 8, "chang": 8, "temperatur": 8, "_complet": 8, "completion_kwargs_for_this_cal": 8, "openaiobject": 8, "method": [8, 9, 13], "yaml_fil": 8, "yaml": 8, "prompt_token": 8, "completion_token": 8, "total_token": 8, "finish_reason": 8, "answer": 8, "ha": [8, 10], "been": 8, "total": [8, 13], "reason": [8, 9], "why": 8, "stop": 8, "mean": [8, 9], "api": [8, 15], "limit": [8, 13], "length": 8, "becaus": 8, "function_cal": 8, "from_open_ai_object": 8, "open_ai_object": 8, "openaitokencount": 8, "model_nam": 8, "count": [8, 13, 14], "some": [8, 15], "3": [8, 9], "5": 8, "davinci": 8, "003": 8, "embed": 8, "ada": 8, "002": 8, "iter": [8, 11, 13, 14], "just": [8, 14], "_check_mandatory_azure_completion_kwarg": 8, "mandatori": 8, "significancerepeatedtrainingprun": 9, "alpha": 9, "float": [9, 11, 13], "0": [9, 10, 13], "1": [9, 14], "n_warmup_step": 9, "baseprun": 9, "pruner": 9, "statist": 9, "signific": 9, "heurist": 9, "decis": 9, "make": [9, 10], "It": [9, 11, 13, 15], "prune": 9, "repeat": 9, "train": [9, 14], "like": 9, "cross": [9, 14], "valid": [9, 14], "As": 9, "test": [9, 14], "t": 9, "our": 9, "experi": 9, "have": 9, "shown": 9, "aplha": 9, "between": [9, 13], "standard": 9, "assum": 9, "adjust": 9, "onc": [9, 13], "hyperparamet": 9, "those": 9, "basi": 9, "intermedi": 9, "epoch": 9, "contrast": 9, "precis": 9, "individu": 9, "fold": [9, 14], "below": 9, "minimalist": 9, "import": [9, 11], "log": 9, "numpi": 9, "np": 9, "sklearn": 9, "dataset": [9, 14], "load_iri": 9, "model_select": 9, "stratifiedkfold": 9, "ensembl": 9, "randomforestclassifi": 9, "metric": 9, "accuracy_scor": 9, "configur": 9, "logger": 9, "debug": 9, "output": [9, 11], "getlogg": 9, "addhandl": 9, "streamhandl": 9, "setlevel": 9, "x": [9, 10], "y": [9, 10], "target": 9, "def": 9, "trial": 9, "min_samples_split": 9, "suggest_int": 9, "2": 9, "n_estim": 9, "validation_result_list": 9, "skf": 9, "n_split": [9, 14], "fold_index": 9, "train_index": 9, "val_index": 9, "enumer": 9, "x_train": 9, "x_val": 9, "y_train": 9, "y_val": 9, "rf": 9, "fit": [9, 13], "y_pred": 9, "predict": 9, "acc": 9, "report": 9, "we": 9, "should": [9, 11], "should_prun": 9, "here": 9, "break": 9, "studi": 9, "create_studi": 9, "storag": 9, "sqlite": 9, "memori": 9, "study_nam": 9, "iris_cv": 9, "direct": 9, "maxim": 9, "load_if_exist": 9, "sampler": 9, "tpesampl": 9, "multivari": 9, "add": 9, "optim": 9, "n_trial": 9, "level": 9, "aggress": 9, "smaller": 9, "stronger": 9, "differ": [9, 10, 13], "two": [9, 10, 11, 13], "distribut": 9, "disabl": 9, "reach": 9, "exce": 9, "step": [9, 10], "frozentri": 9, "judg": 9, "whether": 9, "note": 9, "suppos": 9, "librari": 9, "instead": 9, "interfac": 9, "mechan": 9, "take": 9, "copi": 9, "befor": [9, 13], "modifi": 9, "boolean": 9, "repres": 9, "matplotlib": 10, "boxplot": 10, "titl": 10, "xlabel": 10, "ylabel": 10, "vert": 10, "print": [10, 11], "diagram": 10, "pyplot": 10, "axi": 10, "box": [10, 15], "vertic": 10, "horizont": 10, "boxplot_dict": 10, "values_dict": 10, "form": [10, 13], "dictionari": 10, "save_last_figur": 10, "last": 10, "made": 10, "jupyt": 10, "notebook": 10, "same": 10, "cell": 10, "twin_axes_timeseries_plot": 10, "values_1": 10, "label_1": 10, "values_2": 10, "label_2": 10, "start_timestep_numb": 10, "shift_1": 10, "shift_2": 10, "label_x": 10, "color_1": 10, "tab": 10, "red": 10, "color_2": 10, "blue": 10, "twin": 10, "ax": 10, "timeseri": 10, "curv": 10, "array_lik": 10, "first": [10, 13], "second": 10, "point": 10, "time": [10, 13], "timestep": 10, "shift": 10, "posit": 10, "neg": 10, "color": 10, "jaccardsimilar": 11, "liter": 11, "de_cmc": 11, "en_ptb": 11, "somajobaseclass": 11, "calcul": [11, 13], "jaccard": 11, "similar": 11, "german": 11, "english": 11, "text1": 11, "text2": 11, "get_token_set": 11, "directli": 11, "somajosentencesplitt": [11, 12], "sentenc": [11, 12], "tokenextractor": 11, "extract_url_set": 11, "token_extractor": 11, "url_set": 11, "ist": 11, "ein": 11, "link": 11, "github": [11, 15], "urlswapp": 11, "url_pattern": 11, "swap": 11, "revers": 11, "replac": [11, 13], "extractor": 11, "pattern": 11, "One": [11, 13], "mark": 11, "put": 11, "reverse_swap_url": 11, "revert": 11, "were": 11, "unknown": 11, "swap_url": 11, "detoken": 11, "how": 11, "extract_token_class_set": 11, "keep_token_class": 11, "keep": 11, "kept": 11, "hug": [12, 14], "face": [12, 14], "textsplitt": 12, "somajo_sentence_splitt": 12, "ignore_overly_long_sent": 12, "alwai": 12, "whole": 12, "splitter": 12, "valueerror": [12, 13], "except": 12, "longer": 12, "simpli": 12, "ignor": 12, "detect": 13, "clean": 13, "invis": 13, "charact": 13, "special": 13, "whitespac": 13, "duplic": 13, "distanc": 13, "find": 13, "anomali": 13, "textdist": 13, "max_dimens": 13, "markup": 13, "unusu": 13, "multipl": 13, "again": 13, "dimens": 13, "greater": 13, "_normalize_char_count": 13, "normal": 13, "char": 13, "defaultdict": 13, "lazi": 13, "postprocess": 13, "manhattan": 13, "scipi": 13, "spatial": 13, "cityblock": 13, "most": 13, "commen": 13, "higher": 13, "least": 13, "_normalize_counter_to_defaultdict": 13, "devid": 13, "them": [13, 15], "clean_all_invisible_chars_and_whitespac": 13, "lead": 13, "trail": 13, "defin": 13, "constant": 13, "invisible_charact": 13, "special_whitespac": 13, "rteturn": 13, "has_invisible_charact": 13, "has_special_whitespac": 13, "remove_invisible_charact": 13, "replace_multiple_whitespac": 13, "replace_special_whitespac": 13, "kfoldlabeleddataset": 14, "7": 14, "n_repeat": 14, "random_st": 14, "k": 14, "labeleddataset": 14, "labeled_dataset": 14, "stratification_label": 14, "encod": 14, "labe": 14, "pretrained_model_name_or_path": 14, "pathlik": 14, "insid": 14, "repo": 14, "huggingfac": 14, "machin": 15, "learn": 15, "python": 15, "packag": 15, "pypi": 15, "might": 15, "refer": 15, "repositori": 15, "licens": 15, "imprint": 15}, "objects": {"mltb2": [[1, 0, 0, "-", "arangodb"], [2, 0, 0, "-", "bs"], [3, 0, 0, "-", "data"], [4, 0, 0, "-", "db"], [5, 0, 0, "-", "fasttext"], [6, 0, 0, "-", "files"], [7, 0, 0, "-", "md"], [8, 0, 0, "-", "openai"], [9, 0, 0, "-", "optuna"], [10, 0, 0, "-", "plot"], [11, 0, 0, "-", "somajo"], [12, 0, 0, "-", "somajo_transformers"], [13, 0, 0, "-", "text"], [14, 0, 0, "-", "transformers"]], "mltb2.arangodb": [[1, 1, 1, "", "ArangoBatchDataManager"], [1, 3, 1, "", "_check_config_keys"], [1, 3, 1, "", "arango_collection_backup"]], "mltb2.arangodb.ArangoBatchDataManager": [[1, 2, 1, "", "_arango_client_factory"], [1, 2, 1, "", "_connection_factory"], [1, 2, 1, "", "from_config_file"], [1, 2, 1, "", "load_batch"], [1, 2, 1, "", "save_batch"]], "mltb2.bs": [[2, 3, 1, "", "extract_all"], [2, 3, 1, "", "extract_one"], [2, 3, 1, "", "extract_text"], [2, 3, 1, "", "html_to_md"], [2, 3, 1, "", "remove_all"], [2, 3, 1, "", "soup_to_md"]], "mltb2.data": [[3, 3, 1, "", "_load_colon_data"], [3, 3, 1, "", "_load_colon_label"], [3, 3, 1, "", "load_colon"], [3, 3, 1, "", "load_leukemia_big"], [3, 3, 1, "", "load_prostate"]], "mltb2.db": [[4, 1, 1, "", "AbstractBatchDataManager"], [4, 1, 1, "", "BatchDataProcessor"]], "mltb2.db.AbstractBatchDataManager": [[4, 2, 1, "", "load_batch"], [4, 2, 1, "", "save_batch"]], "mltb2.db.BatchDataProcessor": [[4, 2, 1, "", "run"]], "mltb2.fasttext": [[5, 1, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[5, 2, 1, "", "__call__"], [5, 2, 1, "", "get_model_path_and_download"]], "mltb2.files": [[6, 3, 1, "", "fetch_remote_file"], [6, 3, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[7, 1, 1, "", "MdTextSplitter"], [7, 3, 1, "", "_chunk_md_by_headline"], [7, 3, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[7, 2, 1, "", "__call__"]], "mltb2.openai": [[8, 1, 1, "", "OpenAiAzureChatCompletion"], [8, 1, 1, "", "OpenAiAzureCompletion"], [8, 1, 1, "", "OpenAiBaseCompletion"], [8, 1, 1, "", "OpenAiChatCompletion"], [8, 1, 1, "", "OpenAiCompletion"], [8, 1, 1, "", "OpenAiCompletionAnswer"], [8, 1, 1, "", "OpenAiTokenCounter"], [8, 3, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[8, 2, 1, "", "__call__"], [8, 2, 1, "", "_completion"], [8, 2, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[8, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[8, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[8, 2, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[8, 2, 1, "", "__call__"]], "mltb2.optuna": [[9, 1, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[9, 2, 1, "", "prune"]], "mltb2.plot": [[10, 3, 1, "", "boxplot"], [10, 3, 1, "", "boxplot_dict"], [10, 3, 1, "", "save_last_figure"], [10, 3, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[11, 1, 1, "", "JaccardSimilarity"], [11, 1, 1, "", "SoMaJoBaseClass"], [11, 1, 1, "", "SoMaJoSentenceSplitter"], [11, 1, 1, "", "TokenExtractor"], [11, 1, 1, "", "UrlSwapper"], [11, 3, 1, "", "detokenize"], [11, 3, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[11, 2, 1, "", "__call__"], [11, 2, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[11, 2, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[11, 2, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[11, 2, 1, "", "reverse_swap_urls"], [11, 2, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[12, 1, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[12, 2, 1, "", "__call__"]], "mltb2.text": [[13, 1, 1, "", "TextDistance"], [13, 3, 1, "", "_normalize_counter_to_defaultdict"], [13, 3, 1, "", "clean_all_invisible_chars_and_whitespaces"], [13, 3, 1, "", "has_invisible_characters"], [13, 3, 1, "", "has_special_whitespaces"], [13, 3, 1, "", "remove_invisible_characters"], [13, 3, 1, "", "replace_multiple_whitespaces"], [13, 3, 1, "", "replace_special_whitespaces"]], "mltb2.text.TextDistance": [[13, 2, 1, "", "_normalize_char_counter"], [13, 2, 1, "", "distance"], [13, 2, 1, "", "fit"]], "mltb2.transformers": [[14, 1, 1, "", "KFoldLabeledDataset"], [14, 1, 1, "", "LabeledDataset"], [14, 1, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[14, 2, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[14, 2, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "refer": 0, "arangodb": 1, "b": 2, "data": 3, "db": 4, "fasttext": 5, "file": 6, "md": 7, "openai": 8, "optuna": 9, "plot": 10, "somajo": 11, "somajo_transform": 12, "text": 13, "transform": 14, "mltb2": 15, "document": 15, "instal": 15, "content": 15}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "arangodb": [[1, "module-mltb2.arangodb"]], "bs": [[2, "module-mltb2.bs"]], "data": [[3, "module-mltb2.data"]], "db": [[4, "module-mltb2.db"]], "fasttext": [[5, "module-mltb2.fasttext"]], "files": [[6, "module-mltb2.files"]], "md": [[7, "module-mltb2.md"]], "openai": [[8, "module-mltb2.openai"]], "optuna": [[9, "module-mltb2.optuna"]], "plot": [[10, "module-mltb2.plot"]], "somajo": [[11, "module-mltb2.somajo"]], "somajo_transformers": [[12, "module-mltb2.somajo_transformers"]], "text": [[13, "module-mltb2.text"]], "transformers": [[14, "module-mltb2.transformers"]], "MLTB2 Documentation": [[15, "mltb2-documentation"]], "Installation": [[15, "installation"]], "Content": [[15, "content"]]}, "indexentries": {"arangobatchdatamanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoBatchDataManager"]], "_arango_client_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._arango_client_factory"]], "_check_config_keys() (in module mltb2.arangodb)": [[1, "mltb2.arangodb._check_config_keys"]], "_connection_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._connection_factory"]], "arango_collection_backup() (in module mltb2.arangodb)": [[1, "mltb2.arangodb.arango_collection_backup"]], "from_config_file() (mltb2.arangodb.arangobatchdatamanager class method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.from_config_file"]], "load_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.load_batch"]], "mltb2.arangodb": [[1, "module-mltb2.arangodb"]], "module": [[1, "module-mltb2.arangodb"], [2, "module-mltb2.bs"], [3, "module-mltb2.data"], [4, "module-mltb2.db"], [5, "module-mltb2.fasttext"], [6, "module-mltb2.files"], [7, "module-mltb2.md"], [8, "module-mltb2.openai"], [9, "module-mltb2.optuna"], [10, "module-mltb2.plot"], [11, "module-mltb2.somajo"], [12, "module-mltb2.somajo_transformers"], [13, "module-mltb2.text"], [14, "module-mltb2.transformers"]], "save_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.save_batch"]], "extract_all() (in module mltb2.bs)": [[2, "mltb2.bs.extract_all"]], "extract_one() (in module mltb2.bs)": [[2, "mltb2.bs.extract_one"]], "extract_text() (in module mltb2.bs)": [[2, "mltb2.bs.extract_text"]], "html_to_md() (in module mltb2.bs)": [[2, "mltb2.bs.html_to_md"]], "mltb2.bs": [[2, "module-mltb2.bs"]], "remove_all() (in module mltb2.bs)": [[2, "mltb2.bs.remove_all"]], "soup_to_md() (in module mltb2.bs)": [[2, "mltb2.bs.soup_to_md"]], "_load_colon_data() (in module mltb2.data)": [[3, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[3, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[3, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[3, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[3, "mltb2.data.load_prostate"]], "mltb2.data": [[3, "module-mltb2.data"]], "abstractbatchdatamanager (class in mltb2.db)": [[4, "mltb2.db.AbstractBatchDataManager"]], "batchdataprocessor (class in mltb2.db)": [[4, "mltb2.db.BatchDataProcessor"]], "load_batch() (mltb2.db.abstractbatchdatamanager method)": [[4, "mltb2.db.AbstractBatchDataManager.load_batch"]], "mltb2.db": [[4, "module-mltb2.db"]], "run() (mltb2.db.batchdataprocessor method)": [[4, "mltb2.db.BatchDataProcessor.run"]], "save_batch() (mltb2.db.abstractbatchdatamanager method)": [[4, "mltb2.db.AbstractBatchDataManager.save_batch"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[5, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[5, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[5, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[5, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[6, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[6, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[6, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[7, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[7, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[7, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[7, "mltb2.md.chunk_md"]], "mltb2.md": [[7, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[8, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[8, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[8, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[8, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[8, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[8, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[8, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[8, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[8, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[9, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[9, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[9, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[10, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[10, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[10, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[10, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[10, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[11, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[11, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[11, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[11, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[11, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[11, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[11, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[11, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[11, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[11, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[11, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[11, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[11, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[11, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[12, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[12, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[12, "module-mltb2.somajo_transformers"]], "textdistance (class in mltb2.text)": [[13, "mltb2.text.TextDistance"]], "_normalize_char_counter() (mltb2.text.textdistance method)": [[13, "mltb2.text.TextDistance._normalize_char_counter"]], "_normalize_counter_to_defaultdict() (in module mltb2.text)": [[13, "mltb2.text._normalize_counter_to_defaultdict"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "distance() (mltb2.text.textdistance method)": [[13, "mltb2.text.TextDistance.distance"]], "fit() (mltb2.text.textdistance method)": [[13, "mltb2.text.TextDistance.fit"]], "has_invisible_characters() (in module mltb2.text)": [[13, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[13, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[13, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[14, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[14, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[14, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[14, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[14, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[14, "mltb2.transformers.KFoldLabeledDataset.split"]]}})
\ No newline at end of file
|