diff --git a/_modules/mltb2/openai.html b/_modules/mltb2/openai.html index 4ae2513..8254512 100644 --- a/_modules/mltb2/openai.html +++ b/_modules/mltb2/openai.html @@ -220,6 +220,23 @@
return cls(**result) # type: ignore[arg-type]
self,
prompt: Union[str, List[Dict[str, str]]],
completion_kwargs: Optional[Dict[str, Any]] = None,
+ clean_openai_tokens: bool = False,
) -> OpenAiChatResult:
"""Create a model response for the given prompt (chat conversation).
@@ -285,6 +303,7 @@ Source code for mltb2.openai
- ``openai.resources.chat.completions.Completions.create()``
- OpenAI API reference: `Create chat completion <https://platform.openai.com/docs/api-reference/chat/create>`_
+ clean_openai_tokens: Remove OpenAI special tokens from the prompt.
Returns:
The result of the OpenAI completion.
@@ -315,6 +334,8 @@ Source code for mltb2.openai
completion_kwargs = {} # set default value
completion_kwargs["model"] = self.model
messages = [{"role": "user", "content": prompt}] if isinstance(prompt, str) else prompt
+ if clean_openai_tokens:
+ messages = remove_openai_tokens(messages)
chat_completion = self.client.chat.completions.create(
messages=messages, # type: ignore[arg-type]
**completion_kwargs,
@@ -326,6 +347,7 @@ Source code for mltb2.openai
self,
prompt: Union[str, List[Dict[str, str]]],
completion_kwargs: Optional[Dict[str, Any]] = None,
+ clean_openai_tokens: bool = False,
) -> OpenAiChatResult:
"""Create a model response for the given prompt (chat conversation).
@@ -340,6 +362,7 @@ Source code for mltb2.openai
- ``openai.resources.chat.completions.Completions.create()``
- OpenAI API reference: `Create chat completion <https://platform.openai.com/docs/api-reference/chat/create>`_
+ clean_openai_tokens: Remove OpenAI special tokens from the prompt.
Returns:
The result of the OpenAI completion.
@@ -370,6 +393,8 @@ Source code for mltb2.openai
completion_kwargs = {} # set default value
completion_kwargs["model"] = self.model
messages = [{"role": "user", "content": prompt}] if isinstance(prompt, str) else prompt
+ if clean_openai_tokens:
+ messages = remove_openai_tokens(messages)
chat_completion = await self.async_client.chat.completions.create(
messages=messages, # type: ignore[arg-type]
**completion_kwargs,
diff --git a/api-reference/openai.html b/api-reference/openai.html
index a1c4cc0..e56b1f2 100644
--- a/api-reference/openai.html
+++ b/api-reference/openai.html
@@ -73,6 +73,7 @@
OpenAiTokenCounter.__call__()
+remove_openai_tokens()
optuna
@@ -168,7 +169,7 @@
-
-create_completions(prompt: str | List[Dict[str, str]], completion_kwargs: Dict[str, Any] | None = None) OpenAiChatResult [source]
+create_completions(prompt: str | List[Dict[str, str]], completion_kwargs: Dict[str, Any] | None = None, clean_openai_tokens: bool = False) OpenAiChatResult [source]
Create a model response for the given prompt (chat conversation).
- Parameters:
@@ -187,6 +188,7 @@
+
clean_openai_tokens (bool) – Remove OpenAI special tokens from the prompt.
Returns:
@@ -200,7 +202,7 @@
-
-async create_completions_async(prompt: str | List[Dict[str, str]], completion_kwargs: Dict[str, Any] | None = None) OpenAiChatResult [source]
+async create_completions_async(prompt: str | List[Dict[str, str]], completion_kwargs: Dict[str, Any] | None = None, clean_openai_tokens: bool = False) OpenAiChatResult [source]
Create a model response for the given prompt (chat conversation).
- Parameters:
@@ -219,6 +221,7 @@
+
clean_openai_tokens (bool) – Remove OpenAI special tokens from the prompt.
Returns:
@@ -346,6 +349,24 @@
+
+-
+mltb2.openai.remove_openai_tokens(messages: List[Dict[str, str]]) List[Dict[str, str]] [source]
+Remove OpenAI special tokens from the messages.
+These tokens are <|im_start|>
and <|im_end|>
and they can cause problems when passed to the OpenAI API.
+
+
+
diff --git a/genindex.html b/genindex.html
index 3df6ba4..0b67df6 100644
--- a/genindex.html
+++ b/genindex.html
@@ -508,6 +508,8 @@ R
remove_all() (in module mltb2.bs)
remove_invisible_characters() (in module mltb2.text)
+
+ remove_openai_tokens() (in module mltb2.openai)
diff --git a/objects.inv b/objects.inv
index 18745f5..4596bac 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/searchindex.js b/searchindex.js
index 4bf751d..4c1132a 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["api-reference", "api-reference/arangodb", "api-reference/bs", "api-reference/data", "api-reference/db", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/arangodb.rst", "api-reference/bs.rst", "api-reference/data.rst", "api-reference/db.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "arangodb
", "bs
", "data
", "db
", "fasttext
", "files
", "md
", "openai
", "optuna
", "plot
", "somajo
", "somajo_transformers
", "text
", "transformers
", "MLTB2 Documentation"], "terms": {"arangodb": [0, 15], "b": [0, 5, 13, 15], "data": [0, 1, 4, 6, 9, 14, 15], "db": [0, 9, 15], "fasttext": [0, 15], "file": [0, 1, 3, 5, 8, 15], "md": [0, 15], "openai": [0, 15], "optuna": [0, 15], "plot": [0, 15], "somajo": [0, 12, 15], "somajo_transform": [0, 15], "text": [0, 2, 5, 7, 8, 11, 12, 14, 15], "transform": [0, 12, 15], "util": [1, 4, 6, 14], "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "pip": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15], "instal": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14], "necessari": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15], "depend": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "class": [1, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "arangobatchdatamanag": 1, "host": [1, 14], "str": [1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14], "sequenc": [1, 4, 6], "db_name": 1, "usernam": 1, "password": 1, "collection_nam": 1, "attribute_nam": 1, "batch_siz": [1, 6], "int": [1, 5, 6, 7, 8, 9, 10, 12, 13, 14], "20": [1, 9], "aql_overwrit": 1, "none": [1, 2, 3, 4, 5, 6, 8, 10, 11, 13, 14], "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "base": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "abstractbatchdatamanag": [1, 4], "arangoconnectionmanag": 1, "implement": [1, 2, 4, 9, 13], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "databas": [1, 4], "name": [1, 2, 6, 8], "document": 1, "from": [1, 2, 3, 4, 5, 6, 8, 9, 11, 13], "collect": 1, "ar": [1, 2, 3, 5, 6, 7, 8, 9, 13, 14, 15], "process": [1, 4, 6, 7, 8, 11, 12, 13, 14], "attribut": [1, 2], "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "check": [1, 9, 13], "alreadi": [1, 7], "If": [1, 2, 3, 5, 6, 8, 10, 11, 12, 13, 14], "present": 1, "avail": [1, 15], "consid": 1, "The": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "batch": [1, 4, 6], "size": [1, 6], "aql": 1, "string": [1, 2, 11], "overwrit": 1, "default": [1, 2, 3, 6, 10], "classmethod": [1, 8], "from_config_fil": 1, "config_file_nam": 1, "construct": [1, 8], "config": 1, "must": [1, 9, 13, 15], "contain": [1, 3, 5, 8, 13, 14], "valu": [1, 8, 9, 10, 13], "exampl": [1, 8, 9, 11, 13], "http": [1, 3, 11], "com": [1, 11], "my_ml_databas": 1, "my_usernam": 1, "secret": 1, "my_ml_data_collect": 1, "processing_metadata": 1, "100": [1, 9, 13], "path": [1, 5, 6, 14], "load_batch": [1, 4], "load": [1, 3, 4, 6, 8], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "type": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "save_batch": [1, 4, 6], "save": [1, 4, 6, 10], "object": [1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "connect": 1, "manag": [1, 4], "creat": [1, 6, 8, 10], "_arango_client_factori": 1, "arangocli": 1, "an": [1, 4, 5, 8, 9, 11, 12, 14], "client": 1, "_connection_factori": 1, "arango_cli": 1, "standarddatabas": 1, "arangoimportdatamanag": 1, "import": [1, 9, 11], "tool": [1, 2, 3, 5, 8, 9, 10, 11, 12, 13, 14, 15], "fill": 1, "least": [1, 13], "import_datafram": 1, "datafram": [1, 3], "create_collect": 1, "bool": [1, 6, 7, 8, 9, 10, 11, 12, 13, 14], "fals": [1, 6, 7, 8, 10, 11, 12, 13, 14], "panda": [1, 3], "true": [1, 2, 9, 10, 12, 13], "doe": [1, 7, 9, 10, 12], "exist": [1, 6], "rais": [1, 2, 6, 12, 13], "arango": 1, "except": [1, 12], "documentinserterror": 1, "fail": 1, "import_dict": 1, "dict": [1, 2, 5, 6, 8], "ani": [1, 2, 6, 8], "_check_config_kei": 1, "expected_config_kei": 1, "all": [1, 2, 6, 11], "expect": 1, "kei": [1, 8], "arango_collection_backup": 1, "commandlin": 1, "do": [1, 11, 13, 14], "backup": 1, "written": 1, "gzip": 1, "compress": 1, "jsonl": 1, "current": 1, "work": [1, 9], "directori": [1, 3, 6, 14], "run": [1, 4, 8], "col": 1, "h": 1, "get": [1, 5, 8, 11], "command": 1, "line": [1, 2], "help": 1, "beauti": 2, "soup": 2, "html": [2, 3, 13], "specif": [2, 7, 8, 11, 12, 13, 14, 15], "extract_al": 2, "beautifulsoup": 2, "attr": 2, "kwarg": 2, "extract": [2, 11], "specifi": [2, 7, 12], "element": [2, 5], "tag": [2, 13], "addit": 2, "keyword": [2, 8], "argument": [2, 8], "extract_on": 2, "exactli": 2, "one": [2, 4, 10, 11, 13], "function": [2, 4, 6, 8, 9, 10, 13], "expact": 2, "onli": [2, 7, 8, 9, 13], "result": [2, 6, 8, 9], "found": 2, "otherwis": [2, 13], "runtimeerror": 2, "extract_text": 2, "join_str": 2, "ha": [2, 8, 10], "known": 2, "issu": 2, "whitespac": [2, 13], "handl": 2, "join": 2, "part": 2, "per": [2, 7, 9, 12], "space": 2, "html_to_md": 2, "mdformat_opt": 2, "convert": [2, 3, 8, 11], "markdown": [2, 7], "mdformat": 2, "option": [2, 15], "number": [2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "appli": 2, "consecut": 2, "order": 2, "list": [2, 5, 6, 7, 8, 11, 12, 14], "wrap": 2, "paragraph": [2, 7], "word": [2, 11, 12], "mode": 2, "end": [2, 7], "lf": 2, "remove_al": 2, "remov": [2, 6, 7, 13], "done": [2, 4, 9], "place": [2, 11], "noth": 2, "": [2, 9, 10], "soup_to_md": 2, "offer": [3, 5, 9, 11, 12, 13, 14], "follow": [3, 13], "tabular": 3, "set": [3, 5, 8, 9, 11, 14], "biolog": 3, "medic": 3, "domain": 3, "support": [3, 6, 13], "colon": 3, "genom": 3, "pub": 3, "princeton": 3, "edu": 3, "oncologi": 3, "affydata": 3, "index": [3, 15], "prostat": 3, "web": 3, "stanford": 3, "hasti": 3, "casi_fil": 3, "leukemia_big": 3, "leukemia": 3, "after": [3, 6, 13], "internet": 3, "pars": 3, "cach": 3, "determin": [3, 6], "get_and_create_mltb2_data_dir": [3, 6], "_load_colon_data": 3, "label": [3, 10, 14], "also": [3, 8], "see": [3, 8, 9, 15], "_load_colon_label": 3, "seri": 3, "load_colon": 3, "mltb2_base_data_dir": [3, 6], "tupl": [3, 11], "user": [3, 6, 9], "platformdir": [3, 6], "user_data_dir": [3, 6], "load_leukemia_big": 3, "big": 3, "load_prost": 3, "abc": [4, 11], "abstract": [4, 11], "respect": 4, "intend": 4, "conjunct": 4, "batchdataprocessor": 4, "data_manag": 4, "process_batch_callback": 4, "callabl": 4, "A": [4, 5, 8, 9, 15], "callback": 4, "until": [4, 9], "empti": 4, "For": [4, 9, 10, 13, 15], "each": 4, "call": [4, 8, 9, 10, 13], "fasttextlanguageidentif": 5, "identifi": 5, "languag": [5, 11], "__call__": [5, 7, 8, 11, 12, 14], "num_lang": 5, "10": [5, 9], "always_detect_lang": 5, "given": [5, 6, 8, 9, 13], "which": [5, 6, 8, 9, 11, 13, 14], "recogn": 5, "probabl": 5, "should": [5, 9, 11], "alwai": [5, 8, 12], "even": 5, "detect": [5, 13], "0": [5, 9, 10, 13], "more": [5, 9, 10, 13], "than": [5, 12, 13], "so": 5, "guarante": 5, "you": [5, 8, 9, 15], "want": [5, 8, 15], "includ": 5, "case": 5, "when": [5, 6, 8, 15], "veri": 5, "low": 5, "possibl": 5, "af": 5, "al": 5, "am": 5, "arz": 5, "ast": 5, "av": 5, "az": 5, "azb": 5, "ba": 5, "bar": 5, "bcl": 5, "bg": 5, "bh": 5, "bn": 5, "bo": 5, "bpy": 5, "br": 5, "bxr": 5, "ca": 5, "cbk": 5, "ce": 5, "ceb": 5, "ckb": 5, "co": [5, 14], "c": 5, "cv": [5, 9], "cy": 5, "da": [5, 11], "de": [5, 11], "diq": 5, "dsb": 5, "dty": 5, "dv": 5, "el": 5, "eml": 5, "en": 5, "eo": 5, "e": 5, "et": 5, "eu": 5, "fa": 5, "fi": 5, "fr": 5, "frr": 5, "fy": 5, "ga": 5, "gd": 5, "gl": 5, "gn": 5, "gom": 5, "gu": 5, "gv": 5, "he": 5, "hi": 5, "hif": 5, "hr": 5, "hsb": 5, "ht": 5, "hu": 5, "hy": 5, "ia": 5, "id": [5, 14], "ie": 5, "ilo": 5, "io": 5, "ja": 5, "jbo": 5, "jv": 5, "ka": 5, "kk": 5, "km": 5, "kn": 5, "ko": 5, "krc": 5, "ku": 5, "kv": 5, "kw": 5, "ky": 5, "la": 5, "lb": 5, "lez": 5, "li": 5, "lmo": 5, "lo": 5, "lrc": 5, "lt": 5, "lv": 5, "mai": [5, 6], "mg": 5, "mhr": 5, "min": 5, "mk": 5, "ml": 5, "mn": 5, "mr": 5, "mrj": 5, "m": 5, "mt": 5, "mwl": 5, "my": 5, "myv": 5, "mzn": 5, "nah": 5, "nap": 5, "nd": 5, "ne": 5, "new": 5, "nl": 5, "nn": 5, "oc": 5, "o": 5, "pa": 5, "pam": 5, "pfl": 5, "pl": 5, "pm": 5, "pnb": 5, "p": 5, "pt": 5, "qu": 5, "rm": 5, "ro": 5, "ru": 5, "rue": 5, "sa": 5, "sah": 5, "sc": 5, "scn": 5, "sco": 5, "sd": 5, "sh": 5, "si": 5, "sk": 5, "sl": 5, "sq": 5, "sr": 5, "su": 5, "sv": 5, "sw": 5, "ta": 5, "te": 5, "tg": 5, "th": 5, "tk": 5, "tl": 5, "tr": 5, "tt": 5, "tyv": 5, "ug": 5, "uk": 5, "ur": 5, "uz": 5, "vec": 5, "vep": 5, "vi": 5, "vl": 5, "vo": 5, "wa": [5, 8, 13, 14], "war": 5, "wuu": 5, "xal": 5, "xmf": 5, "yi": 5, "yo": 5, "yue": 5, "zh": 5, "static": [5, 6], "get_model_path_and_download": 5, "model": [5, 8, 9, 14], "download": 5, "need": 5, "full": [5, 6, 8], "provid": [6, 9], "other": [6, 13], "filebasedrestartablebatchdataprocessor": 6, "uuid_nam": 6, "result_dir": 6, "processor": 6, "restart": 6, "back": 6, "uuid": 6, "field": 6, "where": [6, 11], "store": 6, "__len__": 6, "record": 6, "load_data": 6, "ignore_load_error": 6, "method": [6, 9, 13], "can": [6, 7, 8, 10, 13, 15], "As": [6, 9], "execut": 6, "sever": 6, "time": [6, 10, 13], "parallel": 6, "duplic": [6, 13], "These": [6, 13], "here": [6, 9], "ignor": [6, 12], "error": 6, "just": [6, 8, 14], "print": [6, 10, 11], "them": [6, 13], "read_batch": 6, "read": 6, "next": 6, "fetch_remote_fil": 6, "dirnam": 6, "filenam": [6, 10], "url": [6, 11], "sha256_checksum": 6, "fetch": 6, "remot": 6, "under": 6, "sha256": 6, "checksum": 6, "ioerror": 6, "wrong": 6, "dir": 6, "exact": 6, "folder": 6, "append": [6, 9], "mdtextsplitt": 7, "max_token": [7, 8, 12], "transformers_token_count": [7, 12], "transformerstokencount": [7, 12, 14], "show_progress_bar": [7, 8, 11, 12, 13, 14], "split": [7, 9, 11, 12, 14], "section": [7, 12], "maximum": [7, 12, 13], "token": [7, 8, 11, 12, 14], "divid": [7, 12], "head": 7, "correspond": 7, "exceed": 7, "singl": [7, 10, 13], "chunk": 7, "larger": [7, 9], "counter": [7, 12, 13], "show": [7, 8, 11, 12, 13, 14], "progressbar": [7, 8, 11, 12, 13, 14], "dure": [7, 8, 11, 12, 13, 14], "md_text": 7, "_chunk_md_by_headlin": 7, "headlin": 7, "chunk_md": 7, "merg": 7, "isol": 7, "subsequ": 7, "without": [7, 8], "content": [7, 8], "openaiazurechat": 8, "api_kei": 8, "api_vers": 8, "azure_endpoint": 8, "openaichat": 8, "interact": 8, "azur": 8, "chat": 8, "from_yaml": 8, "api": [8, 15], "refer": [8, 15], "complet": 8, "quickstart": 8, "start": 8, "gener": [8, 14], "servic": 8, "version": 8, "common": 8, "2023": 8, "05": 8, "15": 8, "endpoint": 8, "create_complet": 8, "prompt": 8, "completion_kwarg": 8, "openaichatresult": 8, "respons": 8, "convers": 8, "via": 8, "pleas": 8, "initi": 8, "messag": 8, "resourc": 8, "async": 8, "create_completions_async": 8, "yaml_fil": 8, "yaml": 8, "environ": 8, "variabl": 8, "openai_api_kei": 8, "prompt_token": 8, "completion_token": 8, "total_token": 8, "finish_reason": 8, "completion_arg": 8, "asdict": 8, "open_ai_chat_result": 8, "dataclass": 8, "been": 8, "total": [8, 13], "content_token": 8, "reason": [8, 9], "why": 8, "stop": 8, "mean": [8, 9], "limit": [8, 13], "length": 8, "becaus": 8, "content_filt": 8, "omit": 8, "due": 8, "flag": 8, "filter": 8, "tool_cal": 8, "function_cal": 8, "deprec": 8, "have": [8, 9], "temperatur": 8, "top_p": 8, "from_chat_complet": 8, "chat_complet": 8, "chatcomplet": 8, "openaitokencount": 8, "model_nam": 8, "count": [8, 13, 14], "some": [8, 13], "gpt": 8, "4": [8, 9], "3": [8, 9], "5": 8, "turbo": 8, "davinci": 8, "003": 8, "embed": 8, "ada": 8, "002": 8, "iter": [8, 11, 13, 14], "significancerepeatedtrainingprun": 9, "alpha": 9, "float": [9, 11, 13], "1": [9, 14], "n_warmup_step": 9, "baseprun": 9, "pruner": 9, "statist": 9, "signific": 9, "heurist": 9, "decis": 9, "make": [9, 10], "It": [9, 11, 13, 15], "prune": 9, "repeat": 9, "train": [9, 14], "like": 9, "cross": [9, 14], "valid": [9, 14], "test": [9, 14], "t": 9, "our": 9, "experi": 9, "shown": 9, "aplha": 9, "between": [9, 13], "standard": 9, "assum": 9, "adjust": 9, "onc": [9, 13], "hyperparamet": 9, "those": [9, 15], "basi": 9, "intermedi": 9, "epoch": 9, "In": 9, "contrast": 9, "precis": 9, "individu": [9, 15], "fold": [9, 14], "below": 9, "minimalist": 9, "log": 9, "numpi": 9, "np": 9, "sklearn": 9, "dataset": [9, 14], "load_iri": 9, "model_select": 9, "stratifiedkfold": 9, "ensembl": 9, "randomforestclassifi": 9, "metric": 9, "accuracy_scor": 9, "configur": 9, "logger": 9, "debug": 9, "output": [9, 11], "getlogg": 9, "addhandl": 9, "streamhandl": 9, "setlevel": 9, "x": [9, 10, 13], "y": [9, 10, 13], "target": 9, "def": 9, "trial": 9, "min_samples_split": 9, "suggest_int": 9, "2": 9, "n_estim": 9, "validation_result_list": 9, "skf": 9, "n_split": [9, 14], "fold_index": 9, "train_index": 9, "val_index": 9, "enumer": 9, "x_train": 9, "x_val": 9, "y_train": 9, "y_val": 9, "rf": 9, "fit": [9, 13], "y_pred": 9, "predict": 9, "acc": 9, "report": 9, "we": [9, 13], "should_prun": 9, "break": 9, "studi": 9, "create_studi": 9, "storag": 9, "sqlite": 9, "memori": 9, "study_nam": 9, "iris_cv": 9, "direct": 9, "maxim": 9, "load_if_exist": 9, "sampler": 9, "tpesampl": 9, "multivari": 9, "add": 9, "optim": 9, "n_trial": 9, "level": 9, "aggress": 9, "smaller": 9, "stronger": 9, "differ": [9, 10, 13], "two": [9, 10, 11, 13], "distribut": 9, "disabl": 9, "reach": 9, "exce": 9, "step": [9, 10], "frozentri": 9, "judg": 9, "whether": 9, "note": 9, "suppos": 9, "librari": 9, "instead": 9, "interfac": 9, "mechan": 9, "take": 9, "copi": 9, "befor": [9, 13], "modifi": 9, "boolean": 9, "repres": 9, "matplotlib": 10, "boxplot": 10, "titl": 10, "xlabel": 10, "ylabel": 10, "vert": 10, "diagram": 10, "pyplot": 10, "axi": 10, "box": [10, 15], "vertic": 10, "horizont": 10, "boxplot_dict": 10, "values_dict": 10, "form": [10, 13], "dictionari": 10, "save_last_figur": 10, "last": 10, "made": 10, "jupyt": 10, "notebook": 10, "same": 10, "cell": 10, "twin_axes_timeseries_plot": 10, "values_1": 10, "label_1": 10, "values_2": 10, "label_2": 10, "start_timestep_numb": 10, "shift_1": 10, "shift_2": 10, "label_x": 10, "color_1": 10, "tab": 10, "red": 10, "color_2": 10, "blue": 10, "twin": 10, "ax": 10, "timeseri": 10, "curv": 10, "array_lik": 10, "first": [10, 13], "second": 10, "point": 10, "timestep": 10, "shift": 10, "posit": 10, "neg": 10, "color": 10, "jaccardsimilar": 11, "liter": 11, "de_cmc": 11, "en_ptb": 11, "somajobaseclass": 11, "calcul": [11, 13], "jaccard": 11, "similar": 11, "german": 11, "english": 11, "text1": 11, "text2": 11, "get_token_set": 11, "directli": 11, "somajosentencesplitt": [11, 12], "sentenc": [11, 12], "tokenextractor": 11, "extract_token_set": 11, "keep_token_class": 11, "keep": 11, "kept": 11, "extract_url_set": 11, "token_extractor": 11, "url_set": 11, "ist": 11, "ein": 11, "link": 11, "github": [11, 15], "urlswapp": 11, "url_pattern": 11, "swap": 11, "revers": 11, "replac": [11, 13], "extractor": 11, "pattern": 11, "One": [11, 13], "mark": 11, "put": 11, "reverse_swap_url": 11, "revert": 11, "were": 11, "unknown": 11, "swap_url": 11, "detoken": 11, "how": 11, "extract_token_class_set": 11, "hug": [12, 14], "face": [12, 14], "textsplitt": 12, "somajo_sentence_splitt": 12, "ignore_overly_long_sent": 12, "whole": 12, "splitter": 12, "valueerror": [12, 13], "longer": 12, "simpli": 12, "clean": 13, "invis": 13, "charact": 13, "special": 13, "distanc": 13, "find": 13, "anomali": 13, "textdist": 13, "max_dimens": 13, "markup": 13, "unusu": 13, "multipl": 13, "again": 13, "dimens": 13, "greater": 13, "_normalize_char_count": 13, "normal": 13, "char": 13, "defaultdict": 13, "lazi": 13, "postprocess": 13, "manhattan": 13, "scipi": 13, "spatial": 13, "cityblock": 13, "most": 13, "commen": 13, "higher": 13, "_normalize_counter_to_defaultdict": 13, "devid": 13, "clean_all_invisible_chars_and_strip": 13, "strip": 13, "lead": 13, "trail": 13, "defin": 13, "constant": 13, "invisible_charact": 13, "special_whitespac": 13, "rteturn": 13, "clean_all_invisible_chars_and_whitespac": 13, "has_invisible_charact": 13, "has_special_whitespac": 13, "has_xml_tag": 13, "xml": 13, "xml_tag": 13, "while": 13, "remove_invisible_charact": 13, "replace_multiple_whitespac": 13, "replace_special_whitespac": 13, "kfoldlabeleddataset": 14, "7": 14, "n_repeat": 14, "random_st": 14, "k": 14, "labeleddataset": 14, "labeled_dataset": 14, "stratification_label": 14, "encod": 14, "labe": 14, "pretrained_model_name_or_path": 14, "pathlik": 14, "insid": 14, "repo": 14, "huggingfac": 14, "machin": 15, "learn": 15, "python": 15, "packag": 15, "pypi": 15, "mani": 15, "To": 15, "descript": 15, "repositori": 15, "licens": 15, "imprint": 15}, "objects": {"mltb2": [[1, 0, 0, "-", "arangodb"], [2, 0, 0, "-", "bs"], [3, 0, 0, "-", "data"], [4, 0, 0, "-", "db"], [5, 0, 0, "-", "fasttext"], [6, 0, 0, "-", "files"], [7, 0, 0, "-", "md"], [8, 0, 0, "-", "openai"], [9, 0, 0, "-", "optuna"], [10, 0, 0, "-", "plot"], [11, 0, 0, "-", "somajo"], [12, 0, 0, "-", "somajo_transformers"], [13, 0, 0, "-", "text"], [14, 0, 0, "-", "transformers"]], "mltb2.arangodb": [[1, 1, 1, "", "ArangoBatchDataManager"], [1, 1, 1, "", "ArangoConnectionManager"], [1, 1, 1, "", "ArangoImportDataManager"], [1, 3, 1, "", "_check_config_keys"], [1, 3, 1, "", "arango_collection_backup"]], "mltb2.arangodb.ArangoBatchDataManager": [[1, 2, 1, "", "from_config_file"], [1, 2, 1, "", "load_batch"], [1, 2, 1, "", "save_batch"]], "mltb2.arangodb.ArangoConnectionManager": [[1, 2, 1, "", "_arango_client_factory"], [1, 2, 1, "", "_connection_factory"]], "mltb2.arangodb.ArangoImportDataManager": [[1, 2, 1, "", "from_config_file"], [1, 2, 1, "", "import_dataframe"], [1, 2, 1, "", "import_dicts"]], "mltb2.bs": [[2, 3, 1, "", "extract_all"], [2, 3, 1, "", "extract_one"], [2, 3, 1, "", "extract_text"], [2, 3, 1, "", "html_to_md"], [2, 3, 1, "", "remove_all"], [2, 3, 1, "", "soup_to_md"]], "mltb2.data": [[3, 3, 1, "", "_load_colon_data"], [3, 3, 1, "", "_load_colon_label"], [3, 3, 1, "", "load_colon"], [3, 3, 1, "", "load_leukemia_big"], [3, 3, 1, "", "load_prostate"]], "mltb2.db": [[4, 1, 1, "", "AbstractBatchDataManager"], [4, 1, 1, "", "BatchDataProcessor"]], "mltb2.db.AbstractBatchDataManager": [[4, 2, 1, "", "load_batch"], [4, 2, 1, "", "save_batch"]], "mltb2.db.BatchDataProcessor": [[4, 2, 1, "", "run"]], "mltb2.fasttext": [[5, 1, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[5, 2, 1, "", "__call__"], [5, 2, 1, "", "get_model_path_and_download"]], "mltb2.files": [[6, 1, 1, "", "FileBasedRestartableBatchDataProcessor"], [6, 3, 1, "", "fetch_remote_file"], [6, 3, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.files.FileBasedRestartableBatchDataProcessor": [[6, 2, 1, "", "__len__"], [6, 2, 1, "", "load_data"], [6, 2, 1, "", "read_batch"], [6, 2, 1, "", "save_batch"]], "mltb2.md": [[7, 1, 1, "", "MdTextSplitter"], [7, 3, 1, "", "_chunk_md_by_headline"], [7, 3, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[7, 2, 1, "", "__call__"]], "mltb2.openai": [[8, 1, 1, "", "OpenAiAzureChat"], [8, 1, 1, "", "OpenAiChat"], [8, 1, 1, "", "OpenAiChatResult"], [8, 1, 1, "", "OpenAiTokenCounter"]], "mltb2.openai.OpenAiChat": [[8, 2, 1, "", "create_completions"], [8, 2, 1, "", "create_completions_async"], [8, 2, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatResult": [[8, 2, 1, "", "from_chat_completion"]], "mltb2.openai.OpenAiTokenCounter": [[8, 2, 1, "", "__call__"]], "mltb2.optuna": [[9, 1, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[9, 2, 1, "", "prune"]], "mltb2.plot": [[10, 3, 1, "", "boxplot"], [10, 3, 1, "", "boxplot_dict"], [10, 3, 1, "", "save_last_figure"], [10, 3, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[11, 1, 1, "", "JaccardSimilarity"], [11, 1, 1, "", "SoMaJoBaseClass"], [11, 1, 1, "", "SoMaJoSentenceSplitter"], [11, 1, 1, "", "TokenExtractor"], [11, 1, 1, "", "UrlSwapper"], [11, 3, 1, "", "detokenize"], [11, 3, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[11, 2, 1, "", "__call__"], [11, 2, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[11, 2, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[11, 2, 1, "", "extract_token_set"], [11, 2, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[11, 2, 1, "", "reverse_swap_urls"], [11, 2, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[12, 1, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[12, 2, 1, "", "__call__"]], "mltb2.text": [[13, 1, 1, "", "TextDistance"], [13, 3, 1, "", "_normalize_counter_to_defaultdict"], [13, 3, 1, "", "clean_all_invisible_chars_and_strip"], [13, 3, 1, "", "clean_all_invisible_chars_and_whitespaces"], [13, 3, 1, "", "has_invisible_characters"], [13, 3, 1, "", "has_special_whitespaces"], [13, 3, 1, "", "has_xml_tag"], [13, 3, 1, "", "remove_invisible_characters"], [13, 3, 1, "", "replace_multiple_whitespaces"], [13, 3, 1, "", "replace_special_whitespaces"]], "mltb2.text.TextDistance": [[13, 2, 1, "", "_normalize_char_counter"], [13, 2, 1, "", "distance"], [13, 2, 1, "", "fit"]], "mltb2.transformers": [[14, 1, 1, "", "KFoldLabeledDataset"], [14, 1, 1, "", "LabeledDataset"], [14, 1, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[14, 2, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[14, 2, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "refer": 0, "arangodb": 1, "b": 2, "data": 3, "db": 4, "fasttext": 5, "file": 6, "md": 7, "openai": 8, "optuna": 9, "plot": 10, "somajo": 11, "somajo_transform": 12, "text": 13, "transform": 14, "mltb2": 15, "document": 15, "instal": 15, "content": 15}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "arangodb": [[1, "module-mltb2.arangodb"]], "bs": [[2, "module-mltb2.bs"]], "data": [[3, "module-mltb2.data"]], "db": [[4, "module-mltb2.db"]], "fasttext": [[5, "module-mltb2.fasttext"]], "files": [[6, "module-mltb2.files"]], "md": [[7, "module-mltb2.md"]], "openai": [[8, "module-mltb2.openai"]], "optuna": [[9, "module-mltb2.optuna"]], "plot": [[10, "module-mltb2.plot"]], "somajo": [[11, "module-mltb2.somajo"]], "somajo_transformers": [[12, "module-mltb2.somajo_transformers"]], "text": [[13, "module-mltb2.text"]], "transformers": [[14, "module-mltb2.transformers"]], "MLTB2 Documentation": [[15, "mltb2-documentation"]], "Installation": [[15, "installation"]], "Content": [[15, "content"]]}, "indexentries": {"arangobatchdatamanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoBatchDataManager"]], "arangoconnectionmanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoConnectionManager"]], "arangoimportdatamanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoImportDataManager"]], "_arango_client_factory() (mltb2.arangodb.arangoconnectionmanager method)": [[1, "mltb2.arangodb.ArangoConnectionManager._arango_client_factory"]], "_check_config_keys() (in module mltb2.arangodb)": [[1, "mltb2.arangodb._check_config_keys"]], "_connection_factory() (mltb2.arangodb.arangoconnectionmanager method)": [[1, "mltb2.arangodb.ArangoConnectionManager._connection_factory"]], "arango_collection_backup() (in module mltb2.arangodb)": [[1, "mltb2.arangodb.arango_collection_backup"]], "from_config_file() (mltb2.arangodb.arangobatchdatamanager class method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.from_config_file"]], "from_config_file() (mltb2.arangodb.arangoimportdatamanager class method)": [[1, "mltb2.arangodb.ArangoImportDataManager.from_config_file"]], "import_dataframe() (mltb2.arangodb.arangoimportdatamanager method)": [[1, "mltb2.arangodb.ArangoImportDataManager.import_dataframe"]], "import_dicts() (mltb2.arangodb.arangoimportdatamanager method)": [[1, "mltb2.arangodb.ArangoImportDataManager.import_dicts"]], "load_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.load_batch"]], "mltb2.arangodb": [[1, "module-mltb2.arangodb"]], "module": [[1, "module-mltb2.arangodb"], [2, "module-mltb2.bs"], [3, "module-mltb2.data"], [4, "module-mltb2.db"], [5, "module-mltb2.fasttext"], [6, "module-mltb2.files"], [7, "module-mltb2.md"], [8, "module-mltb2.openai"], [9, "module-mltb2.optuna"], [10, "module-mltb2.plot"], [11, "module-mltb2.somajo"], [12, "module-mltb2.somajo_transformers"], [13, "module-mltb2.text"], [14, "module-mltb2.transformers"]], "save_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.save_batch"]], "extract_all() (in module mltb2.bs)": [[2, "mltb2.bs.extract_all"]], "extract_one() (in module mltb2.bs)": [[2, "mltb2.bs.extract_one"]], "extract_text() (in module mltb2.bs)": [[2, "mltb2.bs.extract_text"]], "html_to_md() (in module mltb2.bs)": [[2, "mltb2.bs.html_to_md"]], "mltb2.bs": [[2, "module-mltb2.bs"]], "remove_all() (in module mltb2.bs)": [[2, "mltb2.bs.remove_all"]], "soup_to_md() (in module mltb2.bs)": [[2, "mltb2.bs.soup_to_md"]], "_load_colon_data() (in module mltb2.data)": [[3, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[3, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[3, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[3, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[3, "mltb2.data.load_prostate"]], "mltb2.data": [[3, "module-mltb2.data"]], "abstractbatchdatamanager (class in mltb2.db)": [[4, "mltb2.db.AbstractBatchDataManager"]], "batchdataprocessor (class in mltb2.db)": [[4, "mltb2.db.BatchDataProcessor"]], "load_batch() (mltb2.db.abstractbatchdatamanager method)": [[4, "mltb2.db.AbstractBatchDataManager.load_batch"]], "mltb2.db": [[4, "module-mltb2.db"]], "run() (mltb2.db.batchdataprocessor method)": [[4, "mltb2.db.BatchDataProcessor.run"]], "save_batch() (mltb2.db.abstractbatchdatamanager method)": [[4, "mltb2.db.AbstractBatchDataManager.save_batch"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[5, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[5, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[5, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[5, "module-mltb2.fasttext"]], "filebasedrestartablebatchdataprocessor (class in mltb2.files)": [[6, "mltb2.files.FileBasedRestartableBatchDataProcessor"]], "__len__() (mltb2.files.filebasedrestartablebatchdataprocessor method)": [[6, "mltb2.files.FileBasedRestartableBatchDataProcessor.__len__"]], "fetch_remote_file() (in module mltb2.files)": [[6, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[6, "mltb2.files.get_and_create_mltb2_data_dir"]], "load_data() (mltb2.files.filebasedrestartablebatchdataprocessor static method)": [[6, "mltb2.files.FileBasedRestartableBatchDataProcessor.load_data"]], "mltb2.files": [[6, "module-mltb2.files"]], "read_batch() (mltb2.files.filebasedrestartablebatchdataprocessor method)": [[6, "mltb2.files.FileBasedRestartableBatchDataProcessor.read_batch"]], "save_batch() (mltb2.files.filebasedrestartablebatchdataprocessor method)": [[6, "mltb2.files.FileBasedRestartableBatchDataProcessor.save_batch"]], "mdtextsplitter (class in mltb2.md)": [[7, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[7, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[7, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[7, "mltb2.md.chunk_md"]], "mltb2.md": [[7, "module-mltb2.md"]], "openaiazurechat (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiAzureChat"]], "openaichat (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiChat"]], "openaichatresult (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiChatResult"]], "openaitokencounter (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaitokencounter method)": [[8, "mltb2.openai.OpenAiTokenCounter.__call__"]], "create_completions() (mltb2.openai.openaichat method)": [[8, "mltb2.openai.OpenAiChat.create_completions"]], "create_completions_async() (mltb2.openai.openaichat method)": [[8, "mltb2.openai.OpenAiChat.create_completions_async"]], "from_chat_completion() (mltb2.openai.openaichatresult class method)": [[8, "mltb2.openai.OpenAiChatResult.from_chat_completion"]], "from_yaml() (mltb2.openai.openaichat class method)": [[8, "mltb2.openai.OpenAiChat.from_yaml"]], "mltb2.openai": [[8, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[9, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[9, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[9, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[10, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[10, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[10, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[10, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[10, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[11, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[11, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[11, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[11, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[11, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[11, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[11, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[11, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[11, "mltb2.somajo.extract_token_class_set"]], "extract_token_set() (mltb2.somajo.tokenextractor method)": [[11, "mltb2.somajo.TokenExtractor.extract_token_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[11, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[11, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[11, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[11, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[11, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[12, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[12, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[12, "module-mltb2.somajo_transformers"]], "textdistance (class in mltb2.text)": [[13, "mltb2.text.TextDistance"]], "_normalize_char_counter() (mltb2.text.textdistance method)": [[13, "mltb2.text.TextDistance._normalize_char_counter"]], "_normalize_counter_to_defaultdict() (in module mltb2.text)": [[13, "mltb2.text._normalize_counter_to_defaultdict"]], "clean_all_invisible_chars_and_strip() (in module mltb2.text)": [[13, "mltb2.text.clean_all_invisible_chars_and_strip"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "distance() (mltb2.text.textdistance method)": [[13, "mltb2.text.TextDistance.distance"]], "fit() (mltb2.text.textdistance method)": [[13, "mltb2.text.TextDistance.fit"]], "has_invisible_characters() (in module mltb2.text)": [[13, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.has_special_whitespaces"]], "has_xml_tag() (in module mltb2.text)": [[13, "mltb2.text.has_xml_tag"]], "mltb2.text": [[13, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[13, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[14, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[14, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[14, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[14, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[14, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[14, "mltb2.transformers.KFoldLabeledDataset.split"]]}})
\ No newline at end of file
+Search.setIndex({"docnames": ["api-reference", "api-reference/arangodb", "api-reference/bs", "api-reference/data", "api-reference/db", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/arangodb.rst", "api-reference/bs.rst", "api-reference/data.rst", "api-reference/db.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "arangodb
", "bs
", "data
", "db
", "fasttext
", "files
", "md
", "openai
", "optuna
", "plot
", "somajo
", "somajo_transformers
", "text
", "transformers
", "MLTB2 Documentation"], "terms": {"arangodb": [0, 15], "b": [0, 5, 13, 15], "data": [0, 1, 4, 6, 9, 14, 15], "db": [0, 9, 15], "fasttext": [0, 15], "file": [0, 1, 3, 5, 8, 15], "md": [0, 15], "openai": [0, 15], "optuna": [0, 15], "plot": [0, 15], "somajo": [0, 12, 15], "somajo_transform": [0, 15], "text": [0, 2, 5, 7, 8, 11, 12, 14, 15], "transform": [0, 12, 15], "util": [1, 4, 6, 14], "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "pip": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15], "instal": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14], "necessari": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15], "depend": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "class": [1, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "arangobatchdatamanag": 1, "host": [1, 14], "str": [1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14], "sequenc": [1, 4, 6], "db_name": 1, "usernam": 1, "password": 1, "collection_nam": 1, "attribute_nam": 1, "batch_siz": [1, 6], "int": [1, 5, 6, 7, 8, 9, 10, 12, 13, 14], "20": [1, 9], "aql_overwrit": 1, "none": [1, 2, 3, 4, 5, 6, 8, 10, 11, 13, 14], "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "base": [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "abstractbatchdatamanag": [1, 4], "arangoconnectionmanag": 1, "implement": [1, 2, 4, 9, 13], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "databas": [1, 4], "name": [1, 2, 6, 8], "document": 1, "from": [1, 2, 3, 4, 5, 6, 8, 9, 11, 13], "collect": 1, "ar": [1, 2, 3, 5, 6, 7, 8, 9, 13, 14, 15], "process": [1, 4, 6, 7, 8, 11, 12, 13, 14], "attribut": [1, 2], "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], "check": [1, 9, 13], "alreadi": [1, 7], "If": [1, 2, 3, 5, 6, 8, 10, 11, 12, 13, 14], "present": 1, "avail": [1, 15], "consid": 1, "The": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "batch": [1, 4, 6], "size": [1, 6], "aql": 1, "string": [1, 2, 11], "overwrit": 1, "default": [1, 2, 3, 6, 10], "classmethod": [1, 8], "from_config_fil": 1, "config_file_nam": 1, "construct": [1, 8], "config": 1, "must": [1, 9, 13, 15], "contain": [1, 3, 5, 8, 13, 14], "valu": [1, 8, 9, 10, 13], "exampl": [1, 8, 9, 11, 13], "http": [1, 3, 11], "com": [1, 11], "my_ml_databas": 1, "my_usernam": 1, "secret": 1, "my_ml_data_collect": 1, "processing_metadata": 1, "100": [1, 9, 13], "path": [1, 5, 6, 14], "load_batch": [1, 4], "load": [1, 3, 4, 6, 8], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "type": [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "save_batch": [1, 4, 6], "save": [1, 4, 6, 10], "object": [1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14], "connect": 1, "manag": [1, 4], "creat": [1, 6, 8, 10], "_arango_client_factori": 1, "arangocli": 1, "an": [1, 4, 5, 8, 9, 11, 12, 14], "client": 1, "_connection_factori": 1, "arango_cli": 1, "standarddatabas": 1, "arangoimportdatamanag": 1, "import": [1, 9, 11], "tool": [1, 2, 3, 5, 8, 9, 10, 11, 12, 13, 14, 15], "fill": 1, "least": [1, 13], "import_datafram": 1, "datafram": [1, 3], "create_collect": 1, "bool": [1, 6, 7, 8, 9, 10, 11, 12, 13, 14], "fals": [1, 6, 7, 8, 10, 11, 12, 13, 14], "panda": [1, 3], "true": [1, 2, 9, 10, 12, 13], "doe": [1, 7, 9, 10, 12], "exist": [1, 6], "rais": [1, 2, 6, 12, 13], "arango": 1, "except": [1, 12], "documentinserterror": 1, "fail": 1, "import_dict": 1, "dict": [1, 2, 5, 6, 8], "ani": [1, 2, 6, 8], "_check_config_kei": 1, "expected_config_kei": 1, "all": [1, 2, 6, 11], "expect": 1, "kei": [1, 8], "arango_collection_backup": 1, "commandlin": 1, "do": [1, 11, 13, 14], "backup": 1, "written": 1, "gzip": 1, "compress": 1, "jsonl": 1, "current": 1, "work": [1, 9], "directori": [1, 3, 6, 14], "run": [1, 4, 8], "col": 1, "h": 1, "get": [1, 5, 8, 11], "command": 1, "line": [1, 2], "help": 1, "beauti": 2, "soup": 2, "html": [2, 3, 13], "specif": [2, 7, 8, 11, 12, 13, 14, 15], "extract_al": 2, "beautifulsoup": 2, "attr": 2, "kwarg": 2, "extract": [2, 11], "specifi": [2, 7, 12], "element": [2, 5], "tag": [2, 13], "addit": 2, "keyword": [2, 8], "argument": [2, 8], "extract_on": 2, "exactli": 2, "one": [2, 4, 10, 11, 13], "function": [2, 4, 6, 8, 9, 10, 13], "expact": 2, "onli": [2, 7, 8, 9, 13], "result": [2, 6, 8, 9], "found": 2, "otherwis": [2, 13], "runtimeerror": 2, "extract_text": 2, "join_str": 2, "ha": [2, 8, 10], "known": 2, "issu": 2, "whitespac": [2, 13], "handl": 2, "join": 2, "part": 2, "per": [2, 7, 9, 12], "space": 2, "html_to_md": 2, "mdformat_opt": 2, "convert": [2, 3, 8, 11], "markdown": [2, 7], "mdformat": 2, "option": [2, 15], "number": [2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "appli": 2, "consecut": 2, "order": 2, "list": [2, 5, 6, 7, 8, 11, 12, 14], "wrap": 2, "paragraph": [2, 7], "word": [2, 11, 12], "mode": 2, "end": [2, 7], "lf": 2, "remove_al": 2, "remov": [2, 6, 7, 8, 13], "done": [2, 4, 9], "place": [2, 11], "noth": 2, "": [2, 9, 10], "soup_to_md": 2, "offer": [3, 5, 9, 11, 12, 13, 14], "follow": [3, 13], "tabular": 3, "set": [3, 5, 8, 9, 11, 14], "biolog": 3, "medic": 3, "domain": 3, "support": [3, 6, 13], "colon": 3, "genom": 3, "pub": 3, "princeton": 3, "edu": 3, "oncologi": 3, "affydata": 3, "index": [3, 15], "prostat": 3, "web": 3, "stanford": 3, "hasti": 3, "casi_fil": 3, "leukemia_big": 3, "leukemia": 3, "after": [3, 6, 13], "internet": 3, "pars": 3, "cach": 3, "determin": [3, 6], "get_and_create_mltb2_data_dir": [3, 6], "_load_colon_data": 3, "label": [3, 10, 14], "also": [3, 8], "see": [3, 8, 9, 15], "_load_colon_label": 3, "seri": 3, "load_colon": 3, "mltb2_base_data_dir": [3, 6], "tupl": [3, 11], "user": [3, 6, 9], "platformdir": [3, 6], "user_data_dir": [3, 6], "load_leukemia_big": 3, "big": 3, "load_prost": 3, "abc": [4, 11], "abstract": [4, 11], "respect": 4, "intend": 4, "conjunct": 4, "batchdataprocessor": 4, "data_manag": 4, "process_batch_callback": 4, "callabl": 4, "A": [4, 5, 8, 9, 15], "callback": 4, "until": [4, 9], "empti": 4, "For": [4, 9, 10, 13, 15], "each": 4, "call": [4, 8, 9, 10, 13], "fasttextlanguageidentif": 5, "identifi": 5, "languag": [5, 11], "__call__": [5, 7, 8, 11, 12, 14], "num_lang": 5, "10": [5, 9], "always_detect_lang": 5, "given": [5, 6, 8, 9, 13], "which": [5, 6, 8, 9, 11, 13, 14], "recogn": 5, "probabl": 5, "should": [5, 9, 11], "alwai": [5, 8, 12], "even": 5, "detect": [5, 13], "0": [5, 9, 10, 13], "more": [5, 9, 10, 13], "than": [5, 12, 13], "so": 5, "guarante": 5, "you": [5, 8, 9, 15], "want": [5, 8, 15], "includ": 5, "case": 5, "when": [5, 6, 8, 15], "veri": 5, "low": 5, "possibl": 5, "af": 5, "al": 5, "am": 5, "arz": 5, "ast": 5, "av": 5, "az": 5, "azb": 5, "ba": 5, "bar": 5, "bcl": 5, "bg": 5, "bh": 5, "bn": 5, "bo": 5, "bpy": 5, "br": 5, "bxr": 5, "ca": 5, "cbk": 5, "ce": 5, "ceb": 5, "ckb": 5, "co": [5, 14], "c": 5, "cv": [5, 9], "cy": 5, "da": [5, 11], "de": [5, 11], "diq": 5, "dsb": 5, "dty": 5, "dv": 5, "el": 5, "eml": 5, "en": 5, "eo": 5, "e": 5, "et": 5, "eu": 5, "fa": 5, "fi": 5, "fr": 5, "frr": 5, "fy": 5, "ga": 5, "gd": 5, "gl": 5, "gn": 5, "gom": 5, "gu": 5, "gv": 5, "he": 5, "hi": 5, "hif": 5, "hr": 5, "hsb": 5, "ht": 5, "hu": 5, "hy": 5, "ia": 5, "id": [5, 14], "ie": 5, "ilo": 5, "io": 5, "ja": 5, "jbo": 5, "jv": 5, "ka": 5, "kk": 5, "km": 5, "kn": 5, "ko": 5, "krc": 5, "ku": 5, "kv": 5, "kw": 5, "ky": 5, "la": 5, "lb": 5, "lez": 5, "li": 5, "lmo": 5, "lo": 5, "lrc": 5, "lt": 5, "lv": 5, "mai": [5, 6], "mg": 5, "mhr": 5, "min": 5, "mk": 5, "ml": 5, "mn": 5, "mr": 5, "mrj": 5, "m": 5, "mt": 5, "mwl": 5, "my": 5, "myv": 5, "mzn": 5, "nah": 5, "nap": 5, "nd": 5, "ne": 5, "new": 5, "nl": 5, "nn": 5, "oc": 5, "o": 5, "pa": 5, "pam": 5, "pfl": 5, "pl": 5, "pm": 5, "pnb": 5, "p": 5, "pt": 5, "qu": 5, "rm": 5, "ro": 5, "ru": 5, "rue": 5, "sa": 5, "sah": 5, "sc": 5, "scn": 5, "sco": 5, "sd": 5, "sh": 5, "si": 5, "sk": 5, "sl": 5, "sq": 5, "sr": 5, "su": 5, "sv": 5, "sw": 5, "ta": 5, "te": 5, "tg": 5, "th": 5, "tk": 5, "tl": 5, "tr": 5, "tt": 5, "tyv": 5, "ug": 5, "uk": 5, "ur": 5, "uz": 5, "vec": 5, "vep": 5, "vi": 5, "vl": 5, "vo": 5, "wa": [5, 8, 13, 14], "war": 5, "wuu": 5, "xal": 5, "xmf": 5, "yi": 5, "yo": 5, "yue": 5, "zh": 5, "static": [5, 6], "get_model_path_and_download": 5, "model": [5, 8, 9, 14], "download": 5, "need": 5, "full": [5, 6, 8], "provid": [6, 9], "other": [6, 13], "filebasedrestartablebatchdataprocessor": 6, "uuid_nam": 6, "result_dir": 6, "processor": 6, "restart": 6, "back": 6, "uuid": 6, "field": 6, "where": [6, 11], "store": 6, "__len__": 6, "record": 6, "load_data": 6, "ignore_load_error": 6, "method": [6, 9, 13], "can": [6, 7, 8, 10, 13, 15], "As": [6, 9], "execut": 6, "sever": 6, "time": [6, 10, 13], "parallel": 6, "duplic": [6, 13], "These": [6, 8, 13], "here": [6, 9], "ignor": [6, 12], "error": 6, "just": [6, 8, 14], "print": [6, 10, 11], "them": [6, 13], "read_batch": 6, "read": 6, "next": 6, "fetch_remote_fil": 6, "dirnam": 6, "filenam": [6, 10], "url": [6, 11], "sha256_checksum": 6, "fetch": 6, "remot": 6, "under": 6, "sha256": 6, "checksum": 6, "ioerror": 6, "wrong": 6, "dir": 6, "exact": 6, "folder": 6, "append": [6, 9], "mdtextsplitt": 7, "max_token": [7, 8, 12], "transformers_token_count": [7, 12], "transformerstokencount": [7, 12, 14], "show_progress_bar": [7, 8, 11, 12, 13, 14], "split": [7, 9, 11, 12, 14], "section": [7, 12], "maximum": [7, 12, 13], "token": [7, 8, 11, 12, 14], "divid": [7, 12], "head": 7, "correspond": 7, "exceed": 7, "singl": [7, 10, 13], "chunk": 7, "larger": [7, 9], "counter": [7, 12, 13], "show": [7, 8, 11, 12, 13, 14], "progressbar": [7, 8, 11, 12, 13, 14], "dure": [7, 8, 11, 12, 13, 14], "md_text": 7, "_chunk_md_by_headlin": 7, "headlin": 7, "chunk_md": 7, "merg": 7, "isol": 7, "subsequ": 7, "without": [7, 8], "content": [7, 8], "openaiazurechat": 8, "api_kei": 8, "api_vers": 8, "azure_endpoint": 8, "openaichat": 8, "interact": 8, "azur": 8, "chat": 8, "from_yaml": 8, "api": [8, 15], "refer": [8, 15], "complet": 8, "quickstart": 8, "start": 8, "gener": [8, 14], "servic": 8, "version": 8, "common": 8, "2023": 8, "05": 8, "15": 8, "endpoint": 8, "create_complet": 8, "prompt": 8, "completion_kwarg": 8, "clean_openai_token": 8, "openaichatresult": 8, "respons": 8, "convers": 8, "via": 8, "pleas": 8, "initi": 8, "messag": 8, "resourc": 8, "special": [8, 13], "async": 8, "create_completions_async": 8, "yaml_fil": 8, "yaml": 8, "environ": 8, "variabl": 8, "openai_api_kei": 8, "prompt_token": 8, "completion_token": 8, "total_token": 8, "finish_reason": 8, "completion_arg": 8, "asdict": 8, "open_ai_chat_result": 8, "dataclass": 8, "been": 8, "total": [8, 13], "content_token": 8, "reason": [8, 9], "why": 8, "stop": 8, "mean": [8, 9], "limit": [8, 13], "length": 8, "becaus": 8, "content_filt": 8, "omit": 8, "due": 8, "flag": 8, "filter": 8, "tool_cal": 8, "function_cal": 8, "deprec": 8, "have": [8, 9], "temperatur": 8, "top_p": 8, "from_chat_complet": 8, "chat_complet": 8, "chatcomplet": 8, "openaitokencount": 8, "model_nam": 8, "count": [8, 13, 14], "some": [8, 13], "gpt": 8, "4": [8, 9], "3": [8, 9], "5": 8, "turbo": 8, "davinci": 8, "003": 8, "embed": 8, "ada": 8, "002": 8, "iter": [8, 11, 13, 14], "remove_openai_token": 8, "im_start": 8, "im_end": 8, "thei": 8, "caus": 8, "problem": 8, "pass": 8, "significancerepeatedtrainingprun": 9, "alpha": 9, "float": [9, 11, 13], "1": [9, 14], "n_warmup_step": 9, "baseprun": 9, "pruner": 9, "statist": 9, "signific": 9, "heurist": 9, "decis": 9, "make": [9, 10], "It": [9, 11, 13, 15], "prune": 9, "repeat": 9, "train": [9, 14], "like": 9, "cross": [9, 14], "valid": [9, 14], "test": [9, 14], "t": 9, "our": 9, "experi": 9, "shown": 9, "aplha": 9, "between": [9, 13], "standard": 9, "assum": 9, "adjust": 9, "onc": [9, 13], "hyperparamet": 9, "those": [9, 15], "basi": 9, "intermedi": 9, "epoch": 9, "In": 9, "contrast": 9, "precis": 9, "individu": [9, 15], "fold": [9, 14], "below": 9, "minimalist": 9, "log": 9, "numpi": 9, "np": 9, "sklearn": 9, "dataset": [9, 14], "load_iri": 9, "model_select": 9, "stratifiedkfold": 9, "ensembl": 9, "randomforestclassifi": 9, "metric": 9, "accuracy_scor": 9, "configur": 9, "logger": 9, "debug": 9, "output": [9, 11], "getlogg": 9, "addhandl": 9, "streamhandl": 9, "setlevel": 9, "x": [9, 10, 13], "y": [9, 10, 13], "target": 9, "def": 9, "trial": 9, "min_samples_split": 9, "suggest_int": 9, "2": 9, "n_estim": 9, "validation_result_list": 9, "skf": 9, "n_split": [9, 14], "fold_index": 9, "train_index": 9, "val_index": 9, "enumer": 9, "x_train": 9, "x_val": 9, "y_train": 9, "y_val": 9, "rf": 9, "fit": [9, 13], "y_pred": 9, "predict": 9, "acc": 9, "report": 9, "we": [9, 13], "should_prun": 9, "break": 9, "studi": 9, "create_studi": 9, "storag": 9, "sqlite": 9, "memori": 9, "study_nam": 9, "iris_cv": 9, "direct": 9, "maxim": 9, "load_if_exist": 9, "sampler": 9, "tpesampl": 9, "multivari": 9, "add": 9, "optim": 9, "n_trial": 9, "level": 9, "aggress": 9, "smaller": 9, "stronger": 9, "differ": [9, 10, 13], "two": [9, 10, 11, 13], "distribut": 9, "disabl": 9, "reach": 9, "exce": 9, "step": [9, 10], "frozentri": 9, "judg": 9, "whether": 9, "note": 9, "suppos": 9, "librari": 9, "instead": 9, "interfac": 9, "mechan": 9, "take": 9, "copi": 9, "befor": [9, 13], "modifi": 9, "boolean": 9, "repres": 9, "matplotlib": 10, "boxplot": 10, "titl": 10, "xlabel": 10, "ylabel": 10, "vert": 10, "diagram": 10, "pyplot": 10, "axi": 10, "box": [10, 15], "vertic": 10, "horizont": 10, "boxplot_dict": 10, "values_dict": 10, "form": [10, 13], "dictionari": 10, "save_last_figur": 10, "last": 10, "made": 10, "jupyt": 10, "notebook": 10, "same": 10, "cell": 10, "twin_axes_timeseries_plot": 10, "values_1": 10, "label_1": 10, "values_2": 10, "label_2": 10, "start_timestep_numb": 10, "shift_1": 10, "shift_2": 10, "label_x": 10, "color_1": 10, "tab": 10, "red": 10, "color_2": 10, "blue": 10, "twin": 10, "ax": 10, "timeseri": 10, "curv": 10, "array_lik": 10, "first": [10, 13], "second": 10, "point": 10, "timestep": 10, "shift": 10, "posit": 10, "neg": 10, "color": 10, "jaccardsimilar": 11, "liter": 11, "de_cmc": 11, "en_ptb": 11, "somajobaseclass": 11, "calcul": [11, 13], "jaccard": 11, "similar": 11, "german": 11, "english": 11, "text1": 11, "text2": 11, "get_token_set": 11, "directli": 11, "somajosentencesplitt": [11, 12], "sentenc": [11, 12], "tokenextractor": 11, "extract_token_set": 11, "keep_token_class": 11, "keep": 11, "kept": 11, "extract_url_set": 11, "token_extractor": 11, "url_set": 11, "ist": 11, "ein": 11, "link": 11, "github": [11, 15], "urlswapp": 11, "url_pattern": 11, "swap": 11, "revers": 11, "replac": [11, 13], "extractor": 11, "pattern": 11, "One": [11, 13], "mark": 11, "put": 11, "reverse_swap_url": 11, "revert": 11, "were": 11, "unknown": 11, "swap_url": 11, "detoken": 11, "how": 11, "extract_token_class_set": 11, "hug": [12, 14], "face": [12, 14], "textsplitt": 12, "somajo_sentence_splitt": 12, "ignore_overly_long_sent": 12, "whole": 12, "splitter": 12, "valueerror": [12, 13], "longer": 12, "simpli": 12, "clean": 13, "invis": 13, "charact": 13, "distanc": 13, "find": 13, "anomali": 13, "textdist": 13, "max_dimens": 13, "markup": 13, "unusu": 13, "multipl": 13, "again": 13, "dimens": 13, "greater": 13, "_normalize_char_count": 13, "normal": 13, "char": 13, "defaultdict": 13, "lazi": 13, "postprocess": 13, "manhattan": 13, "scipi": 13, "spatial": 13, "cityblock": 13, "most": 13, "commen": 13, "higher": 13, "_normalize_counter_to_defaultdict": 13, "devid": 13, "clean_all_invisible_chars_and_strip": 13, "strip": 13, "lead": 13, "trail": 13, "defin": 13, "constant": 13, "invisible_charact": 13, "special_whitespac": 13, "rteturn": 13, "clean_all_invisible_chars_and_whitespac": 13, "has_invisible_charact": 13, "has_special_whitespac": 13, "has_xml_tag": 13, "xml": 13, "xml_tag": 13, "while": 13, "remove_invisible_charact": 13, "replace_multiple_whitespac": 13, "replace_special_whitespac": 13, "kfoldlabeleddataset": 14, "7": 14, "n_repeat": 14, "random_st": 14, "k": 14, "labeleddataset": 14, "labeled_dataset": 14, "stratification_label": 14, "encod": 14, "labe": 14, "pretrained_model_name_or_path": 14, "pathlik": 14, "insid": 14, "repo": 14, "huggingfac": 14, "machin": 15, "learn": 15, "python": 15, "packag": 15, "pypi": 15, "mani": 15, "To": 15, "descript": 15, "repositori": 15, "licens": 15, "imprint": 15}, "objects": {"mltb2": [[1, 0, 0, "-", "arangodb"], [2, 0, 0, "-", "bs"], [3, 0, 0, "-", "data"], [4, 0, 0, "-", "db"], [5, 0, 0, "-", "fasttext"], [6, 0, 0, "-", "files"], [7, 0, 0, "-", "md"], [8, 0, 0, "-", "openai"], [9, 0, 0, "-", "optuna"], [10, 0, 0, "-", "plot"], [11, 0, 0, "-", "somajo"], [12, 0, 0, "-", "somajo_transformers"], [13, 0, 0, "-", "text"], [14, 0, 0, "-", "transformers"]], "mltb2.arangodb": [[1, 1, 1, "", "ArangoBatchDataManager"], [1, 1, 1, "", "ArangoConnectionManager"], [1, 1, 1, "", "ArangoImportDataManager"], [1, 3, 1, "", "_check_config_keys"], [1, 3, 1, "", "arango_collection_backup"]], "mltb2.arangodb.ArangoBatchDataManager": [[1, 2, 1, "", "from_config_file"], [1, 2, 1, "", "load_batch"], [1, 2, 1, "", "save_batch"]], "mltb2.arangodb.ArangoConnectionManager": [[1, 2, 1, "", "_arango_client_factory"], [1, 2, 1, "", "_connection_factory"]], "mltb2.arangodb.ArangoImportDataManager": [[1, 2, 1, "", "from_config_file"], [1, 2, 1, "", "import_dataframe"], [1, 2, 1, "", "import_dicts"]], "mltb2.bs": [[2, 3, 1, "", "extract_all"], [2, 3, 1, "", "extract_one"], [2, 3, 1, "", "extract_text"], [2, 3, 1, "", "html_to_md"], [2, 3, 1, "", "remove_all"], [2, 3, 1, "", "soup_to_md"]], "mltb2.data": [[3, 3, 1, "", "_load_colon_data"], [3, 3, 1, "", "_load_colon_label"], [3, 3, 1, "", "load_colon"], [3, 3, 1, "", "load_leukemia_big"], [3, 3, 1, "", "load_prostate"]], "mltb2.db": [[4, 1, 1, "", "AbstractBatchDataManager"], [4, 1, 1, "", "BatchDataProcessor"]], "mltb2.db.AbstractBatchDataManager": [[4, 2, 1, "", "load_batch"], [4, 2, 1, "", "save_batch"]], "mltb2.db.BatchDataProcessor": [[4, 2, 1, "", "run"]], "mltb2.fasttext": [[5, 1, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[5, 2, 1, "", "__call__"], [5, 2, 1, "", "get_model_path_and_download"]], "mltb2.files": [[6, 1, 1, "", "FileBasedRestartableBatchDataProcessor"], [6, 3, 1, "", "fetch_remote_file"], [6, 3, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.files.FileBasedRestartableBatchDataProcessor": [[6, 2, 1, "", "__len__"], [6, 2, 1, "", "load_data"], [6, 2, 1, "", "read_batch"], [6, 2, 1, "", "save_batch"]], "mltb2.md": [[7, 1, 1, "", "MdTextSplitter"], [7, 3, 1, "", "_chunk_md_by_headline"], [7, 3, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[7, 2, 1, "", "__call__"]], "mltb2.openai": [[8, 1, 1, "", "OpenAiAzureChat"], [8, 1, 1, "", "OpenAiChat"], [8, 1, 1, "", "OpenAiChatResult"], [8, 1, 1, "", "OpenAiTokenCounter"], [8, 3, 1, "", "remove_openai_tokens"]], "mltb2.openai.OpenAiChat": [[8, 2, 1, "", "create_completions"], [8, 2, 1, "", "create_completions_async"], [8, 2, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatResult": [[8, 2, 1, "", "from_chat_completion"]], "mltb2.openai.OpenAiTokenCounter": [[8, 2, 1, "", "__call__"]], "mltb2.optuna": [[9, 1, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[9, 2, 1, "", "prune"]], "mltb2.plot": [[10, 3, 1, "", "boxplot"], [10, 3, 1, "", "boxplot_dict"], [10, 3, 1, "", "save_last_figure"], [10, 3, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[11, 1, 1, "", "JaccardSimilarity"], [11, 1, 1, "", "SoMaJoBaseClass"], [11, 1, 1, "", "SoMaJoSentenceSplitter"], [11, 1, 1, "", "TokenExtractor"], [11, 1, 1, "", "UrlSwapper"], [11, 3, 1, "", "detokenize"], [11, 3, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[11, 2, 1, "", "__call__"], [11, 2, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[11, 2, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[11, 2, 1, "", "extract_token_set"], [11, 2, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[11, 2, 1, "", "reverse_swap_urls"], [11, 2, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[12, 1, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[12, 2, 1, "", "__call__"]], "mltb2.text": [[13, 1, 1, "", "TextDistance"], [13, 3, 1, "", "_normalize_counter_to_defaultdict"], [13, 3, 1, "", "clean_all_invisible_chars_and_strip"], [13, 3, 1, "", "clean_all_invisible_chars_and_whitespaces"], [13, 3, 1, "", "has_invisible_characters"], [13, 3, 1, "", "has_special_whitespaces"], [13, 3, 1, "", "has_xml_tag"], [13, 3, 1, "", "remove_invisible_characters"], [13, 3, 1, "", "replace_multiple_whitespaces"], [13, 3, 1, "", "replace_special_whitespaces"]], "mltb2.text.TextDistance": [[13, 2, 1, "", "_normalize_char_counter"], [13, 2, 1, "", "distance"], [13, 2, 1, "", "fit"]], "mltb2.transformers": [[14, 1, 1, "", "KFoldLabeledDataset"], [14, 1, 1, "", "LabeledDataset"], [14, 1, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[14, 2, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[14, 2, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "refer": 0, "arangodb": 1, "b": 2, "data": 3, "db": 4, "fasttext": 5, "file": 6, "md": 7, "openai": 8, "optuna": 9, "plot": 10, "somajo": 11, "somajo_transform": 12, "text": 13, "transform": 14, "mltb2": 15, "document": 15, "instal": 15, "content": 15}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "arangodb": [[1, "module-mltb2.arangodb"]], "bs": [[2, "module-mltb2.bs"]], "data": [[3, "module-mltb2.data"]], "db": [[4, "module-mltb2.db"]], "fasttext": [[5, "module-mltb2.fasttext"]], "files": [[6, "module-mltb2.files"]], "md": [[7, "module-mltb2.md"]], "openai": [[8, "module-mltb2.openai"]], "optuna": [[9, "module-mltb2.optuna"]], "plot": [[10, "module-mltb2.plot"]], "somajo": [[11, "module-mltb2.somajo"]], "somajo_transformers": [[12, "module-mltb2.somajo_transformers"]], "text": [[13, "module-mltb2.text"]], "transformers": [[14, "module-mltb2.transformers"]], "MLTB2 Documentation": [[15, "mltb2-documentation"]], "Installation": [[15, "installation"]], "Content": [[15, "content"]]}, "indexentries": {"arangobatchdatamanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoBatchDataManager"]], "arangoconnectionmanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoConnectionManager"]], "arangoimportdatamanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoImportDataManager"]], "_arango_client_factory() (mltb2.arangodb.arangoconnectionmanager method)": [[1, "mltb2.arangodb.ArangoConnectionManager._arango_client_factory"]], "_check_config_keys() (in module mltb2.arangodb)": [[1, "mltb2.arangodb._check_config_keys"]], "_connection_factory() (mltb2.arangodb.arangoconnectionmanager method)": [[1, "mltb2.arangodb.ArangoConnectionManager._connection_factory"]], "arango_collection_backup() (in module mltb2.arangodb)": [[1, "mltb2.arangodb.arango_collection_backup"]], "from_config_file() (mltb2.arangodb.arangobatchdatamanager class method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.from_config_file"]], "from_config_file() (mltb2.arangodb.arangoimportdatamanager class method)": [[1, "mltb2.arangodb.ArangoImportDataManager.from_config_file"]], "import_dataframe() (mltb2.arangodb.arangoimportdatamanager method)": [[1, "mltb2.arangodb.ArangoImportDataManager.import_dataframe"]], "import_dicts() (mltb2.arangodb.arangoimportdatamanager method)": [[1, "mltb2.arangodb.ArangoImportDataManager.import_dicts"]], "load_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.load_batch"]], "mltb2.arangodb": [[1, "module-mltb2.arangodb"]], "module": [[1, "module-mltb2.arangodb"], [2, "module-mltb2.bs"], [3, "module-mltb2.data"], [4, "module-mltb2.db"], [5, "module-mltb2.fasttext"], [6, "module-mltb2.files"], [7, "module-mltb2.md"], [8, "module-mltb2.openai"], [9, "module-mltb2.optuna"], [10, "module-mltb2.plot"], [11, "module-mltb2.somajo"], [12, "module-mltb2.somajo_transformers"], [13, "module-mltb2.text"], [14, "module-mltb2.transformers"]], "save_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.save_batch"]], "extract_all() (in module mltb2.bs)": [[2, "mltb2.bs.extract_all"]], "extract_one() (in module mltb2.bs)": [[2, "mltb2.bs.extract_one"]], "extract_text() (in module mltb2.bs)": [[2, "mltb2.bs.extract_text"]], "html_to_md() (in module mltb2.bs)": [[2, "mltb2.bs.html_to_md"]], "mltb2.bs": [[2, "module-mltb2.bs"]], "remove_all() (in module mltb2.bs)": [[2, "mltb2.bs.remove_all"]], "soup_to_md() (in module mltb2.bs)": [[2, "mltb2.bs.soup_to_md"]], "_load_colon_data() (in module mltb2.data)": [[3, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[3, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[3, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[3, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[3, "mltb2.data.load_prostate"]], "mltb2.data": [[3, "module-mltb2.data"]], "abstractbatchdatamanager (class in mltb2.db)": [[4, "mltb2.db.AbstractBatchDataManager"]], "batchdataprocessor (class in mltb2.db)": [[4, "mltb2.db.BatchDataProcessor"]], "load_batch() (mltb2.db.abstractbatchdatamanager method)": [[4, "mltb2.db.AbstractBatchDataManager.load_batch"]], "mltb2.db": [[4, "module-mltb2.db"]], "run() (mltb2.db.batchdataprocessor method)": [[4, "mltb2.db.BatchDataProcessor.run"]], "save_batch() (mltb2.db.abstractbatchdatamanager method)": [[4, "mltb2.db.AbstractBatchDataManager.save_batch"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[5, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[5, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[5, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[5, "module-mltb2.fasttext"]], "filebasedrestartablebatchdataprocessor (class in mltb2.files)": [[6, "mltb2.files.FileBasedRestartableBatchDataProcessor"]], "__len__() (mltb2.files.filebasedrestartablebatchdataprocessor method)": [[6, "mltb2.files.FileBasedRestartableBatchDataProcessor.__len__"]], "fetch_remote_file() (in module mltb2.files)": [[6, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[6, "mltb2.files.get_and_create_mltb2_data_dir"]], "load_data() (mltb2.files.filebasedrestartablebatchdataprocessor static method)": [[6, "mltb2.files.FileBasedRestartableBatchDataProcessor.load_data"]], "mltb2.files": [[6, "module-mltb2.files"]], "read_batch() (mltb2.files.filebasedrestartablebatchdataprocessor method)": [[6, "mltb2.files.FileBasedRestartableBatchDataProcessor.read_batch"]], "save_batch() (mltb2.files.filebasedrestartablebatchdataprocessor method)": [[6, "mltb2.files.FileBasedRestartableBatchDataProcessor.save_batch"]], "mdtextsplitter (class in mltb2.md)": [[7, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[7, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[7, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[7, "mltb2.md.chunk_md"]], "mltb2.md": [[7, "module-mltb2.md"]], "openaiazurechat (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiAzureChat"]], "openaichat (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiChat"]], "openaichatresult (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiChatResult"]], "openaitokencounter (class in mltb2.openai)": [[8, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaitokencounter method)": [[8, "mltb2.openai.OpenAiTokenCounter.__call__"]], "create_completions() (mltb2.openai.openaichat method)": [[8, "mltb2.openai.OpenAiChat.create_completions"]], "create_completions_async() (mltb2.openai.openaichat method)": [[8, "mltb2.openai.OpenAiChat.create_completions_async"]], "from_chat_completion() (mltb2.openai.openaichatresult class method)": [[8, "mltb2.openai.OpenAiChatResult.from_chat_completion"]], "from_yaml() (mltb2.openai.openaichat class method)": [[8, "mltb2.openai.OpenAiChat.from_yaml"]], "mltb2.openai": [[8, "module-mltb2.openai"]], "remove_openai_tokens() (in module mltb2.openai)": [[8, "mltb2.openai.remove_openai_tokens"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[9, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[9, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[9, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[10, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[10, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[10, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[10, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[10, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[11, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[11, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[11, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[11, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[11, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[11, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[11, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[11, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[11, "mltb2.somajo.extract_token_class_set"]], "extract_token_set() (mltb2.somajo.tokenextractor method)": [[11, "mltb2.somajo.TokenExtractor.extract_token_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[11, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[11, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[11, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[11, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[11, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[12, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[12, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[12, "module-mltb2.somajo_transformers"]], "textdistance (class in mltb2.text)": [[13, "mltb2.text.TextDistance"]], "_normalize_char_counter() (mltb2.text.textdistance method)": [[13, "mltb2.text.TextDistance._normalize_char_counter"]], "_normalize_counter_to_defaultdict() (in module mltb2.text)": [[13, "mltb2.text._normalize_counter_to_defaultdict"]], "clean_all_invisible_chars_and_strip() (in module mltb2.text)": [[13, "mltb2.text.clean_all_invisible_chars_and_strip"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "distance() (mltb2.text.textdistance method)": [[13, "mltb2.text.TextDistance.distance"]], "fit() (mltb2.text.textdistance method)": [[13, "mltb2.text.TextDistance.fit"]], "has_invisible_characters() (in module mltb2.text)": [[13, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.has_special_whitespaces"]], "has_xml_tag() (in module mltb2.text)": [[13, "mltb2.text.has_xml_tag"]], "mltb2.text": [[13, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[13, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[13, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[14, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[14, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[14, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[14, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[14, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[14, "mltb2.transformers.KFoldLabeledDataset.split"]]}})
\ No newline at end of file