From 961ca86cfafb6e38d3aa029a34b78f905a52a366 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sun, 31 Dec 2023 22:17:39 +0000 Subject: [PATCH] deploy: 98d657742adb1688c0d1d66b5bf5f1633eeadc47 --- _modules/mltb2/arangodb.html | 69 +++++++++++++++++++++++++++++++++-- api-reference/arangodb.html | 33 +++++++++++++++++ genindex.html | 4 ++ objects.inv | Bin 1558 -> 1581 bytes searchindex.js | 2 +- 5 files changed, 103 insertions(+), 5 deletions(-) diff --git a/_modules/mltb2/arangodb.html b/_modules/mltb2/arangodb.html index 2ec9b25..1673437 100644 --- a/_modules/mltb2/arangodb.html +++ b/_modules/mltb2/arangodb.html @@ -90,17 +90,31 @@

Source code for mltb2.arangodb

 """
 
 
+import gzip
+from argparse import ArgumentParser
 from contextlib import closing
 from dataclasses import dataclass
-from typing import Optional, Sequence, Union
+from typing import Dict, Optional, Sequence, Union
 
+import jsonlines
 from arango import ArangoClient
 from arango.database import StandardDatabase
 from dotenv import dotenv_values
+from tqdm import tqdm
 
 from mltb2.db import AbstractBatchDataManager
 
 
+
[docs]def _check_config_keys(config: Dict[str, Optional[str]], expected_config_keys: Sequence[str]) -> None: + """Check if all expected keys are in config. + + This is useful to check if a config file contains all necessary keys. + """ + for expected_config_key in expected_config_keys: + if expected_config_key not in config: + raise ValueError(f"Config file must contain '{expected_config_key}'!")
+ +
[docs]@dataclass class ArangoBatchDataManager(AbstractBatchDataManager): """ArangoDB implementation of the ``AbstractBatchDataManager``. @@ -170,9 +184,7 @@

Source code for mltb2.arangodb

             "attribute_name",
             "batch_size",
         ]
-        for expected_config_file_key in expected_config_file_keys:
-            if expected_config_file_key not in arango_config:
-                raise ValueError(f"Config file must contain '{expected_config_file_key}'!")
+        _check_config_keys(arango_config, expected_config_file_keys)
 
         return cls(
             hosts=arango_config["hosts"],  # type: ignore
@@ -235,6 +247,55 @@ 

Source code for mltb2.arangodb

             connection = self._connection_factory(arango_client)
             collection = connection.collection(self.collection_name)
             collection.import_bulk(batch, on_duplicate="update")
+ + +
[docs]def arango_collection_backup() -> None: + """Commandline tool to do an ArangoDB backup of a collection. + + The backup is written to a gzip compressed JSONL file in the current working directory. + Run ``arango-col-backup -h`` to get command line help. + """ + # argument parsing + description = ( + "ArangoDB backup of a collection. " + "The backup is written to a gzip compressed JSONL file in the current working directory." + ) + argument_parser = ArgumentParser(description=description) + argument_parser.add_argument( + "--conf", type=str, required=True, help="Config file containing 'hosts', 'db_name', 'username' and 'password'." + ) + argument_parser.add_argument("--col", type=str, required=True, help="Collection name to backup.") + args = argument_parser.parse_args() + + # load and check config file + arango_config = dotenv_values(args.conf) + expected_config_file_keys = ["hosts", "db_name", "username", "password"] + _check_config_keys(arango_config, expected_config_file_keys) + + output_file_name = f"./{args.col}_backup.jsonl.gz" + print(f"Writing backup to '{output_file_name}'...") + + with closing(ArangoClient(hosts=arango_config["hosts"])) as arango_client, gzip.open( # type: ignore + output_file_name, "w" + ) as gzip_out: + connection = arango_client.db( + arango_config["db_name"], # type: ignore + arango_config["username"], # type: ignore + arango_config["password"], # type: ignore + ) + jsonlines_writer = jsonlines.Writer(gzip_out) # type: ignore + try: + cursor = connection.aql.execute( + "FOR doc IN @@coll RETURN doc", + bind_vars={"@coll": args.col}, + batch_size=100, + max_runtime=60 * 60, # type: ignore # 1 hour + stream=True, + ) + for doc in tqdm(cursor): + jsonlines_writer.write(doc) + finally: + cursor.close(ignore_missing=True) # type: ignore
diff --git a/api-reference/arangodb.html b/api-reference/arangodb.html index 585ad47..07f1ad2 100644 --- a/api-reference/arangodb.html +++ b/api-reference/arangodb.html @@ -59,6 +59,8 @@
  • ArangoBatchDataManager.save_batch()
  • +
  • _check_config_keys()
  • +
  • arango_collection_backup()
  • data
  • @@ -224,6 +226,37 @@ +
    +
    +mltb2.arangodb._check_config_keys(config: Dict[str, str | None], expected_config_keys: Sequence[str]) None[source]
    +

    Check if all expected keys are in config.

    +

    This is useful to check if a config file contains all necessary keys.

    +
    +
    Parameters:
    +
    +
    +
    Return type:
    +

    None

    +
    +
    +
    + +
    +
    +mltb2.arangodb.arango_collection_backup() None[source]
    +

    Commandline tool to do an ArangoDB backup of a collection.

    +

    The backup is written to a gzip compressed JSONL file in the current working directory. +Run arango-col-backup -h to get command line help.

    +
    +
    Return type:
    +

    None

    +
    +
    +
    + diff --git a/genindex.html b/genindex.html index 2d8a0a4..eabddd7 100644 --- a/genindex.html +++ b/genindex.html @@ -124,6 +124,8 @@

    _

  • _arango_client_factory() (mltb2.arangodb.ArangoBatchDataManager method) +
  • +
  • _check_config_keys() (in module mltb2.arangodb)
  • diff --git a/objects.inv b/objects.inv index 60b6f88f27c001a15eb75606f760e770124edce7..307668a93844a96284cdaeb225ba8fae8cbd6b0b 100644 GIT binary patch delta 1482 zcmV;*1vUDX46O{1b$?NF+b9r#-}Ngr(|fh=u8)07(sm}jWG++Zo*68#O^rYUD2e;) zZvn}YVu7*bJQ*4I*j?C#g_4`~=OtEHY$aLe)!|DiNzG~gSB@*TP;^5T6$SmXZL-{% zl7I4z7JuZ6tQ-)QH?DAtO|kom4SvJ|Z>h52f@7`ozoXn_;C~YN2B_eS3WGMdFj5_y z0@Umzx!yp96oM9pN%4WBY?LhRNE^0A8wU2j{t#SZg4Xum4}fd@Lg&L#v7^PlwZons zw54ro;mm(IuU#XO^JWIuxY*a#Bq{-QBMxG#1J2>9kh4FC0#dZ7pbQ->Lg9E#`7}&N znbDuqdQVG+(SMq4XJ9HNwZVp7j)=vpV5G8Zc&&{BS5A*oBdZ~dN%OSJkV*bEiaes8 zlq{&$@=`TV0ZUbji^SMqZ47-gk@;$y1ZM46{T&L=PxC{8)5u-PFj2qN-T*wr*;RDak-bj;wkK4fiIx1licNYPRn zdfb$P9@^u*2U*3eHie?#=g3vL7>Ec{mqSYuQL7{YouJ~8SnIDU&cI?3H)t7=&ME6* z6a!maE%vB{0C7M&iV0^TZrD*wOS!P8tf;^&Vm{v>V8W*IU--FUH@|B|?{?VS$+F@! z?tOi1lz(>})-Bc_gG!n93D9C7>(#jKPt?nP6i31b?TcxGg*W1n{!5KT*Su@hdK4$1z(S zP}xs*zzO)l@MkPEt$D691AB zd-?_ei#C-%a8Y1IR;*+kD`pPiHPgqP=cKsbg+zO#GuFeAre-R3B_Ht*={7B=23mSwgFu*{PyN@J2_RhA!JE?3Dt_vnS}ooU%le0nryNTuvGMgDov4w zZ!4FcFVu22i%_>v+3$Fih%#DR`z7y;)Y^O!5CKl;jZ`Jr>ulHt!n8TuhqX$-S(m0-o0{W$KNl(66 zpbaxq7=69GO0cF8p&MLtb9O%tVhc_&Y^AUSn0R5Dt+}_Gc6tBnjv1;eTu?m?zr*?+ z7@V>3oM|*&(Sl(YcUD>A)yVp8f>PXK05~XV8=}Q7Kw4Rj27`~0PUmfr0mVeH@(vaPY zge&%7&PBJ7X$V5!hK%-KKUMd(n03vX8Sb@se0aY7oPU)?T|z4Ao`zCXD!CkVxyUtbDmob1}tTAf>Rn>l$0? zI4->O^gm;mrvvPcVrU;tu|q0$!;bzx-)H6PpYWDuP0UDSDh~8w*UYD delta 1459 zcmV;k1x)&_43-R#b$`v0+b9r#?|KSV?OyHO%`vx0vZ+coQ#%#Uo+&KaHW>sez?pHL zejBhXX)G|7JSVpdd~`Q-^H&SIez&BW%B^DSvN^o37269a|14?47aBI8L00fbTV=Vk zwfHSmTKrxvih4j;-g)5zwbkwuwe+4!x&`gPm7vCyeB(}r)!hNJ&)--S?=k+u8x4d90UgvoG*pv2U3#gS>} zi-@o2b*CSGyAesZ* z>{uJCF-Y>`e}7~)#4&ku_&Q|P%pFCZP!C#FV2rxf%yYm}@8wl$Y^bpoUTk8#+A5Jz zI~9LM;Sq&z7^Hi2%@>A=m2Y@O{k(~qHsoIoUJ_DKAqYv>J-3T3Sb~OwAPu!ULM0+7L4 zOPE!&h)q~<&z$#Hjo|39L>RP;MB|+KFp447ZZDrmjS1pFc0ifnGVRz=OiQ`4rD{M@ zo^W5N5OLwL@L&A7<9EOI8b0o*{iy0jKsx&R*d*^ZtPj+@4Jv2YCqS!#t~cZQ?b?CX zVjlH|gMU0k-Olh^srRO;-AVJDN;?x^VTfH7LOD_EA9w_qr|zSA0{KzxrJbp2A2d5` zFl$1E`3_eR>drQVx@^%KBl7f2_0}ZQVXOzcmo%9YU4>{$^o1+Fm3|?|2H&A^%N%>u zluItR54c-O`?BLt052Q&lXUDDzY!|69dpGIlYf2DTD`P{sJ}bLcw=1stBC}^9Y$~B zk(zIDCQPH)XO4060)|*#g8|J^oMe_!HT|Iy{tOi&9z7O*rBy{WTk)C;s<}PH=S&}W znUfQKR}$Tp&bkOE2JBSqO5M}1$}d{p%4y~X$_lsAuI}aC-2~Wd2l{0A_lrNN`;uheoji-Di0{8u_lBpI!*!<6?< zYJENlNDwE@M(G;;bun!WYA18Zux8$AC)!6oJU>2?-CVdLNL&y2BB#+OZme4?6BewJP=USIFUn;x(J%9<5phv0a_ z-XXaRaxr$COaV^E%`(Wv`S~f{{C{-1DxFk}?IGfgdt0tD$i>A?$R*nOj-?zaD9%%7 z9Nf)|kvWm4+qqIhZ8WW2nBXGMfH;C9g$KiQg#FewUAU$T*Yp%~{>naw*uB;{sU!LD z%hamXr>_a>&_+@gWB4}Ges#Q4So-kFLv}9`UhKhqh;H$uAsM_38STGesDHjVvFNHD zN4Qtx{@de+cjYHl?Q1-W`sbmdW?2fJ9za;98fF1pe@FCc4vV(n*l}3TgN7mgvwH2{ ztp&O3{g_5VGRUNLS5`sp(|k_x7f3m*?z+ZF9ml7ap8Xdb^Kih`Q4alMfa+>DY#9Cv zZC0WDfo`EV=8PPT`v=D$cPv^Tv)V`ox&l3N2s`%EFL>SQFyCdzU>ASG*z<9BwE^dv NkE=Y;^?%`88USV!-ktye diff --git a/searchindex.js b/searchindex.js index e1f8e1d..c770b77 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["api-reference", "api-reference/arangodb", "api-reference/data", "api-reference/db", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/arangodb.rst", "api-reference/data.rst", "api-reference/db.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "arangodb", "data", "db", "fasttext", "files", "md", "openai", "optuna", "plot", "somajo", "somajo_transformers", "text", "transformers", "MLTB2 Documentation"], "terms": {"arangodb": [0, 14], "data": [0, 1, 3, 5, 8, 13, 14], "db": [0, 8, 14], "fasttext": [0, 14], "file": [0, 1, 2, 4, 7, 14], "md": [0, 14], "openai": [0, 14], "optuna": [0, 14], "plot": [0, 14], "somajo": [0, 11, 14], "somajo_transform": [0, 14], "text": [0, 4, 6, 7, 10, 11, 13, 14], "transform": [0, 11, 14], "util": [1, 3, 5, 13], "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "pip": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "instal": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13], "necessari": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "depend": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "class": [1, 3, 4, 6, 7, 8, 10, 11, 12, 13], "arangobatchdatamanag": 1, "host": [1, 13], "str": [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 13], "sequenc": [1, 3], "db_name": 1, "usernam": 1, "password": 1, "collection_nam": 1, "attribute_nam": 1, "batch_siz": 1, "int": [1, 4, 6, 7, 8, 9, 11, 12, 13], "20": [1, 8], "aql_overwrit": 1, "none": [1, 2, 3, 5, 7, 9, 10, 12, 13], "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "base": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "abstractbatchdatamanag": [1, 3], "implement": [1, 3, 8, 12], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "databas": [1, 3], "name": [1, 7], "document": 1, "from": [1, 2, 3, 4, 5, 7, 8, 10, 12], "collect": 1, "ar": [1, 2, 4, 6, 7, 8, 12, 13], "process": [1, 3, 6, 7, 10, 11, 12, 13], "attribut": 1, "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "check": [1, 7, 8, 12], "alreadi": [1, 6], "If": [1, 2, 5, 7, 9, 10, 11, 12, 13], "present": 1, "avail": [1, 14], "consid": 1, "The": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "batch": [1, 3], "size": 1, "aql": 1, "string": [1, 7, 10], "overwrit": [1, 7], "default": [1, 2, 5, 9], "_arango_client_factori": 1, "arangocli": 1, "creat": [1, 5, 7, 9], "an": [1, 3, 4, 7, 8, 10, 11, 13], "client": 1, "return": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "type": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "_connection_factori": 1, "arango_cli": 1, "standarddatabas": 1, "connect": 1, "classmethod": [1, 7], "from_config_fil": 1, "config_file_nam": 1, "construct": [1, 7], "config": 1, "must": [1, 7, 8, 12], "contain": [1, 2, 4, 10, 12, 13], "valu": [1, 8, 9, 12], "exampl": [1, 7, 8, 10, 12], "http": [1, 2, 10], "com": [1, 10], "my_ml_databas": 1, "my_usernam": 1, "secret": 1, "my_ml_data_collect": 1, "processing_metadata": 1, "100": [1, 8, 12], "path": [1, 4, 5, 13], "load_batch": [1, 3], "load": [1, 2, 3], "save_batch": [1, 3], "save": [1, 3, 5, 9], "offer": [2, 4, 8, 10, 11, 12, 13], "tool": [2, 4, 8, 9, 10, 11, 12, 13, 14], "follow": [2, 7, 12], "tabular": 2, "set": [2, 8, 10, 13], "biolog": 2, "medic": 2, "domain": 2, "support": [2, 12], "colon": 2, "genom": 2, "pub": 2, "princeton": 2, "edu": 2, "oncologi": 2, "affydata": 2, "index": [2, 14], "html": [2, 12], "prostat": 2, "web": 2, "stanford": 2, "hasti": 2, "casi_fil": 2, "leukemia_big": 2, "leukemia": 2, "after": [2, 12], "internet": 2, "pars": 2, "convert": [2, 10], "cach": 2, "directori": [2, 5, 13], "determin": [2, 5], "get_and_create_mltb2_data_dir": [2, 5], "_load_colon_data": 2, "datafram": 2, "label": [2, 9, 13], "also": [2, 7], "see": [2, 8], "panda": 2, "_load_colon_label": 2, "seri": 2, "load_colon": 2, "mltb2_base_data_dir": [2, 5], "tupl": [2, 10], "user": [2, 5, 8], "platformdir": [2, 5], "user_data_dir": [2, 5], "load_leukemia_big": 2, "big": 2, "load_prost": 2, "abc": [3, 7, 10], "abstract": [3, 7, 10], "respect": 3, "intend": 3, "conjunct": 3, "batchdataprocessor": 3, "data_manag": 3, "process_batch_callback": 3, "callabl": 3, "object": [3, 4, 6, 7, 8, 10, 11, 12, 13], "manag": 3, "A": [3, 4, 8, 14], "callback": 3, "function": [3, 5, 7, 8, 9, 12], "one": [3, 9, 10], "run": [3, 7], "done": [3, 8], "until": [3, 8], "empti": 3, "For": [3, 8, 9, 12], "each": 3, "call": [3, 7, 8, 9, 12], "fasttextlanguageidentif": 4, "identifi": 4, "languag": [4, 10], "__call__": [4, 6, 7, 10, 11, 13], "num_lang": 4, "10": [4, 8], "given": [4, 5, 8, 12], "which": [4, 5, 7, 8, 10, 12, 13], "recogn": 4, "number": [4, 6, 7, 8, 9, 10, 11, 12, 13], "dict": [4, 7], "probabl": 4, "more": [4, 8, 9, 12], "than": [4, 11, 12], "element": 4, "so": 4, "guarante": 4, "you": [4, 7, 8, 14], "want": 4, "includ": 4, "case": [4, 7], "when": [4, 7], "veri": 4, "low": 4, "possibl": 4, "af": 4, "al": 4, "am": 4, "arz": 4, "ast": 4, "av": 4, "az": 4, "azb": 4, "ba": 4, "bar": 4, "bcl": 4, "bg": 4, "bh": 4, "bn": 4, "bo": 4, "bpy": 4, "br": 4, "b": 4, "bxr": 4, "ca": 4, "cbk": 4, "ce": 4, "ceb": 4, "ckb": 4, "co": [4, 13], "c": 4, "cv": [4, 8], "cy": 4, "da": [4, 10], "de": [4, 10], "diq": 4, "dsb": 4, "dty": 4, "dv": 4, "el": 4, "eml": 4, "en": 4, "eo": 4, "e": 4, "et": 4, "eu": 4, "fa": 4, "fi": 4, "fr": 4, "frr": 4, "fy": 4, "ga": 4, "gd": 4, "gl": 4, "gn": 4, "gom": 4, "gu": 4, "gv": 4, "he": 4, "hi": 4, "hif": 4, "hr": 4, "hsb": 4, "ht": 4, "hu": 4, "hy": 4, "ia": 4, "id": [4, 13], "ie": 4, "ilo": 4, "io": 4, "ja": 4, "jbo": 4, "jv": 4, "ka": 4, "kk": 4, "km": 4, "kn": 4, "ko": 4, "krc": 4, "ku": 4, "kv": 4, "kw": 4, "ky": 4, "la": 4, "lb": 4, "lez": 4, "li": 4, "lmo": 4, "lo": 4, "lrc": 4, "lt": 4, "lv": 4, "mai": 4, "mg": 4, "mhr": 4, "min": 4, "mk": 4, "ml": 4, "mn": 4, "mr": 4, "mrj": 4, "m": 4, "mt": 4, "mwl": 4, "my": 4, "myv": 4, "mzn": 4, "nah": 4, "nap": 4, "nd": 4, "ne": 4, "new": 4, "nl": 4, "nn": 4, "oc": 4, "o": 4, "pa": 4, "pam": 4, "pfl": 4, "pl": 4, "pm": 4, "pnb": 4, "p": 4, "pt": 4, "qu": 4, "rm": 4, "ro": 4, "ru": 4, "rue": 4, "sa": 4, "sah": 4, "sc": 4, "scn": 4, "sco": 4, "sd": 4, "sh": 4, "si": 4, "sk": 4, "sl": 4, "sq": 4, "sr": 4, "su": 4, "sv": 4, "sw": 4, "ta": 4, "te": 4, "tg": 4, "th": 4, "tk": 4, "tl": 4, "tr": 4, "tt": 4, "tyv": 4, "ug": 4, "uk": 4, "ur": 4, "uz": 4, "vec": 4, "vep": 4, "vi": 4, "vl": 4, "vo": 4, "wa": [4, 7, 12, 13], "war": 4, "wuu": 4, "xal": 4, "xmf": 4, "yi": 4, "yo": 4, "yue": 4, "zh": 4, "static": 4, "get_model_path_and_download": 4, "get": [4, 7, 10], "model": [4, 7, 8, 13], "download": 4, "need": 4, "full": [4, 5, 7], "provid": [5, 8], "other": [5, 12], "fetch_remote_fil": 5, "dirnam": 5, "filenam": [5, 9], "url": [5, 10], "sha256_checksum": 5, "fetch": 5, "remot": 5, "where": [5, 10], "under": 5, "sha256": 5, "checksum": 5, "rais": [5, 11, 12], "ioerror": 5, "wrong": 5, "dir": 5, "exact": 5, "folder": 5, "append": [5, 8], "markdown": 6, "specif": [6, 7, 10, 11, 12, 13], "mdtextsplitt": 6, "max_token": [6, 11], "transformers_token_count": [6, 11], "transformerstokencount": [6, 11, 13], "show_progress_bar": [6, 7, 10, 11, 12, 13], "bool": [6, 7, 8, 9, 10, 11, 12, 13], "fals": [6, 7, 9, 10, 11, 12, 13], "split": [6, 8, 10, 11, 13], "section": [6, 11], "specifi": [6, 7, 11], "maximum": [6, 11, 12], "token": [6, 7, 10, 11, 13], "doe": [6, 8, 9, 11], "divid": [6, 11], "head": 6, "correspond": 6, "paragraph": 6, "per": [6, 8, 11], "can": [6, 7, 9, 12, 14], "onli": [6, 7, 8, 12], "exceed": 6, "singl": [6, 9, 12], "chunk": 6, "larger": [6, 8], "counter": [6, 11, 12], "show": [6, 7, 10, 11, 12, 13], "progressbar": [6, 7, 10, 11, 12, 13], "dure": [6, 7, 10, 11, 12, 13], "md_text": 6, "list": [6, 7, 10, 11, 13], "_chunk_md_by_headlin": 6, "headlin": 6, "chunk_md": 6, "merg": 6, "isol": 6, "subsequ": 6, "end": 6, "without": [6, 7], "content": 6, "remov": [6, 12], "openaiazurechatcomplet": 7, "completion_kwarg": 7, "ani": 7, "openaichatcomplet": 7, "azur": 7, "chat": 7, "complet": 7, "openaibasecomplet": 7, "from_yaml": 7, "kwarg": 7, "properti": 7, "api_typ": 7, "api_vers": 7, "api_bas": 7, "engin": 7, "quickstart": 7, "start": 7, "gpt": 7, "35": 7, "turbo": 7, "4": [7, 8], "servic": 7, "openaiazurecomplet": 7, "openaicomplet": 7, "non": 7, "gener": [7, 13], "prompt": 7, "map": 7, "openaicompletionansw": 7, "llm": 7, "In": [7, 8], "allow": 7, "chang": 7, "temperatur": 7, "_complet": 7, "completion_kwargs_for_this_cal": 7, "openaiobject": 7, "method": [7, 8, 12], "yaml_fil": 7, "yaml": 7, "prompt_token": 7, "completion_token": 7, "total_token": 7, "finish_reason": 7, "answer": 7, "result": [7, 8], "ha": [7, 9], "been": 7, "total": [7, 12], "reason": [7, 8], "why": 7, "stop": 7, "mean": [7, 8], "api": [7, 14], "limit": [7, 12], "length": 7, "becaus": 7, "function_cal": 7, "from_open_ai_object": 7, "open_ai_object": 7, "openaitokencount": 7, "model_nam": 7, "count": [7, 12, 13], "some": [7, 14], "3": [7, 8], "5": 7, "davinci": 7, "003": 7, "embed": 7, "ada": 7, "002": 7, "iter": [7, 10, 12, 13], "just": [7, 13], "_check_mandatory_azure_completion_kwarg": 7, "mandatori": 7, "significancerepeatedtrainingprun": 8, "alpha": 8, "float": [8, 10, 12], "0": [8, 9, 12], "1": [8, 13], "n_warmup_step": 8, "baseprun": 8, "pruner": 8, "statist": 8, "signific": 8, "heurist": 8, "decis": 8, "make": [8, 9], "It": [8, 10, 12, 14], "prune": 8, "repeat": 8, "train": [8, 13], "like": 8, "cross": [8, 13], "valid": [8, 13], "As": 8, "test": [8, 13], "t": 8, "our": 8, "experi": 8, "have": 8, "shown": 8, "aplha": 8, "between": [8, 12], "": [8, 9], "standard": 8, "assum": 8, "adjust": 8, "onc": [8, 12], "hyperparamet": 8, "those": 8, "work": 8, "basi": 8, "intermedi": 8, "epoch": 8, "contrast": 8, "precis": 8, "individu": 8, "fold": [8, 13], "below": 8, "minimalist": 8, "import": [8, 10], "log": 8, "numpi": 8, "np": 8, "sklearn": 8, "dataset": [8, 13], "load_iri": 8, "model_select": 8, "stratifiedkfold": 8, "ensembl": 8, "randomforestclassifi": 8, "metric": 8, "accuracy_scor": 8, "configur": 8, "logger": 8, "debug": 8, "output": [8, 10], "getlogg": 8, "addhandl": 8, "streamhandl": 8, "setlevel": 8, "x": [8, 9], "y": [8, 9], "target": 8, "def": 8, "trial": 8, "min_samples_split": 8, "suggest_int": 8, "2": 8, "n_estim": 8, "validation_result_list": 8, "skf": 8, "n_split": [8, 13], "fold_index": 8, "train_index": 8, "val_index": 8, "enumer": 8, "x_train": 8, "x_val": 8, "y_train": 8, "y_val": 8, "rf": 8, "fit": [8, 12], "y_pred": 8, "predict": 8, "acc": 8, "report": 8, "we": 8, "should": [8, 10], "should_prun": 8, "here": 8, "break": 8, "studi": 8, "create_studi": 8, "storag": 8, "sqlite": 8, "memori": 8, "study_nam": 8, "iris_cv": 8, "direct": 8, "maxim": 8, "load_if_exist": 8, "true": [8, 9, 11, 12], "sampler": 8, "tpesampl": 8, "multivari": 8, "add": 8, "optim": 8, "n_trial": 8, "level": 8, "aggress": 8, "smaller": 8, "stronger": 8, "differ": [8, 9, 12], "two": [8, 9, 10, 12], "distribut": 8, "disabl": 8, "reach": 8, "exce": 8, "step": [8, 9], "frozentri": 8, "judg": 8, "whether": 8, "note": 8, "suppos": 8, "librari": 8, "instead": 8, "interfac": 8, "mechan": 8, "take": 8, "copi": 8, "befor": [8, 12], "modifi": 8, "boolean": 8, "repres": 8, "matplotlib": 9, "boxplot": 9, "titl": 9, "xlabel": 9, "ylabel": 9, "vert": 9, "print": [9, 10], "diagram": 9, "pyplot": 9, "axi": 9, "box": [9, 14], "vertic": 9, "horizont": 9, "boxplot_dict": 9, "values_dict": 9, "form": [9, 12], "dictionari": 9, "save_last_figur": 9, "last": 9, "made": 9, "jupyt": 9, "notebook": 9, "same": 9, "cell": 9, "twin_axes_timeseries_plot": 9, "values_1": 9, "label_1": 9, "values_2": 9, "label_2": 9, "start_timestep_numb": 9, "shift_1": 9, "shift_2": 9, "label_x": 9, "color_1": 9, "tab": 9, "red": 9, "color_2": 9, "blue": 9, "twin": 9, "ax": 9, "timeseri": 9, "curv": 9, "array_lik": 9, "first": [9, 12], "second": 9, "point": 9, "time": [9, 12], "timestep": 9, "shift": 9, "posit": 9, "neg": 9, "color": 9, "jaccardsimilar": 10, "liter": 10, "de_cmc": 10, "en_ptb": 10, "somajobaseclass": 10, "calcul": [10, 12], "jaccard": 10, "similar": 10, "german": 10, "english": 10, "text1": 10, "text2": 10, "get_token_set": 10, "word": [10, 11], "directli": 10, "somajosentencesplitt": [10, 11], "sentenc": [10, 11], "tokenextractor": 10, "extract": 10, "extract_url_set": 10, "token_extractor": 10, "url_set": 10, "ist": 10, "ein": 10, "link": 10, "github": [10, 14], "urlswapp": 10, "url_pattern": 10, "swap": 10, "revers": 10, "replac": [10, 12], "extractor": 10, "pattern": 10, "One": [10, 12], "mark": 10, "place": 10, "put": 10, "reverse_swap_url": 10, "revert": 10, "were": 10, "unknown": 10, "swap_url": 10, "detoken": 10, "how": 10, "do": [10, 13], "extract_token_class_set": 10, "keep_token_class": 10, "keep": 10, "all": [10, 14], "kept": 10, "hug": [11, 13], "face": [11, 13], "textsplitt": 11, "somajo_sentence_splitt": 11, "ignore_overly_long_sent": 11, "alwai": 11, "whole": 11, "splitter": 11, "valueerror": [11, 12], "except": 11, "longer": 11, "simpli": 11, "ignor": 11, "detect": 12, "clean": 12, "invis": 12, "charact": 12, "special": 12, "whitespac": 12, "duplic": 12, "distanc": 12, "find": 12, "anomali": 12, "textdist": 12, "max_dimens": 12, "markup": 12, "unusu": 12, "multipl": 12, "again": 12, "dimens": 12, "greater": 12, "_normalize_char_count": 12, "normal": 12, "char": 12, "defaultdict": 12, "lazi": 12, "postprocess": 12, "manhattan": 12, "scipi": 12, "spatial": 12, "cityblock": 12, "most": 12, "commen": 12, "higher": 12, "least": 12, "_normalize_counter_to_defaultdict": 12, "devid": 12, "them": [12, 14], "clean_all_invisible_chars_and_whitespac": 12, "lead": 12, "trail": 12, "defin": 12, "constant": 12, "invisible_charact": 12, "special_whitespac": 12, "rteturn": 12, "has_invisible_charact": 12, "otherwis": 12, "has_special_whitespac": 12, "remove_invisible_charact": 12, "replace_multiple_whitespac": 12, "replace_special_whitespac": 12, "kfoldlabeleddataset": 13, "7": 13, "n_repeat": 13, "random_st": 13, "k": 13, "labeleddataset": 13, "labeled_dataset": 13, "stratification_label": 13, "encod": 13, "labe": 13, "pretrained_model_name_or_path": 13, "pathlik": 13, "insid": 13, "repo": 13, "huggingfac": 13, "machin": 14, "learn": 14, "python": 14, "packag": 14, "pypi": 14, "option": 14, "might": 14, "refer": 14, "repositori": 14, "licens": 14, "imprint": 14}, "objects": {"mltb2": [[1, 0, 0, "-", "arangodb"], [2, 0, 0, "-", "data"], [3, 0, 0, "-", "db"], [4, 0, 0, "-", "fasttext"], [5, 0, 0, "-", "files"], [6, 0, 0, "-", "md"], [7, 0, 0, "-", "openai"], [8, 0, 0, "-", "optuna"], [9, 0, 0, "-", "plot"], [10, 0, 0, "-", "somajo"], [11, 0, 0, "-", "somajo_transformers"], [12, 0, 0, "-", "text"], [13, 0, 0, "-", "transformers"]], "mltb2.arangodb": [[1, 1, 1, "", "ArangoBatchDataManager"]], "mltb2.arangodb.ArangoBatchDataManager": [[1, 2, 1, "", "_arango_client_factory"], [1, 2, 1, "", "_connection_factory"], [1, 2, 1, "", "from_config_file"], [1, 2, 1, "", "load_batch"], [1, 2, 1, "", "save_batch"]], "mltb2.data": [[2, 3, 1, "", "_load_colon_data"], [2, 3, 1, "", "_load_colon_label"], [2, 3, 1, "", "load_colon"], [2, 3, 1, "", "load_leukemia_big"], [2, 3, 1, "", "load_prostate"]], "mltb2.db": [[3, 1, 1, "", "AbstractBatchDataManager"], [3, 1, 1, "", "BatchDataProcessor"]], "mltb2.db.AbstractBatchDataManager": [[3, 2, 1, "", "load_batch"], [3, 2, 1, "", "save_batch"]], "mltb2.db.BatchDataProcessor": [[3, 2, 1, "", "run"]], "mltb2.fasttext": [[4, 1, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[4, 2, 1, "", "__call__"], [4, 2, 1, "", "get_model_path_and_download"]], "mltb2.files": [[5, 3, 1, "", "fetch_remote_file"], [5, 3, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[6, 1, 1, "", "MdTextSplitter"], [6, 3, 1, "", "_chunk_md_by_headline"], [6, 3, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[6, 2, 1, "", "__call__"]], "mltb2.openai": [[7, 1, 1, "", "OpenAiAzureChatCompletion"], [7, 1, 1, "", "OpenAiAzureCompletion"], [7, 1, 1, "", "OpenAiBaseCompletion"], [7, 1, 1, "", "OpenAiChatCompletion"], [7, 1, 1, "", "OpenAiCompletion"], [7, 1, 1, "", "OpenAiCompletionAnswer"], [7, 1, 1, "", "OpenAiTokenCounter"], [7, 3, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[7, 2, 1, "", "__call__"], [7, 2, 1, "", "_completion"], [7, 2, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[7, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[7, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[7, 2, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[7, 2, 1, "", "__call__"]], "mltb2.optuna": [[8, 1, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[8, 2, 1, "", "prune"]], "mltb2.plot": [[9, 3, 1, "", "boxplot"], [9, 3, 1, "", "boxplot_dict"], [9, 3, 1, "", "save_last_figure"], [9, 3, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[10, 1, 1, "", "JaccardSimilarity"], [10, 1, 1, "", "SoMaJoBaseClass"], [10, 1, 1, "", "SoMaJoSentenceSplitter"], [10, 1, 1, "", "TokenExtractor"], [10, 1, 1, "", "UrlSwapper"], [10, 3, 1, "", "detokenize"], [10, 3, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[10, 2, 1, "", "__call__"], [10, 2, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[10, 2, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[10, 2, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[10, 2, 1, "", "reverse_swap_urls"], [10, 2, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[11, 1, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[11, 2, 1, "", "__call__"]], "mltb2.text": [[12, 1, 1, "", "TextDistance"], [12, 3, 1, "", "_normalize_counter_to_defaultdict"], [12, 3, 1, "", "clean_all_invisible_chars_and_whitespaces"], [12, 3, 1, "", "has_invisible_characters"], [12, 3, 1, "", "has_special_whitespaces"], [12, 3, 1, "", "remove_invisible_characters"], [12, 3, 1, "", "replace_multiple_whitespaces"], [12, 3, 1, "", "replace_special_whitespaces"]], "mltb2.text.TextDistance": [[12, 2, 1, "", "_normalize_char_counter"], [12, 2, 1, "", "distance"], [12, 2, 1, "", "fit"]], "mltb2.transformers": [[13, 1, 1, "", "KFoldLabeledDataset"], [13, 1, 1, "", "LabeledDataset"], [13, 1, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[13, 2, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[13, 2, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "refer": 0, "arangodb": 1, "data": 2, "db": 3, "fasttext": 4, "file": 5, "md": 6, "openai": 7, "optuna": 8, "plot": 9, "somajo": 10, "somajo_transform": 11, "text": 12, "transform": 13, "mltb2": 14, "document": 14, "instal": 14, "content": 14}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "arangodb": [[1, "module-mltb2.arangodb"]], "data": [[2, "module-mltb2.data"]], "db": [[3, "module-mltb2.db"]], "fasttext": [[4, "module-mltb2.fasttext"]], "files": [[5, "module-mltb2.files"]], "md": [[6, "module-mltb2.md"]], "openai": [[7, "module-mltb2.openai"]], "optuna": [[8, "module-mltb2.optuna"]], "plot": [[9, "module-mltb2.plot"]], "somajo": [[10, "module-mltb2.somajo"]], "somajo_transformers": [[11, "module-mltb2.somajo_transformers"]], "text": [[12, "module-mltb2.text"]], "transformers": [[13, "module-mltb2.transformers"]], "MLTB2 Documentation": [[14, "mltb2-documentation"]], "Installation": [[14, "installation"]], "Content": [[14, "content"]]}, "indexentries": {"arangobatchdatamanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoBatchDataManager"]], "_arango_client_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._arango_client_factory"]], "_connection_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._connection_factory"]], "from_config_file() (mltb2.arangodb.arangobatchdatamanager class method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.from_config_file"]], "load_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.load_batch"]], "mltb2.arangodb": [[1, "module-mltb2.arangodb"]], "module": [[1, "module-mltb2.arangodb"], [2, "module-mltb2.data"], [3, "module-mltb2.db"], [4, "module-mltb2.fasttext"], [5, "module-mltb2.files"], [6, "module-mltb2.md"], [7, "module-mltb2.openai"], [8, "module-mltb2.optuna"], [9, "module-mltb2.plot"], [10, "module-mltb2.somajo"], [11, "module-mltb2.somajo_transformers"], [12, "module-mltb2.text"], [13, "module-mltb2.transformers"]], "save_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.save_batch"]], "_load_colon_data() (in module mltb2.data)": [[2, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[2, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[2, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[2, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[2, "mltb2.data.load_prostate"]], "mltb2.data": [[2, "module-mltb2.data"]], "abstractbatchdatamanager (class in mltb2.db)": [[3, "mltb2.db.AbstractBatchDataManager"]], "batchdataprocessor (class in mltb2.db)": [[3, "mltb2.db.BatchDataProcessor"]], "load_batch() (mltb2.db.abstractbatchdatamanager method)": [[3, "mltb2.db.AbstractBatchDataManager.load_batch"]], "mltb2.db": [[3, "module-mltb2.db"]], "run() (mltb2.db.batchdataprocessor method)": [[3, "mltb2.db.BatchDataProcessor.run"]], "save_batch() (mltb2.db.abstractbatchdatamanager method)": [[3, "mltb2.db.AbstractBatchDataManager.save_batch"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[4, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[4, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[4, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[4, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[5, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[5, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[5, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[6, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[6, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[6, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[6, "mltb2.md.chunk_md"]], "mltb2.md": [[6, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[7, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[7, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[7, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[7, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[7, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[7, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[7, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[7, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[7, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[8, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[8, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[8, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[9, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[9, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[9, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[9, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[9, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[10, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[10, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[10, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[10, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[10, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[10, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[10, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[10, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[10, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[10, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[10, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[10, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[10, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[10, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[11, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[11, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[11, "module-mltb2.somajo_transformers"]], "textdistance (class in mltb2.text)": [[12, "mltb2.text.TextDistance"]], "_normalize_char_counter() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance._normalize_char_counter"]], "_normalize_counter_to_defaultdict() (in module mltb2.text)": [[12, "mltb2.text._normalize_counter_to_defaultdict"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "distance() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance.distance"]], "fit() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance.fit"]], "has_invisible_characters() (in module mltb2.text)": [[12, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[12, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[12, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[13, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[13, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[13, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[13, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[13, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[13, "mltb2.transformers.KFoldLabeledDataset.split"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["api-reference", "api-reference/arangodb", "api-reference/data", "api-reference/db", "api-reference/fasttext", "api-reference/files", "api-reference/md", "api-reference/openai", "api-reference/optuna", "api-reference/plot", "api-reference/somajo", "api-reference/somajo_transformers", "api-reference/text", "api-reference/transformers", "index"], "filenames": ["api-reference.rst", "api-reference/arangodb.rst", "api-reference/data.rst", "api-reference/db.rst", "api-reference/fasttext.rst", "api-reference/files.rst", "api-reference/md.rst", "api-reference/openai.rst", "api-reference/optuna.rst", "api-reference/plot.rst", "api-reference/somajo.rst", "api-reference/somajo_transformers.rst", "api-reference/text.rst", "api-reference/transformers.rst", "index.rst"], "titles": ["API Reference", "arangodb", "data", "db", "fasttext", "files", "md", "openai", "optuna", "plot", "somajo", "somajo_transformers", "text", "transformers", "MLTB2 Documentation"], "terms": {"arangodb": [0, 14], "data": [0, 1, 3, 5, 8, 13, 14], "db": [0, 8, 14], "fasttext": [0, 14], "file": [0, 1, 2, 4, 7, 14], "md": [0, 14], "openai": [0, 14], "optuna": [0, 14], "plot": [0, 14], "somajo": [0, 11, 14], "somajo_transform": [0, 14], "text": [0, 4, 6, 7, 10, 11, 13, 14], "transform": [0, 11, 14], "util": [1, 3, 5, 13], "modul": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "pip": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "instal": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13], "necessari": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "depend": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "mltb2": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "class": [1, 3, 4, 6, 7, 8, 10, 11, 12, 13], "arangobatchdatamanag": 1, "host": [1, 13], "str": [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 13], "sequenc": [1, 3], "db_name": 1, "usernam": 1, "password": 1, "collection_nam": 1, "attribute_nam": 1, "batch_siz": 1, "int": [1, 4, 6, 7, 8, 9, 11, 12, 13], "20": [1, 8], "aql_overwrit": 1, "none": [1, 2, 3, 5, 7, 9, 10, 12, 13], "sourc": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "base": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "abstractbatchdatamanag": [1, 3], "implement": [1, 3, 8, 12], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], "databas": [1, 3], "name": [1, 7], "document": 1, "from": [1, 2, 3, 4, 5, 7, 8, 10, 12], "collect": 1, "ar": [1, 2, 4, 6, 7, 8, 12, 13], "process": [1, 3, 6, 7, 10, 11, 12, 13], "attribut": 1, "i": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], "check": [1, 7, 8, 12], "alreadi": [1, 6], "If": [1, 2, 5, 7, 9, 10, 11, 12, 13], "present": 1, "avail": [1, 14], "consid": 1, "The": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "batch": [1, 3], "size": 1, "aql": 1, "string": [1, 7, 10], "overwrit": [1, 7], "default": [1, 2, 5, 9], "_arango_client_factori": 1, "arangocli": 1, "creat": [1, 5, 7, 9], "an": [1, 3, 4, 7, 8, 10, 11, 13], "client": 1, "return": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "type": [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], "_connection_factori": 1, "arango_cli": 1, "standarddatabas": 1, "connect": 1, "classmethod": [1, 7], "from_config_fil": 1, "config_file_nam": 1, "construct": [1, 7], "config": 1, "must": [1, 7, 8, 12], "contain": [1, 2, 4, 10, 12, 13], "valu": [1, 8, 9, 12], "exampl": [1, 7, 8, 10, 12], "http": [1, 2, 10], "com": [1, 10], "my_ml_databas": 1, "my_usernam": 1, "secret": 1, "my_ml_data_collect": 1, "processing_metadata": 1, "100": [1, 8, 12], "path": [1, 4, 5, 13], "load_batch": [1, 3], "load": [1, 2, 3], "save_batch": [1, 3], "save": [1, 3, 5, 9], "_check_config_kei": 1, "dict": [1, 4, 7], "expected_config_kei": 1, "all": [1, 10, 14], "expect": 1, "kei": 1, "arango_collection_backup": 1, "commandlin": 1, "tool": [1, 2, 4, 8, 9, 10, 11, 12, 13, 14], "do": [1, 10, 13], "backup": 1, "written": 1, "gzip": 1, "compress": 1, "jsonl": 1, "current": 1, "work": [1, 8], "directori": [1, 2, 5, 13], "run": [1, 3, 7], "arango": 1, "col": 1, "h": 1, "get": [1, 4, 7, 10], "command": 1, "line": 1, "help": 1, "offer": [2, 4, 8, 10, 11, 12, 13], "follow": [2, 7, 12], "tabular": 2, "set": [2, 8, 10, 13], "biolog": 2, "medic": 2, "domain": 2, "support": [2, 12], "colon": 2, "genom": 2, "pub": 2, "princeton": 2, "edu": 2, "oncologi": 2, "affydata": 2, "index": [2, 14], "html": [2, 12], "prostat": 2, "web": 2, "stanford": 2, "hasti": 2, "casi_fil": 2, "leukemia_big": 2, "leukemia": 2, "after": [2, 12], "internet": 2, "pars": 2, "convert": [2, 10], "cach": 2, "determin": [2, 5], "get_and_create_mltb2_data_dir": [2, 5], "_load_colon_data": 2, "datafram": 2, "label": [2, 9, 13], "also": [2, 7], "see": [2, 8], "panda": 2, "_load_colon_label": 2, "seri": 2, "load_colon": 2, "mltb2_base_data_dir": [2, 5], "tupl": [2, 10], "user": [2, 5, 8], "platformdir": [2, 5], "user_data_dir": [2, 5], "load_leukemia_big": 2, "big": 2, "load_prost": 2, "abc": [3, 7, 10], "abstract": [3, 7, 10], "respect": 3, "intend": 3, "conjunct": 3, "batchdataprocessor": 3, "data_manag": 3, "process_batch_callback": 3, "callabl": 3, "object": [3, 4, 6, 7, 8, 10, 11, 12, 13], "manag": 3, "A": [3, 4, 8, 14], "callback": 3, "function": [3, 5, 7, 8, 9, 12], "one": [3, 9, 10], "done": [3, 8], "until": [3, 8], "empti": 3, "For": [3, 8, 9, 12], "each": 3, "call": [3, 7, 8, 9, 12], "fasttextlanguageidentif": 4, "identifi": 4, "languag": [4, 10], "__call__": [4, 6, 7, 10, 11, 13], "num_lang": 4, "10": [4, 8], "given": [4, 5, 8, 12], "which": [4, 5, 7, 8, 10, 12, 13], "recogn": 4, "number": [4, 6, 7, 8, 9, 10, 11, 12, 13], "probabl": 4, "more": [4, 8, 9, 12], "than": [4, 11, 12], "element": 4, "so": 4, "guarante": 4, "you": [4, 7, 8, 14], "want": 4, "includ": 4, "case": [4, 7], "when": [4, 7], "veri": 4, "low": 4, "possibl": 4, "af": 4, "al": 4, "am": 4, "arz": 4, "ast": 4, "av": 4, "az": 4, "azb": 4, "ba": 4, "bar": 4, "bcl": 4, "bg": 4, "bh": 4, "bn": 4, "bo": 4, "bpy": 4, "br": 4, "b": 4, "bxr": 4, "ca": 4, "cbk": 4, "ce": 4, "ceb": 4, "ckb": 4, "co": [4, 13], "c": 4, "cv": [4, 8], "cy": 4, "da": [4, 10], "de": [4, 10], "diq": 4, "dsb": 4, "dty": 4, "dv": 4, "el": 4, "eml": 4, "en": 4, "eo": 4, "e": 4, "et": 4, "eu": 4, "fa": 4, "fi": 4, "fr": 4, "frr": 4, "fy": 4, "ga": 4, "gd": 4, "gl": 4, "gn": 4, "gom": 4, "gu": 4, "gv": 4, "he": 4, "hi": 4, "hif": 4, "hr": 4, "hsb": 4, "ht": 4, "hu": 4, "hy": 4, "ia": 4, "id": [4, 13], "ie": 4, "ilo": 4, "io": 4, "ja": 4, "jbo": 4, "jv": 4, "ka": 4, "kk": 4, "km": 4, "kn": 4, "ko": 4, "krc": 4, "ku": 4, "kv": 4, "kw": 4, "ky": 4, "la": 4, "lb": 4, "lez": 4, "li": 4, "lmo": 4, "lo": 4, "lrc": 4, "lt": 4, "lv": 4, "mai": 4, "mg": 4, "mhr": 4, "min": 4, "mk": 4, "ml": 4, "mn": 4, "mr": 4, "mrj": 4, "m": 4, "mt": 4, "mwl": 4, "my": 4, "myv": 4, "mzn": 4, "nah": 4, "nap": 4, "nd": 4, "ne": 4, "new": 4, "nl": 4, "nn": 4, "oc": 4, "o": 4, "pa": 4, "pam": 4, "pfl": 4, "pl": 4, "pm": 4, "pnb": 4, "p": 4, "pt": 4, "qu": 4, "rm": 4, "ro": 4, "ru": 4, "rue": 4, "sa": 4, "sah": 4, "sc": 4, "scn": 4, "sco": 4, "sd": 4, "sh": 4, "si": 4, "sk": 4, "sl": 4, "sq": 4, "sr": 4, "su": 4, "sv": 4, "sw": 4, "ta": 4, "te": 4, "tg": 4, "th": 4, "tk": 4, "tl": 4, "tr": 4, "tt": 4, "tyv": 4, "ug": 4, "uk": 4, "ur": 4, "uz": 4, "vec": 4, "vep": 4, "vi": 4, "vl": 4, "vo": 4, "wa": [4, 7, 12, 13], "war": 4, "wuu": 4, "xal": 4, "xmf": 4, "yi": 4, "yo": 4, "yue": 4, "zh": 4, "static": 4, "get_model_path_and_download": 4, "model": [4, 7, 8, 13], "download": 4, "need": 4, "full": [4, 5, 7], "provid": [5, 8], "other": [5, 12], "fetch_remote_fil": 5, "dirnam": 5, "filenam": [5, 9], "url": [5, 10], "sha256_checksum": 5, "fetch": 5, "remot": 5, "where": [5, 10], "under": 5, "sha256": 5, "checksum": 5, "rais": [5, 11, 12], "ioerror": 5, "wrong": 5, "dir": 5, "exact": 5, "folder": 5, "append": [5, 8], "markdown": 6, "specif": [6, 7, 10, 11, 12, 13], "mdtextsplitt": 6, "max_token": [6, 11], "transformers_token_count": [6, 11], "transformerstokencount": [6, 11, 13], "show_progress_bar": [6, 7, 10, 11, 12, 13], "bool": [6, 7, 8, 9, 10, 11, 12, 13], "fals": [6, 7, 9, 10, 11, 12, 13], "split": [6, 8, 10, 11, 13], "section": [6, 11], "specifi": [6, 7, 11], "maximum": [6, 11, 12], "token": [6, 7, 10, 11, 13], "doe": [6, 8, 9, 11], "divid": [6, 11], "head": 6, "correspond": 6, "paragraph": 6, "per": [6, 8, 11], "can": [6, 7, 9, 12, 14], "onli": [6, 7, 8, 12], "exceed": 6, "singl": [6, 9, 12], "chunk": 6, "larger": [6, 8], "counter": [6, 11, 12], "show": [6, 7, 10, 11, 12, 13], "progressbar": [6, 7, 10, 11, 12, 13], "dure": [6, 7, 10, 11, 12, 13], "md_text": 6, "list": [6, 7, 10, 11, 13], "_chunk_md_by_headlin": 6, "headlin": 6, "chunk_md": 6, "merg": 6, "isol": 6, "subsequ": 6, "end": 6, "without": [6, 7], "content": 6, "remov": [6, 12], "openaiazurechatcomplet": 7, "completion_kwarg": 7, "ani": 7, "openaichatcomplet": 7, "azur": 7, "chat": 7, "complet": 7, "openaibasecomplet": 7, "from_yaml": 7, "kwarg": 7, "properti": 7, "api_typ": 7, "api_vers": 7, "api_bas": 7, "engin": 7, "quickstart": 7, "start": 7, "gpt": 7, "35": 7, "turbo": 7, "4": [7, 8], "servic": 7, "openaiazurecomplet": 7, "openaicomplet": 7, "non": 7, "gener": [7, 13], "prompt": 7, "map": 7, "openaicompletionansw": 7, "llm": 7, "In": [7, 8], "allow": 7, "chang": 7, "temperatur": 7, "_complet": 7, "completion_kwargs_for_this_cal": 7, "openaiobject": 7, "method": [7, 8, 12], "yaml_fil": 7, "yaml": 7, "prompt_token": 7, "completion_token": 7, "total_token": 7, "finish_reason": 7, "answer": 7, "result": [7, 8], "ha": [7, 9], "been": 7, "total": [7, 12], "reason": [7, 8], "why": 7, "stop": 7, "mean": [7, 8], "api": [7, 14], "limit": [7, 12], "length": 7, "becaus": 7, "function_cal": 7, "from_open_ai_object": 7, "open_ai_object": 7, "openaitokencount": 7, "model_nam": 7, "count": [7, 12, 13], "some": [7, 14], "3": [7, 8], "5": 7, "davinci": 7, "003": 7, "embed": 7, "ada": 7, "002": 7, "iter": [7, 10, 12, 13], "just": [7, 13], "_check_mandatory_azure_completion_kwarg": 7, "mandatori": 7, "significancerepeatedtrainingprun": 8, "alpha": 8, "float": [8, 10, 12], "0": [8, 9, 12], "1": [8, 13], "n_warmup_step": 8, "baseprun": 8, "pruner": 8, "statist": 8, "signific": 8, "heurist": 8, "decis": 8, "make": [8, 9], "It": [8, 10, 12, 14], "prune": 8, "repeat": 8, "train": [8, 13], "like": 8, "cross": [8, 13], "valid": [8, 13], "As": 8, "test": [8, 13], "t": 8, "our": 8, "experi": 8, "have": 8, "shown": 8, "aplha": 8, "between": [8, 12], "": [8, 9], "standard": 8, "assum": 8, "adjust": 8, "onc": [8, 12], "hyperparamet": 8, "those": 8, "basi": 8, "intermedi": 8, "epoch": 8, "contrast": 8, "precis": 8, "individu": 8, "fold": [8, 13], "below": 8, "minimalist": 8, "import": [8, 10], "log": 8, "numpi": 8, "np": 8, "sklearn": 8, "dataset": [8, 13], "load_iri": 8, "model_select": 8, "stratifiedkfold": 8, "ensembl": 8, "randomforestclassifi": 8, "metric": 8, "accuracy_scor": 8, "configur": 8, "logger": 8, "debug": 8, "output": [8, 10], "getlogg": 8, "addhandl": 8, "streamhandl": 8, "setlevel": 8, "x": [8, 9], "y": [8, 9], "target": 8, "def": 8, "trial": 8, "min_samples_split": 8, "suggest_int": 8, "2": 8, "n_estim": 8, "validation_result_list": 8, "skf": 8, "n_split": [8, 13], "fold_index": 8, "train_index": 8, "val_index": 8, "enumer": 8, "x_train": 8, "x_val": 8, "y_train": 8, "y_val": 8, "rf": 8, "fit": [8, 12], "y_pred": 8, "predict": 8, "acc": 8, "report": 8, "we": 8, "should": [8, 10], "should_prun": 8, "here": 8, "break": 8, "studi": 8, "create_studi": 8, "storag": 8, "sqlite": 8, "memori": 8, "study_nam": 8, "iris_cv": 8, "direct": 8, "maxim": 8, "load_if_exist": 8, "true": [8, 9, 11, 12], "sampler": 8, "tpesampl": 8, "multivari": 8, "add": 8, "optim": 8, "n_trial": 8, "level": 8, "aggress": 8, "smaller": 8, "stronger": 8, "differ": [8, 9, 12], "two": [8, 9, 10, 12], "distribut": 8, "disabl": 8, "reach": 8, "exce": 8, "step": [8, 9], "frozentri": 8, "judg": 8, "whether": 8, "note": 8, "suppos": 8, "librari": 8, "instead": 8, "interfac": 8, "mechan": 8, "take": 8, "copi": 8, "befor": [8, 12], "modifi": 8, "boolean": 8, "repres": 8, "matplotlib": 9, "boxplot": 9, "titl": 9, "xlabel": 9, "ylabel": 9, "vert": 9, "print": [9, 10], "diagram": 9, "pyplot": 9, "axi": 9, "box": [9, 14], "vertic": 9, "horizont": 9, "boxplot_dict": 9, "values_dict": 9, "form": [9, 12], "dictionari": 9, "save_last_figur": 9, "last": 9, "made": 9, "jupyt": 9, "notebook": 9, "same": 9, "cell": 9, "twin_axes_timeseries_plot": 9, "values_1": 9, "label_1": 9, "values_2": 9, "label_2": 9, "start_timestep_numb": 9, "shift_1": 9, "shift_2": 9, "label_x": 9, "color_1": 9, "tab": 9, "red": 9, "color_2": 9, "blue": 9, "twin": 9, "ax": 9, "timeseri": 9, "curv": 9, "array_lik": 9, "first": [9, 12], "second": 9, "point": 9, "time": [9, 12], "timestep": 9, "shift": 9, "posit": 9, "neg": 9, "color": 9, "jaccardsimilar": 10, "liter": 10, "de_cmc": 10, "en_ptb": 10, "somajobaseclass": 10, "calcul": [10, 12], "jaccard": 10, "similar": 10, "german": 10, "english": 10, "text1": 10, "text2": 10, "get_token_set": 10, "word": [10, 11], "directli": 10, "somajosentencesplitt": [10, 11], "sentenc": [10, 11], "tokenextractor": 10, "extract": 10, "extract_url_set": 10, "token_extractor": 10, "url_set": 10, "ist": 10, "ein": 10, "link": 10, "github": [10, 14], "urlswapp": 10, "url_pattern": 10, "swap": 10, "revers": 10, "replac": [10, 12], "extractor": 10, "pattern": 10, "One": [10, 12], "mark": 10, "place": 10, "put": 10, "reverse_swap_url": 10, "revert": 10, "were": 10, "unknown": 10, "swap_url": 10, "detoken": 10, "how": 10, "extract_token_class_set": 10, "keep_token_class": 10, "keep": 10, "kept": 10, "hug": [11, 13], "face": [11, 13], "textsplitt": 11, "somajo_sentence_splitt": 11, "ignore_overly_long_sent": 11, "alwai": 11, "whole": 11, "splitter": 11, "valueerror": [11, 12], "except": 11, "longer": 11, "simpli": 11, "ignor": 11, "detect": 12, "clean": 12, "invis": 12, "charact": 12, "special": 12, "whitespac": 12, "duplic": 12, "distanc": 12, "find": 12, "anomali": 12, "textdist": 12, "max_dimens": 12, "markup": 12, "unusu": 12, "multipl": 12, "again": 12, "dimens": 12, "greater": 12, "_normalize_char_count": 12, "normal": 12, "char": 12, "defaultdict": 12, "lazi": 12, "postprocess": 12, "manhattan": 12, "scipi": 12, "spatial": 12, "cityblock": 12, "most": 12, "commen": 12, "higher": 12, "least": 12, "_normalize_counter_to_defaultdict": 12, "devid": 12, "them": [12, 14], "clean_all_invisible_chars_and_whitespac": 12, "lead": 12, "trail": 12, "defin": 12, "constant": 12, "invisible_charact": 12, "special_whitespac": 12, "rteturn": 12, "has_invisible_charact": 12, "otherwis": 12, "has_special_whitespac": 12, "remove_invisible_charact": 12, "replace_multiple_whitespac": 12, "replace_special_whitespac": 12, "kfoldlabeleddataset": 13, "7": 13, "n_repeat": 13, "random_st": 13, "k": 13, "labeleddataset": 13, "labeled_dataset": 13, "stratification_label": 13, "encod": 13, "labe": 13, "pretrained_model_name_or_path": 13, "pathlik": 13, "insid": 13, "repo": 13, "huggingfac": 13, "machin": 14, "learn": 14, "python": 14, "packag": 14, "pypi": 14, "option": 14, "might": 14, "refer": 14, "repositori": 14, "licens": 14, "imprint": 14}, "objects": {"mltb2": [[1, 0, 0, "-", "arangodb"], [2, 0, 0, "-", "data"], [3, 0, 0, "-", "db"], [4, 0, 0, "-", "fasttext"], [5, 0, 0, "-", "files"], [6, 0, 0, "-", "md"], [7, 0, 0, "-", "openai"], [8, 0, 0, "-", "optuna"], [9, 0, 0, "-", "plot"], [10, 0, 0, "-", "somajo"], [11, 0, 0, "-", "somajo_transformers"], [12, 0, 0, "-", "text"], [13, 0, 0, "-", "transformers"]], "mltb2.arangodb": [[1, 1, 1, "", "ArangoBatchDataManager"], [1, 3, 1, "", "_check_config_keys"], [1, 3, 1, "", "arango_collection_backup"]], "mltb2.arangodb.ArangoBatchDataManager": [[1, 2, 1, "", "_arango_client_factory"], [1, 2, 1, "", "_connection_factory"], [1, 2, 1, "", "from_config_file"], [1, 2, 1, "", "load_batch"], [1, 2, 1, "", "save_batch"]], "mltb2.data": [[2, 3, 1, "", "_load_colon_data"], [2, 3, 1, "", "_load_colon_label"], [2, 3, 1, "", "load_colon"], [2, 3, 1, "", "load_leukemia_big"], [2, 3, 1, "", "load_prostate"]], "mltb2.db": [[3, 1, 1, "", "AbstractBatchDataManager"], [3, 1, 1, "", "BatchDataProcessor"]], "mltb2.db.AbstractBatchDataManager": [[3, 2, 1, "", "load_batch"], [3, 2, 1, "", "save_batch"]], "mltb2.db.BatchDataProcessor": [[3, 2, 1, "", "run"]], "mltb2.fasttext": [[4, 1, 1, "", "FastTextLanguageIdentification"]], "mltb2.fasttext.FastTextLanguageIdentification": [[4, 2, 1, "", "__call__"], [4, 2, 1, "", "get_model_path_and_download"]], "mltb2.files": [[5, 3, 1, "", "fetch_remote_file"], [5, 3, 1, "", "get_and_create_mltb2_data_dir"]], "mltb2.md": [[6, 1, 1, "", "MdTextSplitter"], [6, 3, 1, "", "_chunk_md_by_headline"], [6, 3, 1, "", "chunk_md"]], "mltb2.md.MdTextSplitter": [[6, 2, 1, "", "__call__"]], "mltb2.openai": [[7, 1, 1, "", "OpenAiAzureChatCompletion"], [7, 1, 1, "", "OpenAiAzureCompletion"], [7, 1, 1, "", "OpenAiBaseCompletion"], [7, 1, 1, "", "OpenAiChatCompletion"], [7, 1, 1, "", "OpenAiCompletion"], [7, 1, 1, "", "OpenAiCompletionAnswer"], [7, 1, 1, "", "OpenAiTokenCounter"], [7, 3, 1, "", "_check_mandatory_azure_completion_kwargs"]], "mltb2.openai.OpenAiBaseCompletion": [[7, 2, 1, "", "__call__"], [7, 2, 1, "", "_completion"], [7, 2, 1, "", "from_yaml"]], "mltb2.openai.OpenAiChatCompletion": [[7, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletion": [[7, 2, 1, "", "_completion"]], "mltb2.openai.OpenAiCompletionAnswer": [[7, 2, 1, "", "from_open_ai_object"]], "mltb2.openai.OpenAiTokenCounter": [[7, 2, 1, "", "__call__"]], "mltb2.optuna": [[8, 1, 1, "", "SignificanceRepeatedTrainingPruner"]], "mltb2.optuna.SignificanceRepeatedTrainingPruner": [[8, 2, 1, "", "prune"]], "mltb2.plot": [[9, 3, 1, "", "boxplot"], [9, 3, 1, "", "boxplot_dict"], [9, 3, 1, "", "save_last_figure"], [9, 3, 1, "", "twin_axes_timeseries_plot"]], "mltb2.somajo": [[10, 1, 1, "", "JaccardSimilarity"], [10, 1, 1, "", "SoMaJoBaseClass"], [10, 1, 1, "", "SoMaJoSentenceSplitter"], [10, 1, 1, "", "TokenExtractor"], [10, 1, 1, "", "UrlSwapper"], [10, 3, 1, "", "detokenize"], [10, 3, 1, "", "extract_token_class_set"]], "mltb2.somajo.JaccardSimilarity": [[10, 2, 1, "", "__call__"], [10, 2, 1, "", "get_token_set"]], "mltb2.somajo.SoMaJoSentenceSplitter": [[10, 2, 1, "", "__call__"]], "mltb2.somajo.TokenExtractor": [[10, 2, 1, "", "extract_url_set"]], "mltb2.somajo.UrlSwapper": [[10, 2, 1, "", "reverse_swap_urls"], [10, 2, 1, "", "swap_urls"]], "mltb2.somajo_transformers": [[11, 1, 1, "", "TextSplitter"]], "mltb2.somajo_transformers.TextSplitter": [[11, 2, 1, "", "__call__"]], "mltb2.text": [[12, 1, 1, "", "TextDistance"], [12, 3, 1, "", "_normalize_counter_to_defaultdict"], [12, 3, 1, "", "clean_all_invisible_chars_and_whitespaces"], [12, 3, 1, "", "has_invisible_characters"], [12, 3, 1, "", "has_special_whitespaces"], [12, 3, 1, "", "remove_invisible_characters"], [12, 3, 1, "", "replace_multiple_whitespaces"], [12, 3, 1, "", "replace_special_whitespaces"]], "mltb2.text.TextDistance": [[12, 2, 1, "", "_normalize_char_counter"], [12, 2, 1, "", "distance"], [12, 2, 1, "", "fit"]], "mltb2.transformers": [[13, 1, 1, "", "KFoldLabeledDataset"], [13, 1, 1, "", "LabeledDataset"], [13, 1, 1, "", "TransformersTokenCounter"]], "mltb2.transformers.KFoldLabeledDataset": [[13, 2, 1, "", "split"]], "mltb2.transformers.TransformersTokenCounter": [[13, 2, 1, "", "__call__"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"]}, "titleterms": {"api": 0, "refer": 0, "arangodb": 1, "data": 2, "db": 3, "fasttext": 4, "file": 5, "md": 6, "openai": 7, "optuna": 8, "plot": 9, "somajo": 10, "somajo_transform": 11, "text": 12, "transform": 13, "mltb2": 14, "document": 14, "instal": 14, "content": 14}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx": 58}, "alltitles": {"API Reference": [[0, "api-reference"]], "arangodb": [[1, "module-mltb2.arangodb"]], "data": [[2, "module-mltb2.data"]], "db": [[3, "module-mltb2.db"]], "fasttext": [[4, "module-mltb2.fasttext"]], "files": [[5, "module-mltb2.files"]], "md": [[6, "module-mltb2.md"]], "openai": [[7, "module-mltb2.openai"]], "optuna": [[8, "module-mltb2.optuna"]], "plot": [[9, "module-mltb2.plot"]], "somajo": [[10, "module-mltb2.somajo"]], "somajo_transformers": [[11, "module-mltb2.somajo_transformers"]], "text": [[12, "module-mltb2.text"]], "transformers": [[13, "module-mltb2.transformers"]], "MLTB2 Documentation": [[14, "mltb2-documentation"]], "Installation": [[14, "installation"]], "Content": [[14, "content"]]}, "indexentries": {"arangobatchdatamanager (class in mltb2.arangodb)": [[1, "mltb2.arangodb.ArangoBatchDataManager"]], "_arango_client_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._arango_client_factory"]], "_check_config_keys() (in module mltb2.arangodb)": [[1, "mltb2.arangodb._check_config_keys"]], "_connection_factory() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager._connection_factory"]], "arango_collection_backup() (in module mltb2.arangodb)": [[1, "mltb2.arangodb.arango_collection_backup"]], "from_config_file() (mltb2.arangodb.arangobatchdatamanager class method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.from_config_file"]], "load_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.load_batch"]], "mltb2.arangodb": [[1, "module-mltb2.arangodb"]], "module": [[1, "module-mltb2.arangodb"], [2, "module-mltb2.data"], [3, "module-mltb2.db"], [4, "module-mltb2.fasttext"], [5, "module-mltb2.files"], [6, "module-mltb2.md"], [7, "module-mltb2.openai"], [8, "module-mltb2.optuna"], [9, "module-mltb2.plot"], [10, "module-mltb2.somajo"], [11, "module-mltb2.somajo_transformers"], [12, "module-mltb2.text"], [13, "module-mltb2.transformers"]], "save_batch() (mltb2.arangodb.arangobatchdatamanager method)": [[1, "mltb2.arangodb.ArangoBatchDataManager.save_batch"]], "_load_colon_data() (in module mltb2.data)": [[2, "mltb2.data._load_colon_data"]], "_load_colon_label() (in module mltb2.data)": [[2, "mltb2.data._load_colon_label"]], "load_colon() (in module mltb2.data)": [[2, "mltb2.data.load_colon"]], "load_leukemia_big() (in module mltb2.data)": [[2, "mltb2.data.load_leukemia_big"]], "load_prostate() (in module mltb2.data)": [[2, "mltb2.data.load_prostate"]], "mltb2.data": [[2, "module-mltb2.data"]], "abstractbatchdatamanager (class in mltb2.db)": [[3, "mltb2.db.AbstractBatchDataManager"]], "batchdataprocessor (class in mltb2.db)": [[3, "mltb2.db.BatchDataProcessor"]], "load_batch() (mltb2.db.abstractbatchdatamanager method)": [[3, "mltb2.db.AbstractBatchDataManager.load_batch"]], "mltb2.db": [[3, "module-mltb2.db"]], "run() (mltb2.db.batchdataprocessor method)": [[3, "mltb2.db.BatchDataProcessor.run"]], "save_batch() (mltb2.db.abstractbatchdatamanager method)": [[3, "mltb2.db.AbstractBatchDataManager.save_batch"]], "fasttextlanguageidentification (class in mltb2.fasttext)": [[4, "mltb2.fasttext.FastTextLanguageIdentification"]], "__call__() (mltb2.fasttext.fasttextlanguageidentification method)": [[4, "mltb2.fasttext.FastTextLanguageIdentification.__call__"]], "get_model_path_and_download() (mltb2.fasttext.fasttextlanguageidentification static method)": [[4, "mltb2.fasttext.FastTextLanguageIdentification.get_model_path_and_download"]], "mltb2.fasttext": [[4, "module-mltb2.fasttext"]], "fetch_remote_file() (in module mltb2.files)": [[5, "mltb2.files.fetch_remote_file"]], "get_and_create_mltb2_data_dir() (in module mltb2.files)": [[5, "mltb2.files.get_and_create_mltb2_data_dir"]], "mltb2.files": [[5, "module-mltb2.files"]], "mdtextsplitter (class in mltb2.md)": [[6, "mltb2.md.MdTextSplitter"]], "__call__() (mltb2.md.mdtextsplitter method)": [[6, "mltb2.md.MdTextSplitter.__call__"]], "_chunk_md_by_headline() (in module mltb2.md)": [[6, "mltb2.md._chunk_md_by_headline"]], "chunk_md() (in module mltb2.md)": [[6, "mltb2.md.chunk_md"]], "mltb2.md": [[6, "module-mltb2.md"]], "openaiazurechatcompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiAzureChatCompletion"]], "openaiazurecompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiAzureCompletion"]], "openaibasecompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiBaseCompletion"]], "openaichatcompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiChatCompletion"]], "openaicompletion (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiCompletion"]], "openaicompletionanswer (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiCompletionAnswer"]], "openaitokencounter (class in mltb2.openai)": [[7, "mltb2.openai.OpenAiTokenCounter"]], "__call__() (mltb2.openai.openaibasecompletion method)": [[7, "mltb2.openai.OpenAiBaseCompletion.__call__"]], "__call__() (mltb2.openai.openaitokencounter method)": [[7, "mltb2.openai.OpenAiTokenCounter.__call__"]], "_check_mandatory_azure_completion_kwargs() (in module mltb2.openai)": [[7, "mltb2.openai._check_mandatory_azure_completion_kwargs"]], "_completion() (mltb2.openai.openaibasecompletion method)": [[7, "mltb2.openai.OpenAiBaseCompletion._completion"]], "_completion() (mltb2.openai.openaichatcompletion method)": [[7, "mltb2.openai.OpenAiChatCompletion._completion"]], "_completion() (mltb2.openai.openaicompletion method)": [[7, "mltb2.openai.OpenAiCompletion._completion"]], "from_open_ai_object() (mltb2.openai.openaicompletionanswer class method)": [[7, "mltb2.openai.OpenAiCompletionAnswer.from_open_ai_object"]], "from_yaml() (mltb2.openai.openaibasecompletion class method)": [[7, "mltb2.openai.OpenAiBaseCompletion.from_yaml"]], "mltb2.openai": [[7, "module-mltb2.openai"]], "significancerepeatedtrainingpruner (class in mltb2.optuna)": [[8, "mltb2.optuna.SignificanceRepeatedTrainingPruner"]], "mltb2.optuna": [[8, "module-mltb2.optuna"]], "prune() (mltb2.optuna.significancerepeatedtrainingpruner method)": [[8, "mltb2.optuna.SignificanceRepeatedTrainingPruner.prune"]], "boxplot() (in module mltb2.plot)": [[9, "mltb2.plot.boxplot"]], "boxplot_dict() (in module mltb2.plot)": [[9, "mltb2.plot.boxplot_dict"]], "mltb2.plot": [[9, "module-mltb2.plot"]], "save_last_figure() (in module mltb2.plot)": [[9, "mltb2.plot.save_last_figure"]], "twin_axes_timeseries_plot() (in module mltb2.plot)": [[9, "mltb2.plot.twin_axes_timeseries_plot"]], "jaccardsimilarity (class in mltb2.somajo)": [[10, "mltb2.somajo.JaccardSimilarity"]], "somajobaseclass (class in mltb2.somajo)": [[10, "mltb2.somajo.SoMaJoBaseClass"]], "somajosentencesplitter (class in mltb2.somajo)": [[10, "mltb2.somajo.SoMaJoSentenceSplitter"]], "tokenextractor (class in mltb2.somajo)": [[10, "mltb2.somajo.TokenExtractor"]], "urlswapper (class in mltb2.somajo)": [[10, "mltb2.somajo.UrlSwapper"]], "__call__() (mltb2.somajo.jaccardsimilarity method)": [[10, "mltb2.somajo.JaccardSimilarity.__call__"]], "__call__() (mltb2.somajo.somajosentencesplitter method)": [[10, "mltb2.somajo.SoMaJoSentenceSplitter.__call__"]], "detokenize() (in module mltb2.somajo)": [[10, "mltb2.somajo.detokenize"]], "extract_token_class_set() (in module mltb2.somajo)": [[10, "mltb2.somajo.extract_token_class_set"]], "extract_url_set() (mltb2.somajo.tokenextractor method)": [[10, "mltb2.somajo.TokenExtractor.extract_url_set"]], "get_token_set() (mltb2.somajo.jaccardsimilarity method)": [[10, "mltb2.somajo.JaccardSimilarity.get_token_set"]], "mltb2.somajo": [[10, "module-mltb2.somajo"]], "reverse_swap_urls() (mltb2.somajo.urlswapper method)": [[10, "mltb2.somajo.UrlSwapper.reverse_swap_urls"]], "swap_urls() (mltb2.somajo.urlswapper method)": [[10, "mltb2.somajo.UrlSwapper.swap_urls"]], "textsplitter (class in mltb2.somajo_transformers)": [[11, "mltb2.somajo_transformers.TextSplitter"]], "__call__() (mltb2.somajo_transformers.textsplitter method)": [[11, "mltb2.somajo_transformers.TextSplitter.__call__"]], "mltb2.somajo_transformers": [[11, "module-mltb2.somajo_transformers"]], "textdistance (class in mltb2.text)": [[12, "mltb2.text.TextDistance"]], "_normalize_char_counter() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance._normalize_char_counter"]], "_normalize_counter_to_defaultdict() (in module mltb2.text)": [[12, "mltb2.text._normalize_counter_to_defaultdict"]], "clean_all_invisible_chars_and_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.clean_all_invisible_chars_and_whitespaces"]], "distance() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance.distance"]], "fit() (mltb2.text.textdistance method)": [[12, "mltb2.text.TextDistance.fit"]], "has_invisible_characters() (in module mltb2.text)": [[12, "mltb2.text.has_invisible_characters"]], "has_special_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.has_special_whitespaces"]], "mltb2.text": [[12, "module-mltb2.text"]], "remove_invisible_characters() (in module mltb2.text)": [[12, "mltb2.text.remove_invisible_characters"]], "replace_multiple_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.replace_multiple_whitespaces"]], "replace_special_whitespaces() (in module mltb2.text)": [[12, "mltb2.text.replace_special_whitespaces"]], "kfoldlabeleddataset (class in mltb2.transformers)": [[13, "mltb2.transformers.KFoldLabeledDataset"]], "labeleddataset (class in mltb2.transformers)": [[13, "mltb2.transformers.LabeledDataset"]], "transformerstokencounter (class in mltb2.transformers)": [[13, "mltb2.transformers.TransformersTokenCounter"]], "__call__() (mltb2.transformers.transformerstokencounter method)": [[13, "mltb2.transformers.TransformersTokenCounter.__call__"]], "mltb2.transformers": [[13, "module-mltb2.transformers"]], "split() (mltb2.transformers.kfoldlabeleddataset method)": [[13, "mltb2.transformers.KFoldLabeledDataset.split"]]}}) \ No newline at end of file