From a253a116349038a4f1592bec11758ab1bd109b37 Mon Sep 17 00:00:00 2001
From: tboquet <thomas.boquet@hec.ca>
Date: Tue, 19 Sep 2023 09:53:05 -0400
Subject: [PATCH 01/11] feat: load_pipeline improvements + style

---
 src/tasknet/utils.py | 169 ++++++++++++++++++++++++++++---------------
 1 file changed, 109 insertions(+), 60 deletions(-)

diff --git a/src/tasknet/utils.py b/src/tasknet/utils.py
index 9552fe9..1ac3f3c 100755
--- a/src/tasknet/utils.py
+++ b/src/tasknet/utils.py
@@ -8,15 +8,20 @@
 import torch
 import magicattr
 
+
 class NoTqdm:
     def __enter__(self):
-        tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=True)    
+        tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=True)
+
     def __exit__(self, exc_type, exc_value, exc_traceback):
         tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=False)
 
+
 def train_validation_test_split(dataset, train_ratio=0.8, val_test_ratio=0.5, seed=0):
     train_testvalid = dataset.train_test_split(test_size=1 - train_ratio, seed=seed)
-    test_valid = train_testvalid["test"].train_test_split(test_size=val_test_ratio, seed=seed)
+    test_valid = train_testvalid["test"].train_test_split(
+        test_size=val_test_ratio, seed=seed
+    )
     dataset = DatasetDict(
         train=train_testvalid["train"],
         validation=test_valid["test"],
@@ -25,32 +30,39 @@ def train_validation_test_split(dataset, train_ratio=0.8, val_test_ratio=0.5, se
     return dataset
 
 
-def load_dataset_sample(*args,n=1000):
-    ds= load_dataset(*args,streaming=True)
-    return DatasetDict({k: Dataset.from_list(list(ds[k].shuffle().take(n))) for k in ds})
+def load_dataset_sample(*args, n=1000):
+    ds = load_dataset(*args, streaming=True)
+    return DatasetDict(
+        {k: Dataset.from_list(list(ds[k].shuffle().take(n))) for k in ds}
+    )
 
 
 def to_dict(x):
-    if hasattr(x,'items'):
+    if hasattr(x, "items"):
         return edict(x)
     else:
-        x=edict({a:getattr(x,a) for a in dir(x) if not a.startswith('__')})
+        x = edict({a: getattr(x, a) for a in dir(x) if not a.startswith("__")})
         return x
 
+
 def deep_copy_cache(function):
-  memo = {}
-  def wrapper(*args, **kwargs):
-    if args in memo:
-      return copy.deepcopy(memo[args])
-    else:
-      rv = function(*args, **kwargs)
-      memo[args] = rv
-      return rv
-  return wrapper
+    memo = {}
+
+    def wrapper(*args, **kwargs):
+        if args in memo:
+            return copy.deepcopy(memo[args])
+        else:
+            rv = function(*args, **kwargs)
+            memo[args] = rv
+            return rv
+
+    return wrapper
+
 
 def shallow_copy_A_to_B(A, B):
     """Shallow copy (=parameter sharing) A into B
-    https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427"""
+    https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427
+    """
 
     def rsetattr(obj, attr, val):
         pre, _, post = attr.rpartition(".")
@@ -66,39 +78,43 @@ def _getattr(obj, attr):
         rsetattr(B, nb, rgetattr(A, na))
     return A, B
 
+
 def normalize_label(label):
-    label=str(label).lower()
-    label=label.replace('-','_')
-    label=label.replace(' ','_')
-    label=label.replace('entailed', 'entailment')
-    label=label.replace('non_','not_')
-    label=label.replace('duplicate','equivalent')
-    label=label.replace('neg','negative')
-    label=label.replace('pos','positive')
+    label = str(label).lower()
+    label = label.replace("-", "_")
+    label = label.replace(" ", "_")
+    label = label.replace("entailed", "entailment")
+    label = label.replace("non_", "not_")
+    label = label.replace("duplicate", "equivalent")
+    label = label.replace("neg", "negative")
+    label = label.replace("pos", "positive")
     return label
 
 
-def merge_tasks(tasks,names):
+def merge_tasks(tasks, names):
     prev, done, to_delete = dict(), dict(), []
-    for i,t in tqdm(enumerate(tasks)):
-        x=[x for x in names if x in t.name]
+    for i, t in tqdm(enumerate(tasks)):
+        x = [x for x in names if x in t.name]
         if x:
-            x=x[0]
-            columns=t.dataset['train'].features.keys()
-            n_choices = len([c for c in columns if 'choice' in c])
+            x = x[0]
+            columns = t.dataset["train"].features.keys()
+            n_choices = len([c for c in columns if "choice" in c])
             if n_choices:
-                x=f"{x}-{n_choices}"
+                x = f"{x}-{n_choices}"
             if x in prev:
-                t.dataset=DatasetDict(fc.merge_with(concatenate_datasets, prev[x], t.dataset))
-            prev[x]=t.dataset
-            t.name=x
-            done[x]=t
-            to_delete+=[i]
-    tasks = [task for i, task in enumerate(tasks) if i not in to_delete] + list(done.values())
+                t.dataset = DatasetDict(
+                    fc.merge_with(concatenate_datasets, prev[x], t.dataset)
+                )
+            prev[x] = t.dataset
+            t.name = x
+            done[x] = t
+            to_delete += [i]
+    tasks = [task for i, task in enumerate(tasks) if i not in to_delete] + list(
+        done.values()
+    )
     return tasks
 
 
-
 def nested_children(m: torch.nn.Module):
     children = dict(m.named_children())
     output = {}
@@ -107,57 +123,90 @@ def nested_children(m: torch.nn.Module):
     else:
         for name, child in children.items():
             if name.isnumeric():
-                name=f'[{name}]'
+                name = f"[{name}]"
             try:
                 output[name] = nested_children(child)
             except TypeError:
                 output[name] = nested_children(child)
     return output
 
+
 def convert(d):
     for k, v in d.items():
         if isinstance(v, dict):
-            yield from (f'{k}.{x}'.replace('.[','[') for x in convert(v))
+            yield from (f"{k}.{x}".replace(".[", "[") for x in convert(v))
         else:
             yield k
 
-def search_module(m,name, mode='attr', lowercase=True):
+
+def search_module(m, name, mode="attr", lowercase=True):
     paths = convert(nested_children(m))
-    module_name = lambda x: magicattr.get(m,x).__class__.__name__ 
+    module_name = lambda x: magicattr.get(m, x).__class__.__name__
     process = lambda x: x.lower() if lowercase else x
-    name=process(name)
-    if mode=='attr':
+    name = process(name)
+    if mode == "attr":
         return [x for x in paths if name in process(x)]
-    if mode=='class':
+    if mode == "class":
         return [x for x in paths if name in process(module_name(x))]
     else:
         raise ValueError('mode must be "attr" or "class"')
 
 
-def load_pipeline(model_name, task_name, adapt_task_embedding=True,multilingual=False):
-    if multilingual or 'mdeberta' in model_name:
-        multilingual=True 
+def load_pipeline(model_name, task_name, adapt_task_embedding=True, multilingual=False):
+    """Load Text Classification Pipeline for a Specified Model.
+
+    Load a text classification pipeline for the specified model and task. If
+    the model is multilingual or has "mdeberta" in its name, it will handle
+    the multilingual settings. The pipeline will have a model that's adapted
+    to the task using an adapter.
+
+    Args:
+        model_name (str): Name of the model to be loaded.
+        task_name (str): Name of the task for which the pipeline is loaded.
+        adapt_task_embedding (bool, optional): Flag to determine if task
+            embedding should be adapted. Defaults to True.
+        multilingual (bool, optional): Flag to determine if the model is
+            multilingual. Defaults to False.
+        device (int, optional): The device to run the pipeline on (-1 for CPU,
+            >= 0 for GPU ids). Defaults to -1.
+
+    Returns:
+        TextClassificationPipeline: Loaded text classification pipeline.
+
+    """
+    if multilingual or "mdeberta" in model_name:
+        multilingual = True
 
-    from transformers import AutoModelForSequenceClassification, TextClassificationPipeline, AutoTokenizer
+    from transformers import (
+        AutoModelForSequenceClassification,
+        TextClassificationPipeline,
+        AutoTokenizer,
+    )
     from .models import Adapter
+
     try:
         import tasksource
     except:
-        raise ImportError('Requires tasksource.\n pip install tasksource')
-    task = tasksource.load_task(task_name,multilingual=multilingual)
+        raise ImportError("Requires tasksource.\n pip install tasksource")
+    task = tasksource.load_task(task_name, multilingual=multilingual)
 
-    model = AutoModelForSequenceClassification.from_pretrained(model_name,ignore_mismatched_sizes=True)
-    adapter = Adapter.from_pretrained(model_name.replace('-nli','')+'-adapters')
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_name, ignore_mismatched_sizes=True
+    )
+    adapter = Adapter.from_pretrained(model_name.replace("-nli", "") + "-adapters")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = adapter.adapt_model_to_task(model, task_name)
-    model.config.id2label=task['train'].features['labels']._int2str
-    
+    model.config.id2label = task["train"].features["labels"]._int2str
+
     task_index = adapter.config.tasks.index(task_name)
-    
+
     if adapt_task_embedding:
         with torch.no_grad():
-            model.deberta.embeddings.word_embeddings.weight[tokenizer.cls_token_id]+=adapter.Z[task_index]
+            model.deberta.embeddings.word_embeddings.weight[
+                tokenizer.cls_token_id
+            ] += adapter.Z[task_index]
 
     pipe = TextClassificationPipeline(
-    model=model, tokenizer=tokenizer)
-    return pipe
\ No newline at end of file
+        model=model, tokenizer=tokenizer, device=device, return_all_scores=True
+    )
+    return pipe

From 89aadf4c1ab93047e3442019595dcbded1ea924a Mon Sep 17 00:00:00 2001
From: tboquet <thomas.boquet@hec.ca>
Date: Tue, 19 Sep 2023 09:58:26 -0400
Subject: [PATCH 02/11] feat: function signature matches functionalities

---
 src/tasknet/utils.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/tasknet/utils.py b/src/tasknet/utils.py
index 1ac3f3c..0b44f36 100755
--- a/src/tasknet/utils.py
+++ b/src/tasknet/utils.py
@@ -152,7 +152,14 @@ def search_module(m, name, mode="attr", lowercase=True):
         raise ValueError('mode must be "attr" or "class"')
 
 
-def load_pipeline(model_name, task_name, adapt_task_embedding=True, multilingual=False):
+def load_pipeline(
+    model_name: str,
+    task_name: str,
+    adapt_task_embedding: bool = True,
+    multilingual: bool = False,
+    device: int = -1,
+    return_all_scores: bool = False,
+) -> TextClassificationPipeline:
     """Load Text Classification Pipeline for a Specified Model.
 
     Load a text classification pipeline for the specified model and task. If

From 4077d51ebeb4bb619084377233e3468fb23c9d59 Mon Sep 17 00:00:00 2001
From: tboquet <thomas.boquet@hec.ca>
Date: Tue, 19 Sep 2023 09:59:24 -0400
Subject: [PATCH 03/11] fix: import module level

---
 src/tasknet/utils.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/tasknet/utils.py b/src/tasknet/utils.py
index 0b44f36..f6a8214 100755
--- a/src/tasknet/utils.py
+++ b/src/tasknet/utils.py
@@ -1,12 +1,17 @@
-from datasets import DatasetDict, Dataset, load_dataset
-from easydict import EasyDict as edict
 import copy
 import functools
-from tqdm.auto import tqdm
-from datasets import concatenate_datasets
+
 import funcy as fc
-import torch
 import magicattr
+import torch
+from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
+from easydict import EasyDict as edict
+from tqdm.auto import tqdm
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    TextClassificationPipeline,
+)
 
 
 class NoTqdm:
@@ -184,11 +189,6 @@ def load_pipeline(
     if multilingual or "mdeberta" in model_name:
         multilingual = True
 
-    from transformers import (
-        AutoModelForSequenceClassification,
-        TextClassificationPipeline,
-        AutoTokenizer,
-    )
     from .models import Adapter
 
     try:

From 48d8e1aa41a65d398dfcac4998fd4c02075854af Mon Sep 17 00:00:00 2001
From: tboquet <thomas.boquet@hec.ca>
Date: Tue, 19 Sep 2023 10:01:52 -0400
Subject: [PATCH 04/11] fix: add param to constructor

---
 src/tasknet/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/tasknet/utils.py b/src/tasknet/utils.py
index f6a8214..060e708 100755
--- a/src/tasknet/utils.py
+++ b/src/tasknet/utils.py
@@ -214,6 +214,9 @@ def load_pipeline(
             ] += adapter.Z[task_index]
 
     pipe = TextClassificationPipeline(
-        model=model, tokenizer=tokenizer, device=device, return_all_scores=True
+        model=model,
+        tokenizer=tokenizer,
+        device=device,
+        return_all_scores=return_all_scores,
     )
     return pipe

From d4ba7646586deef0d14147714b8d804d615dfb25 Mon Sep 17 00:00:00 2001
From: sileod <damien.sileo@gmail.com>
Date: Thu, 2 Nov 2023 09:38:01 +0100
Subject: [PATCH 05/11] Update setup.cfg

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 2ecab9d..2a5de02 100755
--- a/setup.cfg
+++ b/setup.cfg
@@ -26,6 +26,7 @@ install_requires =
     seqeval
     magicattr
     sentencepiece
+    tasksource
 
 [options.packages.find]
 where = src

From a3bb8141ffd53af61449aa8b29957beebd65843f Mon Sep 17 00:00:00 2001
From: sileod <damien.sileo@gmail.com>
Date: Thu, 2 Nov 2023 09:53:25 +0100
Subject: [PATCH 06/11] Update setup.cfg

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 2a5de02..396a156 100755
--- a/setup.cfg
+++ b/setup.cfg
@@ -16,7 +16,7 @@ packages = find:
 python_requires = >=3.6
 install_requires =
     torch
-    transformers
+    transformers[torch]
     datasets
     evaluate
     lazy_load

From 3bd3910d82ba5f56c40782b7d54ccf3d4ce4fc85 Mon Sep 17 00:00:00 2001
From: sileod <damien.sileo@gmail.com>
Date: Thu, 2 Nov 2023 09:56:31 +0100
Subject: [PATCH 07/11] Update models.py

---
 src/tasknet/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tasknet/models.py b/src/tasknet/models.py
index f6f39d3..1122d7a 100755
--- a/src/tasknet/models.py
+++ b/src/tasknet/models.py
@@ -342,6 +342,7 @@ class default:
             save_steps = 1000000
             label_names = ["labels"]
             include_inputs_for_metrics = True
+            model_name = "sileod/deberta-v3-base-tasksource-nli"
             
         default, hparams = to_dict(default), to_dict(hparams)
         self.p = hparams.get('p', 1)

From f89bbce29ce6fc3dfd14199fe68c2333c379615b Mon Sep 17 00:00:00 2001
From: sileod <damien.sileo@gmail.com>
Date: Thu, 2 Nov 2023 10:04:15 +0100
Subject: [PATCH 08/11] Update models.py

---
 src/tasknet/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tasknet/models.py b/src/tasknet/models.py
index 1122d7a..057c5a9 100755
--- a/src/tasknet/models.py
+++ b/src/tasknet/models.py
@@ -350,7 +350,7 @@ class default:
         self.batched = hparams.get('batched',False)
 
         trainer_args = transformers.TrainingArguments(
-            **{**default, **fc.project(hparams, dir(transformers.TrainingArguments))},
+          **fc.project({**default,**hparams}, dir(transformers.TrainingArguments))
         )
         if not tokenizer:
             tokenizer = AutoTokenizer.from_pretrained(hparams["model_name"])

From 68793e64637a135ebb4db234bc0d27b6fd8de3d8 Mon Sep 17 00:00:00 2001
From: sileod <damien.sileo@gmail.com>
Date: Thu, 2 Nov 2023 10:18:17 +0100
Subject: [PATCH 09/11] Update models.py

---
 src/tasknet/models.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/tasknet/models.py b/src/tasknet/models.py
index 057c5a9..b9875ec 100755
--- a/src/tasknet/models.py
+++ b/src/tasknet/models.py
@@ -342,7 +342,6 @@ class default:
             save_steps = 1000000
             label_names = ["labels"]
             include_inputs_for_metrics = True
-            model_name = "sileod/deberta-v3-base-tasksource-nli"
             
         default, hparams = to_dict(default), to_dict(hparams)
         self.p = hparams.get('p', 1)
@@ -350,7 +349,7 @@ class default:
         self.batched = hparams.get('batched',False)
 
         trainer_args = transformers.TrainingArguments(
-          **fc.project({**default,**hparams}, dir(transformers.TrainingArguments))
+            **{**default, **fc.project(hparams, dir(transformers.TrainingArguments))},
         )
         if not tokenizer:
             tokenizer = AutoTokenizer.from_pretrained(hparams["model_name"])
@@ -383,7 +382,7 @@ class default:
             task: dataset["test"]
             for task, dataset in self.processed_tasks.items()
         }
-        # We preventstrainer from automatically evaluating on each dataset:
+        # We prevent Trainer from automatically evaluating on each dataset:
         # transformers.Trainer recognizes eval_dataset instances of "dict"
         # But we use a custom "evaluate" function so that we can use different metrics for each task
         self.eval_dataset = MappingProxyType(self.eval_dataset)

From 89832edd7aeab9afc8740590cf32e2e9c5e79af4 Mon Sep 17 00:00:00 2001
From: sileod <damien.sileo@gmail.com>
Date: Thu, 18 Jan 2024 22:20:40 +0100
Subject: [PATCH 10/11] Update README.md

---
 README.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 2cba434..430e25c 100755
--- a/README.md
+++ b/README.md
@@ -38,12 +38,6 @@ p([{'text':x.premise,'text_pair': x.hypothesis}]) # HuggingFace pipeline for inf
 ```
 Tasknet is multitask by design. `model.task_models_list` contains one model per task, with a shared encoder.
 
-## Balancing dataset sizes 
-```py
-tn.Classification(dataset, nrows=5000, nrows_eval=500, oversampling=2)
-```
-You can balance multiple datasets with `nrows` and `oversampling`. `nrows` is the maximal number of examples. If a dataset has less than `nrows`, it will be oversampled at most `oversampling` times.
-
 ## AutoTask
 You can also leverage [tasksource](https://github.com/sileod/tasksource/) with tn.AutoTask and have one-line access to 600+ datasets, see [implemented tasks](https://github.com/sileod/tasksource/blob/main/README.md).
 ```py
@@ -51,6 +45,13 @@ rte = tn.AutoTask("glue/rte", nrows=5000)
 ```
 AutoTask guesses a template based on the dataset structure. It also accepts a dataset as input, if it fits the template (e.g. after tasksource custom preprocessing).
 
+## Balancing dataset sizes 
+```py
+tn.Classification(dataset, nrows=5000, nrows_eval=500, oversampling=2)
+```
+You can balance multiple datasets with `nrows` and `oversampling`. `nrows` is the maximal number of examples. If a dataset has less than `nrows`, it will be oversampled at most `oversampling` times.
+
+
 ## Colab examples
 Minimal-ish example:
 

From 59a258cdcad353a3bdb21196cc09d9e885954142 Mon Sep 17 00:00:00 2001
From: sileod <damien.sileo@gmail.com>
Date: Thu, 18 Jan 2024 22:23:35 +0100
Subject: [PATCH 11/11] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 430e25c..8b6f53e 100755
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ trainer = tn.Trainer(model, tasks, hparams)
 trainer.train()
 trainer.evaluate()
 p = trainer.pipeline()
-p([{'text':x.premise,'text_pair': x.hypothesis}]) # HuggingFace pipeline for inference
+p([{'text':'premise here','text_pair': 'hypothesis here'}]) # HuggingFace pipeline for inference
 ```
 Tasknet is multitask by design. `model.task_models_list` contains one model per task, with a shared encoder.