From a253a116349038a4f1592bec11758ab1bd109b37 Mon Sep 17 00:00:00 2001 From: tboquet Date: Tue, 19 Sep 2023 09:53:05 -0400 Subject: [PATCH 01/11] feat: load_pipeline improvements + style --- src/tasknet/utils.py | 169 ++++++++++++++++++++++++++++--------------- 1 file changed, 109 insertions(+), 60 deletions(-) diff --git a/src/tasknet/utils.py b/src/tasknet/utils.py index 9552fe9..1ac3f3c 100755 --- a/src/tasknet/utils.py +++ b/src/tasknet/utils.py @@ -8,15 +8,20 @@ import torch import magicattr + class NoTqdm: def __enter__(self): - tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=True) + tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=True) + def __exit__(self, exc_type, exc_value, exc_traceback): tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=False) + def train_validation_test_split(dataset, train_ratio=0.8, val_test_ratio=0.5, seed=0): train_testvalid = dataset.train_test_split(test_size=1 - train_ratio, seed=seed) - test_valid = train_testvalid["test"].train_test_split(test_size=val_test_ratio, seed=seed) + test_valid = train_testvalid["test"].train_test_split( + test_size=val_test_ratio, seed=seed + ) dataset = DatasetDict( train=train_testvalid["train"], validation=test_valid["test"], @@ -25,32 +30,39 @@ def train_validation_test_split(dataset, train_ratio=0.8, val_test_ratio=0.5, se return dataset -def load_dataset_sample(*args,n=1000): - ds= load_dataset(*args,streaming=True) - return DatasetDict({k: Dataset.from_list(list(ds[k].shuffle().take(n))) for k in ds}) +def load_dataset_sample(*args, n=1000): + ds = load_dataset(*args, streaming=True) + return DatasetDict( + {k: Dataset.from_list(list(ds[k].shuffle().take(n))) for k in ds} + ) def to_dict(x): - if hasattr(x,'items'): + if hasattr(x, "items"): return edict(x) else: - x=edict({a:getattr(x,a) for a in dir(x) if not a.startswith('__')}) + x = edict({a: getattr(x, a) for a in dir(x) if not a.startswith("__")}) return x + def deep_copy_cache(function): - memo = {} - def wrapper(*args, **kwargs): - if args in memo: - return copy.deepcopy(memo[args]) - else: - rv = function(*args, **kwargs) - memo[args] = rv - return rv - return wrapper + memo = {} + + def wrapper(*args, **kwargs): + if args in memo: + return copy.deepcopy(memo[args]) + else: + rv = function(*args, **kwargs) + memo[args] = rv + return rv + + return wrapper + def shallow_copy_A_to_B(A, B): """Shallow copy (=parameter sharing) A into B - https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427""" + https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427 + """ def rsetattr(obj, attr, val): pre, _, post = attr.rpartition(".") @@ -66,39 +78,43 @@ def _getattr(obj, attr): rsetattr(B, nb, rgetattr(A, na)) return A, B + def normalize_label(label): - label=str(label).lower() - label=label.replace('-','_') - label=label.replace(' ','_') - label=label.replace('entailed', 'entailment') - label=label.replace('non_','not_') - label=label.replace('duplicate','equivalent') - label=label.replace('neg','negative') - label=label.replace('pos','positive') + label = str(label).lower() + label = label.replace("-", "_") + label = label.replace(" ", "_") + label = label.replace("entailed", "entailment") + label = label.replace("non_", "not_") + label = label.replace("duplicate", "equivalent") + label = label.replace("neg", "negative") + label = label.replace("pos", "positive") return label -def merge_tasks(tasks,names): +def merge_tasks(tasks, names): prev, done, to_delete = dict(), dict(), [] - for i,t in tqdm(enumerate(tasks)): - x=[x for x in names if x in t.name] + for i, t in tqdm(enumerate(tasks)): + x = [x for x in names if x in t.name] if x: - x=x[0] - columns=t.dataset['train'].features.keys() - n_choices = len([c for c in columns if 'choice' in c]) + x = x[0] + columns = t.dataset["train"].features.keys() + n_choices = len([c for c in columns if "choice" in c]) if n_choices: - x=f"{x}-{n_choices}" + x = f"{x}-{n_choices}" if x in prev: - t.dataset=DatasetDict(fc.merge_with(concatenate_datasets, prev[x], t.dataset)) - prev[x]=t.dataset - t.name=x - done[x]=t - to_delete+=[i] - tasks = [task for i, task in enumerate(tasks) if i not in to_delete] + list(done.values()) + t.dataset = DatasetDict( + fc.merge_with(concatenate_datasets, prev[x], t.dataset) + ) + prev[x] = t.dataset + t.name = x + done[x] = t + to_delete += [i] + tasks = [task for i, task in enumerate(tasks) if i not in to_delete] + list( + done.values() + ) return tasks - def nested_children(m: torch.nn.Module): children = dict(m.named_children()) output = {} @@ -107,57 +123,90 @@ def nested_children(m: torch.nn.Module): else: for name, child in children.items(): if name.isnumeric(): - name=f'[{name}]' + name = f"[{name}]" try: output[name] = nested_children(child) except TypeError: output[name] = nested_children(child) return output + def convert(d): for k, v in d.items(): if isinstance(v, dict): - yield from (f'{k}.{x}'.replace('.[','[') for x in convert(v)) + yield from (f"{k}.{x}".replace(".[", "[") for x in convert(v)) else: yield k -def search_module(m,name, mode='attr', lowercase=True): + +def search_module(m, name, mode="attr", lowercase=True): paths = convert(nested_children(m)) - module_name = lambda x: magicattr.get(m,x).__class__.__name__ + module_name = lambda x: magicattr.get(m, x).__class__.__name__ process = lambda x: x.lower() if lowercase else x - name=process(name) - if mode=='attr': + name = process(name) + if mode == "attr": return [x for x in paths if name in process(x)] - if mode=='class': + if mode == "class": return [x for x in paths if name in process(module_name(x))] else: raise ValueError('mode must be "attr" or "class"') -def load_pipeline(model_name, task_name, adapt_task_embedding=True,multilingual=False): - if multilingual or 'mdeberta' in model_name: - multilingual=True +def load_pipeline(model_name, task_name, adapt_task_embedding=True, multilingual=False): + """Load Text Classification Pipeline for a Specified Model. + + Load a text classification pipeline for the specified model and task. If + the model is multilingual or has "mdeberta" in its name, it will handle + the multilingual settings. The pipeline will have a model that's adapted + to the task using an adapter. + + Args: + model_name (str): Name of the model to be loaded. + task_name (str): Name of the task for which the pipeline is loaded. + adapt_task_embedding (bool, optional): Flag to determine if task + embedding should be adapted. Defaults to True. + multilingual (bool, optional): Flag to determine if the model is + multilingual. Defaults to False. + device (int, optional): The device to run the pipeline on (-1 for CPU, + >= 0 for GPU ids). Defaults to -1. + + Returns: + TextClassificationPipeline: Loaded text classification pipeline. + + """ + if multilingual or "mdeberta" in model_name: + multilingual = True - from transformers import AutoModelForSequenceClassification, TextClassificationPipeline, AutoTokenizer + from transformers import ( + AutoModelForSequenceClassification, + TextClassificationPipeline, + AutoTokenizer, + ) from .models import Adapter + try: import tasksource except: - raise ImportError('Requires tasksource.\n pip install tasksource') - task = tasksource.load_task(task_name,multilingual=multilingual) + raise ImportError("Requires tasksource.\n pip install tasksource") + task = tasksource.load_task(task_name, multilingual=multilingual) - model = AutoModelForSequenceClassification.from_pretrained(model_name,ignore_mismatched_sizes=True) - adapter = Adapter.from_pretrained(model_name.replace('-nli','')+'-adapters') + model = AutoModelForSequenceClassification.from_pretrained( + model_name, ignore_mismatched_sizes=True + ) + adapter = Adapter.from_pretrained(model_name.replace("-nli", "") + "-adapters") tokenizer = AutoTokenizer.from_pretrained(model_name) model = adapter.adapt_model_to_task(model, task_name) - model.config.id2label=task['train'].features['labels']._int2str - + model.config.id2label = task["train"].features["labels"]._int2str + task_index = adapter.config.tasks.index(task_name) - + if adapt_task_embedding: with torch.no_grad(): - model.deberta.embeddings.word_embeddings.weight[tokenizer.cls_token_id]+=adapter.Z[task_index] + model.deberta.embeddings.word_embeddings.weight[ + tokenizer.cls_token_id + ] += adapter.Z[task_index] pipe = TextClassificationPipeline( - model=model, tokenizer=tokenizer) - return pipe \ No newline at end of file + model=model, tokenizer=tokenizer, device=device, return_all_scores=True + ) + return pipe From 89aadf4c1ab93047e3442019595dcbded1ea924a Mon Sep 17 00:00:00 2001 From: tboquet Date: Tue, 19 Sep 2023 09:58:26 -0400 Subject: [PATCH 02/11] feat: function signature matches functionalities --- src/tasknet/utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/tasknet/utils.py b/src/tasknet/utils.py index 1ac3f3c..0b44f36 100755 --- a/src/tasknet/utils.py +++ b/src/tasknet/utils.py @@ -152,7 +152,14 @@ def search_module(m, name, mode="attr", lowercase=True): raise ValueError('mode must be "attr" or "class"') -def load_pipeline(model_name, task_name, adapt_task_embedding=True, multilingual=False): +def load_pipeline( + model_name: str, + task_name: str, + adapt_task_embedding: bool = True, + multilingual: bool = False, + device: int = -1, + return_all_scores: bool = False, +) -> TextClassificationPipeline: """Load Text Classification Pipeline for a Specified Model. Load a text classification pipeline for the specified model and task. If From 4077d51ebeb4bb619084377233e3468fb23c9d59 Mon Sep 17 00:00:00 2001 From: tboquet Date: Tue, 19 Sep 2023 09:59:24 -0400 Subject: [PATCH 03/11] fix: import module level --- src/tasknet/utils.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/tasknet/utils.py b/src/tasknet/utils.py index 0b44f36..f6a8214 100755 --- a/src/tasknet/utils.py +++ b/src/tasknet/utils.py @@ -1,12 +1,17 @@ -from datasets import DatasetDict, Dataset, load_dataset -from easydict import EasyDict as edict import copy import functools -from tqdm.auto import tqdm -from datasets import concatenate_datasets + import funcy as fc -import torch import magicattr +import torch +from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset +from easydict import EasyDict as edict +from tqdm.auto import tqdm +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + TextClassificationPipeline, +) class NoTqdm: @@ -184,11 +189,6 @@ def load_pipeline( if multilingual or "mdeberta" in model_name: multilingual = True - from transformers import ( - AutoModelForSequenceClassification, - TextClassificationPipeline, - AutoTokenizer, - ) from .models import Adapter try: From 48d8e1aa41a65d398dfcac4998fd4c02075854af Mon Sep 17 00:00:00 2001 From: tboquet Date: Tue, 19 Sep 2023 10:01:52 -0400 Subject: [PATCH 04/11] fix: add param to constructor --- src/tasknet/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/tasknet/utils.py b/src/tasknet/utils.py index f6a8214..060e708 100755 --- a/src/tasknet/utils.py +++ b/src/tasknet/utils.py @@ -214,6 +214,9 @@ def load_pipeline( ] += adapter.Z[task_index] pipe = TextClassificationPipeline( - model=model, tokenizer=tokenizer, device=device, return_all_scores=True + model=model, + tokenizer=tokenizer, + device=device, + return_all_scores=return_all_scores, ) return pipe From d4ba7646586deef0d14147714b8d804d615dfb25 Mon Sep 17 00:00:00 2001 From: sileod Date: Thu, 2 Nov 2023 09:38:01 +0100 Subject: [PATCH 05/11] Update setup.cfg --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 2ecab9d..2a5de02 100755 --- a/setup.cfg +++ b/setup.cfg @@ -26,6 +26,7 @@ install_requires = seqeval magicattr sentencepiece + tasksource [options.packages.find] where = src From a3bb8141ffd53af61449aa8b29957beebd65843f Mon Sep 17 00:00:00 2001 From: sileod Date: Thu, 2 Nov 2023 09:53:25 +0100 Subject: [PATCH 06/11] Update setup.cfg --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 2a5de02..396a156 100755 --- a/setup.cfg +++ b/setup.cfg @@ -16,7 +16,7 @@ packages = find: python_requires = >=3.6 install_requires = torch - transformers + transformers[torch] datasets evaluate lazy_load From 3bd3910d82ba5f56c40782b7d54ccf3d4ce4fc85 Mon Sep 17 00:00:00 2001 From: sileod Date: Thu, 2 Nov 2023 09:56:31 +0100 Subject: [PATCH 07/11] Update models.py --- src/tasknet/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tasknet/models.py b/src/tasknet/models.py index f6f39d3..1122d7a 100755 --- a/src/tasknet/models.py +++ b/src/tasknet/models.py @@ -342,6 +342,7 @@ class default: save_steps = 1000000 label_names = ["labels"] include_inputs_for_metrics = True + model_name = "sileod/deberta-v3-base-tasksource-nli" default, hparams = to_dict(default), to_dict(hparams) self.p = hparams.get('p', 1) From f89bbce29ce6fc3dfd14199fe68c2333c379615b Mon Sep 17 00:00:00 2001 From: sileod Date: Thu, 2 Nov 2023 10:04:15 +0100 Subject: [PATCH 08/11] Update models.py --- src/tasknet/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tasknet/models.py b/src/tasknet/models.py index 1122d7a..057c5a9 100755 --- a/src/tasknet/models.py +++ b/src/tasknet/models.py @@ -350,7 +350,7 @@ class default: self.batched = hparams.get('batched',False) trainer_args = transformers.TrainingArguments( - **{**default, **fc.project(hparams, dir(transformers.TrainingArguments))}, + **fc.project({**default,**hparams}, dir(transformers.TrainingArguments)) ) if not tokenizer: tokenizer = AutoTokenizer.from_pretrained(hparams["model_name"]) From 68793e64637a135ebb4db234bc0d27b6fd8de3d8 Mon Sep 17 00:00:00 2001 From: sileod Date: Thu, 2 Nov 2023 10:18:17 +0100 Subject: [PATCH 09/11] Update models.py --- src/tasknet/models.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/tasknet/models.py b/src/tasknet/models.py index 057c5a9..b9875ec 100755 --- a/src/tasknet/models.py +++ b/src/tasknet/models.py @@ -342,7 +342,6 @@ class default: save_steps = 1000000 label_names = ["labels"] include_inputs_for_metrics = True - model_name = "sileod/deberta-v3-base-tasksource-nli" default, hparams = to_dict(default), to_dict(hparams) self.p = hparams.get('p', 1) @@ -350,7 +349,7 @@ class default: self.batched = hparams.get('batched',False) trainer_args = transformers.TrainingArguments( - **fc.project({**default,**hparams}, dir(transformers.TrainingArguments)) + **{**default, **fc.project(hparams, dir(transformers.TrainingArguments))}, ) if not tokenizer: tokenizer = AutoTokenizer.from_pretrained(hparams["model_name"]) @@ -383,7 +382,7 @@ class default: task: dataset["test"] for task, dataset in self.processed_tasks.items() } - # We preventstrainer from automatically evaluating on each dataset: + # We prevent Trainer from automatically evaluating on each dataset: # transformers.Trainer recognizes eval_dataset instances of "dict" # But we use a custom "evaluate" function so that we can use different metrics for each task self.eval_dataset = MappingProxyType(self.eval_dataset) From 89832edd7aeab9afc8740590cf32e2e9c5e79af4 Mon Sep 17 00:00:00 2001 From: sileod Date: Thu, 18 Jan 2024 22:20:40 +0100 Subject: [PATCH 10/11] Update README.md --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2cba434..430e25c 100755 --- a/README.md +++ b/README.md @@ -38,12 +38,6 @@ p([{'text':x.premise,'text_pair': x.hypothesis}]) # HuggingFace pipeline for inf ``` Tasknet is multitask by design. `model.task_models_list` contains one model per task, with a shared encoder. -## Balancing dataset sizes -```py -tn.Classification(dataset, nrows=5000, nrows_eval=500, oversampling=2) -``` -You can balance multiple datasets with `nrows` and `oversampling`. `nrows` is the maximal number of examples. If a dataset has less than `nrows`, it will be oversampled at most `oversampling` times. - ## AutoTask You can also leverage [tasksource](https://github.com/sileod/tasksource/) with tn.AutoTask and have one-line access to 600+ datasets, see [implemented tasks](https://github.com/sileod/tasksource/blob/main/README.md). ```py @@ -51,6 +45,13 @@ rte = tn.AutoTask("glue/rte", nrows=5000) ``` AutoTask guesses a template based on the dataset structure. It also accepts a dataset as input, if it fits the template (e.g. after tasksource custom preprocessing). +## Balancing dataset sizes +```py +tn.Classification(dataset, nrows=5000, nrows_eval=500, oversampling=2) +``` +You can balance multiple datasets with `nrows` and `oversampling`. `nrows` is the maximal number of examples. If a dataset has less than `nrows`, it will be oversampled at most `oversampling` times. + + ## Colab examples Minimal-ish example: From 59a258cdcad353a3bdb21196cc09d9e885954142 Mon Sep 17 00:00:00 2001 From: sileod Date: Thu, 18 Jan 2024 22:23:35 +0100 Subject: [PATCH 11/11] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 430e25c..8b6f53e 100755 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ trainer = tn.Trainer(model, tasks, hparams) trainer.train() trainer.evaluate() p = trainer.pipeline() -p([{'text':x.premise,'text_pair': x.hypothesis}]) # HuggingFace pipeline for inference +p([{'text':'premise here','text_pair': 'hypothesis here'}]) # HuggingFace pipeline for inference ``` Tasknet is multitask by design. `model.task_models_list` contains one model per task, with a shared encoder.