Skip to content

Commit

Permalink
allow zero training example, work with newer version of transformers
Browse files Browse the repository at this point in the history
  • Loading branch information
Damien Sileo committed Mar 1, 2024
2 parents bf98292 + 59a258c commit ff952e0
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 81 deletions.
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,24 @@ trainer = tn.Trainer(model, tasks, hparams)
trainer.train()
trainer.evaluate()
p = trainer.pipeline()
p([{'text':x.premise,'text_pair': x.hypothesis}]) # HuggingFace pipeline for inference
p([{'text':'premise here','text_pair': 'hypothesis here'}]) # HuggingFace pipeline for inference
```
Tasknet is multitask by design. `model.task_models_list` contains one model per task, with a shared encoder.

## Balancing dataset sizes
```py
tn.Classification(dataset, nrows=5000, nrows_eval=500, oversampling=2)
```
You can balance multiple datasets with `nrows` and `oversampling`. `nrows` is the maximal number of examples. If a dataset has less than `nrows`, it will be oversampled at most `oversampling` times.

## AutoTask
You can also leverage [tasksource](https://github.com/sileod/tasksource/) with tn.AutoTask and have one-line access to 600+ datasets, see [implemented tasks](https://github.com/sileod/tasksource/blob/main/README.md).
```py
rte = tn.AutoTask("glue/rte", nrows=5000)
```
AutoTask guesses a template based on the dataset structure. It also accepts a dataset as input, if it fits the template (e.g. after tasksource custom preprocessing).

## Balancing dataset sizes
```py
tn.Classification(dataset, nrows=5000, nrows_eval=500, oversampling=2)
```
You can balance multiple datasets with `nrows` and `oversampling`. `nrows` is the maximal number of examples. If a dataset has less than `nrows`, it will be oversampled at most `oversampling` times.


## Colab examples
Minimal-ish example:

Expand Down
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ packages = find:
python_requires = >=3.6
install_requires =
torch
transformers
transformers[torch]
datasets
evaluate
lazy_load
Expand All @@ -26,6 +26,7 @@ install_requires =
seqeval
magicattr
sentencepiece
tasksource

[options.packages.find]
where = src
2 changes: 1 addition & 1 deletion src/tasknet/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ class default:
task: dataset["test"]
for task, dataset in self.processed_tasks.items()
}
# We preventstrainer from automatically evaluating on each dataset:
# We prevent Trainer from automatically evaluating on each dataset:
# transformers.Trainer recognizes eval_dataset instances of "dict"
# But we use a custom "evaluate" function so that we can use different metrics for each task
self.eval_dataset = MappingProxyType(self.eval_dataset)
Expand Down
198 changes: 126 additions & 72 deletions src/tasknet/utils.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,32 @@
from datasets import DatasetDict, Dataset, load_dataset
from easydict import EasyDict as edict
import copy
import functools
from tqdm.auto import tqdm
from datasets import concatenate_datasets

import funcy as fc
import torch
import magicattr
import torch
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
from easydict import EasyDict as edict
from tqdm.auto import tqdm
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TextClassificationPipeline,
)


class NoTqdm:
def __enter__(self):
tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=True)
tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=True)

def __exit__(self, exc_type, exc_value, exc_traceback):
tqdm.__init__ = functools.partialmethod(tqdm.__init__, disable=False)


def train_validation_test_split(dataset, train_ratio=0.8, val_test_ratio=0.5, seed=0):
train_testvalid = dataset.train_test_split(test_size=1 - train_ratio, seed=seed)
test_valid = train_testvalid["test"].train_test_split(test_size=val_test_ratio, seed=seed)
test_valid = train_testvalid["test"].train_test_split(
test_size=val_test_ratio, seed=seed
)
dataset = DatasetDict(
train=train_testvalid["train"],
validation=test_valid["test"],
Expand All @@ -25,32 +35,39 @@ def train_validation_test_split(dataset, train_ratio=0.8, val_test_ratio=0.5, se
return dataset


def load_dataset_sample(*args,n=1000):
ds= load_dataset(*args,streaming=True)
return DatasetDict({k: Dataset.from_list(list(ds[k].shuffle().take(n))) for k in ds})
def load_dataset_sample(*args, n=1000):
ds = load_dataset(*args, streaming=True)
return DatasetDict(
{k: Dataset.from_list(list(ds[k].shuffle().take(n))) for k in ds}
)


def to_dict(x):
if hasattr(x,'items'):
if hasattr(x, "items"):
return edict(x)
else:
x=edict({a:getattr(x,a) for a in dir(x) if not a.startswith('__')})
x = edict({a: getattr(x, a) for a in dir(x) if not a.startswith("__")})
return x


def deep_copy_cache(function):
memo = {}
def wrapper(*args, **kwargs):
if args in memo:
return copy.deepcopy(memo[args])
else:
rv = function(*args, **kwargs)
memo[args] = rv
return rv
return wrapper
memo = {}

def wrapper(*args, **kwargs):
if args in memo:
return copy.deepcopy(memo[args])
else:
rv = function(*args, **kwargs)
memo[args] = rv
return rv

return wrapper


def shallow_copy_A_to_B(A, B):
"""Shallow copy (=parameter sharing) A into B
https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427"""
https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427
"""

def rsetattr(obj, attr, val):
pre, _, post = attr.rpartition(".")
Expand All @@ -66,39 +83,43 @@ def _getattr(obj, attr):
rsetattr(B, nb, rgetattr(A, na))
return A, B


def normalize_label(label):
label=str(label).lower()
label=label.replace('-','_')
label=label.replace(' ','_')
label=label.replace('entailed', 'entailment')
label=label.replace('non_','not_')
label=label.replace('duplicate','equivalent')
label=label.replace('neg','negative')
label=label.replace('pos','positive')
label = str(label).lower()
label = label.replace("-", "_")
label = label.replace(" ", "_")
label = label.replace("entailed", "entailment")
label = label.replace("non_", "not_")
label = label.replace("duplicate", "equivalent")
label = label.replace("neg", "negative")
label = label.replace("pos", "positive")
return label


def merge_tasks(tasks,names):
def merge_tasks(tasks, names):
prev, done, to_delete = dict(), dict(), []
for i,t in tqdm(enumerate(tasks)):
x=[x for x in names if x in t.name]
for i, t in tqdm(enumerate(tasks)):
x = [x for x in names if x in t.name]
if x:
x=x[0]
columns=t.dataset['train'].features.keys()
n_choices = len([c for c in columns if 'choice' in c])
x = x[0]
columns = t.dataset["train"].features.keys()
n_choices = len([c for c in columns if "choice" in c])
if n_choices:
x=f"{x}-{n_choices}"
x = f"{x}-{n_choices}"
if x in prev:
t.dataset=DatasetDict(fc.merge_with(concatenate_datasets, prev[x], t.dataset))
prev[x]=t.dataset
t.name=x
done[x]=t
to_delete+=[i]
tasks = [task for i, task in enumerate(tasks) if i not in to_delete] + list(done.values())
t.dataset = DatasetDict(
fc.merge_with(concatenate_datasets, prev[x], t.dataset)
)
prev[x] = t.dataset
t.name = x
done[x] = t
to_delete += [i]
tasks = [task for i, task in enumerate(tasks) if i not in to_delete] + list(
done.values()
)
return tasks



def nested_children(m: torch.nn.Module):
children = dict(m.named_children())
output = {}
Expand All @@ -107,62 +128,95 @@ def nested_children(m: torch.nn.Module):
else:
for name, child in children.items():
if name.isnumeric():
name=f'[{name}]'
name = f"[{name}]"
try:
output[name] = nested_children(child)
except TypeError:
output[name] = nested_children(child)
return output


def convert(d):
for k, v in d.items():
if isinstance(v, dict):
yield from (f'{k}.{x}'.replace('.[','[') for x in convert(v))
yield from (f"{k}.{x}".replace(".[", "[") for x in convert(v))
else:
yield k

def search_module(m,name, mode='attr', lowercase=True):

def search_module(m, name, mode="attr", lowercase=True):
paths = convert(nested_children(m))
module_name = lambda x: magicattr.get(m,x).__class__.__name__
module_name = lambda x: magicattr.get(m, x).__class__.__name__
process = lambda x: x.lower() if lowercase else x
name=process(name)
if mode=='attr':
name = process(name)
if mode == "attr":
return [x for x in paths if name in process(x)]
if mode=='class':
if mode == "class":
return [x for x in paths if name in process(module_name(x))]
else:
raise ValueError('mode must be "attr" or "class"')


def load_pipeline(model_name, task_name, adapt_task_embedding=True,multilingual=False):
if multilingual or 'mdeberta' in model_name:
multilingual=True
def load_pipeline(
model_name: str,
task_name: str,
adapt_task_embedding: bool = True,
multilingual: bool = False,
device: int = -1,
return_all_scores: bool = False,
) -> TextClassificationPipeline:
"""Load Text Classification Pipeline for a Specified Model.
Load a text classification pipeline for the specified model and task. If
the model is multilingual or has "mdeberta" in its name, it will handle
the multilingual settings. The pipeline will have a model that's adapted
to the task using an adapter.
Args:
model_name (str): Name of the model to be loaded.
task_name (str): Name of the task for which the pipeline is loaded.
adapt_task_embedding (bool, optional): Flag to determine if task
embedding should be adapted. Defaults to True.
multilingual (bool, optional): Flag to determine if the model is
multilingual. Defaults to False.
device (int, optional): The device to run the pipeline on (-1 for CPU,
>= 0 for GPU ids). Defaults to -1.
Returns:
TextClassificationPipeline: Loaded text classification pipeline.
"""
if multilingual or "mdeberta" in model_name:
multilingual = True

from transformers import AutoModelForSequenceClassification, TextClassificationPipeline, AutoTokenizer
from transformers import AutoModelForTokenClassification, TokenClassificationPipeline
from .models import Adapter

try:
import tasksource
except:
raise ImportError('Requires tasksource.\n pip install tasksource')
task = tasksource.load_task(task_name,multilingual=multilingual)
#model_type = eval(f"AutoModelFor{task.task_type}")
model_type = AutoModelForTokenClassification
model = model_type.from_pretrained(model_name,ignore_mismatched_sizes=True)
adapter = Adapter.from_pretrained(model_name.replace('-nli','')+'-adapters')
raise ImportError("Requires tasksource.\n pip install tasksource")
task = tasksource.load_task(task_name, multilingual=multilingual)

model = AutoModelForSequenceClassification.from_pretrained(
model_name, ignore_mismatched_sizes=True
)
adapter = Adapter.from_pretrained(model_name.replace("-nli", "") + "-adapters")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = adapter.adapt_model_to_task(model, task_name)
labels = task['train'].features['labels']
labels = (labels.feature if hasattr(labels,"feature") else labels)
model.config.id2label=labels._int2str
model.config.id2label = task["train"].features["labels"]._int2str

task_index = adapter.config.tasks.index(task_name)

if adapt_task_embedding:
with torch.no_grad():
model.deberta.embeddings.word_embeddings.weight[tokenizer.cls_token_id]+=adapter.Z[task_index]

pipe = TokenClassificationPipeline(
model=model, tokenizer=tokenizer)
return pipe
model.deberta.embeddings.word_embeddings.weight[
tokenizer.cls_token_id
] += adapter.Z[task_index]

pipe = TextClassificationPipeline(
model=model,
tokenizer=tokenizer,
device=device,
return_all_scores=return_all_scores,
)
return pipe

0 comments on commit ff952e0

Please sign in to comment.