Skip to content

Commit

Permalink
Remove data caching
Browse files Browse the repository at this point in the history
it's almost never been used
  • Loading branch information
Mehrad0711 committed Feb 28, 2022
1 parent 251b086 commit 33a038b
Show file tree
Hide file tree
Showing 19 changed files with 81 additions and 144 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ genienlp run-paraphrase --model_name_or_path <model_dir> --temperature 0.3 --rep
Use the following command for training/ finetuning an NMT model:

```bash
genienlp train --train_tasks almond_translate --data <data_directory> --train_languages <src_lang> --eval_languages <tgt_lang> --no_commit --train_iterations <iterations> --preserve_case --save <save_dir> --exist_ok --skip_cache --model TransformerSeq2Seq --pretrained_model <hf_model_name>
genienlp train --train_tasks almond_translate --data <data_directory> --train_languages <src_lang> --eval_languages <tgt_lang> --no_commit --train_iterations <iterations> --preserve_case --save <save_dir> --exist_ok --model TransformerSeq2Seq --pretrained_model <hf_model_name>
```

We currently support MarianMT, MBART, MT5, and M2M100 models.<br>
Expand All @@ -131,7 +131,7 @@ To save a pretrained model in genienlp format without any finetuning, set train_
To produce translations for an eval/ test set run the following command:

```bash
genienlp predict --tasks almond_translate --data <data_directory> --pred_languages <src_lang> --pred_tgt_languages <tgt_lang> --path <path_to_saved_model> --eval_dir <eval_dir> --skip_cache --val_batch_size 4000 --evaluate <valid/test> --overwrite --silent
genienlp predict --tasks almond_translate --data <data_directory> --pred_languages <src_lang> --pred_tgt_languages <tgt_lang> --path <path_to_saved_model> --eval_dir <eval_dir> --val_batch_size 4000 --evaluate <valid/test> --overwrite --silent
```

If your dataset is a document or contains long examples, pass `--translate_example_split` to break the examples down into individual sentences before translation for better results. <br>
Expand Down
5 changes: 0 additions & 5 deletions genienlp/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ def parse_argv(parser):
parser.add_argument('--data', default='.data/', type=str, help='where to load data from.')
parser.add_argument('--save', required=True, type=str, help='where to save results.')
parser.add_argument('--embeddings', default='.embeddings/', type=str, help='where to save embeddings.')
parser.add_argument('--cache', default='.cache/', type=str, help='where to save cached files')

parser.add_argument(
'--train_languages',
Expand Down Expand Up @@ -409,10 +408,6 @@ def parse_argv(parser):
help='Ignore all conditions and use fast version of huggingface tokenizer',
)

parser.add_argument('--skip_cache', action='store_true', help='whether to use existing cached splits or generate new ones')
parser.add_argument(
'--cache_input_data', action='store_true', help='Cache examples from input data for faster subsequent trainings'
)
parser.add_argument('--use_curriculum', action='store_true', help='Use curriculum learning')
parser.add_argument(
'--aux_dataset', default='', type=str, help='path to auxiliary dataset (ignored if curriculum is not used)'
Expand Down
4 changes: 0 additions & 4 deletions genienlp/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,7 @@ def parse_argv(parser):
parser.add_argument('--overwrite', action='store_true', help='whether to overwrite previously written predictions')
parser.add_argument('--silent', action='store_true', help='whether to print predictions to stdout')

parser.add_argument('--skip_cache', action='store_true', help='whether use exisiting cached splits or generate new ones')
parser.add_argument('--eval_dir', type=str, required=True, help='use this directory to store eval results')
parser.add_argument('--cache', default='.cache', type=str, help='where to save cached files')
parser.add_argument('--subsample', default=20000000, type=int, help='subsample the eval/test datasets')

parser.add_argument(
Expand Down Expand Up @@ -331,9 +329,7 @@ def prepare_data(args):

kwargs.update(
{
'skip_cache': args.skip_cache,
'subsample': args.subsample,
'cached_path': os.path.join(args.cache, task.name),
'all_dirs': task_languages,
'num_workers': args.num_workers,
'src_lang': src_lang,
Expand Down
9 changes: 0 additions & 9 deletions genienlp/run_bootleg.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def parse_argv(parser):
parser.add_argument('--save', required=True, type=str, help='where to save results.')
parser.add_argument('--embeddings', default='.embeddings/', type=str, help='where to save embeddings.')
parser.add_argument('--data', default='.data/', type=str, help='where to load data from.')
parser.add_argument('--cache', default='.cache/', type=str, help='where to save cached files')

parser.add_argument(
'--train_languages',
Expand Down Expand Up @@ -177,11 +176,6 @@ def parse_argv(parser):
'--exist_ok', action='store_true', help='Ok if the save directory already exists, i.e. overwrite is ok'
)

parser.add_argument('--skip_cache', action='store_true', help='whether to use existing cached splits or generate new ones')
parser.add_argument(
'--cache_input_data', action='store_true', help='Cache examples from input data for faster subsequent trainings'
)

# token classification task args
parser.add_argument('--num_labels', type=int, help='num_labels for classification tasks')
parser.add_argument('--crossner_domains', nargs='+', type=str, help='domains to use for CrossNER task')
Expand All @@ -197,8 +191,6 @@ def bootleg_dump_entities(args, logger):

bootleg_shared_kwargs = {
'subsample': args.subsample,
'skip_cache': args.skip_cache,
'cache_input_data': args.cache_input_data,
'num_workers': args.num_workers,
'all_dirs': args.train_src_languages,
'crossner_domains': args.crossner_domains,
Expand All @@ -212,7 +204,6 @@ def bootleg_dump_entities(args, logger):

kwargs = {'train': None, 'validation': None, 'test': None}
kwargs.update(bootleg_shared_kwargs)
kwargs['cached_path'] = os.path.join(args.cache, task.name)
for split in args.bootleg_data_splits:
if split == 'train':
del kwargs['train'] # deleting keys means use the default file name
Expand Down
109 changes: 46 additions & 63 deletions genienlp/tasks/almond_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@
import multiprocessing as mp
import os

import torch

from ..data_utils.almond_utils import chunk_file, create_examples_from_file
from .base_dataset import Split
from .generic_dataset import CQA
Expand All @@ -48,73 +46,58 @@ class AlmondDataset(CQA):

def __init__(self, path, *, make_example, **kwargs):

# TODO fix cache_path for multilingual task
subsample = kwargs.get('subsample')
cached_path = kwargs.get('cached_path')

skip_cache = kwargs.get('skip_cache', True)
cache_input_data = kwargs.get('cache_input_data', False)
num_workers = kwargs.get('num_workers', 0)

cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample))
dir_name = os.path.basename(os.path.dirname(path))

if os.path.exists(cache_name) and not skip_cache:
logger.info(f'Loading cached data from {cache_name}')
examples = torch.load(cache_name)
n = 0
with open(path, 'r', encoding='utf-8') as fp:
for line in fp:
n += 1

max_examples = min(n, subsample) if subsample is not None else n
if num_workers > 0:
num_processes = min(num_workers, int(mp.cpu_count()))
logger.info(f'Using {num_processes} workers...')
chunk_size = int(math.ceil(max_examples / num_processes))
num_chunks = int(math.ceil(max_examples / chunk_size))

base_path, extension = path.rsplit('.', 1)

chunk_file_paths = [f'{base_path}_{chunk_id}.tsv' for chunk_id in range(num_chunks)]
chunk_file(path, chunk_file_paths, chunk_size, num_chunks)
num_processes = min(num_processes, num_chunks)

with mp.Pool(processes=num_processes) as pool:
process_args = [
{
'in_file': chunk_file_paths[i],
'chunk_size': chunk_size,
'dir_name': dir_name,
'example_batch_size': 1,
'make_process_example': make_example,
'kwargs': kwargs,
}
for i in range(num_chunks)
]
results = pool.map(create_examples_from_file, process_args)

# merge all results
examples = [item for sublist in results for item in sublist]

for file in chunk_file_paths:
os.remove(file)
else:
n = 0
with open(path, 'r', encoding='utf-8') as fp:
for line in fp:
n += 1

max_examples = min(n, subsample) if subsample is not None else n
if num_workers > 0:
num_processes = min(num_workers, int(mp.cpu_count()))
logger.info(f'Using {num_processes} workers...')
chunk_size = int(math.ceil(max_examples / num_processes))
num_chunks = int(math.ceil(max_examples / chunk_size))

base_path, extension = path.rsplit('.', 1)

chunk_file_paths = [f'{base_path}_{chunk_id}.tsv' for chunk_id in range(num_chunks)]
chunk_file(path, chunk_file_paths, chunk_size, num_chunks)
num_processes = min(num_processes, num_chunks)

with mp.Pool(processes=num_processes) as pool:
process_args = [
{
'in_file': chunk_file_paths[i],
'chunk_size': chunk_size,
'dir_name': dir_name,
'example_batch_size': 1,
'make_process_example': make_example,
'kwargs': kwargs,
}
for i in range(num_chunks)
]
results = pool.map(create_examples_from_file, process_args)

# merge all results
examples = [item for sublist in results for item in sublist]

for file in chunk_file_paths:
os.remove(file)
else:
process_args = {
'in_file': path,
'chunk_size': max_examples,
'dir_name': dir_name,
'example_batch_size': 1,
'make_process_example': make_example,
'kwargs': kwargs,
}
examples = create_examples_from_file(process_args)

if cache_input_data:
os.makedirs(os.path.dirname(cache_name), exist_ok=True)
logger.info(f'Caching data to {cache_name}')
torch.save(examples, cache_name)
process_args = {
'in_file': path,
'chunk_size': max_examples,
'dir_name': dir_name,
'example_batch_size': 1,
'make_process_example': make_example,
'kwargs': kwargs,
}
examples = create_examples_from_file(process_args)

super().__init__(examples, **kwargs)

Expand Down
31 changes: 10 additions & 21 deletions genienlp/tasks/generic_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
import os
from typing import Iterable

import torch
import ujson
from datasets import load_dataset

Expand Down Expand Up @@ -97,27 +96,17 @@ def __init__(self, examples, sort_key_fn=input_then_output_len, batch_size_fn=al
class JSON(CQA):
name = 'json'

def __init__(self, path, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs):
cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample))
def __init__(self, path, subsample=None, lower=False, **kwargs):

examples = []
if os.path.exists(cache_name) and not skip_cache:
logger.info(f'Loading cached data from {cache_name}')
examples = torch.load(cache_name)
else:
with open(os.path.expanduser(path)) as f:
lines = f.readlines()
for line in lines:
ex = json.loads(line)
context, question, answer = ex['context'], ex['question'], ex['answer']
examples.append(
Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower)
)
if subsample is not None and len(examples) >= subsample:
break
os.makedirs(os.path.dirname(cache_name), exist_ok=True)
logger.info(f'Caching data to {cache_name}')
torch.save(examples, cache_name)
with open(os.path.expanduser(path)) as f:
lines = f.readlines()
for line in lines:
ex = json.loads(line)
context, question, answer = ex['context'], ex['question'], ex['answer']
examples.append(Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower))
if subsample is not None and len(examples) >= subsample:
break

super(JSON, self).__init__(examples, **kwargs)

Expand Down Expand Up @@ -235,7 +224,7 @@ class OODDataset(CQA):
name = 'ood'
is_sequence_classification = True

def __init__(self, path, lower=False, cached_path=None, skip_cache=False, **kwargs):
def __init__(self, path, lower=False, **kwargs):
examples = []
question = 'Is this sentence in-domain or out-domain?'

Expand Down
13 changes: 0 additions & 13 deletions genienlp/tasks/hf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,8 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import logging
import os

import datasets
import torch
from datasets import load_dataset

from ..tasks.generic_dataset import CQA
Expand All @@ -46,24 +44,13 @@ class HFDataset(CQA):
def __init__(self, data, make_example, **kwargs):

subsample = kwargs.get('subsample')
skip_cache = kwargs.pop('kwargs', True)

cache_name = os.path.join(os.path.dirname(data.cache_files[0]['filename']), data.split._name, str(subsample))
examples = []

if os.path.exists(cache_name) and not skip_cache:
logger.info(f'Loading cached data from {cache_name}')
examples = torch.load(cache_name)
for ex in data:
examples.append(make_example(ex, **kwargs))

if subsample is not None and len(examples) >= subsample:
break

os.makedirs(os.path.dirname(cache_name), exist_ok=True)
logger.info(f'Caching data to {cache_name}')
torch.save(examples, cache_name)

super().__init__(examples, **kwargs)

@classmethod
Expand Down
4 changes: 0 additions & 4 deletions genienlp/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,6 @@ def prepare_data(args, logger):

train_eval_shared_kwargs = {
'subsample': args.subsample,
'skip_cache': args.skip_cache,
'cache_input_data': args.cache_input_data,
'num_workers': args.num_workers,
}

Expand All @@ -99,7 +97,6 @@ def prepare_data(args, logger):
kwargs['train'] = args.train_set_name
kwargs.update(train_eval_shared_kwargs)
kwargs['all_dirs'] = args.train_src_languages
kwargs['cached_path'] = os.path.join(args.cache, task.name)
kwargs['crossner_domains'] = args.crossner_domains
if args.use_curriculum:
kwargs['curriculum'] = True
Expand Down Expand Up @@ -144,7 +141,6 @@ def prepare_data(args, logger):
kwargs['validation'] = args.eval_set_name
kwargs.update(train_eval_shared_kwargs)
kwargs['all_dirs'] = args.eval_src_languages
kwargs['cached_path'] = os.path.join(args.cache, task.name)
kwargs['crossner_domains'] = args.crossner_domains
kwargs['hf_test_overfit'] = args.hf_test_overfit

Expand Down
4 changes: 2 additions & 2 deletions tests/test_NED.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ for hparams in \
do

# train
genienlp train --train_tasks almond_dialogue_nlu --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --bootleg_output_dir $SRCDIR/dataset/thingpedia_99/bootleg/ --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit --do_ned --min_entity_len 2 --max_entity_len 4 $hparams
genienlp train --train_tasks almond_dialogue_nlu --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --bootleg_output_dir $SRCDIR/dataset/thingpedia_99/bootleg/ --exist_ok --embeddings $EMBEDDING_DIR --no_commit --do_ned --min_entity_len 2 --max_entity_len 4 $hparams

# greedy prediction
genienlp predict --tasks almond_dialogue_nlu --evaluate valid --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --embeddings $EMBEDDING_DIR --skip_cache
genienlp predict --tasks almond_dialogue_nlu --evaluate valid --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --embeddings $EMBEDDING_DIR

# check if result file exists
if test ! -f $workdir/model_$i/eval_results/valid/almond_dialogue_nlu.tsv ; then
Expand Down
4 changes: 2 additions & 2 deletions tests/test_calibration.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ for hparams in \
do

# train
genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit
genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --embeddings $EMBEDDING_DIR --no_commit

# greedy prediction
genienlp predict --tasks almond --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR --skip_cache --save_confidence_features --confidence_feature_path $workdir/model_$i/confidences.pkl --mc_dropout_num 10
genienlp predict --tasks almond --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR --save_confidence_features --confidence_feature_path $workdir/model_$i/confidences.pkl --mc_dropout_num 10

# check if confidence file exists
if test ! -f $workdir/model_$i/confidences.pkl ; then
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ for hparams in \
do

# train
genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 2 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit
genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 2 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/ $hparams --exist_ok --embeddings $EMBEDDING_DIR --no_commit

# generate a long sequence
long_sequence=''
Expand Down
Loading

0 comments on commit 33a038b

Please sign in to comment.