Remove data caching

it's almost never been used
stanford-oval · Feb 28, 2022 · 33a038b · 33a038b
1 parent 251b086
commit 33a038b
Show file tree

Hide file tree

Showing 19 changed files with 81 additions and 144 deletions.
diff --git a/README.md b/README.md
@@ -122,7 +122,7 @@ genienlp run-paraphrase --model_name_or_path <model_dir> --temperature 0.3 --rep
 Use the following command for training/ finetuning an NMT model:
 
 ```bash
-genienlp train --train_tasks almond_translate --data <data_directory> --train_languages <src_lang> --eval_languages <tgt_lang> --no_commit --train_iterations <iterations> --preserve_case --save <save_dir> --exist_ok --skip_cache --model TransformerSeq2Seq --pretrained_model <hf_model_name>
+genienlp train --train_tasks almond_translate --data <data_directory> --train_languages <src_lang> --eval_languages <tgt_lang> --no_commit --train_iterations <iterations> --preserve_case --save <save_dir> --exist_ok  --model TransformerSeq2Seq --pretrained_model <hf_model_name>
 ```
 
 We currently support MarianMT, MBART, MT5, and M2M100 models.<br>
@@ -131,7 +131,7 @@ To save a pretrained model in genienlp format without any finetuning, set train_
 To produce translations for an eval/ test set run the following command:
 
 ```bash
-genienlp predict --tasks almond_translate --data <data_directory> --pred_languages <src_lang> --pred_tgt_languages <tgt_lang> --path <path_to_saved_model> --eval_dir <eval_dir> --skip_cache --val_batch_size 4000 --evaluate <valid/test>  --overwrite --silent
+genienlp predict --tasks almond_translate --data <data_directory> --pred_languages <src_lang> --pred_tgt_languages <tgt_lang> --path <path_to_saved_model> --eval_dir <eval_dir>  --val_batch_size 4000 --evaluate <valid/test>  --overwrite --silent
 ```
 
 If your dataset is a document or contains long examples, pass `--translate_example_split` to break the examples down into individual sentences before translation for better results. <br>

diff --git a/genienlp/arguments.py b/genienlp/arguments.py
@@ -67,7 +67,6 @@ def parse_argv(parser):
     parser.add_argument('--data', default='.data/', type=str, help='where to load data from.')
     parser.add_argument('--save', required=True, type=str, help='where to save results.')
     parser.add_argument('--embeddings', default='.embeddings/', type=str, help='where to save embeddings.')
-    parser.add_argument('--cache', default='.cache/', type=str, help='where to save cached files')
 
     parser.add_argument(
         '--train_languages',
@@ -409,10 +408,6 @@ def parse_argv(parser):
         help='Ignore all conditions and use fast version of huggingface tokenizer',
     )
 
-    parser.add_argument('--skip_cache', action='store_true', help='whether to use existing cached splits or generate new ones')
-    parser.add_argument(
-        '--cache_input_data', action='store_true', help='Cache examples from input data for faster subsequent trainings'
-    )
     parser.add_argument('--use_curriculum', action='store_true', help='Use curriculum learning')
     parser.add_argument(
         '--aux_dataset', default='', type=str, help='path to auxiliary dataset (ignored if curriculum is not used)'

diff --git a/genienlp/predict.py b/genienlp/predict.py
@@ -100,9 +100,7 @@ def parse_argv(parser):
     parser.add_argument('--overwrite', action='store_true', help='whether to overwrite previously written predictions')
     parser.add_argument('--silent', action='store_true', help='whether to print predictions to stdout')
 
-    parser.add_argument('--skip_cache', action='store_true', help='whether use exisiting cached splits or generate new ones')
     parser.add_argument('--eval_dir', type=str, required=True, help='use this directory to store eval results')
-    parser.add_argument('--cache', default='.cache', type=str, help='where to save cached files')
     parser.add_argument('--subsample', default=20000000, type=int, help='subsample the eval/test datasets')
 
     parser.add_argument(
@@ -331,9 +329,7 @@ def prepare_data(args):
 
         kwargs.update(
             {
-                'skip_cache': args.skip_cache,
                 'subsample': args.subsample,
-                'cached_path': os.path.join(args.cache, task.name),
                 'all_dirs': task_languages,
                 'num_workers': args.num_workers,
                 'src_lang': src_lang,

diff --git a/genienlp/run_bootleg.py b/genienlp/run_bootleg.py
@@ -48,7 +48,6 @@ def parse_argv(parser):
     parser.add_argument('--save', required=True, type=str, help='where to save results.')
     parser.add_argument('--embeddings', default='.embeddings/', type=str, help='where to save embeddings.')
     parser.add_argument('--data', default='.data/', type=str, help='where to load data from.')
-    parser.add_argument('--cache', default='.cache/', type=str, help='where to save cached files')
 
     parser.add_argument(
         '--train_languages',
@@ -177,11 +176,6 @@ def parse_argv(parser):
         '--exist_ok', action='store_true', help='Ok if the save directory already exists, i.e. overwrite is ok'
     )
 
-    parser.add_argument('--skip_cache', action='store_true', help='whether to use existing cached splits or generate new ones')
-    parser.add_argument(
-        '--cache_input_data', action='store_true', help='Cache examples from input data for faster subsequent trainings'
-    )
-
     # token classification task args
     parser.add_argument('--num_labels', type=int, help='num_labels for classification tasks')
     parser.add_argument('--crossner_domains', nargs='+', type=str, help='domains to use for CrossNER task')
@@ -197,8 +191,6 @@ def bootleg_dump_entities(args, logger):
 
     bootleg_shared_kwargs = {
         'subsample': args.subsample,
-        'skip_cache': args.skip_cache,
-        'cache_input_data': args.cache_input_data,
         'num_workers': args.num_workers,
         'all_dirs': args.train_src_languages,
         'crossner_domains': args.crossner_domains,
@@ -212,7 +204,6 @@ def bootleg_dump_entities(args, logger):
 
         kwargs = {'train': None, 'validation': None, 'test': None}
         kwargs.update(bootleg_shared_kwargs)
-        kwargs['cached_path'] = os.path.join(args.cache, task.name)
         for split in args.bootleg_data_splits:
             if split == 'train':
                 del kwargs['train']  # deleting keys means use the default file name

diff --git a/genienlp/tasks/almond_dataset.py b/genienlp/tasks/almond_dataset.py
@@ -32,8 +32,6 @@
 import multiprocessing as mp
 import os
 
-import torch
-
 from ..data_utils.almond_utils import chunk_file, create_examples_from_file
 from .base_dataset import Split
 from .generic_dataset import CQA
@@ -48,73 +46,58 @@ class AlmondDataset(CQA):
 
     def __init__(self, path, *, make_example, **kwargs):
 
-        # TODO fix cache_path for multilingual task
         subsample = kwargs.get('subsample')
-        cached_path = kwargs.get('cached_path')
-
-        skip_cache = kwargs.get('skip_cache', True)
-        cache_input_data = kwargs.get('cache_input_data', False)
         num_workers = kwargs.get('num_workers', 0)
 
-        cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample))
         dir_name = os.path.basename(os.path.dirname(path))
 
-        if os.path.exists(cache_name) and not skip_cache:
-            logger.info(f'Loading cached data from {cache_name}')
-            examples = torch.load(cache_name)
+        n = 0
+        with open(path, 'r', encoding='utf-8') as fp:
+            for line in fp:
+                n += 1
+
+        max_examples = min(n, subsample) if subsample is not None else n
+        if num_workers > 0:
+            num_processes = min(num_workers, int(mp.cpu_count()))
+            logger.info(f'Using {num_processes} workers...')
+            chunk_size = int(math.ceil(max_examples / num_processes))
+            num_chunks = int(math.ceil(max_examples / chunk_size))
+
+            base_path, extension = path.rsplit('.', 1)
+
+            chunk_file_paths = [f'{base_path}_{chunk_id}.tsv' for chunk_id in range(num_chunks)]
+            chunk_file(path, chunk_file_paths, chunk_size, num_chunks)
+            num_processes = min(num_processes, num_chunks)
+
+            with mp.Pool(processes=num_processes) as pool:
+                process_args = [
+                    {
+                        'in_file': chunk_file_paths[i],
+                        'chunk_size': chunk_size,
+                        'dir_name': dir_name,
+                        'example_batch_size': 1,
+                        'make_process_example': make_example,
+                        'kwargs': kwargs,
+                    }
+                    for i in range(num_chunks)
+                ]
+                results = pool.map(create_examples_from_file, process_args)
+
+            # merge all results
+            examples = [item for sublist in results for item in sublist]
+
+            for file in chunk_file_paths:
+                os.remove(file)
         else:
-            n = 0
-            with open(path, 'r', encoding='utf-8') as fp:
-                for line in fp:
-                    n += 1
-
-            max_examples = min(n, subsample) if subsample is not None else n
-            if num_workers > 0:
-                num_processes = min(num_workers, int(mp.cpu_count()))
-                logger.info(f'Using {num_processes} workers...')
-                chunk_size = int(math.ceil(max_examples / num_processes))
-                num_chunks = int(math.ceil(max_examples / chunk_size))
-
-                base_path, extension = path.rsplit('.', 1)
-
-                chunk_file_paths = [f'{base_path}_{chunk_id}.tsv' for chunk_id in range(num_chunks)]
-                chunk_file(path, chunk_file_paths, chunk_size, num_chunks)
-                num_processes = min(num_processes, num_chunks)
-
-                with mp.Pool(processes=num_processes) as pool:
-                    process_args = [
-                        {
-                            'in_file': chunk_file_paths[i],
-                            'chunk_size': chunk_size,
-                            'dir_name': dir_name,
-                            'example_batch_size': 1,
-                            'make_process_example': make_example,
-                            'kwargs': kwargs,
-                        }
-                        for i in range(num_chunks)
-                    ]
-                    results = pool.map(create_examples_from_file, process_args)
-
-                # merge all results
-                examples = [item for sublist in results for item in sublist]
-
-                for file in chunk_file_paths:
-                    os.remove(file)
-            else:
-                process_args = {
-                    'in_file': path,
-                    'chunk_size': max_examples,
-                    'dir_name': dir_name,
-                    'example_batch_size': 1,
-                    'make_process_example': make_example,
-                    'kwargs': kwargs,
-                }
-                examples = create_examples_from_file(process_args)
-
-            if cache_input_data:
-                os.makedirs(os.path.dirname(cache_name), exist_ok=True)
-                logger.info(f'Caching data to {cache_name}')
-                torch.save(examples, cache_name)
+            process_args = {
+                'in_file': path,
+                'chunk_size': max_examples,
+                'dir_name': dir_name,
+                'example_batch_size': 1,
+                'make_process_example': make_example,
+                'kwargs': kwargs,
+            }
+            examples = create_examples_from_file(process_args)
 
         super().__init__(examples, **kwargs)
 

diff --git a/genienlp/tasks/generic_dataset.py b/genienlp/tasks/generic_dataset.py
@@ -33,7 +33,6 @@
 import os
 from typing import Iterable
 
-import torch
 import ujson
 from datasets import load_dataset
 
@@ -97,27 +96,17 @@ def __init__(self, examples, sort_key_fn=input_then_output_len, batch_size_fn=al
 class JSON(CQA):
     name = 'json'
 
-    def __init__(self, path, subsample=None, lower=False, cached_path=None, skip_cache=False, **kwargs):
-        cache_name = os.path.join(cached_path, os.path.basename(path), str(subsample))
+    def __init__(self, path, subsample=None, lower=False, **kwargs):
 
         examples = []
-        if os.path.exists(cache_name) and not skip_cache:
-            logger.info(f'Loading cached data from {cache_name}')
-            examples = torch.load(cache_name)
-        else:
-            with open(os.path.expanduser(path)) as f:
-                lines = f.readlines()
-                for line in lines:
-                    ex = json.loads(line)
-                    context, question, answer = ex['context'], ex['question'], ex['answer']
-                    examples.append(
-                        Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower)
-                    )
-                    if subsample is not None and len(examples) >= subsample:
-                        break
-            os.makedirs(os.path.dirname(cache_name), exist_ok=True)
-            logger.info(f'Caching data to {cache_name}')
-            torch.save(examples, cache_name)
+        with open(os.path.expanduser(path)) as f:
+            lines = f.readlines()
+            for line in lines:
+                ex = json.loads(line)
+                context, question, answer = ex['context'], ex['question'], ex['answer']
+                examples.append(Example.from_raw(make_example_id(self, len(examples)), context, question, answer, lower=lower))
+                if subsample is not None and len(examples) >= subsample:
+                    break
 
         super(JSON, self).__init__(examples, **kwargs)
 
@@ -235,7 +224,7 @@ class OODDataset(CQA):
     name = 'ood'
     is_sequence_classification = True
 
-    def __init__(self, path, lower=False, cached_path=None, skip_cache=False, **kwargs):
+    def __init__(self, path, lower=False, **kwargs):
         examples = []
         question = 'Is this sentence in-domain or out-domain?'
 

diff --git a/genienlp/tasks/hf_dataset.py b/genienlp/tasks/hf_dataset.py
@@ -28,10 +28,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import logging
-import os
 
 import datasets
-import torch
 from datasets import load_dataset
 
 from ..tasks.generic_dataset import CQA
@@ -46,24 +44,13 @@ class HFDataset(CQA):
     def __init__(self, data, make_example, **kwargs):
 
         subsample = kwargs.get('subsample')
-        skip_cache = kwargs.pop('kwargs', True)
-
-        cache_name = os.path.join(os.path.dirname(data.cache_files[0]['filename']), data.split._name, str(subsample))
         examples = []
 
-        if os.path.exists(cache_name) and not skip_cache:
-            logger.info(f'Loading cached data from {cache_name}')
-            examples = torch.load(cache_name)
         for ex in data:
             examples.append(make_example(ex, **kwargs))
-
             if subsample is not None and len(examples) >= subsample:
                 break
 
-        os.makedirs(os.path.dirname(cache_name), exist_ok=True)
-        logger.info(f'Caching data to {cache_name}')
-        torch.save(examples, cache_name)
-
         super().__init__(examples, **kwargs)
 
     @classmethod

diff --git a/genienlp/train.py b/genienlp/train.py
@@ -87,8 +87,6 @@ def prepare_data(args, logger):
 
     train_eval_shared_kwargs = {
         'subsample': args.subsample,
-        'skip_cache': args.skip_cache,
-        'cache_input_data': args.cache_input_data,
         'num_workers': args.num_workers,
     }
 
@@ -99,7 +97,6 @@ def prepare_data(args, logger):
             kwargs['train'] = args.train_set_name
             kwargs.update(train_eval_shared_kwargs)
             kwargs['all_dirs'] = args.train_src_languages
-            kwargs['cached_path'] = os.path.join(args.cache, task.name)
             kwargs['crossner_domains'] = args.crossner_domains
             if args.use_curriculum:
                 kwargs['curriculum'] = True
@@ -144,7 +141,6 @@ def prepare_data(args, logger):
                 kwargs['validation'] = args.eval_set_name
             kwargs.update(train_eval_shared_kwargs)
             kwargs['all_dirs'] = args.eval_src_languages
-            kwargs['cached_path'] = os.path.join(args.cache, task.name)
             kwargs['crossner_domains'] = args.crossner_domains
             kwargs['hf_test_overfit'] = args.hf_test_overfit
 

diff --git a/tests/test_NED.sh b/tests/test_NED.sh
@@ -15,10 +15,10 @@ for hparams in \
 do
 
     # train
-    genienlp train --train_tasks almond_dialogue_nlu --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --bootleg_output_dir $SRCDIR/dataset/thingpedia_99/bootleg/  --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit --do_ned --min_entity_len 2 --max_entity_len 4 $hparams
+    genienlp train --train_tasks almond_dialogue_nlu --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --bootleg_output_dir $SRCDIR/dataset/thingpedia_99/bootleg/  --exist_ok  --embeddings $EMBEDDING_DIR --no_commit --do_ned --min_entity_len 2 --max_entity_len 4 $hparams
 
     # greedy prediction
-    genienlp predict --tasks almond_dialogue_nlu --evaluate valid --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --embeddings $EMBEDDING_DIR --skip_cache
+    genienlp predict --tasks almond_dialogue_nlu --evaluate valid --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --database_dir $SRCDIR/database/ --data $SRCDIR/dataset/thingpedia_99/ --embeddings $EMBEDDING_DIR
 
     # check if result file exists
     if test ! -f $workdir/model_$i/eval_results/valid/almond_dialogue_nlu.tsv ; then

diff --git a/tests/test_calibration.sh b/tests/test_calibration.sh
@@ -9,10 +9,10 @@ for hparams in \
 do
 
     # train
-    genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/  $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit
+    genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 6 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/  $hparams --exist_ok --embeddings $EMBEDDING_DIR --no_commit
 
     # greedy prediction
-    genienlp predict --tasks almond --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR --skip_cache --save_confidence_features --confidence_feature_path $workdir/model_$i/confidences.pkl --mc_dropout_num 10
+    genienlp predict --tasks almond --evaluate test --path $workdir/model_$i --overwrite --eval_dir $workdir/model_$i/eval_results/ --data $SRCDIR/dataset/ --embeddings $EMBEDDING_DIR --save_confidence_features --confidence_feature_path $workdir/model_$i/confidences.pkl --mc_dropout_num 10
 
     # check if confidence file exists
     if test ! -f $workdir/model_$i/confidences.pkl ; then

diff --git a/tests/test_cuda.sh b/tests/test_cuda.sh
@@ -9,7 +9,7 @@ for hparams in \
 do
 
     # train
-    genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 2 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/  $hparams --exist_ok --skip_cache --embeddings $EMBEDDING_DIR --no_commit
+    genienlp train --train_tasks almond --train_batch_tokens 100 --val_batch_size 100 --train_iterations 2 --preserve_case --save_every 2 --log_every 2 --val_every 2 --save $workdir/model_$i --data $SRCDIR/dataset/  $hparams --exist_ok --embeddings $EMBEDDING_DIR --no_commit
 
     # generate a long sequence
     long_sequence=''