diff --git a/genienlp/arguments.py b/genienlp/arguments.py index ca9f97a2c..991c6ac4f 100644 --- a/genienlp/arguments.py +++ b/genienlp/arguments.py @@ -534,11 +534,6 @@ def parse_argv(parser): # token classification task args parser.add_argument('--num_labels', type=int, help='num_labels for classification tasks') parser.add_argument('--crossner_domains', nargs='+', type=str, help='domains to use for CrossNER task') - parser.add_argument( - '--hf_test_overfit', - action='store_true', - help='Debugging flag for hf datasets where validation will be performed on train set', - ) parser.add_argument( '--e2e_dialogue_evaluation', diff --git a/genienlp/predict.py b/genienlp/predict.py index 4325c6f23..c372770ba 100644 --- a/genienlp/predict.py +++ b/genienlp/predict.py @@ -315,7 +315,6 @@ def prepare_data(args): if len(args.pred_src_languages) == 1 and len(args.tasks) > 1: args.pred_src_languages *= len(args.tasks) for i, task in enumerate(args.tasks): - task_languages = args.pred_src_languages[i] logger.info(f'Loading {task}') kwargs = {'train': None, 'validation': None, 'test': None} if args.evaluate == 'train': @@ -330,11 +329,9 @@ def prepare_data(args): kwargs.update( { 'subsample': args.subsample, - 'all_dirs': task_languages, 'num_workers': args.num_workers, 'src_lang': src_lang, 'crossner_domains': args.crossner_domains, - 'hf_test_overfit': args.hf_test_overfit, } ) diff --git a/genienlp/run_bootleg.py b/genienlp/run_bootleg.py index 556a3e5e6..175008d34 100644 --- a/genienlp/run_bootleg.py +++ b/genienlp/run_bootleg.py @@ -179,11 +179,6 @@ def parse_argv(parser): # token classification task args parser.add_argument('--num_labels', type=int, help='num_labels for classification tasks') parser.add_argument('--crossner_domains', nargs='+', type=str, help='domains to use for CrossNER task') - parser.add_argument( - '--hf_test_overfit', - action='store_true', - help='Debugging flag for hf datasets where validation will be performed on train set', - ) def bootleg_dump_entities(args, logger): @@ -192,7 +187,6 @@ def bootleg_dump_entities(args, logger): bootleg_shared_kwargs = { 'subsample': args.subsample, 'num_workers': args.num_workers, - 'all_dirs': args.train_src_languages, 'crossner_domains': args.crossner_domains, } diff --git a/genienlp/tasks/generic_dataset.py b/genienlp/tasks/generic_dataset.py index 33a0dfaa6..2721412c8 100644 --- a/genienlp/tasks/generic_dataset.py +++ b/genienlp/tasks/generic_dataset.py @@ -185,16 +185,16 @@ def return_splits(cls, path='.data', train='train', validation='dev', test='test with open(test_path, "r") as fin: test_data = fin.readlines() - # Uncomment for testing - if kwargs.pop("hf_test_overfit", False): - if validation: - validation_path = os.path.join(path, domain, 'train.txt') - with open(validation_path, "r") as fin: - validation_data = fin.readlines() - if test: - test_path = os.path.join(path, domain, 'train.txt') - with open(test_path, "r") as fin: - test_data = fin.readlines() + # Uncomment for debugging + # if True: + # if validation: + # validation_path = os.path.join(path, domain, 'train.txt') + # with open(validation_path, "r") as fin: + # validation_data = fin.readlines() + # if test: + # test_path = os.path.join(path, domain, 'train.txt') + # with open(test_path, "r") as fin: + # test_data = fin.readlines() kwargs['domain'] = domain diff --git a/genienlp/tasks/hf_dataset.py b/genienlp/tasks/hf_dataset.py index 578fce224..fdc6ad355 100644 --- a/genienlp/tasks/hf_dataset.py +++ b/genienlp/tasks/hf_dataset.py @@ -69,14 +69,15 @@ def return_splits(cls, name, root='.data', train='train', validation='validation test_data = load_dataset(name, split='test', cache_dir=root) test_path = test_data.cache_files[0]['filename'] - if kwargs.pop('hf_test_overfit', False): - # override validation/ test data with train data - if validation: - validation_data = load_dataset(name, split='train', cache_dir=root) - validation_path = validation_data.cache_files[0]['filename'] - if test: - test_data = load_dataset(name, split='train', cache_dir=root) - test_path = test_data.cache_files[0]['filename'] + # Uncomment for debugging + # if True: + # # override validation/ test data with train data + # if validation: + # validation_data = load_dataset(name, split='train', cache_dir=root) + # validation_path = validation_data.cache_files[0]['filename'] + # if test: + # test_data = load_dataset(name, split='train', cache_dir=root) + # test_path = test_data.cache_files[0]['filename'] train_data = None if train is None else cls(train_data, **kwargs) validation_data = None if validation is None else cls(validation_data, **kwargs) diff --git a/genienlp/train.py b/genienlp/train.py index 957a5ff00..c71ac23b9 100644 --- a/genienlp/train.py +++ b/genienlp/train.py @@ -96,7 +96,6 @@ def prepare_data(args, logger): kwargs = {'test': None, 'validation': None} kwargs['train'] = args.train_set_name kwargs.update(train_eval_shared_kwargs) - kwargs['all_dirs'] = args.train_src_languages kwargs['crossner_domains'] = args.crossner_domains if args.use_curriculum: kwargs['curriculum'] = True @@ -140,9 +139,7 @@ def prepare_data(args, logger): if args.eval_set_name is not None: kwargs['validation'] = args.eval_set_name kwargs.update(train_eval_shared_kwargs) - kwargs['all_dirs'] = args.eval_src_languages kwargs['crossner_domains'] = args.crossner_domains - kwargs['hf_test_overfit'] = args.hf_test_overfit logger.info(f'Adding {task.name} to validation datasets') splits, paths = task.get_splits(args.data, lower=args.lower, **kwargs) diff --git a/genienlp/util.py b/genienlp/util.py index 5b174ec15..bce36453d 100644 --- a/genienlp/util.py +++ b/genienlp/util.py @@ -862,7 +862,6 @@ def load_config_json(args): 'no_separator', 'num_labels', 'crossner_domains', - 'hf_test_overfit', 'override_valid_metrics', 'eval_src_languages', 'eval_tgt_languages',