make release-tag: Merge branch 'master' into stable

sdv-dev · May 21, 2021 · ba92661 · ba92661
2 parents 2b06387 + a1f1903
commit ba92661
Show file tree

Hide file tree

Showing 14 changed files with 743 additions and 47 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,20 @@
 # History
 
+## v0.3.1 - 2021-05-20
+
+This release adds new features to store results and cache contents into an S3 bucket
+as well as a script to collect results from a cache dir and compile a single results
+CSV file.
+
+### Issues closed
+
+* Collect cached results from s3 bucket - [Issue #85](https://github.com/sdv-dev/SDGym/issues/85) by @katxiao
+* Store cache contents into an S3 bucket - [Issue #81](https://github.com/sdv-dev/SDGym/issues/81) by @katxiao
+* Store SDGym results into an S3 bucket - [Issue #80](https://github.com/sdv-dev/SDGym/issues/80) by @katxiao
+* Add a way to collect cached results - [Issue #79](https://github.com/sdv-dev/SDGym/issues/79) by @katxiao
+* Allow reading datasets from private s3 bucket - [Issue #74](https://github.com/sdv-dev/SDGym/issues/74) by @katxiao
+* Typos in the sdgym.run function docstring documentation - [Issue #69](https://github.com/sdv-dev/SDGym/issues/69) by @sbrugman
+
 ## v0.3.0 - 2021-01-27
 
 Major rework of the SDGym functionality to support a collection of new features:

diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = 'sdgym' %}
-{% set version = '0.3.0' %}
+{% set version = '0.3.1.dev3' %}
 
 package:
   name: "{{ name|lower }}"

diff --git a/sdgym/__init__.py b/sdgym/__init__.py
@@ -8,10 +8,11 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = '[email protected]'
 __license__ = 'MIT'
-__version__ = '0.3.0'
+__version__ = '0.3.1.dev3'
 
 from sdgym import benchmark, results, synthesizers
 from sdgym.benchmark import run
+from sdgym.collect import collect_results
 from sdgym.datasets import load_dataset
 
 __all__ = [
@@ -20,4 +21,5 @@
     'results',
     'run',
     'load_dataset',
+    'collect_results'
 ]
diff --git a/sdgym/__main__.py b/sdgym/__main__.py
@@ -86,12 +86,15 @@ def _run(args):
         datasets_path=args.datasets_path,
         modalities=args.modalities,
         metrics=args.metrics,
+        bucket=args.bucket,
         iterations=args.iterations,
         cache_dir=args.cache_dir,
         workers=workers,
         show_progress=args.progress,
         timeout=args.timeout,
         output_path=args.output_path,
+        aws_key=args.aws_key,
+        aws_secret=args.aws_secret,
     )
 
     if args.groupby:
@@ -105,10 +108,12 @@ def _download_datasets(args):
     _env_setup(args.logfile, args.verbose)
     datasets = args.datasets
     if not datasets:
-        datasets = sdgym.datasets.get_available_datasets(args.bucket)['name']
+        datasets = sdgym.datasets.get_available_datasets(
+            args.bucket, args.aws_key, args.aws_secret)['name']
 
     for dataset in tqdm.tqdm(datasets):
-        sdgym.datasets.load_dataset(dataset, args.datasets_path, args.bucket)
+        sdgym.datasets.load_dataset(
+            dataset, args.datasets_path, args.bucket, args.aws_key, args.aws_secret)
 
 
 def _list_downloaded(args):
@@ -118,10 +123,14 @@ def _list_downloaded(args):
 
 
 def _list_available(args):
-    datasets = sdgym.datasets.get_available_datasets(args.bucket)
+    datasets = sdgym.datasets.get_available_datasets(args.bucket, args.aws_key, args.aws_secret)
     _print_table(datasets, args.sort, args.reverse, {'size': humanfriendly.format_size})
 
 
+def _collect(args):
+    sdgym.collect.collect_results(args.input_path, args.output_file, args.aws_key, args.aws_secret)
+
+
 def _get_parser():
     parser = argparse.ArgumentParser(description='SDGym Command Line Interface')
     parser.set_defaults(action=None)
@@ -140,6 +149,8 @@ def _get_parser():
                      help='Synthesizer/s to be benchmarked. Accepts multiple names.')
     run.add_argument('-m', '--metrics', nargs='+',
                      help='Metrics to apply. Accepts multiple names.')
+    run.add_argument('-b', '--bucket',
+                     help='Bucket from which to download the datasets.')
     run.add_argument('-d', '--datasets', nargs='+',
                      help='Datasets/s to be used. Accepts multiple names.')
     run.add_argument('-dp', '--datasets-path',
@@ -163,7 +174,11 @@ def _get_parser():
     run.add_argument('-t', '--timeout', type=int,
                      help='Maximum seconds to run for each dataset.')
     run.add_argument('-g', '--groupby', nargs='+',
-                     help='Group scores leaderboard by the given fields')
+                     help='Group scores leaderboard by the given fields.')
+    run.add_argument('-ak', '--aws-key', type=str, required=False,
+                     help='Aws access key ID to use when reading datasets.')
+    run.add_argument('-as', '--aws-secret', type=str, required=False,
+                     help='Aws secret access key to use when reading datasets.')
 
     # download-datasets
     download = action.add_parser('download-datasets', help='Download datasets.')
@@ -178,8 +193,12 @@ def _get_parser():
                           help='Be verbose. Repeat for increased verbosity.')
     download.add_argument('-l', '--logfile', type=str,
                           help='Name of the log file.')
+    download.add_argument('-ak', '--aws-key', type=str, required=False,
+                          help='Aws access key ID to use when reading datasets.')
+    download.add_argument('-as', '--aws-secret', type=str, required=False,
+                          help='Aws secret access key to use when reading datasets.')
 
-    # list-available-datasets
+    # list-downloaded-datasets
     list_downloaded = action.add_parser('list-downloaded', help='List downloaded datasets.')
     list_downloaded.set_defaults(action=_list_downloaded)
     list_downloaded.add_argument('-s', '--sort', default='name',
@@ -192,13 +211,29 @@ def _get_parser():
     # list-available-datasets
     list_available = action.add_parser('list-available',
                                        help='List datasets available for download.')
+    list_available.set_defaults(action=_list_available)
     list_available.add_argument('-s', '--sort', default='name',
                                 help='Value to sort by (name|size|modality). Defaults to `name`.')
     list_available.add_argument('-r', '--reverse', action='store_true',
                                 help='Reverse the order.')
     list_available.add_argument('-b', '--bucket',
                                 help='Bucket from which to download the datasets.')
-    list_available.set_defaults(action=_list_available)
+    list_available.add_argument('-ak', '--aws-key', type=str, required=False,
+                                help='Aws access key ID to use when reading datasets.')
+    list_available.add_argument('-as', '--aws-secret', type=str, required=False,
+                                help='Aws secret access key to use when reading datasets.')
+
+    # collect
+    collect = action.add_parser('collect', help='Collect sdgym results.')
+    collect.set_defaults(action=_collect)
+    collect.add_argument('-i', '--input-path', type=str, required=True,
+                         help='Path within which to look for sdgym results.')
+    collect.add_argument('-o', '--output-file', type=str,
+                         help='Output file containing the collected results.')
+    collect.add_argument('-ak', '--aws-key', type=str, required=False,
+                         help='Aws access key ID to use when reading datasets.')
+    collect.add_argument('-as', '--aws-secret', type=str, required=False,
+                         help='Aws secret access key to use when reading datasets.')
 
     return parser
 

diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py
@@ -17,6 +17,7 @@
 from sdgym.errors import SDGymError
 from sdgym.metrics import get_metrics
 from sdgym.progress import TqdmLogger, progress
+from sdgym.s3 import is_s3_path, write_csv, write_file
 from sdgym.synthesizers.base import Baseline
 from sdgym.utils import (
     build_synthesizer, format_exception, get_synthesizers_dict, import_object, used_memory)
@@ -153,7 +154,8 @@ def _run_job(args):
     # Reset random seed
     np.random.seed()
 
-    synthesizer, metadata, metrics, iteration, cache_dir, timeout, run_id = args
+    synthesizer, metadata, metrics, iteration, cache_dir, \
+        timeout, run_id, aws_key, aws_secret = args
 
     name = synthesizer['name']
     dataset_name = metadata._metadata['name']
@@ -183,14 +185,16 @@ def _run_job(args):
         scores['error'] = output['error']
 
     if cache_dir:
-        base_path = str(cache_dir / f'{name}_{dataset_name}_{iteration}_{run_id}')
+        cache_dir_name = str(cache_dir)
+        base_path = f'{cache_dir_name}/{name}_{dataset_name}_{iteration}_{run_id}'
         if scores is not None:
-            scores.to_csv(base_path + '_scores.csv', index=False)
+            write_csv(scores, f'{base_path}_scores.csv', aws_key, aws_secret)
         if 'synthetic_data' in output:
-            compress_pickle.dump(output['synthetic_data'], base_path + '.data.gz')
+            synthetic_data = compress_pickle.dumps(output['synthetic_data'], compression='gzip')
+            write_file(synthetic_data, f'{base_path}.data.gz', aws_key, aws_secret)
         if 'exception' in output:
-            with open(base_path + '_error.txt', 'w') as error_file:
-                error_file.write(output['exception'])
+            exception = output['exception'].encode('utf-8')
+            write_file(exception, f'{base_path}_error.txt', aws_key, aws_secret)
 
     return scores
 
@@ -220,7 +224,7 @@ def _run_on_dask(jobs, verbose):
 
 def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket=None,
         metrics=None, iterations=1, workers=1, cache_dir=None, show_progress=False,
-        timeout=None, output_path=None):
+        timeout=None, output_path=None, aws_key=None, aws_secret=None):
     """Run the SDGym benchmark and return a leaderboard.
 
     The ``synthesizers`` object can either be a single synthesizer or, an iterable of
@@ -251,7 +255,7 @@ def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket
         bucket (str):
             Name of the bucket from which the datasets must be downloaded if not found locally.
         iterations (int):
-            Number of iterations to perform over each dataset and synthesizer. Defaults to 3.
+            Number of iterations to perform over each dataset and synthesizer. Defaults to 1.
         workers (int or str):
             If ``workers`` is given as an integer value other than 0 or 1, a multiprocessing
             Pool is used to distribute the computation across the indicated number of workers.
@@ -264,7 +268,7 @@ def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket
             is still running and also recovering results in case the process does not finish
             properly. Defaults to ``None``.
         show_progress (bool):
-            Whether to use tqdm to keep track of the progress. Defaults to ``True``.
+            Whether to use tqdm to keep track of the progress. Defaults to ``False``.
         timeout (int):
             Maximum number of seconds to wait for each dataset to
             finish the evaluation process. If not passed, wait until
@@ -273,16 +277,22 @@ def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket
             If an ``output_path`` is given, the generated leaderboard will be stored in the
             indicated path as a CSV file. The given path must be a complete path including
             the ``.csv`` filename.
+        aws_key (str):
+            If an ``aws_key`` is provided, the given access key id will be used to read
+            from the specified bucket.
+        aws_secret (str):
+            If an ``aws_secret`` is provided, the given secret access key will be used to read
+            from the specified bucket.
 
     Returns:
         pandas.DataFrame:
             A table containing one row per synthesizer + dataset + metric + iteration.
     """
     synthesizers = get_synthesizers_dict(synthesizers)
-    datasets = get_dataset_paths(datasets, datasets_path, bucket)
+    datasets = get_dataset_paths(datasets, datasets_path, bucket, aws_key, aws_secret)
     run_id = os.getenv('RUN_ID') or str(uuid.uuid4())[:10]
 
-    if cache_dir:
+    if cache_dir and not is_s3_path(cache_dir):
         cache_dir = Path(cache_dir)
         os.makedirs(cache_dir, exist_ok=True)
 
@@ -301,6 +311,8 @@ def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket
                         cache_dir,
                         timeout,
                         run_id,
+                        aws_key,
+                        aws_secret,
                     )
                     jobs.append(args)
 
@@ -321,7 +333,8 @@ def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket
         raise SDGymError("No valid Dataset/Synthesizer combination given")
 
     scores = pd.concat(scores)
+
     if output_path:
-        scores.to_csv(output_path, index=False)
+        write_csv(scores, output_path, aws_key, aws_secret)
 
     return scores
diff --git a/sdgym/collect.py b/sdgym/collect.py
@@ -0,0 +1,35 @@
+from sdgym.s3 import read_csv_from_path, write_csv
+
+
+def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None):
+    """Collect the results in the given input directory, and
+    write all the results into one csv file.
+
+    Args:
+        input_path (str):
+            The path of the directory that the results files
+            will be read from.
+        output_file (str):
+            If ``output_file`` is provided, the consolidated
+            results will be written there. Otherwise, they
+            will be written to ``input_path``/results.csv.
+        aws_key (str):
+            If an ``aws_key`` is provided, the given access
+            key id will be used to read from and/or write to
+            any s3 paths.
+        aws_secret (str):
+            If an ``aws_secret`` is provided, the given secret
+            access key will be used to read from and/or write to
+            any s3 paths.
+    """
+    print(f'Reading results from {input_path}')
+    scores = read_csv_from_path(input_path, aws_key, aws_secret)
+    scores = scores.drop_duplicates()
+
+    if output_file:
+        output = output_file
+    else:
+        output = f'{input_path}/results.csv'
+
+    print(f'Storing results at {output}')
+    write_csv(scores, output, aws_key, aws_secret)