Skip to content

Commit

Permalink
make release-tag: Merge branch 'master' into stable
Browse files Browse the repository at this point in the history
  • Loading branch information
katxiao committed May 21, 2021
2 parents 2b06387 + a1f1903 commit ba92661
Show file tree
Hide file tree
Showing 14 changed files with 743 additions and 47 deletions.
15 changes: 15 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
# History

## v0.3.1 - 2021-05-20

This release adds new features to store results and cache contents into an S3 bucket
as well as a script to collect results from a cache dir and compile a single results
CSV file.

### Issues closed

* Collect cached results from s3 bucket - [Issue #85](https://github.com/sdv-dev/SDGym/issues/85) by @katxiao
* Store cache contents into an S3 bucket - [Issue #81](https://github.com/sdv-dev/SDGym/issues/81) by @katxiao
* Store SDGym results into an S3 bucket - [Issue #80](https://github.com/sdv-dev/SDGym/issues/80) by @katxiao
* Add a way to collect cached results - [Issue #79](https://github.com/sdv-dev/SDGym/issues/79) by @katxiao
* Allow reading datasets from private s3 bucket - [Issue #74](https://github.com/sdv-dev/SDGym/issues/74) by @katxiao
* Typos in the sdgym.run function docstring documentation - [Issue #69](https://github.com/sdv-dev/SDGym/issues/69) by @sbrugman

## v0.3.0 - 2021-01-27

Major rework of the SDGym functionality to support a collection of new features:
Expand Down
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% set name = 'sdgym' %}
{% set version = '0.3.0' %}
{% set version = '0.3.1.dev3' %}

package:
name: "{{ name|lower }}"
Expand Down
4 changes: 3 additions & 1 deletion sdgym/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = '[email protected]'
__license__ = 'MIT'
__version__ = '0.3.0'
__version__ = '0.3.1.dev3'

from sdgym import benchmark, results, synthesizers
from sdgym.benchmark import run
from sdgym.collect import collect_results
from sdgym.datasets import load_dataset

__all__ = [
Expand All @@ -20,4 +21,5 @@
'results',
'run',
'load_dataset',
'collect_results'
]
47 changes: 41 additions & 6 deletions sdgym/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,15 @@ def _run(args):
datasets_path=args.datasets_path,
modalities=args.modalities,
metrics=args.metrics,
bucket=args.bucket,
iterations=args.iterations,
cache_dir=args.cache_dir,
workers=workers,
show_progress=args.progress,
timeout=args.timeout,
output_path=args.output_path,
aws_key=args.aws_key,
aws_secret=args.aws_secret,
)

if args.groupby:
Expand All @@ -105,10 +108,12 @@ def _download_datasets(args):
_env_setup(args.logfile, args.verbose)
datasets = args.datasets
if not datasets:
datasets = sdgym.datasets.get_available_datasets(args.bucket)['name']
datasets = sdgym.datasets.get_available_datasets(
args.bucket, args.aws_key, args.aws_secret)['name']

for dataset in tqdm.tqdm(datasets):
sdgym.datasets.load_dataset(dataset, args.datasets_path, args.bucket)
sdgym.datasets.load_dataset(
dataset, args.datasets_path, args.bucket, args.aws_key, args.aws_secret)


def _list_downloaded(args):
Expand All @@ -118,10 +123,14 @@ def _list_downloaded(args):


def _list_available(args):
datasets = sdgym.datasets.get_available_datasets(args.bucket)
datasets = sdgym.datasets.get_available_datasets(args.bucket, args.aws_key, args.aws_secret)
_print_table(datasets, args.sort, args.reverse, {'size': humanfriendly.format_size})


def _collect(args):
sdgym.collect.collect_results(args.input_path, args.output_file, args.aws_key, args.aws_secret)


def _get_parser():
parser = argparse.ArgumentParser(description='SDGym Command Line Interface')
parser.set_defaults(action=None)
Expand All @@ -140,6 +149,8 @@ def _get_parser():
help='Synthesizer/s to be benchmarked. Accepts multiple names.')
run.add_argument('-m', '--metrics', nargs='+',
help='Metrics to apply. Accepts multiple names.')
run.add_argument('-b', '--bucket',
help='Bucket from which to download the datasets.')
run.add_argument('-d', '--datasets', nargs='+',
help='Datasets/s to be used. Accepts multiple names.')
run.add_argument('-dp', '--datasets-path',
Expand All @@ -163,7 +174,11 @@ def _get_parser():
run.add_argument('-t', '--timeout', type=int,
help='Maximum seconds to run for each dataset.')
run.add_argument('-g', '--groupby', nargs='+',
help='Group scores leaderboard by the given fields')
help='Group scores leaderboard by the given fields.')
run.add_argument('-ak', '--aws-key', type=str, required=False,
help='Aws access key ID to use when reading datasets.')
run.add_argument('-as', '--aws-secret', type=str, required=False,
help='Aws secret access key to use when reading datasets.')

# download-datasets
download = action.add_parser('download-datasets', help='Download datasets.')
Expand All @@ -178,8 +193,12 @@ def _get_parser():
help='Be verbose. Repeat for increased verbosity.')
download.add_argument('-l', '--logfile', type=str,
help='Name of the log file.')
download.add_argument('-ak', '--aws-key', type=str, required=False,
help='Aws access key ID to use when reading datasets.')
download.add_argument('-as', '--aws-secret', type=str, required=False,
help='Aws secret access key to use when reading datasets.')

# list-available-datasets
# list-downloaded-datasets
list_downloaded = action.add_parser('list-downloaded', help='List downloaded datasets.')
list_downloaded.set_defaults(action=_list_downloaded)
list_downloaded.add_argument('-s', '--sort', default='name',
Expand All @@ -192,13 +211,29 @@ def _get_parser():
# list-available-datasets
list_available = action.add_parser('list-available',
help='List datasets available for download.')
list_available.set_defaults(action=_list_available)
list_available.add_argument('-s', '--sort', default='name',
help='Value to sort by (name|size|modality). Defaults to `name`.')
list_available.add_argument('-r', '--reverse', action='store_true',
help='Reverse the order.')
list_available.add_argument('-b', '--bucket',
help='Bucket from which to download the datasets.')
list_available.set_defaults(action=_list_available)
list_available.add_argument('-ak', '--aws-key', type=str, required=False,
help='Aws access key ID to use when reading datasets.')
list_available.add_argument('-as', '--aws-secret', type=str, required=False,
help='Aws secret access key to use when reading datasets.')

# collect
collect = action.add_parser('collect', help='Collect sdgym results.')
collect.set_defaults(action=_collect)
collect.add_argument('-i', '--input-path', type=str, required=True,
help='Path within which to look for sdgym results.')
collect.add_argument('-o', '--output-file', type=str,
help='Output file containing the collected results.')
collect.add_argument('-ak', '--aws-key', type=str, required=False,
help='Aws access key ID to use when reading datasets.')
collect.add_argument('-as', '--aws-secret', type=str, required=False,
help='Aws secret access key to use when reading datasets.')

return parser

Expand Down
37 changes: 25 additions & 12 deletions sdgym/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from sdgym.errors import SDGymError
from sdgym.metrics import get_metrics
from sdgym.progress import TqdmLogger, progress
from sdgym.s3 import is_s3_path, write_csv, write_file
from sdgym.synthesizers.base import Baseline
from sdgym.utils import (
build_synthesizer, format_exception, get_synthesizers_dict, import_object, used_memory)
Expand Down Expand Up @@ -153,7 +154,8 @@ def _run_job(args):
# Reset random seed
np.random.seed()

synthesizer, metadata, metrics, iteration, cache_dir, timeout, run_id = args
synthesizer, metadata, metrics, iteration, cache_dir, \
timeout, run_id, aws_key, aws_secret = args

name = synthesizer['name']
dataset_name = metadata._metadata['name']
Expand Down Expand Up @@ -183,14 +185,16 @@ def _run_job(args):
scores['error'] = output['error']

if cache_dir:
base_path = str(cache_dir / f'{name}_{dataset_name}_{iteration}_{run_id}')
cache_dir_name = str(cache_dir)
base_path = f'{cache_dir_name}/{name}_{dataset_name}_{iteration}_{run_id}'
if scores is not None:
scores.to_csv(base_path + '_scores.csv', index=False)
write_csv(scores, f'{base_path}_scores.csv', aws_key, aws_secret)
if 'synthetic_data' in output:
compress_pickle.dump(output['synthetic_data'], base_path + '.data.gz')
synthetic_data = compress_pickle.dumps(output['synthetic_data'], compression='gzip')
write_file(synthetic_data, f'{base_path}.data.gz', aws_key, aws_secret)
if 'exception' in output:
with open(base_path + '_error.txt', 'w') as error_file:
error_file.write(output['exception'])
exception = output['exception'].encode('utf-8')
write_file(exception, f'{base_path}_error.txt', aws_key, aws_secret)

return scores

Expand Down Expand Up @@ -220,7 +224,7 @@ def _run_on_dask(jobs, verbose):

def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket=None,
metrics=None, iterations=1, workers=1, cache_dir=None, show_progress=False,
timeout=None, output_path=None):
timeout=None, output_path=None, aws_key=None, aws_secret=None):
"""Run the SDGym benchmark and return a leaderboard.
The ``synthesizers`` object can either be a single synthesizer or, an iterable of
Expand Down Expand Up @@ -251,7 +255,7 @@ def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket
bucket (str):
Name of the bucket from which the datasets must be downloaded if not found locally.
iterations (int):
Number of iterations to perform over each dataset and synthesizer. Defaults to 3.
Number of iterations to perform over each dataset and synthesizer. Defaults to 1.
workers (int or str):
If ``workers`` is given as an integer value other than 0 or 1, a multiprocessing
Pool is used to distribute the computation across the indicated number of workers.
Expand All @@ -264,7 +268,7 @@ def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket
is still running and also recovering results in case the process does not finish
properly. Defaults to ``None``.
show_progress (bool):
Whether to use tqdm to keep track of the progress. Defaults to ``True``.
Whether to use tqdm to keep track of the progress. Defaults to ``False``.
timeout (int):
Maximum number of seconds to wait for each dataset to
finish the evaluation process. If not passed, wait until
Expand All @@ -273,16 +277,22 @@ def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket
If an ``output_path`` is given, the generated leaderboard will be stored in the
indicated path as a CSV file. The given path must be a complete path including
the ``.csv`` filename.
aws_key (str):
If an ``aws_key`` is provided, the given access key id will be used to read
from the specified bucket.
aws_secret (str):
If an ``aws_secret`` is provided, the given secret access key will be used to read
from the specified bucket.
Returns:
pandas.DataFrame:
A table containing one row per synthesizer + dataset + metric + iteration.
"""
synthesizers = get_synthesizers_dict(synthesizers)
datasets = get_dataset_paths(datasets, datasets_path, bucket)
datasets = get_dataset_paths(datasets, datasets_path, bucket, aws_key, aws_secret)
run_id = os.getenv('RUN_ID') or str(uuid.uuid4())[:10]

if cache_dir:
if cache_dir and not is_s3_path(cache_dir):
cache_dir = Path(cache_dir)
os.makedirs(cache_dir, exist_ok=True)

Expand All @@ -301,6 +311,8 @@ def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket
cache_dir,
timeout,
run_id,
aws_key,
aws_secret,
)
jobs.append(args)

Expand All @@ -321,7 +333,8 @@ def run(synthesizers, datasets=None, datasets_path=None, modalities=None, bucket
raise SDGymError("No valid Dataset/Synthesizer combination given")

scores = pd.concat(scores)

if output_path:
scores.to_csv(output_path, index=False)
write_csv(scores, output_path, aws_key, aws_secret)

return scores
35 changes: 35 additions & 0 deletions sdgym/collect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from sdgym.s3 import read_csv_from_path, write_csv


def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None):
"""Collect the results in the given input directory, and
write all the results into one csv file.
Args:
input_path (str):
The path of the directory that the results files
will be read from.
output_file (str):
If ``output_file`` is provided, the consolidated
results will be written there. Otherwise, they
will be written to ``input_path``/results.csv.
aws_key (str):
If an ``aws_key`` is provided, the given access
key id will be used to read from and/or write to
any s3 paths.
aws_secret (str):
If an ``aws_secret`` is provided, the given secret
access key will be used to read from and/or write to
any s3 paths.
"""
print(f'Reading results from {input_path}')
scores = read_csv_from_path(input_path, aws_key, aws_secret)
scores = scores.drop_duplicates()

if output_file:
output = output_file
else:
output = f'{input_path}/results.csv'

print(f'Storing results at {output}')
write_csv(scores, output, aws_key, aws_secret)
Loading

0 comments on commit ba92661

Please sign in to comment.