From 10bfc29826bf9271e39572627e7cab4ef81e73ae Mon Sep 17 00:00:00 2001 From: Vasu Jaganath Date: Mon, 27 Jan 2025 09:52:06 -0500 Subject: [PATCH] add (minimum) pass through flags for Toil --- src/sophios/cli.py | 27 +++++++++++++++++++++++++++ src/sophios/run_local.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/src/sophios/cli.py b/src/sophios/cli.py index 17808e36..f4cb9730 100644 --- a/src/sophios/cli.py +++ b/src/sophios/cli.py @@ -125,6 +125,33 @@ parser.add_argument('--custom_net', type=str, required=False, help='Passes --custom-net flag to cwltool.') +# Toil pass through flags +parser.add_argument('--toil_batchsystem', type=str, required=False, default='slurm', + help='pass through flag for toil batch system') +parser.add_argument('--toil_defaultMemory', type=str, required=False, default='10GB', + help='pass through flag for toil default memory') +parser.add_argument('--toil_defaulDisk', type=str, required=False, default='10Gi', + help='pass through flag for toil default disk usage') +parser.add_argument('--toil_maxCores', type=int, required=False, default=128, + help='pass through flag for toil max number of cpu core usage') +parser.add_argument('--toil_maxLocalJobs', type=int, required=False, default=128, + help='pass through flag for toil max number of local jobs') +parser.add_argument('--toil_slurmArgs', type=str, required=False, + default='--export=ALL --partition=normal_cpu --error=serror.txt', + help='pass through flag for toil slurm arguments') +parser.add_argument('--toil_defaultCores', type=int, required=False, default=2, + help='pass through flag for toil default number of cores per job') +parser.add_argument('--toil_logLevel', type=str, required=False, default='CRITICAL', + help='pass through flag for toil log level (OFF, CRITICAL, INFO, ERROR, WARN, INFO, DEBUG)') +parser.add_argument('--toil_clusterStats', type=str, required=False, default='clusterStats.json', + help='pass through flag for toil dumping cluster stats') +parser.add_argument('--toil_coordinationDir', type=str, required=False, default=str(Path.cwd()), + help='pass through flag for toil coordinationDir') +parser.add_argument('--toil_workDir', type=str, required=False, default=str(Path.cwd()), + help='pass through flag for toil workDir') +parser.add_argument('--batchLogsDir', type=str, required=False, default=str(Path.cwd() / 'slurmlogs'), + help='pass through flag for toil dumping directory for batch logs') + def get_args(yaml_path: str = '', suppliedargs: list[str] = []) -> argparse.Namespace: """This is used to get mock command line arguments, default + suppled args diff --git a/src/sophios/run_local.py b/src/sophios/run_local.py index a4244361..5b68bc2f 100644 --- a/src/sophios/run_local.py +++ b/src/sophios/run_local.py @@ -250,8 +250,39 @@ def run_local(args: argparse.Namespace, rose_tree: RoseTree, cachedir: Optional[ date_time = now.strftime("%Y%m%d%H%M%S") cmd += ['--outdir', f'outdir_toil_{yaml_stem}_{date_time}', '--jobStore', f'file:./jobStore_{yaml_stem}', # NOTE: This is the equivalent of --cachedir + '--batchSystem', args.toil_batchsystem, + # The default amount of memory to request for a job (in bytes), by default is 2^31 = 2 gigabytes + # Not applied to the CWL jobs. The CWL CLTs should define RamMin in the ResourceRequirement section + '--defaultMemory', args.toil_defaultmemory, + '--defaultDisk', args.toil_defaultDisk, + # Not giving users these choice is better? + '--statePollingWait', '0', # See https://github.com/DataBiosphere/toil/pull/4471 + '--retryCount', '0', # There appear to be random errors due to the panassas network file system. + # Number of times to retry a failing job before giving + # up and labeling job failed. default=0 + '--maxCores', str(args.toil_maxCores), + '--maxLocalJobs', str(args.toil_maxLocalJobs), + '--disableAutoDeployment', 'True', + # Should auto-deployment of the user script be deactivated? + # If True, the user script/package should be present + # at the same location on all workers. Default = False. + '--stats', + # Records statistics about the toil workflow to be used by 'toil stats'. + '--clusterStats', args.toil_clusterStats, + '--workDir', args.toil_workDir, # "This directory needs to exist on all machines running jobs." + # i.e. /run/user/$UID/coorddir This is a local /tmpfs (in-memory) NOT NFS + # "Absolute path to directory where Toil will keep state and lock files." + '--coordinationDir', args.toil_coordinationDir, + '--disableCaching', # THIS NEEDS MORE DATA + '--disableProgress', # disable the progress bar in the terminal, saves UI cycle # TODO: Check --clean, --cleanWorkDir, --restart - '--clean', 'always', # This effectively disables caching, but is reproducible + # '--clean', 'ne', # This effectively disables caching, but is reproducible + '--cleanWorkDir', 'never', + '--clean', 'never', + '--slurmArgs', args.toil_slurmArgs, + '--defaultCores', str(args.toil_defaultCores), + '--logLevel', args.toil_logLevel, # for debugging + '--batchLogsDir', args.toil_batchLogsDir, f'autogenerated/{yaml_stem}.cwl', f'autogenerated/{yaml_stem}_inputs.yml'] cmdline = ' '.join(cmd)