From b5f4c495d549e700eefdb0e547536181e0293ab0 Mon Sep 17 00:00:00 2001 From: Michael Hucka Date: Wed, 23 Jan 2019 08:55:59 -0800 Subject: [PATCH] Don't use multiple processes for small bags Turns out that if you hand bagit.make_bag() a number of processes > 1, it will always create a multiprocess pool of that size, and creating that pool is slow. The new code checks that the number of items in a bag directory is more than the number of processes requested before it uses multiple processes. --- eprints2bags/__main__.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/eprints2bags/__main__.py b/eprints2bags/__main__.py index 8133446..f893d29 100755 --- a/eprints2bags/__main__.py +++ b/eprints2bags/__main__.py @@ -539,10 +539,18 @@ def password(prompt): return sys.stdin.readline().rstrip() -def bag_and_archive(directory, action, archive_fmt, procs, xml, api_url, say): +def bag_and_archive(directory, action, archive_fmt, processes, xml, url, say): # If xml != None, we're dealing with a record, else the top-level directory. if action != 'none': say.info('Making bag out of {}', directory) + # Don't use large # of processes b/c creating the process pool is + # expensive. If procs = 32 and most of our records have only 1-2 + # files, make_bag() will still create a pool of 32 each time. The + # following tries to balance things out for the most common case. + # Note: this uses listdir to avoid walking down the directory tree, + # but if a given entry is the root of a large subdirectory, then this + # may fail to use multiple processes when it would be good to do so. + procs = 1 if len(os.listdir(directory)) < processes else processes bag = bagit.make_bag(directory, checksums = _BAG_CHECKSUMS, processes = procs) if xml != None: # The official_url field is not always present in the record. @@ -555,7 +563,7 @@ def bag_and_archive(directory, action, archive_fmt, procs, xml, api_url, say): bag.info['External-Description'] = 'Single EPrints record and associated document files' else: # Case: the overall bag for the whole directory - bag.info['External-Identifier'] = api_url + bag.info['External-Identifier'] = url bag.info['External-Description'] = 'Collection of EPrints records and their associated document files' bag.save() if __debug__: log('Verifying bag {}', bag.path) @@ -564,7 +572,7 @@ def bag_and_archive(directory, action, archive_fmt, procs, xml, api_url, say): if action == 'bag-and-archive': archive_file = directory + archive_extension(archive_fmt) say.info('Making archive file {}', archive_file) - comments = file_comments(bag) if xml != None else dir_comments(bag, api_url) + comments = file_comments(bag) if xml != None else dir_comments(bag, url) create_archive(archive_file, archive_fmt, directory, comments) if __debug__: log('Verifying archive file {}', archive_file) verify_archive(archive_file, archive_fmt) @@ -590,14 +598,14 @@ def file_comments(bag): return text -def dir_comments(bag, api_url): +def dir_comments(bag, url): text = '~ '*35 text += '\n' text += 'About this ZIP archive file:\n' text += '\n' text += 'This archive contains a directory of files organized in BagIt v{} format.\n'.format(bag.version) text += 'The data files are the contents of EPrints records obtained from\n' - text += api_url + text += url text += '\n' text += software_comments() text += '\n'