Skip to content

Commit

Permalink
Don't use multiple processes for small bags
Browse files Browse the repository at this point in the history
Turns out that if you hand bagit.make_bag() a number of processes > 1,
it will always create a multiprocess pool of that size, and creating
that pool is slow.  The new code checks that the number of items in a
bag directory is more than the number of processes requested before it
uses multiple processes.
  • Loading branch information
mhucka committed Jan 23, 2019
1 parent a54ba64 commit b5f4c49
Showing 1 changed file with 13 additions and 5 deletions.
18 changes: 13 additions & 5 deletions eprints2bags/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,10 +539,18 @@ def password(prompt):
return sys.stdin.readline().rstrip()


def bag_and_archive(directory, action, archive_fmt, procs, xml, api_url, say):
def bag_and_archive(directory, action, archive_fmt, processes, xml, url, say):
# If xml != None, we're dealing with a record, else the top-level directory.
if action != 'none':
say.info('Making bag out of {}', directory)
# Don't use large # of processes b/c creating the process pool is
# expensive. If procs = 32 and most of our records have only 1-2
# files, make_bag() will still create a pool of 32 each time. The
# following tries to balance things out for the most common case.
# Note: this uses listdir to avoid walking down the directory tree,
# but if a given entry is the root of a large subdirectory, then this
# may fail to use multiple processes when it would be good to do so.
procs = 1 if len(os.listdir(directory)) < processes else processes
bag = bagit.make_bag(directory, checksums = _BAG_CHECKSUMS, processes = procs)
if xml != None:
# The official_url field is not always present in the record.
Expand All @@ -555,7 +563,7 @@ def bag_and_archive(directory, action, archive_fmt, procs, xml, api_url, say):
bag.info['External-Description'] = 'Single EPrints record and associated document files'
else:
# Case: the overall bag for the whole directory
bag.info['External-Identifier'] = api_url
bag.info['External-Identifier'] = url
bag.info['External-Description'] = 'Collection of EPrints records and their associated document files'
bag.save()
if __debug__: log('Verifying bag {}', bag.path)
Expand All @@ -564,7 +572,7 @@ def bag_and_archive(directory, action, archive_fmt, procs, xml, api_url, say):
if action == 'bag-and-archive':
archive_file = directory + archive_extension(archive_fmt)
say.info('Making archive file {}', archive_file)
comments = file_comments(bag) if xml != None else dir_comments(bag, api_url)
comments = file_comments(bag) if xml != None else dir_comments(bag, url)
create_archive(archive_file, archive_fmt, directory, comments)
if __debug__: log('Verifying archive file {}', archive_file)
verify_archive(archive_file, archive_fmt)
Expand All @@ -590,14 +598,14 @@ def file_comments(bag):
return text


def dir_comments(bag, api_url):
def dir_comments(bag, url):
text = '~ '*35
text += '\n'
text += 'About this ZIP archive file:\n'
text += '\n'
text += 'This archive contains a directory of files organized in BagIt v{} format.\n'.format(bag.version)
text += 'The data files are the contents of EPrints records obtained from\n'
text += api_url
text += url
text += '\n'
text += software_comments()
text += '\n'
Expand Down

0 comments on commit b5f4c49

Please sign in to comment.