Skip to content

Commit

Permalink
support linking files in place instead of copying
Browse files Browse the repository at this point in the history
This allows users to manage collections of large WARC files without
duplicating space. Hardlinks are used instead of symlinks to reflect
the original mechanism, where the file is copied (so it can be safely
removed from the source). If we used symlinks, we would break that
expectation which could lead to data loss.

Inversely, hardlinks can lead to data loss as well. For example, pywb
could somehow edit the file, which would modify the original as
well. But we assume here pywb does not modify the file, and each side
of the hardlink can have their own permissions to ensure this (or not)
as well.

Closes: webrecorder#408
  • Loading branch information
anarcat committed Nov 13, 2018
1 parent 1b151b7 commit 22f20e6
Showing 1 changed file with 13 additions and 4 deletions.
17 changes: 13 additions & 4 deletions pywb/manager/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,17 +108,24 @@ def _assert_coll_exists(self):
'To create a new collection, run\n\n{1} init {0}')
raise IOError(msg.format(self.coll_name, sys.argv[0]))

def add_warcs(self, warcs):
def add_warcs(self, warcs, hardlink=False):
if not os.path.isdir(self.archive_dir):
raise IOError('Directory {0} does not exist'.
format(self.archive_dir))

full_paths = []
for filename in warcs:
filename = os.path.abspath(filename)
shutil.copy2(filename, self.archive_dir)
if hardlink:
os.link(filename, os.path.join(self.archive_dir,
os.path.basename(filename)))
else:
shutil.copy2(filename, self.archive_dir)
full_paths.append(os.path.join(self.archive_dir, filename))
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
logging.info('%s %s to %s',
hardlink and 'Linked' or 'Copied',
filename,
self.archive_dir)

self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE)

Expand Down Expand Up @@ -357,12 +364,14 @@ def do_list(r):
# Add Warcs
def do_add(r):
m = CollectionsManager(r.coll_name)
m.add_warcs(r.files)
m.add_warcs(r.files, r.hardlink)

addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
addwarc = subparsers.add_parser('add', help=addwarc_help)
addwarc.add_argument('coll_name')
addwarc.add_argument('files', nargs='+')
addwarc.add_argument('--hardlink', '-l', action='store_true',
help='hardlink files into storage instead of copying')
addwarc.set_defaults(func=do_add)

# Reindex All
Expand Down

0 comments on commit 22f20e6

Please sign in to comment.