From 08ec302c56d7463c3bff4817a60d6063eba93741 Mon Sep 17 00:00:00 2001 From: Michael Rossol Date: Tue, 21 Apr 2020 09:41:51 -0600 Subject: [PATCH 1/5] hsload-> mode x to a load_file -> check if obj exists before creating --- h5pyd/_apps/hsload.py | 8 ++++---- h5pyd/_apps/utillib.py | 23 ++++++++++++++--------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/h5pyd/_apps/hsload.py b/h5pyd/_apps/hsload.py index 1f04a712..b64a05fb 100755 --- a/h5pyd/_apps/hsload.py +++ b/h5pyd/_apps/hsload.py @@ -257,7 +257,7 @@ def main(): if h5py.version.hdf5_version_tuple[0] != 1 or h5py.version.hdf5_version_tuple[1] != 10 or h5py.version.hdf5_version_tuple[2] < 6: sys.stderr.write("link option requires hdf5 lib version 1.10.6 or higher") sys.exit(1) - + try: @@ -286,7 +286,7 @@ def main(): if not S3FS_IMPORT: sys.stderr.write("Install S3FS package to load s3 files") sys.exit(1) - + if not s3: s3 = s3fs.S3FileSystem() try: @@ -297,7 +297,7 @@ def main(): else: if dataload == "link": if op.isabs(src_file): - sys.stderr.write("source file must s3path (for HSDS using S3 storage) or relative path from server root directory (for HSDS using posix storage)") + sys.stderr.write("source file must s3path (for HSDS using S3 storage) or relative path from server root directory (for HSDS using posix storage)") sys.exit(1) s3path = src_file else: @@ -315,7 +315,7 @@ def main(): endpoint = cfg["hs_endpoint"] bucket = cfg["hs_bucket"] - fout = h5pyd.File(tgt, 'x', endpoint=endpoint, username=username, password=password, bucket=bucket) + fout = h5pyd.File(tgt, 'a', endpoint=endpoint, username=username, password=password, bucket=bucket) except IOError as ioe: if ioe.errno == 404: logging.error("Domain: {} not found".format(tgt)) diff --git a/h5pyd/_apps/utillib.py b/h5pyd/_apps/utillib.py index f3255da0..7d1f55d3 100755 --- a/h5pyd/_apps/utillib.py +++ b/h5pyd/_apps/utillib.py @@ -261,6 +261,7 @@ def copy_array(src_arr, ctx): #---------------------------------------------------------------------------------- def copy_attribute(desobj, name, srcobj, ctx): + msg = "creating attribute {} in {}".format(name, srcobj.name) logging.debug(msg) if ctx["verbose"]: @@ -627,16 +628,20 @@ def load_file(fin, fout, verbose=False, dataload="ingest", s3path=None, deflate= copy_attribute(fout, ga, fin, ctx) def object_create_helper(name, obj): - class_name = obj.__class__.__name__ - - if class_name in ("Dataset", "Table"): - create_dataset(obj, ctx) - elif class_name == "Group": - create_group(obj, ctx) - elif class_name == "Datatype": - create_datatype(obj, ctx) + if name in fout: + logger.warning('{} already exists and will be skipped' + .format(name)) else: - logging.error("no handler for object class: {}".format(type(obj))) + class_name = obj.__class__.__name__ + + if class_name in ("Dataset", "Table"): + create_dataset(obj, ctx) + elif class_name == "Group": + create_group(obj, ctx) + elif class_name == "Datatype": + create_datatype(obj, ctx) + else: + logging.error("no handler for object class: {}".format(type(obj))) def object_link_helper(name, obj): class_name = obj.__class__.__name__ From 542c90e321df7b0cc1e62689c55e1f7a210c676a Mon Sep 17 00:00:00 2001 From: Michael Rossol Date: Wed, 27 May 2020 15:21:16 -0600 Subject: [PATCH 2/5] add append option consolidate object helpers into a single function to allow skipping existing objects --- h5pyd/_apps/hsload.py | 17 +++++--- h5pyd/_apps/utillib.py | 92 ++++++++++++++++++------------------------ 2 files changed, 51 insertions(+), 58 deletions(-) diff --git a/h5pyd/_apps/hsload.py b/h5pyd/_apps/hsload.py index b64a05fb..050fc21b 100755 --- a/h5pyd/_apps/hsload.py +++ b/h5pyd/_apps/hsload.py @@ -98,6 +98,7 @@ def usage(): print(" -e | --endpoint :: The HDF Server endpoint, e.g. http://hsdshdflab.hdfgroup.org") print(" -u | --user :: User name credential") print(" -p | --password :: Password credential") + print(" -a | --append :: Flag to append to an existing HDF Server domain") print(" -c | --conf :: A credential and config file") print(" -z[n] :: apply compression filter to any non-compressed datasets, n: [0-9]") print(" --cnf-eg :: Print a config file and then exit") @@ -135,6 +136,7 @@ def main(): logfname=None ipvfam=None s3 = None # s3fs instance + mode = '' src_files = [] argn = 1 @@ -147,8 +149,10 @@ def main(): sys.stderr.write("options must precede source files") usage() sys.exit(-1) + if len(sys.argv) > argn + 1: - val = sys.argv[argn+1] + val = sys.argv[argn + 1] + if arg in ("-v", "--verbose"): verbose = True argn += 1 @@ -200,6 +204,9 @@ def main(): elif arg in ("-p", "--password"): cfg["hs_password"] = val argn += 2 + elif arg in ("-a", "--append"): + mode = 'a' + argn += 1 elif arg == '--cnf-eg': print_config_example() sys.exit(0) @@ -263,15 +270,15 @@ def main(): for src_file in src_files: # check if this is a non local file, if it is remote (http, etc...) stage it first then insert it into hsds - src_file_chk = urlparse(src_file) + src_file_chk = urlparse(src_file) logging.debug(src_file_chk) if src_file_chk.scheme == 'http' or src_file_chk.scheme == 'https' or src_file_chk.scheme == 'ftp': src_file = stage_file(src_file, netfam=ipvfam) - if src_file == None: + if src_file is None: continue istmp = True - logging.info('temp source data: '+str(src_file)) + logging.info('temp source data: ' + str(src_file)) else: istmp = False @@ -315,7 +322,7 @@ def main(): endpoint = cfg["hs_endpoint"] bucket = cfg["hs_bucket"] - fout = h5pyd.File(tgt, 'a', endpoint=endpoint, username=username, password=password, bucket=bucket) + fout = h5pyd.File(tgt, mode, endpoint=endpoint, username=username, password=password, bucket=bucket) except IOError as ioe: if ioe.errno == 404: logging.error("Domain: {} not found".format(tgt)) diff --git a/h5pyd/_apps/utillib.py b/h5pyd/_apps/utillib.py index 7d1f55d3..9918af57 100755 --- a/h5pyd/_apps/utillib.py +++ b/h5pyd/_apps/utillib.py @@ -532,8 +532,8 @@ def create_links(gsrc, gdes, ctx): gdes[title] = des_obj else: # TBD - in hdf5 1.10 it seems that two references to the same object - # can return different id's. This will cause HDF5 files with - # multilinks to not load correctly + # can return different id's. This will cause HDF5 files with + # multilinks to not load correctly msg = "could not find map item to src id: {}".format(src_obj_id_hash) logging.warn(msg) if ctx["verbose"]: @@ -622,83 +622,69 @@ def load_file(fin, fout, verbose=False, dataload="ingest", s3path=None, deflate= ctx["s3path"] = s3path ctx["srcid_desobj_map"] = {} - # create any root attributes for ga in fin.attrs: copy_attribute(fout, ga, fin, ctx) - def object_create_helper(name, obj): + # create root soft/external links + create_links(fin, fout, ctx) + + def object_helper(name, obj): if name in fout: logger.warning('{} already exists and will be skipped' .format(name)) else: class_name = obj.__class__.__name__ - if class_name in ("Dataset", "Table"): - create_dataset(obj, ctx) + dset = self.create_dataset(name, obj) + + if dset is not None: + for da in obj.attrs: + self.copy_attribute(dset, da, obj) + + if dataload == "ingest": + logging.debug("object_copy_helper for object: {}".format(obj.name)) + if ctx["dataload"] == "link": + logging.info("skip datacopy for link reference") + else: + logging.debug("calling write_dataset for dataset: {}".format(obj.name)) + tgt = fout[obj.name] + write_dataset(obj, tgt, ctx) + elif class_name == "Group": - create_group(obj, ctx) + grp = self.create_group(name, obj) + + if grp is not None: + for ga in obj.attrs: + self.copy_attribute(grp, ga, obj) + + # create any soft/external links + logging.debug("object_link_helper for object: {}".format(obj.name)) + fout = ctx["fout"] + grp = fout[name] + create_links(obj, grp, ctx) elif class_name == "Datatype": - create_datatype(obj, ctx) + self.create_datatype(obj) else: - logging.error("no handler for object class: {}".format(type(obj))) - - def object_link_helper(name, obj): - class_name = obj.__class__.__name__ - logging.debug("object_link_helper for object: {}".format(obj.name)) - if class_name == "Group": - # create any soft/external links - fout = ctx["fout"] - grp = fout[name] - create_links(obj, grp, ctx) - - def object_copy_helper(name, obj): - class_name = obj.__class__.__name__ - logging.debug("object_copy_helper for object: {}".format(obj.name)) - if class_name in ("Dataset", "Table"): - if ctx["dataload"] == "link": - logging.info("skip datacopy for link reference") - else: - logging.debug("calling write_dataset for dataset: {}".format(obj.name)) - tgt = fout[obj.name] - write_dataset(obj, tgt, ctx) - elif class_name == "Group": - logging.debug("skip copy for group: {}".format(obj.name)) - elif class_name == "Datatype": - logging.debug("skip copy for datatype: {}".format(obj.name)) - else: - logging.error("no handler for object class: {}".format(type(obj))) - - def object_attribute_helper(name, obj): - tgt = fout[obj.name] - for ga in obj.attrs: - copy_attribute(tgt, ga, obj, ctx) + logger.error("no handler for object class: {}" + .format(type(obj))) # build a rough map of the file using the internal function above - logging.info("creating target objects") - fin.visititems(object_create_helper) - # copy over any attributes - logging.info("creating target attributes") - fin.visititems(object_attribute_helper) - # create soft/external links (and hardlinks not already created) - create_links(fin, fout, ctx) # create root soft/external links - fin.visititems(object_link_helper) - + logging.info("creating target objects and attributes") if dataload == "ingest": # copy dataset data logging.info("copying dataset data") - fin.visititems(object_copy_helper) - else: - logging.info("skipping dataset data copy (dataload is None or 'link')") + + fin.visititems(object_helper) # Fully flush the h5py handle. fout.close() # close up the source domain, see reason(s) for this below fin.close() - msg="load_file complete" + msg = "load_file complete" logging.info(msg) if verbose: print(msg) From a3212a960ac1174d7de0b4f6ad9b3c44be5a92ea Mon Sep 17 00:00:00 2001 From: Michael Rossol Date: Mon, 15 Jun 2020 11:57:18 -0600 Subject: [PATCH 3/5] fix default mode='w' --- h5pyd/_apps/hsload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/h5pyd/_apps/hsload.py b/h5pyd/_apps/hsload.py index 438bcb92..1e823159 100755 --- a/h5pyd/_apps/hsload.py +++ b/h5pyd/_apps/hsload.py @@ -152,7 +152,7 @@ def main(): logfname=None ipvfam=None s3 = None # s3fs instance - mode = '' + mode = 'w' src_files = [] argn = 1 From c05af35194410c4a7347c0de172b16084b59f6f3 Mon Sep 17 00:00:00 2001 From: Michael Rossol Date: Tue, 16 Jun 2020 08:43:48 -0600 Subject: [PATCH 4/5] fix object_helper bugs --- h5pyd/_apps/utillib.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/h5pyd/_apps/utillib.py b/h5pyd/_apps/utillib.py index 3394bd6e..6ff47fe7 100755 --- a/h5pyd/_apps/utillib.py +++ b/h5pyd/_apps/utillib.py @@ -437,9 +437,8 @@ def create_dataset(dobj, ctx): except (IOError, TypeError, KeyError) as e: msg = "ERROR: failed to create dataset: {}".format(str(e)) logging.error(msg) - print(msg) - return + return dset # create_dataset #---------------------------------------------------------------------------------- @@ -583,6 +582,8 @@ def create_group(gobj, ctx): logging.debug("adding group id {} to {} in srcid_desobj_map".format(gobj.id.id, grp)) srcid_desobj_map[gobj.id.__hash__()] = grp + return grp + # create_group #---------------------------------------------------------------------------------- @@ -642,11 +643,11 @@ def object_helper(name, obj): else: class_name = obj.__class__.__name__ if class_name in ("Dataset", "Table"): - dset = self.create_dataset(name, obj) + dset = create_dataset(obj, ctx) if dset is not None: for da in obj.attrs: - self.copy_attribute(dset, da, obj) + copy_attribute(dset, da, obj, ctx) if dataload == "ingest": logging.debug("object_copy_helper for object: {}".format(obj.name)) @@ -658,11 +659,11 @@ def object_helper(name, obj): write_dataset(obj, tgt, ctx) elif class_name == "Group": - grp = self.create_group(name, obj) + grp = create_group(obj, ctx) if grp is not None: for ga in obj.attrs: - self.copy_attribute(grp, ga, obj) + copy_attribute(grp, ga, obj, ctx) # create any soft/external links logging.debug("object_link_helper for object: {}".format(obj.name)) @@ -670,7 +671,7 @@ def object_helper(name, obj): grp = fout[name] create_links(obj, grp, ctx) elif class_name == "Datatype": - self.create_datatype(obj) + create_datatype(obj, ctx) else: logger.error("no handler for object class: {}" .format(type(obj))) From 4d0f45a4632b909953ee6bcdbe3832f87b50de1a Mon Sep 17 00:00:00 2001 From: Michael Rossol Date: Tue, 16 Jun 2020 10:08:12 -0600 Subject: [PATCH 5/5] fix fout error --- h5pyd/_apps/utillib.py | 1 + 1 file changed, 1 insertion(+) diff --git a/h5pyd/_apps/utillib.py b/h5pyd/_apps/utillib.py index 6ff47fe7..32c3b7d3 100755 --- a/h5pyd/_apps/utillib.py +++ b/h5pyd/_apps/utillib.py @@ -637,6 +637,7 @@ def load_file(fin, fout, verbose=False, dataload="ingest", s3path=None, deflate= create_links(fin, fout, ctx) def object_helper(name, obj): + fout = ctx['fout'] if name in fout: logger.warning('{} already exists and will be skipped' .format(name))